diff --git a/.github/actions/verify-tag-version/action.yml b/.github/actions/verify-tag-version/action.yml new file mode 100644 index 0000000000..1b34bdba03 --- /dev/null +++ b/.github/actions/verify-tag-version/action.yml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Verify that the pushed tag version matches the workspace package version in Cargo.toml. +# Tag v0.2.0 or v0.2.0-rc1; Cargo 0.2.0. Compare base version (strip -rc*): both pass when Cargo is 0.2.0. +# Requires: checkout before this step (Cargo.toml in workspace root). Use on tag push (GITHUB_REF like refs/tags/v0.1.0). + +name: 'Verify tag matches crate version' +description: 'Exits with error if GITHUB_REF tag base version does not match [workspace.package] version in Cargo.toml (strips -rc*).' + +runs: + using: 'composite' + steps: + - run: | + echo "Tag and crate version match: $TAG_VERSION" + shell: bash +# uncomment for 0.1.0-incubating-rc0, add it back when incubating is not needed anymore +# - run: | +# TAG_VERSION="${GITHUB_REF#refs/tags/v}" +# CRATE_VERSION=$(sed -n '/^\[workspace.package\]/,/^\[/p' Cargo.toml | grep '^\s*version\s*=' | head -1 | sed -E 's/.*"([^"]+)".*/\1/') +# base() { echo "$1" | sed -E 's/-rc(\.[0-9]+|[0-9]+)$//'; } +# if [ "$(base "$TAG_VERSION")" != "$(base "$CRATE_VERSION")" ]; then +# echo "::error::Tag version ($TAG_VERSION) does not match Cargo.toml version ($CRATE_VERSION). Run scripts/bump-version.sh before tagging, or tag the version that is in Cargo.toml." +# exit 1 +# fi +# echo "Tag and crate version match: $TAG_VERSION" +# shell: bash diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..714e644bd5 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +version: 2 +updates: + # GitHub Actions used by the repository's workflows + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + + # Rust client workspace + - package-ecosystem: "cargo" + directory: "/fluss-rust" + schedule: + interval: "monthly" diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 0000000000..3ca2be6277 --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to you under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Configures "Generate release notes" on GitHub Releases. +# https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes + +changelog: + categories: + - title: Added + labels: + - feat + - feature + - title: Changed + labels: + - refactor + - title: Fixed + labels: + - fix + - bugfix + - title: Docs + labels: + - docs + - documentation + - title: CI / Build + labels: + - ci + - build + - title: Chore + labels: + - chore diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4ef7d372de..ee4a269d73 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -25,11 +25,13 @@ on: paths-ignore: - 'website/**' - 'helm/**' + - 'fluss-rust/**' - '**/*.md' pull_request: paths-ignore: - 'website/**' - 'helm/**' + - 'fluss-rust/**' - '**/*.md' concurrency: diff --git a/.github/workflows/client-integration.yml b/.github/workflows/client-integration.yml new file mode 100644 index 0000000000..6d77be877f --- /dev/null +++ b/.github/workflows/client-integration.yml @@ -0,0 +1,366 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Client integration tests against a SAME-REVISION server (FIP-40 §3.2). +# Builds the Fluss server image from this source tree ONCE, caches + saves it, +# then fans out the Rust / Python / C++ / Elixir integration suites against that +# fluss:dev image (build-once-fan-out, à la Temporal/PyFlink). The image build is +# cached on server/proto hashes, so client-only PRs reuse it instead of rebuilding. + +name: Client Integration + +on: + push: + branches: + - main + paths: + - 'fluss-rpc/src/main/proto/**' + - 'fluss-server/**' + - 'fluss-common/**' + - 'fluss-dist/**' + - 'docker/fluss/**' + - 'fluss-rust/crates/**' + - 'fluss-rust/bindings/**' + - 'fluss-rust/Cargo.toml' + - 'fluss-rust/Cargo.lock' + - '.github/workflows/client-integration.yml' + pull_request: + branches: + - main + paths: + - 'fluss-rpc/src/main/proto/**' + - 'fluss-server/**' + - 'fluss-common/**' + - 'fluss-dist/**' + - 'docker/fluss/**' + - 'fluss-rust/crates/**' + - 'fluss-rust/bindings/**' + - 'fluss-rust/Cargo.toml' + - 'fluss-rust/Cargo.lock' + - '.github/workflows/client-integration.yml' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: true + +jobs: + # Decide which client suites to run, mirroring the per-binding scoping the + # standalone fluss-rust repo had: a binding suite runs only when its own + # binding, the core fluss-rs crate, or the server/proto changed. On non-PR + # events (push to main, manual) everything runs. + detect-changes: + runs-on: ubuntu-latest + outputs: + rust: ${{ steps.filter.outputs.rust }} + python: ${{ steps.filter.outputs.python }} + cpp: ${{ steps.filter.outputs.cpp }} + elixir: ${{ steps.filter.outputs.elixir }} + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - id: filter + run: | + if [ "${{ github.event_name }}" != "pull_request" ]; then + all=true; changed="" + else + all=false + changed=$(git diff --name-only "${{ github.event.pull_request.base.sha }}...HEAD") + fi + echo "Changed files:"; echo "$changed" + has() { echo "$changed" | grep -qE "$1"; } + protocol=false; core=false; py=false; cpp=false; ex=false + has '^(fluss-rpc/src/main/proto/|fluss-server/|fluss-common/|fluss-dist/|docker/fluss/)' && protocol=true || true + has '^(fluss-rust/crates/|fluss-rust/Cargo\.)' && core=true || true + has '^fluss-rust/bindings/python/' && py=true || true + has '^fluss-rust/bindings/cpp/' && cpp=true || true + has '^fluss-rust/bindings/elixir/' && ex=true || true + # a suite runs if: non-PR (all) OR core crate OR server/proto OR its own binding changed + gate() { if [ "$all" = true ] || [ "$core" = true ] || [ "$protocol" = true ] || [ "$1" = true ]; then echo true; else echo false; fi; } + { + echo "rust=$(gate false)" + echo "python=$(gate $py)" + echo "cpp=$(gate $cpp)" + echo "elixir=$(gate $ex)" + } >> "$GITHUB_OUTPUT" + + # Build the server image from THIS source tree once; cache it on server/proto + # hashes so client-only PRs restore it instead of rebuilding. The saved image + # is uploaded as an artifact and loaded by every client integration job. + build-server-image: + needs: detect-changes + if: needs.detect-changes.outputs.rust == 'true' || needs.detect-changes.outputs.python == 'true' || needs.detect-changes.outputs.cpp == 'true' || needs.detect-changes.outputs.elixir == 'true' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Cache server image + id: image-cache + uses: actions/cache@v4 + with: + path: /tmp/fluss-dev.tar + key: fluss-dev-image-${{ hashFiles('fluss-server/**', 'fluss-common/**', 'fluss-rpc/**', 'fluss-dist/**', 'docker/fluss/**', 'pom.xml') }} + + - name: Set up JDK 17 + if: steps.image-cache.outputs.cache-hit != 'true' + uses: actions/setup-java@v5 + with: + java-version: '17' + distribution: 'temurin' + cache: maven + + - name: Build server image (fluss:dev) from source + if: steps.image-cache.outputs.cache-hit != 'true' + run: | + ./mvnw -B --no-transfer-progress clean package -pl fluss-dist -am -DskipTests + rm -rf docker/fluss/build-target + mkdir -p docker/fluss/build-target + cp -r build-target/* docker/fluss/build-target/ + docker build -t fluss:dev docker/fluss + docker save fluss:dev -o /tmp/fluss-dev.tar + + - name: Upload server image + uses: actions/upload-artifact@v4 + with: + name: fluss-dev-image + path: /tmp/fluss-dev.tar + retention-days: 1 + + rust-integration: + needs: [detect-changes, build-server-image] + if: needs.detect-changes.outputs.rust == 'true' + timeout-minutes: 60 + runs-on: ubuntu-latest + defaults: + run: + working-directory: fluss-rust + env: + FLUSS_IMAGE: fluss + FLUSS_VERSION: dev + steps: + - uses: actions/checkout@v6 + - uses: actions/download-artifact@v4 + with: + name: fluss-dev-image + path: /tmp + - name: Load server image + run: docker load -i /tmp/fluss-dev.tar + - name: Install protoc + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + - name: Rust Cache + uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 + with: + workspaces: fluss-rust + - name: Integration tests + run: cargo test --features integration_tests --test test_fluss -p fluss-rs + env: + RUST_LOG: DEBUG + RUST_BACKTRACE: full + + python-integration: + needs: [detect-changes, build-server-image] + if: needs.detect-changes.outputs.python == 'true' + timeout-minutes: 60 + runs-on: ubuntu-latest + strategy: + matrix: + python: ["3.9", "3.10", "3.11", "3.12"] + defaults: + run: + working-directory: fluss-rust + env: + FLUSS_TEST_CLUSTER_BIN: ${{ github.workspace }}/fluss-rust/target/debug/fluss-test-cluster + FLUSS_IMAGE: fluss + FLUSS_VERSION: dev + steps: + - uses: actions/checkout@v6 + - uses: actions/download-artifact@v4 + with: + name: fluss-dev-image + path: /tmp + - name: Load server image + run: docker load -i /tmp/fluss-dev.tar + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python }} + - name: Install uv + uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 + - name: Install protoc + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + - name: Rust Cache + uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 + with: + workspaces: fluss-rust + - name: Build fluss-test-cluster binary + run: cargo build -p fluss-test-cluster + - name: Build Python bindings + working-directory: fluss-rust/bindings/python + run: | + uv sync --extra dev --no-install-project + uv run --no-sync maturin develop --uv + - name: Run tests (parallel) + working-directory: fluss-rust/bindings/python + run: uv run --no-sync pytest test/ -v -n 2 --dist=loadfile + env: + RUST_LOG: DEBUG + RUST_BACKTRACE: full + FLUSS_SKIP_CLUSTER_TEARDOWN: "1" + - name: Dump fluss cluster container logs + if: always() + run: | + mkdir -p cluster-logs + for c in $(docker ps -a --filter "name=shared-test" --format '{{.Names}}'); do + docker logs "$c" > "cluster-logs/$c.log" 2>&1 || true + done + - uses: actions/upload-artifact@v4 + if: always() + with: + name: cluster-logs-${{ matrix.python }} + path: fluss-rust/cluster-logs/ + if-no-files-found: ignore + retention-days: 3 + + cpp-integration: + needs: [detect-changes, build-server-image] + if: needs.detect-changes.outputs.cpp == 'true' + timeout-minutes: 60 + runs-on: ubuntu-latest + defaults: + run: + working-directory: fluss-rust + env: + FLUSS_TEST_CLUSTER_BIN: ${{ github.workspace }}/fluss-rust/target/debug/fluss-test-cluster + FLUSS_IMAGE: fluss + FLUSS_VERSION: dev + steps: + - uses: actions/checkout@v6 + - uses: actions/download-artifact@v4 + with: + name: fluss-dev-image + path: /tmp + - name: Load server image + run: docker load -i /tmp/fluss-dev.tar + - name: Install protoc + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + - name: Install Apache Arrow C++ + run: | + sudo apt-get install -y -V ca-certificates lsb-release wget + wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt-get install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb + sudo apt-get update + sudo apt-get install -y -V libarrow-dev + - name: Rust Cache + uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 + with: + workspaces: fluss-rust + - name: Setup sccache + uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9 + - name: Build fluss-test-cluster binary + run: cargo build -p fluss-test-cluster + - name: Build C++ bindings and tests + working-directory: fluss-rust/bindings/cpp + env: + SCCACHE_GHA_ENABLED: "true" + run: | + cmake -B build \ + -DFLUSS_ENABLE_TESTING=ON \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_C_COMPILER_LAUNCHER=sccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=sccache + cmake --build build --parallel + sccache --show-stats + - name: Run C++ integration tests (parallel) + working-directory: fluss-rust/bindings/cpp + run: cd build && ctest -j$(nproc) --output-on-failure --timeout 300 + env: + RUST_LOG: DEBUG + RUST_BACKTRACE: full + + elixir-integration: + needs: [detect-changes, build-server-image] + if: needs.detect-changes.outputs.elixir == 'true' + timeout-minutes: 60 + runs-on: ubuntu-latest + defaults: + run: + working-directory: fluss-rust + env: + OTP_VERSION: "28.0.2" + ELIXIR_VERSION: "1.19.5" + FLUSS_TEST_CLUSTER_BIN: ${{ github.workspace }}/fluss-rust/target/debug/fluss-test-cluster + MIX_ENV: test + FLUSS_IMAGE: fluss + FLUSS_VERSION: dev + steps: + - uses: actions/checkout@v6 + - uses: actions/download-artifact@v4 + with: + name: fluss-dev-image + path: /tmp + - name: Load server image + run: docker load -i /tmp/fluss-dev.tar + - name: Set up BEAM + uses: erlef/setup-beam@fc68ffb90438ef2936bbb3251622353b3dcb2f93 # v1.24.0 + with: + otp-version: ${{ env.OTP_VERSION }} + elixir-version: ${{ env.ELIXIR_VERSION }} + - name: Install protoc + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + - name: Rust Cache + uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 + with: + workspaces: fluss-rust + - name: Cache Mix deps and build + uses: actions/cache@v4 + with: + path: | + fluss-rust/bindings/elixir/deps + fluss-rust/bindings/elixir/_build + key: ${{ runner.os }}-mix-otp${{ env.OTP_VERSION }}-elixir${{ env.ELIXIR_VERSION }}-${{ hashFiles('fluss-rust/bindings/elixir/mix.lock') }} + restore-keys: | + ${{ runner.os }}-mix-otp${{ env.OTP_VERSION }}-elixir${{ env.ELIXIR_VERSION }}- + - name: Build fluss-test-cluster binary + run: cargo build -p fluss-test-cluster + - name: Fetch Elixir deps + working-directory: fluss-rust/bindings/elixir + run: mix deps.get + - name: Check formatting + working-directory: fluss-rust/bindings/elixir + run: mix format --check-formatted + - name: Compile (warnings as errors) + working-directory: fluss-rust/bindings/elixir + run: mix compile --warnings-as-errors + - name: Credo + working-directory: fluss-rust/bindings/elixir + run: mix credo + - name: Run unit tests + working-directory: fluss-rust/bindings/elixir + run: mix test + - name: Run integration tests + working-directory: fluss-rust/bindings/elixir + run: mix test --include integration --only integration + env: + RUST_LOG: DEBUG + RUST_BACKTRACE: full diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml index 5f52ffa3d9..aa69703eec 100644 --- a/.github/workflows/license-check.yml +++ b/.github/workflows/license-check.yml @@ -17,7 +17,17 @@ name: Check License permissions: contents: read -on: [push, pull_request] +on: + push: + paths-ignore: + - 'fluss-rust/**' + - 'website/**' + - '**/*.md' + pull_request: + paths-ignore: + - 'fluss-rust/**' + - 'website/**' + - '**/*.md' concurrency: group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }} diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml new file mode 100644 index 0000000000..ddbc4f0cf9 --- /dev/null +++ b/.github/workflows/python-release.yml @@ -0,0 +1,180 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Publish the fluss Python binding to PyPI. +# Trigger: push tag only (e.g. v0.1.0). +# Pre-release tags (containing '-') publish to TestPyPI; release tags publish to PyPI. +# +# Token auth: add secrets PYPI_API_TOKEN / TEST_PYPI_API_TOKEN for publishing. + +name: Release Python + +on: + push: + tags: + - "v*" # Only version-like tags (e.g. v0.1.0, v0.1.0-rc1); avoids running on arbitrary tags + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + version-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: ./.github/actions/verify-tag-version + + sdist: + runs-on: ubuntu-latest + needs: [version-check] + steps: + - uses: actions/checkout@v6 + + - name: Install protoc + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + # Vendor the canonical proto so the sdist builds standalone from source. + - name: Vendor canonical proto into the crate + working-directory: fluss-rust + run: scripts/vendor-proto.sh + + - uses: PyO3/maturin-action@v1 + with: + working-directory: fluss-rust/bindings/python + command: sdist + args: -o dist + + - name: Upload sdist + uses: actions/upload-artifact@v7 + with: + name: wheels-sdist + path: fluss-rust/bindings/python/dist + + wheels: + runs-on: ${{ matrix.os }} + needs: [version-check] + strategy: + matrix: + include: + - { os: windows-latest } + - { os: macos-15-intel, target: "x86_64-apple-darwin" } + - { os: macos-15, target: "aarch64-apple-darwin" } + - { os: ubuntu-latest, target: "x86_64" } + - { os: ubuntu-latest, target: "aarch64", manylinux: "manylinux_2_28" } + steps: + - uses: actions/checkout@v6 + + - name: Install protoc (Linux) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install protoc (macOS) + if: runner.os == 'macOS' + run: brew install protobuf + + - name: Install protoc (Windows) + if: runner.os == 'Windows' + run: choco install protoc -y + shell: pwsh + + # Install protoc in manylinux container (x86_64/aarch64); script shared via YAML anchor + - uses: PyO3/maturin-action@v1 + with: + working-directory: fluss-rust/bindings/python + target: ${{ matrix.target }} + command: build + args: --release -o dist -i python3.9 + manylinux: ${{ matrix.manylinux || 'auto' }} + before-script-linux: &protoc-install | + set -e + ARCH=$(uname -m) + case "$ARCH" in + x86_64) ZIP=protoc-27.1-linux-x86_64.zip ;; + aarch64) ZIP=protoc-27.1-linux-aarch_64.zip ;; + *) echo "Unsupported arch $ARCH"; exit 1 ;; + esac + curl -sLO "https://github.com/protocolbuffers/protobuf/releases/download/v27.1/${ZIP}" + python3 -c "import zipfile; zipfile.ZipFile('${ZIP}').extractall('/tmp/protoc_install')" + chmod +x /tmp/protoc_install/bin/protoc + rm -f "${ZIP}" + export PATH="/tmp/protoc_install/bin:$PATH" + export PROTOC=/tmp/protoc_install/bin/protoc + - uses: PyO3/maturin-action@v1 + with: + working-directory: fluss-rust/bindings/python + target: ${{ matrix.target }} + command: build + args: --release -o dist -i python3.10 + manylinux: ${{ matrix.manylinux || 'auto' }} + before-script-linux: *protoc-install + - uses: PyO3/maturin-action@v1 + with: + working-directory: fluss-rust/bindings/python + target: ${{ matrix.target }} + command: build + args: --release -o dist -i python3.11 + manylinux: ${{ matrix.manylinux || 'auto' }} + before-script-linux: *protoc-install + - uses: PyO3/maturin-action@v1 + with: + working-directory: fluss-rust/bindings/python + target: ${{ matrix.target }} + command: build + args: --release -o dist -i python3.12 + manylinux: ${{ matrix.manylinux || 'auto' }} + before-script-linux: *protoc-install + + - name: Upload wheels + uses: actions/upload-artifact@v7 + with: + name: wheels-${{ matrix.os }}-${{ matrix.target || 'native' }} + path: fluss-rust/bindings/python/dist + + release: + name: Publish to PyPI + runs-on: ubuntu-latest + permissions: + contents: read + needs: [version-check, sdist, wheels] + if: startsWith(github.ref, 'refs/tags/') + steps: + - uses: actions/download-artifact@v8 + with: + pattern: wheels-* + merge-multiple: true + path: fluss-rust/bindings/python/dist + + - name: Publish to TestPyPI + if: contains(github.ref, '-') + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e + with: + repository-url: https://test.pypi.org/legacy/ + skip-existing: true + packages-dir: fluss-rust/bindings/python/dist + password: ${{ secrets.TEST_PYPI_API_TOKEN }} + + - name: Publish to PyPI + if: ${{ !contains(github.ref, '-') }} + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e + with: + skip-existing: true + packages-dir: fluss-rust/bindings/python/dist + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/rust-build-and-test.yml b/.github/workflows/rust-build-and-test.yml new file mode 100644 index 0000000000..d59fadce59 --- /dev/null +++ b/.github/workflows/rust-build-and-test.yml @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Rust Build and Tests + +on: + push: + branches: + - main + paths: + - 'fluss-rust/crates/**' + - 'fluss-rust/Cargo.toml' + - 'fluss-rust/Cargo.lock' + - 'fluss-rust/rust-toolchain.toml' + - 'fluss-rust/.cargo/**' + - 'fluss-rpc/src/main/proto/**' + - '.github/workflows/rust-build-and-test.yml' + pull_request: + branches: + - main + paths: + - 'fluss-rust/crates/**' + - 'fluss-rust/Cargo.toml' + - 'fluss-rust/Cargo.lock' + - 'fluss-rust/rust-toolchain.toml' + - 'fluss-rust/.cargo/**' + - 'fluss-rpc/src/main/proto/**' + - '.github/workflows/rust-build-and-test.yml' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: true + +defaults: + run: + working-directory: fluss-rust + +jobs: + build-and-unit-test: + timeout-minutes: 60 + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: + - ubuntu-latest + - macos-latest + steps: + - uses: actions/checkout@v6 + + - name: Install protoc + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Rust Cache + uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 + with: + workspaces: fluss-rust + + - name: Build + run: cargo build --workspace --all-targets --exclude fluss_python --exclude fluss-cpp --exclude fluss_nif + + - name: Unit Test + run: cargo test --all-targets --workspace --exclude fluss_python --exclude fluss-cpp --exclude fluss_nif + env: + RUST_LOG: DEBUG + RUST_BACKTRACE: full diff --git a/.github/workflows/rust-docs-check.yml b/.github/workflows/rust-docs-check.yml new file mode 100644 index 0000000000..e2e6e72059 --- /dev/null +++ b/.github/workflows/rust-docs-check.yml @@ -0,0 +1,51 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# Checks for broken links in the fluss-rust client documentation. +name: Rust Documentation Check +permissions: + contents: read +on: + pull_request: + branches: [main] + paths: + - 'fluss-rust/website/**' + - '.github/workflows/rust-docs-check.yml' + push: + branches: [main] + paths: + - 'fluss-rust/website/**' + - '.github/workflows/rust-docs-check.yml' + +jobs: + check-documentation: + runs-on: ubuntu-latest + defaults: + run: + working-directory: fluss-rust/website + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - uses: actions/setup-node@v6 + with: + node-version: 24 + - name: Install dependencies + run: npm install + - name: Test build website + run: npm run build -- --no-minify diff --git a/.github/workflows/rust-license-and-format.yml b/.github/workflows/rust-license-and-format.yml new file mode 100644 index 0000000000..2c2d4f6b41 --- /dev/null +++ b/.github/workflows/rust-license-and-format.yml @@ -0,0 +1,98 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Rust License and Formatting Check + +on: + push: + branches: + - main + paths: + - 'fluss-rust/crates/**' + - 'fluss-rust/bindings/**' + - 'fluss-rust/Cargo.toml' + - 'fluss-rust/Cargo.lock' + - 'fluss-rust/deny.toml' + - 'fluss-rust/.licenserc.yaml' + - 'fluss-rust/rustfmt.toml' + - 'fluss-rust/rust-toolchain.toml' + - 'fluss-rpc/src/main/proto/**' + - '.github/workflows/rust-license-and-format.yml' + pull_request: + branches: + - main + paths: + - 'fluss-rust/crates/**' + - 'fluss-rust/bindings/**' + - 'fluss-rust/Cargo.toml' + - 'fluss-rust/Cargo.lock' + - 'fluss-rust/deny.toml' + - 'fluss-rust/.licenserc.yaml' + - 'fluss-rust/rustfmt.toml' + - 'fluss-rust/rust-toolchain.toml' + - 'fluss-rpc/src/main/proto/**' + - '.github/workflows/rust-license-and-format.yml' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} + cancel-in-progress: true + +defaults: + run: + working-directory: fluss-rust + +jobs: + check-license-and-formatting: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Check License Header + uses: apache/skywalking-eyes/header@61275cc80d0798a405cb070f7d3a8aaf7cf2c2c1 # v0.8.0 + with: + config: fluss-rust/.licenserc.yaml + + - name: Install cargo-deny + uses: taiki-e/install-action@v2 + with: + tool: cargo-deny@0.14.22 + + - name: Check dependency licenses (Apache-compatible) + run: cargo deny check licenses + + - name: Install protoc + uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Rust Cache + uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1 + with: + workspaces: fluss-rust + + - name: Format + run: cargo fmt --all -- --check + + - name: Clippy + run: cargo clippy --all-targets --workspace -- -D warnings + + - name: Rustdoc + # fluss_python is excluded: its [lib] name = "fluss" collides with fluss-rs + run: cargo doc --workspace --no-deps --exclude fluss_python + env: + RUSTDOCFLAGS: -D warnings diff --git a/.github/workflows/rust-release.yml b/.github/workflows/rust-release.yml new file mode 100644 index 0000000000..d2f8901400 --- /dev/null +++ b/.github/workflows/rust-release.yml @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Publish the fluss Rust crate to crates.io. +# Trigger: push tag only (e.g. v0.1.0). +# Pre-release tags (containing '-') do not publish; release tags publish to crates.io. +# +# Token auth: add secret CARGO_REGISTRY_TOKEN for crates.io publishing. + +name: Release Rust + +on: + push: + tags: + - "v*" # Only version-like tags (e.g. v0.1.0, v0.1.0-rc1); avoids running on arbitrary tags + +defaults: + run: + working-directory: fluss-rust + +jobs: + publish: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@v6 + + - uses: ./.github/actions/verify-tag-version + + - name: Install protoc + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + # build.rs reads the canonical proto from the in-repo fluss-rpc, which is + # outside the published crate; vendor it so the crate publishes standalone. + - name: Vendor canonical proto into the crate + run: scripts/vendor-proto.sh + + - name: Dry run (crates/fluss) + run: cargo publish -p fluss-rs --dry-run --allow-dirty + + - name: Publish fluss-rs to crates.io + if: startsWith(github.ref, 'refs/tags/') && !contains(github.ref, '-') + run: cargo publish -p fluss-rs --allow-dirty + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} diff --git a/fluss-rust/.cargo/config.toml b/fluss-rust/.cargo/config.toml new file mode 100644 index 0000000000..57efc7ff75 --- /dev/null +++ b/fluss-rust/.cargo/config.toml @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] \ No newline at end of file diff --git a/fluss-rust/.gitignore b/fluss-rust/.gitignore new file mode 100644 index 0000000000..eb3a06e6b1 --- /dev/null +++ b/fluss-rust/.gitignore @@ -0,0 +1,54 @@ +.DS_Store +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +# RustRover +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ +.vscode/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +*.dylib +*.dSYM/ +*.egg-info/ +dist/ +build/ +.venv/ +uv.lock + +# CPP +*CMakeFiles/ +.cache/ + +# Website (Docusaurus) +website/node_modules +website/build +website/.docusaurus +website/.cache-loader +website/.env.local +website/.env.development.local +website/.env.test.local +website/.env.production.local +website/npm-debug.log* +website/yarn-debug.log* +website/yarn-error.log* +website/package-lock.json +website/versioned_docs +website/versioned_sidebars +website/versions.json +website/pnpm-lock.yaml diff --git a/fluss-rust/.licenserc.yaml b/fluss-rust/.licenserc.yaml new file mode 100644 index 0000000000..a3647d7f27 --- /dev/null +++ b/fluss-rust/.licenserc.yaml @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +header: + license: + spdx-id: Apache-2.0 + copyright-owner: Apache Software Foundation + + paths: + - 'fluss-rust/**' + + paths-ignore: + # bare (gitignore-style) patterns match the basename at any depth + - '.gitignore' + - 'Cargo.lock' + - 'LICENSE' + - 'NOTICE' + - 'DISCLAIMER' + - 'fluss-rust/bindings/python/fluss/py.typed' + - 'fluss-rust/**/mix.lock' + - 'fluss-rust/website/**' + - '**/*.md' + - 'fluss-rust/**/DEPENDENCIES.*.tsv' + - 'fluss-rust/**/*.env' + comment: on-failure diff --git a/fluss-rust/Cargo.lock b/fluss-rust/Cargo.lock new file mode 100644 index 0000000000..4570d4d81c --- /dev/null +++ b/fluss-rust/Cargo.lock @@ -0,0 +1,4743 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "arrow" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num-traits", +] + +[[package]] +name = "arrow-array" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-buffer" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + +[[package]] +name = "arrow-cast" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", + "atoi", + "base64 0.22.1", + "chrono", + "half", + "lexical-core", + "num-traits", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-ipc" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "flatbuffers", + "lz4_flex", + "zstd", +] + +[[package]] +name = "arrow-json" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.13.1", + "itoa", + "lexical-core", + "memchr", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-pyarrow" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d18c442b4c266aaf3d7f7dd40fd7ae058cef7f113b00ff0cd8256e1e218ec544" +dependencies = [ + "arrow-array", + "arrow-data", + "arrow-schema", + "pyo3", +] + +[[package]] +name = "arrow-row" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" +dependencies = [ + "bitflags", +] + +[[package]] +name = "arrow-select" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num-traits", +] + +[[package]] +name = "arrow-string" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + +[[package]] +name = "astral-tokio-tar" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c23f3af104b40a3430ccb90ed5f7bd877a8dc5c26fc92fde51a22b40890dcf9" +dependencies = [ + "filetime", + "futures-core", + "libc", + "portable-atomic", + "rustc-hash", + "tokio", + "tokio-stream", + "xattr", +] + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "axum" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" +dependencies = [ + "axum-core", + "bytes", + "futures-util", + "http", + "http-body", + "http-body-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde_core", + "sync_wrapper", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", +] + +[[package]] +name = "backon" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" +dependencies = [ + "fastrand", + "gloo-timers", + "tokio", +] + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", + "serde", +] + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bollard" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee04c4c84f1f811b017f2fbb7dd8815c976e7ca98593de9c1e2afad0f636bff4" +dependencies = [ + "async-stream", + "base64 0.22.1", + "bitflags", + "bollard-buildkit-proto", + "bollard-stubs", + "bytes", + "futures-core", + "futures-util", + "hex", + "home", + "http", + "http-body-util", + "hyper", + "hyper-named-pipe", + "hyper-rustls", + "hyper-util", + "hyperlocal", + "log", + "num", + "pin-project-lite", + "rand 0.9.3", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "serde", + "serde_derive", + "serde_json", + "serde_urlencoded", + "thiserror 2.0.18", + "time", + "tokio", + "tokio-stream", + "tokio-util", + "tonic", + "tower-service", + "url", + "winapi", +] + +[[package]] +name = "bollard-buildkit-proto" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85a885520bf6249ab931a764ffdb87b0ceef48e6e7d807cfdb21b751e086e1ad" +dependencies = [ + "prost", + "prost-types", + "tonic", + "tonic-prost", + "ureq", +] + +[[package]] +name = "bollard-stubs" +version = "1.52.1-rc.29.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f0a8ca8799131c1837d1282c3f81f31e76ceb0ce426e04a7fe1ccee3287c066" +dependencies = [ + "base64 0.22.1", + "bollard-buildkit-proto", + "bytes", + "prost", + "serde", + "serde_json", + "serde_repr", + "time", +] + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "cc" +version = "1.2.59" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "clap" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "codespan-reporting" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681" +dependencies = [ + "serde", + "termcolor", + "unicode-width", +] + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.17", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "cxx" +version = "1.0.194" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "747d8437319e3a2f43d93b341c137927ca70c0f5dabeea7a005a73665e247c7e" +dependencies = [ + "cc", + "cxx-build", + "cxxbridge-cmd", + "cxxbridge-flags", + "cxxbridge-macro", + "foldhash 0.2.0", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.194" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0f4697d190a142477b16aef7da8a99bfdc41e7e8b1687583c0d23a79c7afc1e" +dependencies = [ + "cc", + "codespan-reporting", + "indexmap 2.13.1", + "proc-macro2", + "quote", + "scratch", + "syn", +] + +[[package]] +name = "cxxbridge-cmd" +version = "1.0.194" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0956799fa8678d4c50eed028f2de1c0552ae183c76e976cf7ca8c4e36a7c328" +dependencies = [ + "clap", + "codespan-reporting", + "indexmap 2.13.1", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.194" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23384a836ab4f0ad98ace7e3955ad2de39de42378ab487dc28d3990392cb283a" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.194" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6acc6b5822b9526adfb4fc377b67128fdd60aac757cc4a741a6278603f763cf" +dependencies = [ + "indexmap 2.13.1", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core", + "quote", + "syn", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "delegate" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "780eb241654bf097afb00fc5f054a09b687dad862e485fdcf8399bb056565370" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dlv-list" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f" +dependencies = [ + "const-random", +] + +[[package]] +name = "docker_credential" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d89dfcba45b4afad7450a99b39e751590463e45c04728cf555d36bb66940de8" +dependencies = [ + "base64 0.21.7", + "serde", + "serde_json", +] + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "endian-type" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "erased-serde" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2add8a07dd6a8d93ff627029c51de145e12686fbc36ecb298ac22e74cf02dec" +dependencies = [ + "serde", + "serde_core", + "typeid", +] + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "etcetera" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de48cc4d1c1d97a20fd819def54b890cadde72ed3ad0c614822a0a433361be96" +dependencies = [ + "cfg-if", + "windows-sys 0.61.2", +] + +[[package]] +name = "fastrand" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f" + +[[package]] +name = "ferroid" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb330bbd4cb7a5b9f559427f06f98a4f853a137c8298f3bd3f8ca57663e21986" +dependencies = [ + "portable-atomic", + "rand 0.9.3", + "web-time", +] + +[[package]] +name = "filetime" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if", + "libc", + "libredox", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" +dependencies = [ + "bitflags", + "rustc_version", +] + +[[package]] +name = "fluss-cpp" +version = "1.0.0" +dependencies = [ + "anyhow", + "arrow", + "bigdecimal", + "cxx", + "cxx-build", + "fluss-rs", + "tokio", +] + +[[package]] +name = "fluss-examples" +version = "1.0.0" +dependencies = [ + "clap", + "fluss-rs", + "tikv-jemallocator", + "tokio", +] + +[[package]] +name = "fluss-rs" +version = "1.0.0" +dependencies = [ + "arrow", + "arrow-schema", + "bigdecimal", + "bitvec", + "byteorder", + "bytes", + "clap", + "crc32c", + "dashmap", + "delegate", + "fluss-test-cluster", + "futures", + "jiff", + "linked-hash-map", + "log", + "metrics", + "metrics-util", + "opendal", + "ordered-float", + "parking_lot", + "parse-display 0.10.0", + "prost", + "prost-build", + "rand 0.9.3", + "scopeguard", + "serde", + "serde_json", + "snafu", + "strum", + "strum_macros", + "tempfile", + "thiserror 1.0.69", + "tokio", + "url", + "uuid", +] + +[[package]] +name = "fluss-test-cluster" +version = "1.0.0" +dependencies = [ + "clap", + "fluss-rs", + "serde", + "serde_json", + "testcontainers", + "tokio", +] + +[[package]] +name = "fluss_nif" +version = "1.0.0" +dependencies = [ + "bigdecimal", + "fluss-rs", + "rustler", + "tokio", +] + +[[package]] +name = "fluss_python" +version = "1.0.0" +dependencies = [ + "arrow", + "arrow-array", + "arrow-pyarrow", + "arrow-schema", + "bigdecimal", + "fluss-rs", + "indexmap 2.13.1", + "jiff", + "pyo3", + "pyo3-async-runtimes", + "tokio", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi 5.3.0", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "wasip2", + "wasip3", +] + +[[package]] +name = "gloo-timers" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "h2" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap 2.13.1", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "foldhash 0.2.0", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-named-pipe" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278" +dependencies = [ + "hex", + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", + "winapi", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "hyperlocal" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7" +dependencies = [ + "hex", + "http-body-util", + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +dependencies = [ + "displaydoc", + "potential_utf", + "utf8_iter", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" + +[[package]] +name = "icu_properties" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" + +[[package]] +name = "icu_provider" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + +[[package]] +name = "indexmap" +version = "2.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + +[[package]] +name = "inventory" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b" +dependencies = [ + "rustversion", +] + +[[package]] +name = "ipnet" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" + +[[package]] +name = "iri-string" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jiff" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "js-sys", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "wasm-bindgen", + "windows-sys 0.61.2", +] + +[[package]] +name = "jiff-static" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" +dependencies = [ + "cfg-if", + "futures-util", + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libc" +version = "0.2.184" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" + +[[package]] +name = "libloading" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "libredox" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" +dependencies = [ + "bitflags", + "libc", + "plain", + "redox_syscall 0.7.3", +] + +[[package]] +name = "link-cplusplus" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82" +dependencies = [ + "cc", +] + +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "litemap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +dependencies = [ + "value-bag", +] + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "lz4_flex" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + +[[package]] +name = "metrics" +version = "0.24.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff56c2e7dce6bd462e3b8919986a617027481b1dcc703175b58cf9dd98a2f071" +dependencies = [ + "portable-atomic", + "rapidhash", +] + +[[package]] +name = "metrics-util" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e56997f084e57b045edf17c3ed8ba7f9f779c670df8206dfd1c736f4c02dc4a" +dependencies = [ + "aho-corasick", + "crossbeam-epoch", + "crossbeam-utils", + "hashbrown 0.16.1", + "indexmap 2.13.1", + "metrics", + "ordered-float", + "quanta", + "radix_trie", + "rand 0.9.3", + "rand_xoshiro", + "rapidhash", + "sketches-ddsketch", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mio" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "nibble_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43" +dependencies = [ + "smallvec", +] + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-conv" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "opendal" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d075ab8a203a6ab4bc1bce0a4b9fe486a72bf8b939037f4b78d95386384bc80a" +dependencies = [ + "anyhow", + "backon", + "base64 0.22.1", + "bytes", + "crc32c", + "futures", + "getrandom 0.2.17", + "http", + "http-body", + "jiff", + "log", + "md-5", + "percent-encoding", + "quick-xml 0.38.4", + "reqsign", + "reqwest", + "serde", + "serde_json", + "tokio", + "url", + "uuid", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "ordered-float" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e" +dependencies = [ + "num-traits", + "rand 0.8.5", + "serde", +] + +[[package]] +name = "ordered-multimap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" +dependencies = [ + "dlv-list", + "hashbrown 0.14.5", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.5.18", + "smallvec", + "windows-link", +] + +[[package]] +name = "parse-display" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "914a1c2265c98e2446911282c6ac86d8524f495792c38c5bd884f80499c7538a" +dependencies = [ + "parse-display-derive 0.9.1", + "regex", + "regex-syntax", +] + +[[package]] +name = "parse-display" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287d8d3ebdce117b8539f59411e4ed9ec226e0a4153c7f55495c6070d68e6f72" +dependencies = [ + "parse-display-derive 0.10.0", + "regex", + "regex-syntax", +] + +[[package]] +name = "parse-display-derive" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ae7800a4c974efd12df917266338e79a7a74415173caf7e70aa0a0707345281" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "regex-syntax", + "structmeta", + "syn", +] + +[[package]] +name = "parse-display-derive" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fc048687be30d79502dea2f623d052f3a074012c6eac41726b7ab17213616b1" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "regex-syntax", + "structmeta", + "syn", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap 2.13.1", +] + +[[package]] +name = "pin-project" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" +dependencies = [ + "portable-atomic", +] + +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +dependencies = [ + "heck", + "itertools", + "log", + "multimap", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +dependencies = [ + "prost", +] + +[[package]] +name = "pyo3" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" +dependencies = [ + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-async-runtimes" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6ee6d4cb3e8d5b925f5cdb38da183e0ff18122eb2048d4041c9e7034d026e23" +dependencies = [ + "futures", + "once_cell", + "pin-project-lite", + "pyo3", + "tokio", +] + +[[package]] +name = "pyo3-build-config" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" +dependencies = [ + "python3-dll-a", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "python3-dll-a" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d381ef313ae70b4da5f95f8a4de773c6aa5cd28f73adec4b4a31df70b66780d8" +dependencies = [ + "cc", +] + +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.3", + "ring", + "rustc-hash", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.60.2", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + +[[package]] +name = "radix_trie" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd" +dependencies = [ + "endian-type", + "nibble_vec", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", + "serde", +] + +[[package]] +name = "rand" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ec095654a25171c2124e9e3393a930bddbffdc939556c914957a4c3e0a87166" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", + "serde", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.5", +] + +[[package]] +name = "rapidhash" +version = "4.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59" +dependencies = [ + "rustversion", +] + +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_syscall" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" +dependencies = [ + "bitflags", +] + +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-lite" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "reqsign" +version = "0.16.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43451dbf3590a7590684c25fb8d12ecdcc90ed3ac123433e500447c7d77ed701" +dependencies = [ + "anyhow", + "async-trait", + "base64 0.22.1", + "chrono", + "form_urlencoded", + "getrandom 0.2.17", + "hex", + "hmac", + "home", + "http", + "log", + "once_cell", + "percent-encoding", + "quick-xml 0.37.5", + "rand 0.8.5", + "reqwest", + "rust-ini", + "serde", + "serde_json", + "sha1", + "sha2", + "tokio", +] + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64 0.22.1", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rust-ini" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "796e8d2b6696392a43bea58116b667fb4c29727dc5abd27d6acf338bb4f688c7" +dependencies = [ + "cfg-if", + "ordered-multimap", +] + +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustler" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c779e2cbfa2987990205d0d8fc142163739e45a4c6592dc637896c77fec01280" +dependencies = [ + "inventory", + "libloading", + "regex-lite", + "rustler_codegen", +] + +[[package]] +name = "rustler_codegen" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e120f8936c779b6c2e09992a2dfa9a4e8bcd0794c02bb654fde03e03ce8c31" +dependencies = [ + "heck", + "inventory", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "rustls" +version = "0.23.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "schannel" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "schemars" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "scratch" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2" + +[[package]] +name = "security-framework" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_fmt" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e497af288b3b95d067a23a4f749f2861121ffcb2f6d8379310dcda040c345ed" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "3.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" +dependencies = [ + "base64 0.22.1", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.13.1", + "schemars 0.9.0", + "schemars 1.2.1", + "serde_core", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "sketches-ddsketch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "snafu" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e84b3f4eacbf3a1ce05eac6763b4d629d60cbc94d632e4092c54ade71f1e1a2" +dependencies = [ + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1c97747dbf44bb1ca44a561ece23508e99cb592e862f22222dcf42f51d1e451" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "socket2" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "structmeta" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e1575d8d40908d70f6fd05537266b90ae71b15dbbe7a8b7dffa2b759306d329" +dependencies = [ + "proc-macro2", + "quote", + "structmeta-derive", + "syn", +] + +[[package]] +name = "structmeta-derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "sval" +version = "2.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb9318255ebd817902d7e279d8f8e39b35b1b9954decd5eb9ea0e30e5fd2b6a" + +[[package]] +name = "sval_buffer" +version = "2.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12571299185e653fdb0fbfe36cd7f6529d39d4e747a60b15a3f34574b7b97c61" +dependencies = [ + "sval", + "sval_ref", +] + +[[package]] +name = "sval_dynamic" +version = "2.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39526f24e997706c0de7f03fb7371f7f5638b66a504ded508e20ad173d0a3677" +dependencies = [ + "sval", +] + +[[package]] +name = "sval_fmt" +version = "2.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "933dd3bb26965d682280fcc49400ac2a05036f4ee1e6dbd61bf8402d5a5c3a54" +dependencies = [ + "itoa", + "ryu", + "sval", +] + +[[package]] +name = "sval_json" +version = "2.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0cda08f6d5c9948024a6551077557b1fdcc3880ff2f20ae839667d2ec2d87ed" +dependencies = [ + "itoa", + "ryu", + "sval", +] + +[[package]] +name = "sval_nested" +version = "2.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d49d5e6c1f9fd0e53515819b03a97ca4eb1bff5c8ee097c43391c09ecfb19f" +dependencies = [ + "sval", + "sval_buffer", + "sval_ref", +] + +[[package]] +name = "sval_ref" +version = "2.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14f876c5a78405375b4e19cbb9554407513b59c93dea12dc6a4af4e1d30899ca" +dependencies = [ + "sval", +] + +[[package]] +name = "sval_serde" +version = "2.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f9ccd3b7f7200239a655e517dd3fd48d960b9111ad24bd6a5e055bef17607c7" +dependencies = [ + "serde_core", + "sval", + "sval_nested", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + +[[package]] +name = "target-lexicon" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "testcontainers" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bd36b06a2a6c0c3c81a83be1ab05fe86460d054d4d51bf513bc56b3e15bdc22" +dependencies = [ + "astral-tokio-tar", + "async-trait", + "bollard", + "bytes", + "docker_credential", + "either", + "etcetera", + "ferroid", + "futures", + "http", + "itertools", + "log", + "memchr", + "parse-display 0.9.1", + "pin-project-lite", + "serde", + "serde_json", + "serde_with", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tokio-util", + "url", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tikv-jemalloc-sys" +version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinystr" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd" +dependencies = [ + "bytes", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-macros" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tonic" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec" +dependencies = [ + "async-trait", + "axum", + "base64 0.22.1", + "bytes", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "socket2", + "sync_wrapper", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-prost" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309" +dependencies = [ + "bytes", + "prost", + "tonic", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 2.13.1", + "pin-project-lite", + "slab", + "sync_wrapper", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "iri-string", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + +[[package]] +name = "typeid" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c" + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" +dependencies = [ + "base64 0.22.1", + "log", + "percent-encoding", + "rustls", + "rustls-pki-types", + "ureq-proto", + "utf8-zero", +] + +[[package]] +name = "ureq-proto" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" +dependencies = [ + "base64 0.22.1", + "http", + "httparse", + "log", +] + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", + "serde_derive", +] + +[[package]] +name = "utf8-zero" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "serde_core", + "wasm-bindgen", +] + +[[package]] +name = "value-bag" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ba6f5989077681266825251a52748b8c1d8a4ad098cc37e440103d0ea717fc0" +dependencies = [ + "value-bag-serde1", + "value-bag-sval2", +] + +[[package]] +name = "value-bag-serde1" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16530907bfe2999a1773ca5900a65101e092c70f642f25cc23ca0c43573262c5" +dependencies = [ + "erased-serde", + "serde_core", + "serde_fmt", +] + +[[package]] +name = "value-bag-sval2" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00ae130edd690eaa877e4f40605d534790d1cf1d651e7685bd6a144521b251f" +dependencies = [ + "sval", + "sval_buffer", + "sval_dynamic", + "sval_fmt", + "sval_json", + "sval_ref", + "sval_serde", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03623de6905b7206edd0a75f69f747f134b7f0a2323392d664448bf2d3c5d87e" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.13.1", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap 2.13.1", + "semver", +] + +[[package]] +name = "web-sys" +version = "0.3.94" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd70027e39b12f0849461e08ffc50b9cd7688d942c1c8e3c7b22273236b4dd0a" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.13.1", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap 2.13.1", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.13.1", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "writeable" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix", +] + +[[package]] +name = "yoke" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.48" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerofrom" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/fluss-rust/Cargo.toml b/fluss-rust/Cargo.toml new file mode 100644 index 0000000000..a555a9198a --- /dev/null +++ b/fluss-rust/Cargo.toml @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[workspace.package] +authors = ["Apache Fluss "] +categories = ["api-bindings", "database"] +edition = "2024" +homepage = "https://clients.fluss.apache.org/" +license = "Apache-2.0" +repository = "https://github.com/apache/fluss-rust" +rust-version = "1.85" +version = "1.0.0" +keywords = ["fluss", "streaming-storage", "datalake"] + +[workspace] +resolver = "2" +members = ["crates/fluss", "crates/fluss-test-cluster", "crates/examples", "bindings/python", "bindings/cpp", "bindings/elixir/native/fluss_nif"] + +[workspace.dependencies] +fluss = { package = "fluss-rs", version = "1.0.0", path = "crates/fluss", features = ["storage-all"] } +tokio = { version = "1.44.2", features = ["full"] } +clap = { version = "4.5.37", features = ["derive"] } +arrow = { version = "57.0.0", features = ["ipc_compression", "ffi"] } +bigdecimal = "0.4" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +metrics = "0.24" +opendal = "0.53" +jiff = { version = "0.2" } diff --git a/fluss-rust/DEPENDENCIES.rust.tsv b/fluss-rust/DEPENDENCIES.rust.tsv new file mode 100644 index 0000000000..b46eeac210 --- /dev/null +++ b/fluss-rust/DEPENDENCIES.rust.tsv @@ -0,0 +1,325 @@ +crate Apache-2.0 Apache-2.0 WITH LLVM-exception BSD-2-Clause BSD-3-Clause BSL-1.0 CC0-1.0 CDLA-Permissive-2.0 ISC LGPL-2.1-or-later MIT Unicode-3.0 Unlicense Zlib +ahash@0.8.12 X X +aho-corasick@1.1.4 X X +android_system_properties@0.1.5 X X +anstream@1.0.0 X X +anstyle@1.0.14 X X +anstyle-parse@1.0.0 X X +anstyle-query@1.1.5 X X +anstyle-wincon@3.0.11 X X +anyhow@1.0.102 X X +arrow@57.3.0 X +arrow-arith@57.3.0 X +arrow-array@57.3.0 X +arrow-buffer@57.3.0 X +arrow-cast@57.3.0 X +arrow-csv@57.3.0 X +arrow-data@57.3.0 X +arrow-ipc@57.3.0 X +arrow-json@57.3.0 X +arrow-ord@57.3.0 X +arrow-pyarrow@57.3.0 X +arrow-row@57.3.0 X +arrow-schema@57.3.0 X +arrow-select@57.3.0 X +arrow-string@57.3.0 X +async-trait@0.1.89 X X +atoi@2.0.0 X +atomic-waker@1.1.2 X X +autocfg@1.5.0 X X +backon@1.6.0 X +base64@0.22.1 X X +bigdecimal@0.4.10 X X +bitflags@2.11.0 X X +bitvec@1.0.1 X +block-buffer@0.10.4 X X +bumpalo@3.20.2 X X +byteorder@1.5.0 X X +bytes@1.11.1 X +cc@1.2.57 X X +cfg-if@1.0.4 X X +chrono@0.4.44 X X +clap@4.6.0 X X +clap_builder@4.6.0 X X +clap_derive@4.6.0 X X +clap_lex@1.1.0 X X +codespan-reporting@0.13.1 X +colorchoice@1.0.5 X X +const-oid@0.9.6 X X +const-random@0.1.18 X X +const-random-macro@0.1.16 X X +core-foundation-sys@0.8.7 X X +cpufeatures@0.2.17 X X +crc32c@0.6.8 X X +crossbeam-utils@0.8.21 X X +crunchy@0.2.4 X +crypto-common@0.1.7 X X +csv@1.4.0 X X +csv-core@0.1.13 X X +cxx@1.0.194 X X +cxx-build@1.0.194 X X +cxxbridge-flags@1.0.194 X X +cxxbridge-macro@1.0.194 X X +dashmap@6.1.0 X +delegate@0.13.5 X X +digest@0.10.7 X X +displaydoc@0.2.5 X X +either@1.15.0 X X +equivalent@1.0.2 X X +errno@0.3.14 X X +fastrand@2.3.0 X X +find-msvc-tools@0.1.9 X X +fixedbitset@0.5.7 X X +flatbuffers@25.12.19 X +fluss-cpp@0.1.0 X +fluss-examples@0.1.0 X +fluss-rs@0.1.0 X +fluss_python@0.1.0 X +fnv@1.0.7 X X +foldhash@0.1.5 X +foldhash@0.2.0 X +form_urlencoded@1.2.2 X X +funty@2.0.0 X +futures@0.3.32 X X +futures-channel@0.3.32 X X +futures-core@0.3.32 X X +futures-executor@0.3.32 X X +futures-io@0.3.32 X X +futures-macro@0.3.32 X X +futures-sink@0.3.32 X X +futures-task@0.3.32 X X +futures-util@0.3.32 X X +generic-array@0.14.7 X +getrandom@0.2.17 X X +getrandom@0.3.4 X X +getrandom@0.4.2 X X +gloo-timers@0.3.0 X X +h2@0.4.13 X +half@2.7.1 X X +hashbrown@0.14.5 X X +hashbrown@0.15.5 X X +hashbrown@0.16.1 X X +heck@0.5.0 X X +hex@0.4.3 X X +hmac@0.12.1 X X +home@0.5.12 X X +http@1.4.0 X X +http-body@1.0.1 X +http-body-util@0.1.3 X +httparse@1.10.1 X X +httpdate@1.0.3 X X +hyper@1.8.1 X +hyper-rustls@0.27.7 X X X +hyper-util@0.1.20 X +iana-time-zone@0.1.65 X X +iana-time-zone-haiku@0.1.2 X X +icu_collections@2.1.1 X +icu_locale_core@2.1.1 X +icu_normalizer@2.1.1 X +icu_normalizer_data@2.1.1 X +icu_properties@2.1.2 X +icu_properties_data@2.1.2 X +icu_provider@2.1.1 X +idna@1.1.0 X X +idna_adapter@1.2.1 X X +indexmap@2.13.0 X X +indoc@2.0.7 X X +ipnet@2.12.0 X X +iri-string@0.7.11 X X +is_terminal_polyfill@1.70.2 X X +itertools@0.14.0 X X +itoa@1.0.18 X X +jiff@0.2.23 X X +jiff-tzdb@0.1.6 X X +jiff-tzdb-platform@0.1.3 X X +jobserver@0.1.34 X X +js-sys@0.3.91 X X +lexical-core@1.0.6 X X +lexical-parse-float@1.0.6 X X +lexical-parse-integer@1.0.6 X X +lexical-util@1.0.7 X X +lexical-write-float@1.0.6 X X +lexical-write-integer@1.0.6 X X +libc@0.2.183 X X +libm@0.2.16 X +link-cplusplus@1.0.12 X X +linked-hash-map@0.5.6 X X +linux-raw-sys@0.12.1 X X X +litemap@0.8.1 X +lock_api@0.4.14 X X +log@0.4.29 X X +lz4_flex@0.12.1 X +md-5@0.10.6 X X +memchr@2.8.0 X X +memoffset@0.9.1 X +mio@1.1.1 X +multimap@0.10.1 X X +num-bigint@0.4.6 X X +num-complex@0.4.6 X X +num-integer@0.1.46 X X +num-traits@0.2.19 X X +once_cell@1.21.4 X X +once_cell_polyfill@1.70.2 X X +opendal@0.55.0 X +ordered-float@5.1.0 X +parking_lot@0.12.5 X X +parking_lot_core@0.9.12 X X +parse-display@0.10.0 X X +parse-display-derive@0.10.0 X X +percent-encoding@2.3.2 X X +petgraph@0.8.3 X X +pin-project-lite@0.2.17 X X +pin-utils@0.1.0 X X +pkg-config@0.3.32 X X +portable-atomic@1.13.1 X X +portable-atomic-util@0.2.6 X X +potential_utf@0.1.4 X +ppv-lite86@0.2.21 X X +prettyplease@0.2.37 X X +proc-macro2@1.0.106 X X +prost@0.14.3 X +prost-build@0.14.3 X +prost-derive@0.14.3 X +prost-types@0.14.3 X +pyo3@0.26.0 X X +pyo3-async-runtimes@0.26.0 X +pyo3-build-config@0.26.0 X X +pyo3-ffi@0.26.0 X X +pyo3-macros@0.26.0 X X +pyo3-macros-backend@0.26.0 X X +python3-dll-a@0.2.14 X +quick-xml@0.37.5 X +quick-xml@0.38.4 X +quote@1.0.45 X X +r-efi@5.3.0 X X X +r-efi@6.0.0 X X X +radium@0.7.0 X +rand@0.8.5 X X +rand@0.9.2 X X +rand_chacha@0.3.1 X X +rand_chacha@0.9.0 X X +rand_core@0.6.4 X X +rand_core@0.9.5 X X +redox_syscall@0.5.18 X +regex@1.12.3 X X +regex-automata@0.4.14 X X +regex-syntax@0.8.10 X X +reqsign@0.16.5 X +reqwest@0.12.28 X X +ring@0.17.14 X X +rustc_version@0.4.1 X X +rustix@1.1.4 X X X +rustls@0.23.37 X X X +rustls-pki-types@1.14.0 X X +rustls-webpki@0.103.10 X +rustversion@1.0.22 X X +ryu@1.0.23 X X +scopeguard@1.2.0 X X +scratch@1.0.9 X X +semver@1.0.27 X X +serde@1.0.228 X X +serde_core@1.0.228 X X +serde_derive@1.0.228 X X +serde_json@1.0.149 X X +serde_urlencoded@0.7.1 X X +sha1@0.10.6 X X +sha2@0.10.9 X X +shlex@1.3.0 X X +signal-hook-registry@1.4.8 X X +simdutf8@0.1.5 X X +slab@0.4.12 X +smallvec@1.15.1 X X +snafu@0.8.9 X X +snafu-derive@0.8.9 X X +socket2@0.6.3 X X +stable_deref_trait@1.2.1 X X +strsim@0.11.1 X +structmeta@0.3.0 X X +structmeta-derive@0.3.0 X X +strum@0.26.3 X +strum_macros@0.26.4 X +subtle@2.6.1 X +syn@2.0.117 X X +sync_wrapper@1.0.2 X +synstructure@0.13.2 X +tap@1.0.1 X +target-lexicon@0.13.5 X +tempfile@3.27.0 X X +termcolor@1.4.1 X X +thiserror@1.0.69 X X +thiserror-impl@1.0.69 X X +tikv-jemalloc-sys@0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7 X X +tikv-jemallocator@0.6.1 X X +tiny-keccak@2.0.2 X +tinystr@0.8.2 X +tokio@1.50.0 X +tokio-macros@2.6.1 X +tokio-rustls@0.26.4 X X +tokio-util@0.7.18 X +tower@0.5.3 X +tower-http@0.6.8 X +tower-layer@0.3.3 X +tower-service@0.3.3 X +tracing@0.1.44 X +tracing-attributes@0.1.31 X +tracing-core@0.1.36 X +try-lock@0.2.5 X +twox-hash@2.1.2 X +typenum@1.19.0 X X +unicode-ident@1.0.24 X X X +unicode-width@0.2.2 X X +unindent@0.2.4 X X +untrusted@0.9.0 X +url@2.5.8 X X +utf8_iter@1.0.4 X X +utf8parse@0.2.2 X X +uuid@1.22.0 X X +value-bag@1.12.0 X X +version_check@0.9.5 X X +want@0.3.1 X +wasi@0.11.1+wasi-snapshot-preview1 X X X +wasip2@1.0.2+wasi-0.2.9 X X X +wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06 X X X +wasm-bindgen@0.2.114 X X +wasm-bindgen-futures@0.4.64 X X +wasm-bindgen-macro@0.2.114 X X +wasm-bindgen-macro-support@0.2.114 X X +wasm-bindgen-shared@0.2.114 X X +wasm-streams@0.4.2 X X +web-sys@0.3.91 X X +webpki-roots@1.0.6 X +winapi-util@0.1.11 X X +windows-core@0.62.2 X X +windows-implement@0.60.2 X X +windows-interface@0.59.3 X X +windows-link@0.2.1 X X +windows-result@0.4.1 X X +windows-strings@0.5.1 X X +windows-sys@0.52.0 X X +windows-sys@0.61.2 X X +windows-targets@0.52.6 X X +windows_aarch64_gnullvm@0.52.6 X X +windows_aarch64_msvc@0.52.6 X X +windows_i686_gnu@0.52.6 X X +windows_i686_gnullvm@0.52.6 X X +windows_i686_msvc@0.52.6 X X +windows_x86_64_gnu@0.52.6 X X +windows_x86_64_gnullvm@0.52.6 X X +windows_x86_64_msvc@0.52.6 X X +wit-bindgen@0.51.0 X X X +writeable@0.6.2 X +wyz@0.5.1 X +yoke@0.8.1 X +yoke-derive@0.8.1 X +zerocopy@0.8.47 X X X +zerocopy-derive@0.8.47 X X X +zerofrom@0.1.6 X +zerofrom-derive@0.1.6 X +zeroize@1.8.2 X X +zerotrie@0.2.3 X +zerovec@0.11.5 X +zerovec-derive@0.11.2 X +zmij@1.0.21 X +zstd@0.13.3 X +zstd-safe@7.2.4 X X +zstd-sys@2.0.16+zstd.1.5.7 X X diff --git a/fluss-rust/DEVELOPMENT.md b/fluss-rust/DEVELOPMENT.md new file mode 100644 index 0000000000..a1180d6f6c --- /dev/null +++ b/fluss-rust/DEVELOPMENT.md @@ -0,0 +1,106 @@ +# Development Guide + +Welcome to the development guide of `fluss-rust`! This project builds `fluss-rust` client and language specific bindings. + +## Pre-requisites + +- protobuf +- rust + +You can install these using your favourite package / version manager. Example installation using mise: + +```bash +mise install protobuf +mise install rust +``` + +## IDE Setup + +We recommend [RustRover](https://www.jetbrains.com/rust/) IDE to work with fluss-rust code base. + +### Importing fluss-rust + +1. On your terminal, clone fluss-rust project from GitHub + ```bash + git clone https://github.com/apache/fluss-rust.git + ``` +1. Open RustRover, on `Projects` tab, click `Open` and navigate to the root directory of fluss-rust +1. Click `Open` + +### Copyright Profile + +Fluss and Fluss-rust are Apache projects and as such every files need to have Apache licence header. This can be automated in RustRover by adding a Copyright profile: + +1. Go to `Settings` -> `Editor` -> `Copyright` -> `Copyright Profiles`. +1. Add a new profile and name it `Apache`. +1. Add the following text as the license text: + ``` + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + ``` +1. Go to `Editor` -> `Copyright` and choose the `Apache` profile as the default profile for this project. +1. Click `Apply` + +We also use line comment formatting for licence headers. +1. Go to `Editor` -> `Copyright` -> `Formatting` -> `Rust` +1. Choose `Use custom formatting` +1. Choose `Use line comment` + +## Project directories + +Source files are organized in the following manner + +1. `crates/fluss` - fluss rust client crate source +1. `crates/examples` - fluss rust client examples +1. `bindings` - bindings to other languages e.g. C++ under `bindings/cpp` and Python under `bindings/python` +1. Click `Apply` +2. +## Building & Testing + +See [quickstart](README.md#quick-start) for steps to run example code. + +Running all unit tests for fluss rust client: + +```bash +cargo test --workspace +``` + +Running all integration test cases: + +```bash +cargo test --features integration_tests --workspace +``` + + +### License check (cargo-deny) + +We use [cargo-deny](https://embarkstudios.github.io/cargo-deny/) to ensure all dependency licenses are Apache-compatible. When present, configuration lives in a `deny.toml` file at the repo root and should enforce an Apache-compatible license policy. + +```bash +cargo install cargo-deny --locked +cargo deny check licenses +``` + +### Formatting and Clippy + +Our CI runs cargo formatting and clippy to help keep the code base styling tidy and readable. Run the following commands and address any errors or warnings to ensure that your PR can complete CI successfully. + +```bash +cargo fmt --all +cargo clippy --all-targets --fix --allow-dirty --allow-staged +``` + diff --git a/fluss-rust/MODULE.bazel b/fluss-rust/MODULE.bazel new file mode 100644 index 0000000000..f0e6025073 --- /dev/null +++ b/fluss-rust/MODULE.bazel @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Required at repository root for root module mode (`bazel_dep(name = "fluss-cpp", ...)`). +# Consumer examples use `local_path_override(..., path = "/path/to/fluss-rust")`, so +# Bazel resolves the module from the repository root. This also matches the Rust +# workspace layout used by `bindings/cpp` during cargo-based Bazel/CMake builds. +# `0.0.0` is a local-development placeholder in this repository branch. +# Consumers should depend on a published release version. +module( + name = "fluss-cpp", + version = "0.0.0", +) + +bazel_dep(name = "rules_cc", version = "0.0.17") +bazel_dep(name = "platforms", version = "0.0.10") +bazel_dep(name = "rules_foreign_cc", version = "0.15.1") +bazel_dep(name = "rules_python", version = "1.2.0") + +python = use_extension("@rules_python//python/extensions:python.bzl", "python") +python.toolchain(python_version = "3.12") +use_repo(python, "python_3_12") + +foreign_cc_tools = use_extension("@rules_foreign_cc//foreign_cc:extensions.bzl", "tools") +use_repo( + foreign_cc_tools, + "cmake_3.31.8_toolchains", + "cmake_src", + "ninja_1.13.0_toolchains", + "ninja_build_src", + "rules_foreign_cc_framework_toolchains", +) + +register_toolchains( + "@rules_foreign_cc_framework_toolchains//:all", + "@cmake_3.31.8_toolchains//:all", + "@ninja_1.13.0_toolchains//:all", + "@python_3_12//:all", + "@rules_foreign_cc//toolchains:all", +) + +cpp_sdk = use_extension("//bindings/cpp/bazel/cpp:deps.bzl", "cpp_sdk") +cpp_sdk.config( + mode = "build", + arrow_cpp_version = "19.0.1", + protobuf_version = "3.25.5", + ep_cmake_ranlib = "/usr/bin/ranlib", + ep_cmake_ar = "/usr/bin/ar", + ep_cmake_nm = "/usr/bin/nm", +) +use_repo(cpp_sdk, "apache_arrow_cpp") diff --git a/fluss-rust/README.md b/fluss-rust/README.md new file mode 100644 index 0000000000..a88ec2f3ae --- /dev/null +++ b/fluss-rust/README.md @@ -0,0 +1,125 @@ +# Apache Fluss™ Rust (Incubating) + +![Experimental](https://img.shields.io/badge/status-experimental-orange) + +Rust implementation of [Apache Fluss™](https://fluss.apache.org/). + + +## Why Fluss? +[Fluss](https://fluss.apache.org/) is a streaming storage built for real-time analytics which can serve as the real-time data layer for Lakehouse architectures. +It bridges the gap between streaming data and the data Lakehouse by enabling low-latency, high-throughput data ingestion and processing while seamlessly integrating with popular compute engines. + +## Why Fluss Rust Client +It's an official Rust client for interacting with Fluss. This client provides foundational capabilities for table management and log streaming operations, enabling developers to explore Fluss within Rust ecosystems. + +## Quick-Start + +### Step1 Start Fluss cluster +#### Requirements +Fluss runs on all UNIX-like environments, e.g. Linux, Mac OS X. Before you start to setup the system, make sure you have the following software installed on your test machine: + +Java 17 or higher (Java 8 and Java 11 are not recommended) +If your cluster does not fulfill these software requirements you will need to install/upgrade it. + +Fluss requires the JAVA_HOME environment variable to be set on all nodes and point to the directory of your Java installation. + +#### Fluss Setup +Go to the [downloads](https://fluss.apache.org/downloads/) page and download the latest Fluss release (currently 0.8.0). Make sure to pick the Fluss package matching your Java version. After downloading the latest release, extract it: +```shell +tar -xzf fluss-0.8.0-incubating-bin.tgz +cd fluss-0.8.0-incubating/ +``` +You can start Fluss local cluster by running the following command: +```shell +./bin/local-cluster.sh start +``` +After that, the Fluss local cluster is started. + +### Run Provided Example +Only supports Linux or macOs. You will need to [install Rust](https://www.rust-lang.org/tools/install) firstly. + +After that, go the project directory, build it and run the example: +```shell +cargo build --example example-table --release +cd target/release/examples +./example-table +``` +The example code is as follows: +```rust +#[tokio::main] +pub async fn main() -> Result<()> { + // 1: create the table; + let mut args = Args::default(); + args.bootstrap_servers = "127.0.0.1:9123".to_string(); + let conn_config = ConnectionConfig::from_args(args); + let conn = FlussConnection::new(conn_config).await; + + let admin = conn.get_admin(); + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("c1", DataTypes::int()) + .column("c2", DataTypes::string()) + .build(), + ) + .build(); + + let table_path = TablePath::new("fluss".to_owned(), "rust_test".to_owned()); + + admin + .create_table(&table_path, &table_descriptor, true) + .await + .unwrap(); + + // 2: get the table + let table_info = admin.get_table_info(&table_path).await.unwrap(); + print!("Get created table:\n {}\n", table_info); + + // let's sleep 2 seconds to wait leader ready + thread::sleep(Duration::from_secs(2)); + + // 3: append log to the table + let table = conn.get_table(&table_path).await; + let append_writer = table.new_append().create_writer(); + let batch = record_batch!(("c1", Int32, [1, 2, 3, 4, 5, 6]), ("c2", Utf8, ["a1", "a2", "a3", "a4", "a5", "a6"])).unwrap(); + append_writer.append(batch)?; + append_writer.flush().await?; + println!("Start to scan log records......"); + // 4: scan the records + let log_scanner = table.new_scan().create_log_scanner(); + log_scanner.subscribe(0, 0).await; + + loop { + let scan_records = log_scanner.poll(Duration::from_secs(10)).await?; + println!("Start to poll records......"); + for record in scan_records { + let row = record.row(); + println!( + "{{{}, {}}}@{}", + row.get_int(0), + row.get_string(1), + record.offset() + ); + } + } + Ok(()) +} +``` + +You can change it according to your needs, have fun! + +#### Clear environment +Then, stop your Fluss cluster. Go to your Fluss home, stop it via the following commands: +```shell +./bin/local-cluster.sh stop +``` + +## Documentation + +- [Development Guide](DEVELOPMENT.md) – Build, test, and contribute to fluss-rust. +- [Release Guide](website/docs/release/create-release.md) – How to build, release, and sign official Fluss client packages (Rust, Python, C++). + +## License + +Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) \ No newline at end of file diff --git a/fluss-rust/bindings/cpp/.bazelrc b/fluss-rust/bindings/cpp/.bazelrc new file mode 100644 index 0000000000..ce7d81f82a --- /dev/null +++ b/fluss-rust/bindings/cpp/.bazelrc @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Bazel configuration for fluss-rust C++ bindings + +# Enable BzlMod +common --enable_bzlmod + +# Debug configuration (matches BUILD.bazel settings) +build:debug --compilation_mode=dbg +build:debug --copt=-g3 +build:debug --copt=-ggdb +build:debug --copt=-O0 +build:debug --copt=-fno-omit-frame-pointer +build:debug --copt=-DDEBUG +build:debug --strip=never +build:debug --linkopt=-g + +# Release configuration +build:release --compilation_mode=opt +build:release --copt=-O2 +build:release --copt=-DNDEBUG +build:release --strip=always diff --git a/fluss-rust/bindings/cpp/.clang-format b/fluss-rust/bindings/cpp/.clang-format new file mode 100644 index 0000000000..1c31900ec4 --- /dev/null +++ b/fluss-rust/bindings/cpp/.clang-format @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +--- +BasedOnStyle: Google +ColumnLimit: 100 +IndentWidth: 4 diff --git a/fluss-rust/bindings/cpp/.gitignore b/fluss-rust/bindings/cpp/.gitignore new file mode 100644 index 0000000000..1f1632b95c --- /dev/null +++ b/fluss-rust/bindings/cpp/.gitignore @@ -0,0 +1,27 @@ +build/ +cmake-build-*/ +CMakeFiles/ +.idea/ +*.o +*.a +*.so +*.dylib + +# Bazel build outputs +bazel-build/ +bazel-bin +bazel-out +bazel-testlogs +bazel-cpp +bazel-* +MODULE.bazel.lock + +# Keep versioned Bazel consumer examples (name starts with bazel-). +!examples/bazel-consumer/ +!examples/bazel-consumer/** +# `build/` is ignored globally above; keep this fixture path visible. +!examples/bazel-consumer/build/ +!examples/bazel-consumer/build/** +examples/bazel-consumer/**/MODULE.bazel.lock +examples/bazel-consumer/**/bazel-* +examples/bazel-consumer/**/tmp.log diff --git a/fluss-rust/bindings/cpp/BUILD.bazel b/fluss-rust/bindings/cpp/BUILD.bazel new file mode 100644 index 0000000000..d247baf18c --- /dev/null +++ b/fluss-rust/bindings/cpp/BUILD.bazel @@ -0,0 +1,436 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +licenses(["notice"]) + +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_import", "cc_library") + +config_setting( + name = "debug_mode", + values = {"compilation_mode": "dbg"}, +) + +config_setting( + name = "fastbuild_mode", + values = {"compilation_mode": "fastbuild"}, +) + +config_setting( + name = "release_mode", + values = {"compilation_mode": "opt"}, +) + +_PROTOC_SETUP_SNIPPET = """ + set -e + if [ -n "$${CARGO:-}" ]; then + if [ ! -x "$$CARGO" ]; then + echo "Error: CARGO is set but not executable: $$CARGO" >&2 + exit 1 + fi + CARGO_BIN="$$CARGO" + else + CARGO_BIN=$$(command -v cargo || true) + if [ -z "$$CARGO_BIN" ]; then + echo "Error: cargo not found in PATH and CARGO is not set" >&2 + exit 1 + fi + fi + if [ -n "$${PROTOC:-}" ]; then + if [ ! -x "$$PROTOC" ]; then + echo "Error: PROTOC is set but not executable: $$PROTOC" >&2 + exit 1 + fi + export PROTOC + else + PROTOC_BIN=$$(command -v protoc || true) + if [ -z "$$PROTOC_BIN" ]; then + echo "Error: protoc not found in PATH and PROTOC is not set" >&2 + exit 1 + fi + export PROTOC="$$PROTOC_BIN" + fi +""" + +genrule( + name = "cargo_build_debug", + srcs = glob([ + "src/**/*.rs", + "Cargo.toml", + ]), + outs = [ + "rust_lib_debug.a", + "rust_bridge_cc_debug.cc", + "rust_bridge_h_debug.h", + "src/lib.rs_debug.h", + "cxxbridge/rust/cxx_debug.h", + ], + cmd = _PROTOC_SETUP_SNIPPET + """ + EXECROOT=$$(pwd) + OUTPUT_LIB=$(location rust_lib_debug.a) + OUTPUT_CC=$(location rust_bridge_cc_debug.cc) + OUTPUT_H=$(location rust_bridge_h_debug.h) + OUTPUT_SRC_H=$(location src/lib.rs_debug.h) + OUTPUT_CXX_H=$(location cxxbridge/rust/cxx_debug.h) + # Resolve real source path from sandbox symlink + SANDBOX_CARGO=$(location Cargo.toml) + REAL_CARGO=$$(readlink -f $$SANDBOX_CARGO 2>/dev/null || python3 -c "import os; print(os.path.realpath('$$SANDBOX_CARGO'))") + CARGO_DIR=$$(dirname $$REAL_CARGO) + # Find Cargo workspace root (fluss-rust directory, 2 levels up from bindings/cpp) + WORKSPACE_ROOT=$$(cd $$CARGO_DIR/../.. && pwd) + if [ ! -f $$WORKSPACE_ROOT/Cargo.toml ]; then + echo "Error: Cannot find workspace root Cargo.toml at $$WORKSPACE_ROOT" >&2 + exit 1 + fi + cd $$WORKSPACE_ROOT + "$$CARGO_BIN" build --manifest-path $$CARGO_DIR/Cargo.toml + CARGO_TARGET_DIR=$$WORKSPACE_ROOT/target + # cxxbridge uses the Cargo package name (with hyphen): fluss-cpp + RUST_BRIDGE_DIR=$$CARGO_TARGET_DIR/cxxbridge/fluss-cpp/src + # Cargo converts hyphens to underscores in library file names: libfluss_cpp.a + RUST_LIB=$$CARGO_TARGET_DIR/debug/libfluss_cpp.a + if [ ! -f $$RUST_LIB ]; then + echo "Error: Rust library not found at $$RUST_LIB" >&2 + exit 1 + fi + if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.cc ]; then + echo "Error: cxxbridge CC file not found at $$RUST_BRIDGE_DIR/lib.rs.cc" >&2 + exit 1 + fi + if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.h ]; then + echo "Error: cxxbridge header file not found at $$RUST_BRIDGE_DIR/lib.rs.h" >&2 + exit 1 + fi + cd $$EXECROOT + mkdir -p $$(dirname $$OUTPUT_SRC_H) $$(dirname $$OUTPUT_CXX_H) + cp $$RUST_LIB $$OUTPUT_LIB || (echo "Failed to copy $$RUST_LIB to $$OUTPUT_LIB" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.cc $$OUTPUT_CC || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.cc to $$OUTPUT_CC" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_H" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_SRC_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_SRC_H" >&2; exit 1) + CXX_H_SOURCE=$$CARGO_TARGET_DIR/cxxbridge/rust/cxx.h + if [ ! -f $$CXX_H_SOURCE ] && [ ! -L $$CXX_H_SOURCE ]; then + echo "Error: cxx.h not found at $$CXX_H_SOURCE" >&2 + exit 1 + fi + cp -L $$CXX_H_SOURCE $$OUTPUT_CXX_H || (echo "Failed to copy $$CXX_H_SOURCE to $$OUTPUT_CXX_H" >&2; exit 1) + """, + message = "Building Rust library (debug) with cargo...", + local = 1, +) + +genrule( + name = "cargo_build_release", + srcs = glob([ + "src/**/*.rs", + "Cargo.toml", + ]), + outs = [ + "rust_lib_release.a", + "rust_bridge_cc_release.cc", + "rust_bridge_h_release.h", + "src/lib.rs_release.h", + "cxxbridge/rust/cxx_release.h", + ], + cmd = _PROTOC_SETUP_SNIPPET + """ + EXECROOT=$$(pwd) + OUTPUT_LIB=$(location rust_lib_release.a) + OUTPUT_CC=$(location rust_bridge_cc_release.cc) + OUTPUT_H=$(location rust_bridge_h_release.h) + OUTPUT_SRC_H=$(location src/lib.rs_release.h) + OUTPUT_CXX_H=$(location cxxbridge/rust/cxx_release.h) + # Resolve real source path from sandbox symlink + SANDBOX_CARGO=$(location Cargo.toml) + REAL_CARGO=$$(readlink -f $$SANDBOX_CARGO 2>/dev/null || python3 -c "import os; print(os.path.realpath('$$SANDBOX_CARGO'))") + CARGO_DIR=$$(dirname $$REAL_CARGO) + # Find Cargo workspace root (fluss-rust directory, 2 levels up from bindings/cpp) + WORKSPACE_ROOT=$$(cd $$CARGO_DIR/../.. && pwd) + if [ ! -f $$WORKSPACE_ROOT/Cargo.toml ]; then + echo "Error: Cannot find workspace root Cargo.toml at $$WORKSPACE_ROOT" >&2 + exit 1 + fi + cd $$WORKSPACE_ROOT + "$$CARGO_BIN" build --release --manifest-path $$CARGO_DIR/Cargo.toml + CARGO_TARGET_DIR=$$WORKSPACE_ROOT/target + # cxxbridge uses the Cargo package name (with hyphen): fluss-cpp + RUST_BRIDGE_DIR=$$CARGO_TARGET_DIR/cxxbridge/fluss-cpp/src + # Cargo converts hyphens to underscores in library file names: libfluss_cpp.a + RUST_LIB=$$CARGO_TARGET_DIR/release/libfluss_cpp.a + if [ ! -f $$RUST_LIB ]; then + echo "Error: Rust library not found at $$RUST_LIB" >&2 + exit 1 + fi + if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.cc ]; then + echo "Error: cxxbridge CC file not found at $$RUST_BRIDGE_DIR/lib.rs.cc" >&2 + exit 1 + fi + if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.h ]; then + echo "Error: cxxbridge header file not found at $$RUST_BRIDGE_DIR/lib.rs.h" >&2 + exit 1 + fi + cd $$EXECROOT + mkdir -p $$(dirname $$OUTPUT_SRC_H) $$(dirname $$OUTPUT_CXX_H) + cp $$RUST_LIB $$OUTPUT_LIB || (echo "Failed to copy $$RUST_LIB to $$OUTPUT_LIB" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.cc $$OUTPUT_CC || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.cc to $$OUTPUT_CC" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_H" >&2; exit 1) + cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_SRC_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_SRC_H" >&2; exit 1) + CXX_H_SOURCE=$$CARGO_TARGET_DIR/cxxbridge/rust/cxx.h + if [ ! -f $$CXX_H_SOURCE ] && [ ! -L $$CXX_H_SOURCE ]; then + echo "Error: cxx.h not found at $$CXX_H_SOURCE" >&2 + exit 1 + fi + cp -L $$CXX_H_SOURCE $$OUTPUT_CXX_H || (echo "Failed to copy $$CXX_H_SOURCE to $$OUTPUT_CXX_H" >&2; exit 1) + """, + message = "Building Rust library (release) with cargo...", + local = 1, +) + +filegroup( + name = "lib_rs_h_selected", + srcs = select({ + ":debug_mode": [":src/lib.rs_debug.h"], + ":fastbuild_mode": [":src/lib.rs_debug.h"], + ":release_mode": [":src/lib.rs_release.h"], + }), +) + +genrule( + name = "lib_rs_h_unified", + srcs = [":lib_rs_h_selected"], + outs = ["src/lib.rs.h"], + cmd = "cp $(location :lib_rs_h_selected) $(location src/lib.rs.h)", + message = "Unifying lib.rs.h for C++ includes", +) + +filegroup( + name = "rust_bridge_cc_selected", + srcs = select({ + ":debug_mode": [":rust_bridge_cc_debug.cc"], + ":fastbuild_mode": [":rust_bridge_cc_debug.cc"], + ":release_mode": [":rust_bridge_cc_release.cc"], + }), +) + +genrule( + name = "rust_bridge_cc_unified", + srcs = [":rust_bridge_cc_selected"], + outs = ["rust_bridge_cc.cc"], + cmd = "cp $(location :rust_bridge_cc_selected) $(location rust_bridge_cc.cc)", + message = "Unifying rust_bridge_cc.cc for C++ compilation", +) + +filegroup( + name = "rust_bridge_h_selected", + srcs = select({ + ":debug_mode": [":rust_bridge_h_debug.h"], + ":fastbuild_mode": [":rust_bridge_h_debug.h"], + ":release_mode": [":rust_bridge_h_release.h"], + }), +) + +genrule( + name = "rust_bridge_h_unified", + srcs = [":rust_bridge_h_selected"], + outs = ["rust_bridge_h.h"], + cmd = "cp $(location :rust_bridge_h_selected) $(location rust_bridge_h.h)", + message = "Unifying rust_bridge_h.h for C++ includes", +) + +filegroup( + name = "cxx_h_selected", + srcs = select({ + ":debug_mode": [":cxxbridge/rust/cxx_debug.h"], + ":fastbuild_mode": [":cxxbridge/rust/cxx_debug.h"], + ":release_mode": [":cxxbridge/rust/cxx_release.h"], + }), +) + +genrule( + name = "cxx_h_unified", + srcs = [":cxx_h_selected"], + outs = ["cxxbridge/rust/cxx.h"], + cmd = "mkdir -p $$(dirname $(location cxxbridge/rust/cxx.h)) && cp $(location :cxx_h_selected) $(location cxxbridge/rust/cxx.h)", + message = "Unifying cxx.h for C++ includes", +) + +cc_import( + name = "rust_lib", + static_library = select({ + ":debug_mode": ":rust_lib_debug.a", + ":fastbuild_mode": ":rust_lib_debug.a", + ":release_mode": ":rust_lib_release.a", + }), + alwayslink = True, +) + +cc_library( + name = "fluss_cpp", + srcs = [ + "src/admin.cpp", + "src/connection.cpp", + "src/table.cpp", + ], + hdrs = [ + "include/fluss.hpp", + ], + textual_hdrs = [ + "src/ffi_converter.hpp", + ":rust_bridge_h_unified", + ":lib_rs_h_unified", + ":cxx_h_unified", + ], + strip_include_prefix = "include", + copts = [ + "-std=c++17", + ] + select({ + ":debug_mode": [ + "-g3", + "-O0", + "-ggdb", + "-fno-omit-frame-pointer", + "-DDEBUG", + ], + ":fastbuild_mode": [ + "-g", + "-O0", + ], + ":release_mode": [ + "-O2", + "-DNDEBUG", + ], + }), + includes = [ + "src", + "cxxbridge", + ], + linkopts = [ + "-ldl", + "-lpthread", + ] + select({ + ":debug_mode": ["-g"], + ":fastbuild_mode": ["-g"], + ":release_mode": [], + }) + select({ + "@platforms//os:macos": [ + "-framework", "CoreFoundation", + "-framework", "Security", + ], + "//conditions:default": [], + }), + deps = [ + ":rust_lib", + "//bindings/cpp/bazel/cpp:arrow_cpp_dep", + ], + visibility = ["//visibility:public"], +) + +cc_binary( + name = "fluss_cpp_example", + srcs = [ + "examples/example.cpp", + ], + deps = [":fluss_cpp"], + copts = [ + "-std=c++17", + ] + select({ + ":debug_mode": [ + "-g3", + "-O0", + "-ggdb", + "-fno-omit-frame-pointer", + "-DDEBUG", + ], + ":fastbuild_mode": [ + "-g", + "-O0", + ], + ":release_mode": [ + "-O2", + "-DNDEBUG", + ], + }), + linkopts = select({ + ":debug_mode": ["-g"], + ":fastbuild_mode": ["-g"], + ":release_mode": [], + }), + visibility = ["//visibility:public"], +) + +cc_binary( + name = "fluss_cpp_admin_example", + srcs = [ + "examples/admin_example.cpp", + ], + deps = [":fluss_cpp"], + copts = [ + "-std=c++17", + ] + select({ + ":debug_mode": [ + "-g3", + "-O0", + "-ggdb", + "-fno-omit-frame-pointer", + "-DDEBUG", + ], + ":fastbuild_mode": [ + "-g", + "-O0", + ], + ":release_mode": [ + "-O2", + "-DNDEBUG", + ], + }), + linkopts = select({ + ":debug_mode": ["-g"], + ":fastbuild_mode": ["-g"], + ":release_mode": [], + }), + visibility = ["//visibility:public"], +) + +cc_binary( + name = "fluss_cpp_kv_example", + srcs = [ + "examples/kv_example.cpp", + ], + deps = [":fluss_cpp"], + copts = [ + "-std=c++17", + ] + select({ + ":debug_mode": [ + "-g3", + "-O0", + "-ggdb", + "-fno-omit-frame-pointer", + "-DDEBUG", + ], + ":fastbuild_mode": [ + "-g", + "-O0", + ], + ":release_mode": [ + "-O2", + "-DNDEBUG", + ], + }), + linkopts = select({ + ":debug_mode": ["-g"], + ":fastbuild_mode": ["-g"], + ":release_mode": [], + }), + visibility = ["//visibility:public"], +) diff --git a/fluss-rust/bindings/cpp/CMakeLists.txt b/fluss-rust/bindings/cpp/CMakeLists.txt new file mode 100644 index 0000000000..44407ac860 --- /dev/null +++ b/fluss-rust/bindings/cpp/CMakeLists.txt @@ -0,0 +1,293 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cmake_minimum_required(VERSION 3.22) + +if (POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +project(fluss-cpp LANGUAGES CXX) + +include(FetchContent) +set(FLUSS_GOOGLETEST_VERSION 1.15.2 CACHE STRING "version of GoogleTest") +set(FLUSS_NLOHMANN_JSON_VERSION 3.12.0 CACHE STRING "version of nlohmann/json") +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +set(FLUSS_CPP_DEP_MODE "system" CACHE STRING "Dependency provisioning mode for fluss-cpp (system|build)") +set_property(CACHE FLUSS_CPP_DEP_MODE PROPERTY STRINGS system build) +set(FLUSS_CPP_ARROW_VERSION "19.0.1" CACHE STRING "Arrow C++ version baseline for fluss-cpp") +set(FLUSS_CPP_PROTOBUF_VERSION "3.25.5" CACHE STRING "Protobuf/protoc version baseline for fluss-cpp") +set(FLUSS_CPP_ARROW_SYSTEM_ROOT "" CACHE PATH "Optional Arrow installation prefix for system mode") +set(FLUSS_CPP_ARROW_SOURCE_URL + "https://github.com/apache/arrow/archive/refs/tags/apache-arrow-19.0.1.tar.gz" + CACHE STRING + "Arrow source archive URL used in build mode") +set(FLUSS_CPP_ARROW_SOURCE_SHA256 + "4c898504958841cc86b6f8710ecb2919f96b5e10fa8989ac10ac4fca8362d86a" + CACHE STRING + "SHA256 for the Arrow source archive used in build mode") + +find_package(Threads REQUIRED) + +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +option(FLUSS_ENABLE_ADDRESS_SANITIZER "Enable address sanitizer" OFF) +option(FLUSS_ENABLE_TESTING "Enable building test binary for fluss" OFF) +option(FLUSS_DEV "Enable dev mode" OFF) + +if (FLUSS_DEV) + set(FLUSS_ENABLE_ADDRESS_SANITIZER ON) + set(FLUSS_ENABLE_TESTING ON) +endif() + +if (NOT FLUSS_CPP_DEP_MODE STREQUAL "system" AND NOT FLUSS_CPP_DEP_MODE STREQUAL "build") + message(FATAL_ERROR "Unsupported FLUSS_CPP_DEP_MODE='${FLUSS_CPP_DEP_MODE}'. Expected 'system' or 'build'.") +endif() + +find_program(FLUSS_PROTOC_EXECUTABLE NAMES protoc) +if (NOT FLUSS_PROTOC_EXECUTABLE) + message(FATAL_ERROR "protoc not found. Install protoc or set it in PATH. (Fluss baseline: ${FLUSS_CPP_PROTOBUF_VERSION})") +endif() + +if (DEFINED ENV{CARGO} AND NOT "$ENV{CARGO}" STREQUAL "" AND EXISTS "$ENV{CARGO}") + set(FLUSS_CARGO_EXECUTABLE "$ENV{CARGO}") +else() + if (DEFINED ENV{CARGO} AND NOT "$ENV{CARGO}" STREQUAL "") + get_filename_component(_FLUSS_CARGO_HINT_DIR "$ENV{CARGO}" DIRECTORY) + endif() + find_program(FLUSS_CARGO_EXECUTABLE NAMES cargo HINTS "${_FLUSS_CARGO_HINT_DIR}") +endif() +if (NOT FLUSS_CARGO_EXECUTABLE) + message(FATAL_ERROR "cargo not found. Install Rust toolchain or set CARGO/PATH.") +endif() + +execute_process( + COMMAND ${FLUSS_PROTOC_EXECUTABLE} --version + OUTPUT_VARIABLE FLUSS_PROTOC_VERSION_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET +) +string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" FLUSS_PROTOC_VERSION "${FLUSS_PROTOC_VERSION_OUTPUT}") +set(FLUSS_PROTOC_VERSION_NORM "${FLUSS_PROTOC_VERSION}") +set(FLUSS_CPP_PROTOBUF_VERSION_NORM "${FLUSS_CPP_PROTOBUF_VERSION}") +string(REGEX REPLACE "^3\\." "" FLUSS_PROTOC_VERSION_NORM "${FLUSS_PROTOC_VERSION_NORM}") +string(REGEX REPLACE "^3\\." "" FLUSS_CPP_PROTOBUF_VERSION_NORM "${FLUSS_CPP_PROTOBUF_VERSION_NORM}") +if (FLUSS_PROTOC_VERSION AND + NOT FLUSS_PROTOC_VERSION VERSION_EQUAL FLUSS_CPP_PROTOBUF_VERSION AND + NOT FLUSS_PROTOC_VERSION_NORM VERSION_EQUAL FLUSS_CPP_PROTOBUF_VERSION_NORM) + message(WARNING + "protoc version (${FLUSS_PROTOC_VERSION}) does not match Fluss baseline " + "(${FLUSS_CPP_PROTOBUF_VERSION}). Build may still work, but this is outside the tested baseline.") +endif() + +message(STATUS "Fluss C++ dependency mode: ${FLUSS_CPP_DEP_MODE}") +message(STATUS "Fluss C++ protoc executable: ${FLUSS_PROTOC_EXECUTABLE} (${FLUSS_PROTOC_VERSION_OUTPUT})") +message(STATUS "Fluss C++ cargo executable: ${FLUSS_CARGO_EXECUTABLE}") + +if (FLUSS_CPP_DEP_MODE STREQUAL "system") + if (FLUSS_CPP_ARROW_SYSTEM_ROOT) + list(APPEND CMAKE_PREFIX_PATH "${FLUSS_CPP_ARROW_SYSTEM_ROOT}") + set(Arrow_ROOT "${FLUSS_CPP_ARROW_SYSTEM_ROOT}") + endif() + + find_package(Arrow REQUIRED) + + if (DEFINED Arrow_VERSION AND Arrow_VERSION AND NOT Arrow_VERSION VERSION_EQUAL FLUSS_CPP_ARROW_VERSION) + message(WARNING + "Arrow version (${Arrow_VERSION}) does not match Fluss baseline " + "(${FLUSS_CPP_ARROW_VERSION}). Build may still work, but this is outside the tested baseline.") + endif() +else() + # Build mode: provision Arrow C++ from source in-tree. + set(ARROW_BUILD_SHARED ON CACHE BOOL "" FORCE) + set(ARROW_BUILD_STATIC OFF CACHE BOOL "" FORCE) + set(ARROW_BUILD_TESTS OFF CACHE BOOL "" FORCE) + set(ARROW_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) + set(ARROW_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE) + set(ARROW_BUILD_INTEGRATION OFF CACHE BOOL "" FORCE) + set(ARROW_BUILD_UTILITIES OFF CACHE BOOL "" FORCE) + set(ARROW_COMPUTE OFF CACHE BOOL "" FORCE) + set(ARROW_CSV OFF CACHE BOOL "" FORCE) + set(ARROW_DATASET OFF CACHE BOOL "" FORCE) + set(ARROW_FILESYSTEM OFF CACHE BOOL "" FORCE) + set(ARROW_JSON OFF CACHE BOOL "" FORCE) + set(ARROW_PARQUET OFF CACHE BOOL "" FORCE) + set(ARROW_IPC ON CACHE BOOL "" FORCE) + # Reduce third-party sub-build complexity in build mode. + set(ARROW_JEMALLOC OFF CACHE BOOL "" FORCE) + set(ARROW_MIMALLOC OFF CACHE BOOL "" FORCE) + set(ARROW_DEPENDENCY_SOURCE BUNDLED CACHE STRING "" FORCE) + set(ARROW_SIMD_LEVEL NONE CACHE STRING "" FORCE) + set(ARROW_RUNTIME_SIMD_LEVEL NONE CACHE STRING "" FORCE) + + FetchContent_Declare( + apache_arrow_src + URL ${FLUSS_CPP_ARROW_SOURCE_URL} + URL_HASH SHA256=${FLUSS_CPP_ARROW_SOURCE_SHA256} + SOURCE_SUBDIR cpp + ) + FetchContent_MakeAvailable(apache_arrow_src) + set(FLUSS_CPP_ARROW_EXTRA_INCLUDE_DIRS + "${apache_arrow_src_SOURCE_DIR}/cpp/src" + "${apache_arrow_src_BINARY_DIR}/src") + + if (TARGET arrow_shared AND NOT TARGET Arrow::arrow_shared) + add_library(Arrow::arrow_shared ALIAS arrow_shared) + endif() + if (NOT TARGET Arrow::arrow_shared) + message(FATAL_ERROR "Arrow build mode did not produce target Arrow::arrow_shared (or arrow_shared).") + endif() +endif() + +# Get cargo target dir +execute_process(COMMAND ${FLUSS_CARGO_EXECUTABLE} locate-project --workspace --message-format plain + OUTPUT_VARIABLE CARGO_MANIFEST_PATH + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) +if (NOT CARGO_MANIFEST_PATH) + message(FATAL_ERROR + "Failed to resolve Cargo workspace target dir via '${FLUSS_CARGO_EXECUTABLE} locate-project'. " + "Check Rust toolchain installation and PATH/CARGO.") +endif() +get_filename_component(CARGO_WORKSPACE_DIR "${CARGO_MANIFEST_PATH}" DIRECTORY) +set(CARGO_TARGET_DIR "${CARGO_WORKSPACE_DIR}/target") + +set(CARGO_MANIFEST ${PROJECT_SOURCE_DIR}/Cargo.toml) +set(RUST_SOURCE_FILE ${PROJECT_SOURCE_DIR}/src/lib.rs) +set(RUST_BRIDGE_CPP ${CARGO_TARGET_DIR}/cxxbridge/fluss-cpp/src/lib.rs.cc) +set(RUST_HEADER_FILE ${CARGO_TARGET_DIR}/cxxbridge/fluss-cpp/src/lib.rs.h) + +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(RUST_LIB ${CARGO_TARGET_DIR}/debug/${CMAKE_STATIC_LIBRARY_PREFIX}fluss_cpp${CMAKE_STATIC_LIBRARY_SUFFIX}) +else() + set(RUST_LIB ${CARGO_TARGET_DIR}/release/${CMAKE_STATIC_LIBRARY_PREFIX}fluss_cpp${CMAKE_STATIC_LIBRARY_SUFFIX}) +endif() + +set(CPP_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/src + ${CARGO_TARGET_DIR}/cxxbridge + ${CARGO_TARGET_DIR}/cxxbridge/fluss-cpp/src) + +file(GLOB CPP_SOURCE_FILE "src/*.cpp") +file(GLOB CPP_HEADER_FILE "include/*.hpp") + +if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CARGO_BUILD_FLAGS "--release") +endif() + +add_custom_target(cargo_build + COMMAND ${CMAKE_COMMAND} -E env PROTOC=${FLUSS_PROTOC_EXECUTABLE} ${FLUSS_CARGO_EXECUTABLE} build --manifest-path ${CARGO_MANIFEST} ${CARGO_BUILD_FLAGS} + BYPRODUCTS ${RUST_BRIDGE_CPP} ${RUST_LIB} ${RUST_HEADER_FILE} + DEPENDS ${RUST_SOURCE_FILE} + USES_TERMINAL + COMMENT "Running cargo..." +) + +add_library(fluss_cpp STATIC ${CPP_SOURCE_FILE} ${RUST_BRIDGE_CPP}) +target_sources(fluss_cpp PUBLIC ${CPP_HEADER_FILE}) +target_sources(fluss_cpp PRIVATE ${RUST_HEADER_FILE}) +target_include_directories(fluss_cpp PUBLIC ${CPP_INCLUDE_DIR}) +if (FLUSS_CPP_ARROW_EXTRA_INCLUDE_DIRS) + target_include_directories(fluss_cpp PUBLIC ${FLUSS_CPP_ARROW_EXTRA_INCLUDE_DIRS}) +endif() +target_link_libraries(fluss_cpp PUBLIC ${RUST_LIB}) +target_link_libraries(fluss_cpp PRIVATE ${CMAKE_DL_LIBS} Threads::Threads) +target_link_libraries(fluss_cpp PUBLIC Arrow::arrow_shared) +target_compile_definitions(fluss_cpp PRIVATE ARROW_FOUND) +if(APPLE) + target_link_libraries(fluss_cpp PUBLIC "-framework CoreFoundation" "-framework Security") +endif() + +add_executable(fluss_cpp_example examples/example.cpp) +target_link_libraries(fluss_cpp_example PRIVATE fluss_cpp) +target_link_libraries(fluss_cpp_example PRIVATE Arrow::arrow_shared) +target_compile_definitions(fluss_cpp_example PRIVATE ARROW_FOUND) +target_include_directories(fluss_cpp_example PUBLIC ${CPP_INCLUDE_DIR}) + +add_executable(fluss_cpp_admin_example examples/admin_example.cpp) +target_link_libraries(fluss_cpp_admin_example PRIVATE fluss_cpp) +target_link_libraries(fluss_cpp_admin_example PRIVATE Arrow::arrow_shared) +target_compile_definitions(fluss_cpp_admin_example PRIVATE ARROW_FOUND) +target_include_directories(fluss_cpp_admin_example PUBLIC ${CPP_INCLUDE_DIR}) + +add_executable(fluss_cpp_kv_example examples/kv_example.cpp) +target_link_libraries(fluss_cpp_kv_example PRIVATE fluss_cpp) +target_link_libraries(fluss_cpp_kv_example PRIVATE Arrow::arrow_shared) +target_compile_definitions(fluss_cpp_kv_example PRIVATE ARROW_FOUND) +target_include_directories(fluss_cpp_kv_example PUBLIC ${CPP_INCLUDE_DIR}) + +if (CARGO_TARGET_DIR) + set_target_properties(fluss_cpp + PROPERTIES ADDITIONAL_CLEAN_FILES "${CARGO_TARGET_DIR}" + ) +endif() +add_dependencies(fluss_cpp cargo_build) + +if (FLUSS_ENABLE_ADDRESS_SANITIZER) + target_compile_options(fluss_cpp PRIVATE -fsanitize=leak,address,undefined -fno-omit-frame-pointer -fno-common -O1) + target_link_options(fluss_cpp PRIVATE -fsanitize=leak,address,undefined) +endif() + +if (FLUSS_ENABLE_TESTING) + FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/refs/tags/v${FLUSS_GOOGLETEST_VERSION}.tar.gz + ) + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + + FetchContent_MakeAvailable(googletest) + + if (NOT TARGET nlohmann_json::nlohmann_json) + set(JSON_BuildTests OFF CACHE INTERNAL "") + FetchContent_Declare( + nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v${FLUSS_NLOHMANN_JSON_VERSION}.tar.gz + URL_HASH SHA256=4b92eb0c06d10683f7447ce9406cb97cd4b453be18d7279320f7b2f025c10187 + ) + FetchContent_MakeAvailable(nlohmann_json) + endif() + + enable_testing() + include(GoogleTest) + + file(GLOB TEST_SOURCE_FILES "test/*.cpp") + add_executable(fluss_cpp_test ${TEST_SOURCE_FILES}) + target_link_libraries(fluss_cpp_test PRIVATE fluss_cpp GTest::gtest nlohmann_json::nlohmann_json) + target_link_libraries(fluss_cpp_test PRIVATE Arrow::arrow_shared) + target_compile_definitions(fluss_cpp_test PRIVATE ARROW_FOUND) + target_include_directories(fluss_cpp_test PRIVATE + ${CPP_INCLUDE_DIR} + ${PROJECT_SOURCE_DIR}/test + ) + + gtest_discover_tests(fluss_cpp_test + PROPERTIES + TIMEOUT 120 + FIXTURES_REQUIRED fluss_cluster + ) + + add_test(NAME fluss_cluster_cleanup COMMAND fluss_cpp_test --cleanup) + set_tests_properties(fluss_cluster_cleanup PROPERTIES + FIXTURES_CLEANUP fluss_cluster + ) +endif() diff --git a/fluss-rust/bindings/cpp/Cargo.toml b/fluss-rust/bindings/cpp/Cargo.toml new file mode 100644 index 0000000000..26816522fe --- /dev/null +++ b/fluss-rust/bindings/cpp/Cargo.toml @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "fluss-cpp" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true +publish = false + +[lib] +crate-type = ["staticlib"] + +[dependencies] +anyhow = "1.0" +arrow = { workspace = true, features = ["ffi"] } +bigdecimal = { workspace = true } +cxx = "1.0" +fluss = { workspace = true, features = ["storage-all"] } +tokio = { workspace = true, features = ["rt-multi-thread", "macros"] } + +[build-dependencies] +cxx-build = "1.0" diff --git a/fluss-rust/bindings/cpp/DEPENDENCIES.rust.tsv b/fluss-rust/bindings/cpp/DEPENDENCIES.rust.tsv new file mode 100644 index 0000000000..89dbf76539 --- /dev/null +++ b/fluss-rust/bindings/cpp/DEPENDENCIES.rust.tsv @@ -0,0 +1,309 @@ +crate Apache-2.0 Apache-2.0 WITH LLVM-exception BSD-2-Clause BSD-3-Clause BSL-1.0 CC0-1.0 CDLA-Permissive-2.0 ISC LGPL-2.1-or-later MIT Unicode-3.0 Unlicense Zlib +ahash@0.8.12 X X +aho-corasick@1.1.4 X X +android_system_properties@0.1.5 X X +anstream@1.0.0 X X +anstyle@1.0.14 X X +anstyle-parse@1.0.0 X X +anstyle-query@1.1.5 X X +anstyle-wincon@3.0.11 X X +anyhow@1.0.102 X X +arrow@57.3.0 X +arrow-arith@57.3.0 X +arrow-array@57.3.0 X +arrow-buffer@57.3.0 X +arrow-cast@57.3.0 X +arrow-csv@57.3.0 X +arrow-data@57.3.0 X +arrow-ipc@57.3.0 X +arrow-json@57.3.0 X +arrow-ord@57.3.0 X +arrow-row@57.3.0 X +arrow-schema@57.3.0 X +arrow-select@57.3.0 X +arrow-string@57.3.0 X +async-trait@0.1.89 X X +atoi@2.0.0 X +atomic-waker@1.1.2 X X +autocfg@1.5.0 X X +backon@1.6.0 X +base64@0.22.1 X X +bigdecimal@0.4.10 X X +bitflags@2.11.0 X X +bitvec@1.0.1 X +block-buffer@0.10.4 X X +bumpalo@3.20.2 X X +byteorder@1.5.0 X X +bytes@1.11.1 X +cc@1.2.57 X X +cfg-if@1.0.4 X X +chrono@0.4.44 X X +clap@4.6.0 X X +clap_builder@4.6.0 X X +clap_derive@4.6.0 X X +clap_lex@1.1.0 X X +codespan-reporting@0.13.1 X +colorchoice@1.0.5 X X +const-oid@0.9.6 X X +const-random@0.1.18 X X +const-random-macro@0.1.16 X X +core-foundation-sys@0.8.7 X X +cpufeatures@0.2.17 X X +crc32c@0.6.8 X X +crossbeam-utils@0.8.21 X X +crunchy@0.2.4 X +crypto-common@0.1.7 X X +csv@1.4.0 X X +csv-core@0.1.13 X X +cxx@1.0.194 X X +cxx-build@1.0.194 X X +cxxbridge-flags@1.0.194 X X +cxxbridge-macro@1.0.194 X X +dashmap@6.1.0 X +delegate@0.13.5 X X +digest@0.10.7 X X +displaydoc@0.2.5 X X +either@1.15.0 X X +equivalent@1.0.2 X X +errno@0.3.14 X X +fastrand@2.3.0 X X +find-msvc-tools@0.1.9 X X +fixedbitset@0.5.7 X X +flatbuffers@25.12.19 X +fluss-cpp@0.1.0 X +fluss-rs@0.1.0 X +fnv@1.0.7 X X +foldhash@0.1.5 X +foldhash@0.2.0 X +form_urlencoded@1.2.2 X X +funty@2.0.0 X +futures@0.3.32 X X +futures-channel@0.3.32 X X +futures-core@0.3.32 X X +futures-executor@0.3.32 X X +futures-io@0.3.32 X X +futures-macro@0.3.32 X X +futures-sink@0.3.32 X X +futures-task@0.3.32 X X +futures-util@0.3.32 X X +generic-array@0.14.7 X +getrandom@0.2.17 X X +getrandom@0.3.4 X X +getrandom@0.4.2 X X +gloo-timers@0.3.0 X X +h2@0.4.13 X +half@2.7.1 X X +hashbrown@0.14.5 X X +hashbrown@0.15.5 X X +hashbrown@0.16.1 X X +heck@0.5.0 X X +hex@0.4.3 X X +hmac@0.12.1 X X +home@0.5.12 X X +http@1.4.0 X X +http-body@1.0.1 X +http-body-util@0.1.3 X +httparse@1.10.1 X X +httpdate@1.0.3 X X +hyper@1.8.1 X +hyper-rustls@0.27.7 X X X +hyper-util@0.1.20 X +iana-time-zone@0.1.65 X X +iana-time-zone-haiku@0.1.2 X X +icu_collections@2.1.1 X +icu_locale_core@2.1.1 X +icu_normalizer@2.1.1 X +icu_normalizer_data@2.1.1 X +icu_properties@2.1.2 X +icu_properties_data@2.1.2 X +icu_provider@2.1.1 X +idna@1.1.0 X X +idna_adapter@1.2.1 X X +indexmap@2.13.0 X X +ipnet@2.12.0 X X +iri-string@0.7.11 X X +is_terminal_polyfill@1.70.2 X X +itertools@0.14.0 X X +itoa@1.0.18 X X +jiff@0.2.23 X X +jiff-tzdb@0.1.6 X X +jiff-tzdb-platform@0.1.3 X X +jobserver@0.1.34 X X +js-sys@0.3.91 X X +lexical-core@1.0.6 X X +lexical-parse-float@1.0.6 X X +lexical-parse-integer@1.0.6 X X +lexical-util@1.0.7 X X +lexical-write-float@1.0.6 X X +lexical-write-integer@1.0.6 X X +libc@0.2.183 X X +libm@0.2.16 X +link-cplusplus@1.0.12 X X +linked-hash-map@0.5.6 X X +linux-raw-sys@0.12.1 X X X +litemap@0.8.1 X +lock_api@0.4.14 X X +log@0.4.29 X X +lz4_flex@0.12.1 X +md-5@0.10.6 X X +memchr@2.8.0 X X +mio@1.1.1 X +multimap@0.10.1 X X +num-bigint@0.4.6 X X +num-complex@0.4.6 X X +num-integer@0.1.46 X X +num-traits@0.2.19 X X +once_cell@1.21.4 X X +once_cell_polyfill@1.70.2 X X +opendal@0.55.0 X +ordered-float@5.1.0 X +parking_lot@0.12.5 X X +parking_lot_core@0.9.12 X X +parse-display@0.10.0 X X +parse-display-derive@0.10.0 X X +percent-encoding@2.3.2 X X +petgraph@0.8.3 X X +pin-project-lite@0.2.17 X X +pin-utils@0.1.0 X X +pkg-config@0.3.32 X X +portable-atomic@1.13.1 X X +portable-atomic-util@0.2.6 X X +potential_utf@0.1.4 X +ppv-lite86@0.2.21 X X +prettyplease@0.2.37 X X +proc-macro2@1.0.106 X X +prost@0.14.3 X +prost-build@0.14.3 X +prost-derive@0.14.3 X +prost-types@0.14.3 X +quick-xml@0.37.5 X +quick-xml@0.38.4 X +quote@1.0.45 X X +r-efi@5.3.0 X X X +r-efi@6.0.0 X X X +radium@0.7.0 X +rand@0.8.5 X X +rand@0.9.2 X X +rand_chacha@0.3.1 X X +rand_chacha@0.9.0 X X +rand_core@0.6.4 X X +rand_core@0.9.5 X X +redox_syscall@0.5.18 X +regex@1.12.3 X X +regex-automata@0.4.14 X X +regex-syntax@0.8.10 X X +reqsign@0.16.5 X +reqwest@0.12.28 X X +ring@0.17.14 X X +rustc_version@0.4.1 X X +rustix@1.1.4 X X X +rustls@0.23.37 X X X +rustls-pki-types@1.14.0 X X +rustls-webpki@0.103.10 X +rustversion@1.0.22 X X +ryu@1.0.23 X X +scopeguard@1.2.0 X X +scratch@1.0.9 X X +semver@1.0.27 X X +serde@1.0.228 X X +serde_core@1.0.228 X X +serde_derive@1.0.228 X X +serde_json@1.0.149 X X +serde_urlencoded@0.7.1 X X +sha1@0.10.6 X X +sha2@0.10.9 X X +shlex@1.3.0 X X +signal-hook-registry@1.4.8 X X +simdutf8@0.1.5 X X +slab@0.4.12 X +smallvec@1.15.1 X X +snafu@0.8.9 X X +snafu-derive@0.8.9 X X +socket2@0.6.3 X X +stable_deref_trait@1.2.1 X X +strsim@0.11.1 X +structmeta@0.3.0 X X +structmeta-derive@0.3.0 X X +strum@0.26.3 X +strum_macros@0.26.4 X +subtle@2.6.1 X +syn@2.0.117 X X +sync_wrapper@1.0.2 X +synstructure@0.13.2 X +tap@1.0.1 X +tempfile@3.27.0 X X +termcolor@1.4.1 X X +thiserror@1.0.69 X X +thiserror-impl@1.0.69 X X +tiny-keccak@2.0.2 X +tinystr@0.8.2 X +tokio@1.50.0 X +tokio-macros@2.6.1 X +tokio-rustls@0.26.4 X X +tokio-util@0.7.18 X +tower@0.5.3 X +tower-http@0.6.8 X +tower-layer@0.3.3 X +tower-service@0.3.3 X +tracing@0.1.44 X +tracing-attributes@0.1.31 X +tracing-core@0.1.36 X +try-lock@0.2.5 X +twox-hash@2.1.2 X +typenum@1.19.0 X X +unicode-ident@1.0.24 X X X +unicode-width@0.2.2 X X +untrusted@0.9.0 X +url@2.5.8 X X +utf8_iter@1.0.4 X X +utf8parse@0.2.2 X X +uuid@1.22.0 X X +value-bag@1.12.0 X X +version_check@0.9.5 X X +want@0.3.1 X +wasi@0.11.1+wasi-snapshot-preview1 X X X +wasip2@1.0.2+wasi-0.2.9 X X X +wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06 X X X +wasm-bindgen@0.2.114 X X +wasm-bindgen-futures@0.4.64 X X +wasm-bindgen-macro@0.2.114 X X +wasm-bindgen-macro-support@0.2.114 X X +wasm-bindgen-shared@0.2.114 X X +wasm-streams@0.4.2 X X +web-sys@0.3.91 X X +webpki-roots@1.0.6 X +winapi-util@0.1.11 X X +windows-core@0.62.2 X X +windows-implement@0.60.2 X X +windows-interface@0.59.3 X X +windows-link@0.2.1 X X +windows-result@0.4.1 X X +windows-strings@0.5.1 X X +windows-sys@0.52.0 X X +windows-sys@0.61.2 X X +windows-targets@0.52.6 X X +windows_aarch64_gnullvm@0.52.6 X X +windows_aarch64_msvc@0.52.6 X X +windows_i686_gnu@0.52.6 X X +windows_i686_gnullvm@0.52.6 X X +windows_i686_msvc@0.52.6 X X +windows_x86_64_gnu@0.52.6 X X +windows_x86_64_gnullvm@0.52.6 X X +windows_x86_64_msvc@0.52.6 X X +wit-bindgen@0.51.0 X X X +writeable@0.6.2 X +wyz@0.5.1 X +yoke@0.8.1 X +yoke-derive@0.8.1 X +zerocopy@0.8.47 X X X +zerocopy-derive@0.8.47 X X X +zerofrom@0.1.6 X +zerofrom-derive@0.1.6 X +zeroize@1.8.2 X X +zerotrie@0.2.3 X +zerovec@0.11.5 X +zerovec-derive@0.11.2 X +zmij@1.0.21 X +zstd@0.13.3 X +zstd-safe@7.2.4 X X +zstd-sys@2.0.16+zstd.1.5.7 X X diff --git a/fluss-rust/bindings/cpp/README.md b/fluss-rust/bindings/cpp/README.md new file mode 100644 index 0000000000..1a8d9f2f64 --- /dev/null +++ b/fluss-rust/bindings/cpp/README.md @@ -0,0 +1,42 @@ +# Apache Fluss™ C++ Bindings (Incubating) + +C++ bindings for Fluss, built on top of the [fluss-rust](../../crates/fluss) client. The API is exposed via a C++ header ([include/fluss.hpp](include/fluss.hpp)) and implemented with Rust FFI. + +## Requirements + +- Rust (see [rust-toolchain.toml](../../rust-toolchain.toml) at repo root) +- C++17-capable compiler +- CMake 3.18+ and/or Bazel +- Apache Arrow (for Arrow-based APIs) + +## Build + +From the repository root or from `bindings/cpp`: + +**With CMake:** + +```bash +cd bindings/cpp +mkdir build && cd build +cmake .. +cmake --build . +``` + +By default, CMake now uses `Release` when `CMAKE_BUILD_TYPE` is not specified. + +**With Bazel:** + +```bash +cd bindings/cpp +bazel build //... +``` +`ci.sh` defaults to optimized builds via `-c opt` (override with `BAZEL_BUILD_FLAGS` if needed). +See [ci.sh](ci.sh) for the CI build sequence. + + +## TODO + +- [] How to introduce fluss-cpp in your own project, https://github.com/apache/opendal/blob/main/bindings/cpp/README.md is a good reference +- [ ] Add CMake/Bazel install and packaging instructions. +- [ ] Document API usage and minimal example in this README. +- [ ] Add more C++ examples (log scan, upsert, etc.). diff --git a/fluss-rust/bindings/cpp/bazel/cpp/BUILD.bazel b/fluss-rust/bindings/cpp/bazel/cpp/BUILD.bazel new file mode 100644 index 0000000000..e4b730dc9b --- /dev/null +++ b/fluss-rust/bindings/cpp/bazel/cpp/BUILD.bazel @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +package(default_visibility = ["//visibility:public"]) + +# Stable indirection target for the Arrow C++ dependency. The implementation +# repo name can change across modes (registry/build/system) without touching +# bindings/cpp/BUILD.bazel. +alias( + name = "arrow_cpp_dep", + actual = "@apache_arrow_cpp//:arrow_cpp", +) diff --git a/fluss-rust/bindings/cpp/bazel/cpp/deps.bzl b/fluss-rust/bindings/cpp/bazel/cpp/deps.bzl new file mode 100644 index 0000000000..6dd5e1b635 --- /dev/null +++ b/fluss-rust/bindings/cpp/bazel/cpp/deps.bzl @@ -0,0 +1,349 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Bzlmod extension for fluss C++ SDK dependency provisioning.""" + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +_ARROW_BUILD_FILE_TEMPLATE = """ +load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake") + +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "all_srcs", + srcs = glob( + ["**"], + exclude = [ + "**/BUILD", + "**/BUILD.bazel", + ], + ), +) + +cmake( + name = "arrow_cpp", + lib_source = ":all_srcs", + working_directory = "cpp", + generate_args = ["-GUnix Makefiles"], + cache_entries = { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_INSTALL_LIBDIR": "lib", + "CMAKE_POSITION_INDEPENDENT_CODE": "ON", + "ARROW_BUILD_SHARED": "ON", + "ARROW_BUILD_STATIC": "OFF", + "ARROW_BUILD_TESTS": "OFF", + "ARROW_BUILD_EXAMPLES": "OFF", + "ARROW_BUILD_BENCHMARKS": "OFF", + "ARROW_BUILD_INTEGRATION": "OFF", + "ARROW_BUILD_UTILITIES": "OFF", + "ARROW_COMPUTE": "OFF", + "ARROW_CSV": "OFF", + "ARROW_DATASET": "OFF", + "ARROW_FILESYSTEM": "OFF", + "ARROW_JSON": "OFF", + "ARROW_PARQUET": "OFF", + "ARROW_IPC": "ON", + "ARROW_JEMALLOC": "OFF", + "ARROW_MIMALLOC": "OFF", + "ARROW_SIMD_LEVEL": "NONE", + "ARROW_RUNTIME_SIMD_LEVEL": "NONE", + "ARROW_DEPENDENCY_SOURCE": "BUNDLED", + # Temporary workarounds for older images / Bazel sandbox toolchain detection. + "EP_CMAKE_RANLIB": "__EP_CMAKE_RANLIB__", + "EP_CMAKE_AR": "__EP_CMAKE_AR__", + "EP_CMAKE_NM": "__EP_CMAKE_NM__", + }, + out_include_dir = "include", + out_lib_dir = "lib", + out_shared_libs = select({ + "@platforms//os:macos": [ + "libarrow.dylib", + "libarrow.1900.dylib", + ], + "//conditions:default": [ + "libarrow.so", + "libarrow.so.1900", + "libarrow.so.1900.1.0", + ], + }), +) +""" + +_ARROW_PATCH_CMDS = [ + "sed -i.bak 's|#define ARROW_CXX_COMPILER_FLAGS \"@CMAKE_CXX_FLAGS@\"|#define ARROW_CXX_COMPILER_FLAGS \"\"|' cpp/src/arrow/util/config.h.cmake && rm -f cpp/src/arrow/util/config.h.cmake.bak", +] + +_SYSTEM_ARROW_BUILD_FILE_TEMPLATE = """ +load("@rules_cc//cc:defs.bzl", "cc_import", "cc_library") + +package(default_visibility = ["//visibility:public"]) + +cc_import( + name = "arrow_shared_import", + shared_library = "__SYSTEM_ARROW_SHARED_LIBRARY__", +) + +filegroup( + name = "arrow_runtime_libs", + srcs = [ +__SYSTEM_ARROW_RUNTIME_SRCS__ + ], +) + +cc_library( + name = "arrow_cpp", + hdrs = [ +__SYSTEM_ARROW_HDRS__ + ], + includes = ["__SYSTEM_ARROW_INCLUDE_DIR__"], + data = [":arrow_runtime_libs"], + deps = [":arrow_shared_import"], +) +""" + +_ARROW_BUILD_VERSIONS = { + "19.0.1": { + "urls": ["https://github.com/apache/arrow/archive/refs/tags/apache-arrow-19.0.1.tar.gz"], + "strip_prefix": "arrow-apache-arrow-19.0.1", + "integrity": "sha256-TImFBJWIQcyGtvhxDsspGflrXhD6iYmsEKxPyoNi2Go=", + }, +} + +_config_tag = tag_class(attrs = { + "mode": attr.string(default = "build"), + "arrow_cpp_version": attr.string(default = "19.0.1"), + "protobuf_version": attr.string(default = "3.25.5"), + "ep_cmake_ranlib": attr.string(default = "ranlib"), + "ep_cmake_ar": attr.string(default = "ar"), + "ep_cmake_nm": attr.string(default = "nm"), + "system_arrow_prefix": attr.string(default = "/usr"), + "system_arrow_include_dir": attr.string(default = "include"), + "system_arrow_shared_library": attr.string(default = "lib/x86_64-linux-gnu/libarrow.so"), + "system_arrow_runtime_glob": attr.string(default = "lib/x86_64-linux-gnu/libarrow.so*"), +}) + +def _render_arrow_build_file(tag): + return _ARROW_BUILD_FILE_TEMPLATE.replace( + "__EP_CMAKE_RANLIB__", + tag.ep_cmake_ranlib, + ).replace( + "__EP_CMAKE_AR__", + tag.ep_cmake_ar, + ).replace( + "__EP_CMAKE_NM__", + tag.ep_cmake_nm, + ) + +def _render_system_arrow_build_file(tag, shared_library_override = None): + shared_library = shared_library_override if shared_library_override else (tag.system_arrow_shared_library if hasattr(tag, "system_arrow_shared_library") else tag.shared_library) + include_dir = tag.system_arrow_include_dir if hasattr(tag, "system_arrow_include_dir") else tag.include_dir + return _SYSTEM_ARROW_BUILD_FILE_TEMPLATE.replace( + "__SYSTEM_ARROW_SHARED_LIBRARY__", + "sysroot/" + shared_library, + ).replace( + "__SYSTEM_ARROW_INCLUDE_DIR__", + "sysroot/" + include_dir, + ) + +def _starlark_string_list(items): + if not items: + return "" + return "\n".join([' "%s",' % i for i in items]) + +def _list_files(repo_ctx, base_dir, suffixes): + result = repo_ctx.execute([ + "/usr/bin/find", + base_dir, + "(", + "-type", + "f", + "-o", + "-type", + "l", + ")", + ]) + if result.return_code != 0: + fail("failed to enumerate files under %s: %s" % (base_dir, result.stderr)) + files = [] + for line in result.stdout.splitlines(): + for suffix in suffixes: + if line.endswith(suffix): + files.append(line) + break + return sorted(files) + +def _copy_file_to_sysroot(repo_ctx, prefix, rel_path): + if rel_path.startswith("/"): + fail("expected relative path under prefix, got absolute path: %s" % rel_path) + src = prefix + "/" + rel_path + dst = "sysroot/" + rel_path + dst_parent = dst.rsplit("/", 1)[0] if "/" in dst else "sysroot" + mkdir_res = repo_ctx.execute(["/bin/mkdir", "-p", dst_parent]) + if mkdir_res.return_code != 0: + fail("failed to create directory %s: %s" % (dst_parent, mkdir_res.stderr)) + # Resolve symlinks into real files to keep the generated sysroot self-contained. + cp_res = repo_ctx.execute(["/bin/cp", "-L", src, dst]) + if cp_res.return_code != 0: + fail("failed to copy %s to %s: %s" % (src, dst, cp_res.stderr)) + +def _system_arrow_repo_impl(repo_ctx): + prefix = repo_ctx.attr.prefix.rstrip("/") + include_dir = repo_ctx.attr.include_dir + shared_library = repo_ctx.attr.shared_library + runtime_glob = repo_ctx.attr.runtime_glob + + mkdir_res = repo_ctx.execute(["/bin/mkdir", "-p", "sysroot"]) + if mkdir_res.return_code != 0: + fail("failed to create sysroot directory: %s" % mkdir_res.stderr) + + include_dir_for_scan = include_dir + if include_dir_for_scan.endswith("/"): + include_dir_for_scan = include_dir_for_scan[:-1] + header_root = prefix + "/" + include_dir_for_scan + "/arrow" + headers = _list_files(repo_ctx, header_root, [".h", ".hpp"]) + header_srcs_rel = [] + header_srcs = [] + for h in headers: + if not h.startswith(prefix + "/"): + fail("header path %s is outside prefix %s" % (h, prefix)) + rel = h[len(prefix) + 1:] + header_srcs_rel.append(rel) + header_srcs.append("sysroot/" + rel) + + runtime_dir = runtime_glob.rsplit("/", 1)[0] + runtime_prefix = runtime_glob.rsplit("/", 1)[1].replace("*", "") + runtime_files = _list_files(repo_ctx, prefix + "/" + runtime_dir, [""]) + runtime_srcs_rel = [] + runtime_srcs = [] + for f in runtime_files: + rel = f[len(prefix) + 1:] if f.startswith(prefix + "/") else None + if rel == None: + continue + if rel.startswith(runtime_dir + "/") and rel.rsplit("/", 1)[1].startswith(runtime_prefix): + runtime_srcs_rel.append(rel) + runtime_srcs.append("sysroot/" + rel) + runtime_srcs_rel = sorted(runtime_srcs_rel) + runtime_srcs = sorted(runtime_srcs) + + # Prefer a versioned soname file as the imported shared library so Bazel + # runfiles contain the exact filename required by the runtime loader. + shared_import_rel = "sysroot/" + shared_library + shared_basename = shared_library.rsplit("/", 1)[1] + soname_candidates = [] + for rel in runtime_srcs_rel: + base = rel.rsplit("/", 1)[1] + if base == shared_basename: + continue + if base.startswith(shared_basename + "."): + soname_candidates.append("sysroot/" + rel) + if soname_candidates: + # Prefer shortest suffix first (e.g. libarrow.so.1900 before + # libarrow.so.1900.1.0) to match ELF SONAME naming when available. + soname_candidates = sorted(soname_candidates, key = lambda s: (len(s), s)) + shared_import_rel = soname_candidates[0] + + # Copy only required Arrow artifacts instead of mirroring the full system prefix. + copy_rel_paths = {} + for rel in header_srcs_rel + runtime_srcs_rel + [shared_library]: + copy_rel_paths[rel] = True + for rel in sorted(copy_rel_paths.keys()): + _copy_file_to_sysroot(repo_ctx, prefix, rel) + + build_file = _render_system_arrow_build_file(repo_ctx.attr, shared_library_override = shared_import_rel[len("sysroot/"):]).replace( + "__SYSTEM_ARROW_HDRS__", + _starlark_string_list(header_srcs), + ).replace( + "__SYSTEM_ARROW_RUNTIME_SRCS__", + _starlark_string_list(runtime_srcs), + ) + repo_ctx.file("BUILD.bazel", build_file) + +_system_arrow_repository = repository_rule( + implementation = _system_arrow_repo_impl, + attrs = { + "prefix": attr.string(mandatory = True), + "include_dir": attr.string(mandatory = True), + "shared_library": attr.string(mandatory = True), + "runtime_glob": attr.string(mandatory = True), + }, + local = True, +) + +def _select_config(ctx): + selected = None + selected_owner = None + root_selected = None + for mod in ctx.modules: + for tag in mod.tags.config: + is_root = hasattr(mod, "is_root") and mod.is_root + if is_root: + if root_selected != None: + fail("cpp_sdk.config may only be declared once in the root module") + root_selected = tag + continue + if selected == None: + selected = tag + selected_owner = mod.name + elif selected_owner != mod.name: + # Prefer root override. Dependency defaults are tolerated as long + # as they come from a single module. + fail("multiple dependency defaults for cpp_sdk.config without root override") + if root_selected != None: + return root_selected + return selected + +def _cpp_sdk_impl(ctx): + tag = _select_config(ctx) + if tag == None: + return + + if tag.mode == "registry": + return + + if tag.mode == "system": + _system_arrow_repository( + name = "apache_arrow_cpp", + prefix = tag.system_arrow_prefix, + include_dir = tag.system_arrow_include_dir, + shared_library = tag.system_arrow_shared_library, + runtime_glob = tag.system_arrow_runtime_glob, + ) + return + + if tag.mode != "build": + fail("unsupported cpp_sdk mode: %s" % tag.mode) + + arrow_version = _ARROW_BUILD_VERSIONS.get(tag.arrow_cpp_version) + if arrow_version == None: + fail("unsupported arrow_cpp_version for build mode: %s" % tag.arrow_cpp_version) + + http_archive( + name = "apache_arrow_cpp", + urls = arrow_version["urls"], + strip_prefix = arrow_version["strip_prefix"], + integrity = arrow_version["integrity"], + patch_cmds = _ARROW_PATCH_CMDS, + build_file_content = _render_arrow_build_file(tag), + ) + +cpp_sdk = module_extension( + implementation = _cpp_sdk_impl, + tag_classes = { + "config": _config_tag, + }, +) diff --git a/fluss-rust/bindings/cpp/build.rs b/fluss-rust/bindings/cpp/build.rs new file mode 100644 index 0000000000..ec75e24aeb --- /dev/null +++ b/fluss-rust/bindings/cpp/build.rs @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +fn main() { + cxx_build::bridge("src/lib.rs") + .std("c++17") + .compile("fluss-cpp-bridge"); + + println!("cargo:rerun-if-changed=src/lib.rs"); +} diff --git a/fluss-rust/bindings/cpp/ci.sh b/fluss-rust/bindings/cpp/ci.sh new file mode 100755 index 0000000000..ebf5f09205 --- /dev/null +++ b/fluss-rust/bindings/cpp/ci.sh @@ -0,0 +1,101 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -xe + +DIR="$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)" +BAZEL_BUILD_FLAGS="${BAZEL_BUILD_FLAGS:--c opt}" + +# Set Bazel output base to bazel-build directory +# This ensures all Bazel outputs are in bazel-build/.bazel-output-base +BAZEL_OUTPUT_BASE="$DIR/bazel-build/.bazel-output-base" + +# Create output base directory if it doesn't exist +mkdir -p "$BAZEL_OUTPUT_BASE" + +# Wrapper function to run bazel with --output_base +bazel() { + command bazel --output_base="$BAZEL_OUTPUT_BASE" "$@" +} + +compile() { + bazel build ${BAZEL_BUILD_FLAGS} //:fluss_cpp +} + +build_example() { + bazel build ${BAZEL_BUILD_FLAGS} //:fluss_cpp_example +} + +run_example() { + build_example + bazel run ${BAZEL_BUILD_FLAGS} //:fluss_cpp_example +} + +clean() { + bazel clean + # Remove bazel-* symlinks (Bazel automatically creates these) + rm -f "$DIR"/bazel-* + # Also remove the bazel-build directory if it exists + if [ -d "$DIR/bazel-build" ]; then + rm -rf "$DIR/bazel-build" + fi + echo "Cleaned all Bazel outputs and symlinks" +} + +show_outputs() { + echo "=== Library outputs ===" + bazel cquery //:fluss_cpp --output=files 2>/dev/null || echo "Run 'bazel build //:fluss_cpp' first" + echo "" + echo "=== Example binary outputs ===" + bazel cquery //:fluss_cpp_example --output=files 2>/dev/null || echo "Run 'bazel build //:fluss_cpp_example' first" + echo "" + echo "=== To run the example ===" + echo " bazel run //:fluss_cpp_example" + echo "" + echo "=== To find outputs manually ===" + echo " bazel info bazel-bin" +} + +case $1 in + compile ) + compile + ;; + example ) + build_example + ;; + run ) + run_example + ;; + outputs ) + show_outputs + ;; + clean ) + clean + ;; + * ) + echo "Usage: $0 {compile|example|run|outputs|clean}" + echo "" + echo "Commands:" + echo " compile - Build the fluss_cpp library" + echo " example - Build the example binary" + echo " run - Build and run the example binary" + echo " outputs - Show the location of build outputs" + echo " clean - Clean all Bazel outputs" + exit 1 + ;; +esac diff --git a/fluss-rust/bindings/cpp/examples/admin_example.cpp b/fluss-rust/bindings/cpp/examples/admin_example.cpp new file mode 100644 index 0000000000..37683b9513 --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/admin_example.cpp @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "fluss.hpp" + +static void check(const char* step, const fluss::Result& r) { + if (!r.Ok()) { + std::cerr << step << " failed: code=" << r.error_code << " msg=" << r.error_message + << std::endl; + std::exit(1); + } +} + +int main() { + const std::string db_name = "admin_example_db"; + const std::string table_name = "admin_example_table"; + + // 1) Connect and get Admin + fluss::Configuration config; + config.bootstrap_servers = "127.0.0.1:9123"; + + fluss::Connection conn; + check("create", fluss::Connection::Create(config, conn)); + + fluss::Admin admin; + check("get_admin", conn.GetAdmin(admin)); + + // 2) Database operations + std::cout << "--- Database operations ---" << std::endl; + + bool exists = false; + check("database_exists (before create)", admin.DatabaseExists(db_name, exists)); + std::cout << "Database " << db_name << " exists before create: " << (exists ? "yes" : "no") + << std::endl; + + fluss::DatabaseDescriptor db_desc; + db_desc.comment = "Example database for Admin API"; + db_desc.properties["owner"] = "admin_example"; + check("create_database", admin.CreateDatabase(db_name, db_desc, true)); + + check("database_exists (after create)", admin.DatabaseExists(db_name, exists)); + std::cout << "Database " << db_name << " exists after create: " << (exists ? "yes" : "no") + << std::endl; + + fluss::DatabaseInfo db_info; + check("get_database_info", admin.GetDatabaseInfo(db_name, db_info)); + std::cout << "Database info: name=" << db_info.database_name << " comment=" << db_info.comment + << " created_time=" << db_info.created_time << std::endl; + + std::vector databases; + check("list_databases", admin.ListDatabases(databases)); + std::cout << "List databases (" << databases.size() << "): "; + for (size_t i = 0; i < databases.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << databases[i]; + } + std::cout << std::endl; + + // 3) Table operations in the new database + std::cout << "--- Table operations ---" << std::endl; + + fluss::TablePath table_path(db_name, table_name); + + bool table_exists_flag = false; + check("table_exists (before create)", admin.TableExists(table_path, table_exists_flag)); + std::cout << "Table " << db_name << "." << table_name + << " exists before create: " << (table_exists_flag ? "yes" : "no") << std::endl; + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .Build(); + auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(1) + .SetComment("admin example table") + .Build(); + + check("create_table", admin.CreateTable(table_path, descriptor, true)); + + check("table_exists (after create)", admin.TableExists(table_path, table_exists_flag)); + std::cout << "Table exists after create: " << (table_exists_flag ? "yes" : "no") << std::endl; + + std::vector tables; + check("list_tables", admin.ListTables(db_name, tables)); + std::cout << "List tables in " << db_name << " (" << tables.size() << "): "; + for (size_t i = 0; i < tables.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << tables[i]; + } + std::cout << std::endl; + + // 4) Cleanup: drop table, then drop database + std::cout << "--- Cleanup ---" << std::endl; + check("drop_table", admin.DropTable(table_path, true)); + check("drop_database", admin.DropDatabase(db_name, true, true)); + + check("database_exists (after drop)", admin.DatabaseExists(db_name, exists)); + std::cout << "Database exists after drop: " << (exists ? "yes" : "no") << std::endl; + + std::cout << "Admin example completed successfully." << std::endl; + return 0; +} diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/build/BUILD.bazel b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/BUILD.bazel new file mode 100644 index 0000000000..afd35edd7e --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/BUILD.bazel @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +load("@rules_cc//cc:defs.bzl", "cc_binary") + +cc_binary( + name = "consumer_build", + srcs = ["main.cc"], + copts = ["-std=c++17"], + deps = ["@fluss-cpp//bindings/cpp:fluss_cpp"], +) diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/build/MODULE.bazel b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/MODULE.bazel new file mode 100644 index 0000000000..f31165c1cd --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/MODULE.bazel @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module(name = "fluss_cpp_consumer_build") + +bazel_dep(name = "rules_cc", version = "0.2.14") +bazel_dep(name = "fluss-cpp", version = "0.1.0") + +# Local override for repository-local validation only. +local_path_override( + module_name = "fluss-cpp", + # Repository root path (the directory containing `bindings/cpp`). + path = "../../../../../", +) + +fluss_cpp = use_extension("@fluss-cpp//bindings/cpp/bazel/cpp:deps.bzl", "cpp_sdk") +fluss_cpp.config( + mode = "build", + protobuf_version = "3.25.5", + arrow_cpp_version = "19.0.1", + ep_cmake_ranlib = "/usr/bin/ranlib", + ep_cmake_ar = "/usr/bin/ar", + ep_cmake_nm = "/usr/bin/nm", +) +use_repo(fluss_cpp, "apache_arrow_cpp") diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/build/main.cc b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/main.cc new file mode 100644 index 0000000000..87e5b6820f --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/main.cc @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "fluss.hpp" + +#include + +int main() { + fluss::TablePath table_path("demo_db", "demo_table"); + std::cout << "Bazel build-mode dependency example ready: " + << table_path.ToString() << std::endl; + return 0; +} + diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/system/BUILD.bazel b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/BUILD.bazel new file mode 100644 index 0000000000..2f24e6dec7 --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/BUILD.bazel @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +load("@rules_cc//cc:defs.bzl", "cc_binary") + +cc_binary( + name = "consumer_system", + srcs = ["main.cc"], + copts = ["-std=c++17"], + deps = ["@fluss-cpp//bindings/cpp:fluss_cpp"], +) diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/system/MODULE.bazel b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/MODULE.bazel new file mode 100644 index 0000000000..2a4d6a6584 --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/MODULE.bazel @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +module(name = "fluss_cpp_consumer_system") + +bazel_dep(name = "rules_cc", version = "0.2.14") +bazel_dep(name = "fluss-cpp", version = "0.1.0") + +# Repository-local example path (repository root containing `bindings/cpp`). +# If you copy this example out of tree, replace this with an absolute path +# (for example: /path/to/fluss-rust). +local_path_override( + module_name = "fluss-cpp", + path = "../../../../../", +) + +# Intended interface for preinstalled protoc + Arrow C++ environments. +fluss_cpp = use_extension("@fluss-cpp//bindings/cpp/bazel/cpp:deps.bzl", "cpp_sdk") +fluss_cpp.config( + mode = "system", + protobuf_version = "3.25.5", + arrow_cpp_version = "19.0.1", + # Adjust these paths for your environment. + # Ubuntu 22.04 (apt / custom package) commonly uses lib/x86_64-linux-gnu. + system_arrow_prefix = "/usr", + system_arrow_include_dir = "include", + system_arrow_shared_library = "lib/x86_64-linux-gnu/libarrow.so", + system_arrow_runtime_glob = "lib/x86_64-linux-gnu/libarrow.so*", +) +use_repo(fluss_cpp, "apache_arrow_cpp") diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/system/main.cc b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/main.cc new file mode 100644 index 0000000000..b1f0b70b84 --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/main.cc @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "fluss.hpp" + +#include + +int main() { + fluss::TablePath table_path("demo_db", "demo_table"); + std::cout << "Bazel system-mode dependency example ready: " + << table_path.ToString() << std::endl; + return 0; +} diff --git a/fluss-rust/bindings/cpp/examples/example.cpp b/fluss-rust/bindings/cpp/examples/example.cpp new file mode 100644 index 0000000000..d86ee5cda7 --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/example.cpp @@ -0,0 +1,790 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "fluss.hpp" + +static void check(const char* step, const fluss::Result& r) { + if (!r.Ok()) { + std::cerr << step << " failed: code=" << r.error_code << " msg=" << r.error_message + << std::endl; + std::exit(1); + } +} + +int main() { + // 1) Connect + fluss::Configuration config; + config.bootstrap_servers = "127.0.0.1:9123"; + + fluss::Connection conn; + check("create", fluss::Connection::Create(config, conn)); + + // 2) Admin + fluss::Admin admin; + check("get_admin", conn.GetAdmin(admin)); + + fluss::TablePath table_path("fluss", "sample_table_cpp_v1"); + + // 2.1) Drop table if exists + std::cout << "Dropping table if exists..." << std::endl; + auto drop_result = admin.DropTable(table_path, true); + if (drop_result.Ok()) { + std::cout << "Table dropped successfully" << std::endl; + } else { + std::cout << "Table drop result: " << drop_result.error_message << std::endl; + } + + // 3) Schema with scalar and temporal columns + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("score", fluss::DataType::Float()) + .AddColumn("age", fluss::DataType::Int()) + .AddColumn("event_date", fluss::DataType::Date()) + .AddColumn("event_time", fluss::DataType::Time()) + .AddColumn("created_at", fluss::DataType::Timestamp()) + .AddColumn("updated_at", fluss::DataType::TimestampLtz()) + .Build(); + + auto descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(3) + .SetComment("cpp example table with 3 buckets") + .Build(); + + std::cout << "Creating table with 3 buckets..." << std::endl; + check("create_table", admin.CreateTable(table_path, descriptor, false)); + + // 4) Get table + fluss::Table table; + check("get_table", conn.GetTable(table_path, table)); + + // 5) Write rows with scalar and temporal values + fluss::AppendWriter writer; + check("new_append_writer", table.NewAppend().CreateWriter(writer)); + + struct RowData { + int id; + const char* name; + float score; + int age; + fluss::Date date; + fluss::Time time; + fluss::Timestamp ts_ntz; + fluss::Timestamp ts_ltz; + }; + + auto tp_now = std::chrono::system_clock::now(); + std::vector rows = { + {1, "Alice", 95.2f, 25, fluss::Date::FromYMD(2024, 6, 15), fluss::Time::FromHMS(14, 30, 45), + fluss::Timestamp::FromTimePoint(tp_now), fluss::Timestamp::FromMillis(1718467200000)}, + {2, "Bob", 87.2f, 30, fluss::Date::FromYMD(2025, 1, 1), fluss::Time::FromHMS(0, 0, 0), + fluss::Timestamp::FromMillis(1735689600000), + fluss::Timestamp::FromMillisNanos(1735689600000, 500000)}, + {3, "Charlie", 92.1f, 35, fluss::Date::FromYMD(1999, 12, 31), + fluss::Time::FromHMS(23, 59, 59), fluss::Timestamp::FromMillis(946684799999), + fluss::Timestamp::FromMillis(946684799999)}, + }; + + // Fire-and-forget: queue rows, flush at end + for (const auto& r : rows) { + fluss::GenericRow row; + row.SetInt32(0, r.id); + row.SetString(1, r.name); + row.SetFloat32(2, r.score); + row.SetInt32(3, r.age); + row.SetDate(4, r.date); + row.SetTime(5, r.time); + row.SetTimestampNtz(6, r.ts_ntz); + row.SetTimestampLtz(7, r.ts_ltz); + check("append", writer.Append(row)); + } + check("flush", writer.Flush()); + std::cout << "Wrote " << rows.size() << " rows (fire-and-forget + flush)" << std::endl; + + // Per-record acknowledgment + { + fluss::GenericRow row; + row.SetInt32(0, 100); + row.SetString(1, "AckTest"); + row.SetFloat32(2, 99.9f); + row.SetInt32(3, 42); + row.SetDate(4, fluss::Date::FromYMD(2025, 3, 1)); + row.SetTime(5, fluss::Time::FromHMS(12, 0, 0)); + row.SetTimestampNtz(6, fluss::Timestamp::FromMillis(1740787200000)); + row.SetTimestampLtz(7, fluss::Timestamp::FromMillis(1740787200000)); + fluss::WriteResult wr; + check("append", writer.Append(row, wr)); + check("wait", wr.Wait()); + std::cout << "Row acknowledged by server" << std::endl; + } + + // Append a row with all fields null (matches Rust log_table.rs all_supported_datatypes) + { + fluss::GenericRow row; + size_t field_count = 8; + for (size_t i = 0; i < field_count; ++i) { + row.SetNull(i); + } + check("append_null_row", writer.Append(row)); + } + check("flush_null", writer.Flush()); + std::cout << "Wrote row with all fields null" << std::endl; + + // 6) Full scan — verify all column types including temporal + fluss::LogScanner scanner; + check("new_log_scanner", table.NewScan().CreateLogScanner(scanner)); + + auto info = table.GetTableInfo(); + int buckets = info.num_buckets; + for (int b = 0; b < buckets; ++b) { + check("subscribe", scanner.Subscribe(b, 0)); + } + + fluss::ScanRecords records; + check("poll", scanner.Poll(5000, records)); + + // Flat iteration over all records (regardless of bucket) + std::cout << "Scanned records: " << records.Count() << " across " << records.BucketCount() + << " buckets" << std::endl; + for (const auto& rec : records) { + std::cout << " offset=" << rec.offset << " timestamp=" << rec.timestamp << std::endl; + } + + // Per-bucket access (with type verification) + bool scan_ok = true; + bool found_null_row = false; + for (const auto& tb : records.Buckets()) { + auto view = records.Records(tb); + std::cout << " Bucket " << tb.bucket_id; + if (tb.partition_id.has_value()) { + std::cout << " (partition=" << *tb.partition_id << ")"; + } + std::cout << ": " << view.Size() << " records" << std::endl; + for (const auto& rec : view) { + // Check if this is the all-null row + if (rec.row.IsNull(0)) { + found_null_row = true; + for (size_t i = 0; i < rec.row.FieldCount(); ++i) { + if (!rec.row.IsNull(i)) { + std::cerr << "ERROR: column " << i << " should be null" << std::endl; + scan_ok = false; + } + } + std::cout << " [null row] all " << rec.row.FieldCount() << " fields are null" + << std::endl; + continue; + } + + // Non-null rows: verify types + if (rec.row.GetType(4) != fluss::TypeId::Date) { + std::cerr << "ERROR: field 4 expected Date, got " + << static_cast(rec.row.GetType(4)) << std::endl; + scan_ok = false; + } + if (rec.row.GetType(5) != fluss::TypeId::Time) { + std::cerr << "ERROR: field 5 expected Time, got " + << static_cast(rec.row.GetType(5)) << std::endl; + scan_ok = false; + } + if (rec.row.GetType(6) != fluss::TypeId::Timestamp) { + std::cerr << "ERROR: field 6 expected Timestamp, got " + << static_cast(rec.row.GetType(6)) << std::endl; + scan_ok = false; + } + if (rec.row.GetType(7) != fluss::TypeId::TimestampLtz) { + std::cerr << "ERROR: field 7 expected TimestampLtz, got " + << static_cast(rec.row.GetType(7)) << std::endl; + scan_ok = false; + } + + // Name-based getters + auto date = rec.row.GetDate("event_date"); + auto time = rec.row.GetTime("event_time"); + auto ts_ntz = rec.row.GetTimestamp("created_at"); + auto ts_ltz = rec.row.GetTimestamp("updated_at"); + + std::cout << " id=" << rec.row.GetInt32("id") + << " name=" << rec.row.GetString("name") + << " score=" << rec.row.GetFloat32("score") + << " age=" << rec.row.GetInt32("age") << " date=" << date.Year() << "-" + << date.Month() << "-" << date.Day() << " time=" << time.Hour() << ":" + << time.Minute() << ":" << time.Second() << " ts_ntz=" << ts_ntz.epoch_millis + << " ts_ltz=" << ts_ltz.epoch_millis << "+" << ts_ltz.nano_of_millisecond + << "ns" << std::endl; + } + } + + if (!found_null_row) { + std::cerr << "ERROR: did not find the all-null row" << std::endl; + scan_ok = false; + } + + if (!scan_ok) { + std::cerr << "Full scan type verification FAILED!" << std::endl; + std::exit(1); + } + + // 7a) Projected scan by index — project [id, updated_at(TimestampLtz)] to verify + // NTZ/LTZ disambiguation works with column index remapping + std::vector projected_columns = {0, 7}; + fluss::LogScanner projected_scanner; + check("new_log_scanner_with_projection", + table.NewScan().ProjectByIndex(projected_columns).CreateLogScanner(projected_scanner)); + + for (int b = 0; b < buckets; ++b) { + check("subscribe_projected", projected_scanner.Subscribe(b, 0)); + } + + fluss::ScanRecords projected_records; + check("poll_projected", projected_scanner.Poll(5000, projected_records)); + + std::cout << "Projected records: " << projected_records.Count() << std::endl; + for (const auto& tb : projected_records.Buckets()) { + for (const auto& rec : projected_records.Records(tb)) { + if (rec.row.FieldCount() != 2) { + std::cerr << "ERROR: expected 2 fields, got " << rec.row.FieldCount() << std::endl; + scan_ok = false; + continue; + } + // Skip the all-null row + if (rec.row.IsNull(0)) { + std::cout << " [null row] skipped" << std::endl; + continue; + } + if (rec.row.GetType(0) != fluss::TypeId::Int) { + std::cerr << "ERROR: projected field 0 expected Int, got " + << static_cast(rec.row.GetType(0)) << std::endl; + scan_ok = false; + } + if (rec.row.GetType(1) != fluss::TypeId::TimestampLtz) { + std::cerr << "ERROR: projected field 1 expected TimestampLtz, got " + << static_cast(rec.row.GetType(1)) << std::endl; + scan_ok = false; + } + + auto ts = rec.row.GetTimestamp(1); + std::cout << " id=" << rec.row.GetInt32(0) << " updated_at=" << ts.epoch_millis << "+" + << ts.nano_of_millisecond << "ns" << std::endl; + } + } + + // 7b) Projected scan by column names — same columns as above but using names + fluss::LogScanner name_projected_scanner; + check("project_by_name_scanner", table.NewScan() + .ProjectByName({"id", "updated_at"}) + .CreateLogScanner(name_projected_scanner)); + + for (int b = 0; b < buckets; ++b) { + check("subscribe_name_projected", name_projected_scanner.Subscribe(b, 0)); + } + + fluss::ScanRecords name_projected_records; + check("poll_name_projected", name_projected_scanner.Poll(5000, name_projected_records)); + + std::cout << "Name-projected records: " << name_projected_records.Count() << std::endl; + for (const auto& tb : name_projected_records.Buckets()) { + for (const auto& rec : name_projected_records.Records(tb)) { + if (rec.row.FieldCount() != 2) { + std::cerr << "ERROR: expected 2 fields, got " << rec.row.FieldCount() << std::endl; + scan_ok = false; + continue; + } + // Skip the all-null row + if (rec.row.IsNull(0)) { + std::cout << " [null row] skipped" << std::endl; + continue; + } + if (rec.row.GetType(0) != fluss::TypeId::Int) { + std::cerr << "ERROR: name-projected field 0 expected Int, got " + << static_cast(rec.row.GetType(0)) << std::endl; + scan_ok = false; + } + if (rec.row.GetType(1) != fluss::TypeId::TimestampLtz) { + std::cerr << "ERROR: name-projected field 1 expected TimestampLtz, got " + << static_cast(rec.row.GetType(1)) << std::endl; + scan_ok = false; + } + + auto ts = rec.row.GetTimestamp(1); + std::cout << " id=" << rec.row.GetInt32(0) << " updated_at=" << ts.epoch_millis << "+" + << ts.nano_of_millisecond << "ns" << std::endl; + } + } + + if (scan_ok) { + std::cout << "Scan verification passed!" << std::endl; + } else { + std::cerr << "Scan verification FAILED!" << std::endl; + std::exit(1); + } + + // 8) List offsets examples + std::cout << "\n=== List Offsets Examples ===" << std::endl; + + std::vector all_bucket_ids; + all_bucket_ids.reserve(buckets); + for (int b = 0; b < buckets; ++b) { + all_bucket_ids.push_back(b); + } + + std::unordered_map earliest_offsets; + check("list_earliest_offsets", + admin.ListOffsets(table_path, all_bucket_ids, fluss::OffsetSpec::Earliest(), + earliest_offsets)); + std::cout << "Earliest offsets:" << std::endl; + for (const auto& [bucket_id, offset] : earliest_offsets) { + std::cout << " Bucket " << bucket_id << ": offset=" << offset << std::endl; + } + + std::unordered_map latest_offsets; + check("list_latest_offsets", admin.ListOffsets(table_path, all_bucket_ids, + fluss::OffsetSpec::Latest(), latest_offsets)); + std::cout << "Latest offsets:" << std::endl; + for (const auto& [bucket_id, offset] : latest_offsets) { + std::cout << " Bucket " << bucket_id << ": offset=" << offset << std::endl; + } + + auto now = std::chrono::system_clock::now(); + auto one_hour_ago = now - std::chrono::hours(1); + auto timestamp_ms = + std::chrono::duration_cast(one_hour_ago.time_since_epoch()) + .count(); + + std::unordered_map timestamp_offsets; + check("list_timestamp_offsets", + admin.ListOffsets(table_path, all_bucket_ids, fluss::OffsetSpec::Timestamp(timestamp_ms), + timestamp_offsets)); + std::cout << "Offsets for timestamp " << timestamp_ms << " (1 hour ago):" << std::endl; + for (const auto& [bucket_id, offset] : timestamp_offsets) { + std::cout << " Bucket " << bucket_id << ": offset=" << offset << std::endl; + } + + // 9) Batch subscribe + std::cout << "\n=== Batch Subscribe Example ===" << std::endl; + fluss::LogScanner batch_scanner; + check("new_log_scanner_for_batch", table.NewScan().CreateLogScanner(batch_scanner)); + + std::vector subscriptions; + for (const auto& [bucket_id, offset] : earliest_offsets) { + subscriptions.push_back({bucket_id, offset}); + std::cout << "Preparing subscription: bucket=" << bucket_id << ", offset=" << offset + << std::endl; + } + + check("subscribe_buckets", batch_scanner.Subscribe(subscriptions)); + std::cout << "Batch subscribed to " << subscriptions.size() << " buckets" << std::endl; + + fluss::ScanRecords batch_records; + check("poll_batch", batch_scanner.Poll(5000, batch_records)); + + std::cout << "Scanned " << batch_records.Count() << " records from batch subscription" + << std::endl; + for (const auto& tb : batch_records.Buckets()) { + size_t shown = 0; + for (const auto& rec : batch_records.Records(tb)) { + if (shown < 5) { + std::cout << " bucket_id=" << tb.bucket_id << ", offset=" << rec.offset + << ", timestamp=" << rec.timestamp << std::endl; + } + ++shown; + } + if (shown > 5) { + std::cout << " ... and " << (shown - 5) << " more records in bucket " << tb.bucket_id + << std::endl; + } + } + + // 9.1) Unsubscribe from a bucket + std::cout << "\n=== Unsubscribe Example ===" << std::endl; + check("unsubscribe", batch_scanner.Unsubscribe(subscriptions[0].bucket_id)); + std::cout << "Unsubscribed from bucket " << subscriptions[0].bucket_id << std::endl; + + // 10) Arrow record batch polling + std::cout << "\n=== Testing Arrow Record Batch Polling ===" << std::endl; + + fluss::LogScanner arrow_scanner; + check("new_record_batch_log_scanner", + table.NewScan().CreateRecordBatchLogScanner(arrow_scanner)); + + for (int b = 0; b < buckets; ++b) { + check("subscribe_arrow", arrow_scanner.Subscribe(b, 0)); + } + + fluss::ArrowRecordBatches arrow_batches; + check("poll_record_batch", arrow_scanner.PollRecordBatch(5000, arrow_batches)); + + std::cout << "Polled " << arrow_batches.Size() << " Arrow record batches" << std::endl; + for (size_t i = 0; i < arrow_batches.Size(); ++i) { + const auto& batch = arrow_batches[i]; + if (batch->Available()) { + std::cout << " Batch " << i << ": " << batch->GetArrowRecordBatch()->num_rows() + << " rows" << std::endl; + } else { + std::cout << " Batch " << i << ": not available" << std::endl; + } + } + + // 11) Arrow record batch polling with projection + std::cout << "\n=== Testing Arrow Record Batch Polling with Projection ===" << std::endl; + + fluss::LogScanner projected_arrow_scanner; + check("new_record_batch_log_scanner_with_projection", + table.NewScan() + .ProjectByIndex(projected_columns) + .CreateRecordBatchLogScanner(projected_arrow_scanner)); + + for (int b = 0; b < buckets; ++b) { + check("subscribe_projected_arrow", projected_arrow_scanner.Subscribe(b, 0)); + } + + fluss::ArrowRecordBatches projected_arrow_batches; + check("poll_projected_record_batch", + projected_arrow_scanner.PollRecordBatch(5000, projected_arrow_batches)); + + std::cout << "Polled " << projected_arrow_batches.Size() << " projected Arrow record batches" + << std::endl; + for (size_t i = 0; i < projected_arrow_batches.Size(); ++i) { + const auto& batch = projected_arrow_batches[i]; + if (batch->Available()) { + std::cout << " Batch " << i << ": " << batch->GetArrowRecordBatch()->num_rows() + << " rows" << std::endl; + } else { + std::cout << " Batch " << i << ": not available" << std::endl; + } + } + + // 12) AppendArrowBatch — write an Arrow RecordBatch directly + std::cout << "\n=== AppendArrowBatch Example ===" << std::endl; + { + // Build an Arrow RecordBatch matching sample_table_cpp_v1 schema: + // id:INT, name:STRING, score:FLOAT, age:INT, + // event_date:DATE, event_time:TIME, created_at:TIMESTAMP, updated_at:TIMESTAMP_LTZ + auto arrow_schema = arrow::schema({ + arrow::field("id", arrow::int32()), + arrow::field("name", arrow::utf8()), + arrow::field("score", arrow::float32()), + arrow::field("age", arrow::int32()), + arrow::field("event_date", arrow::date32()), + arrow::field("event_time", arrow::time32(arrow::TimeUnit::MILLI)), + arrow::field("created_at", arrow::timestamp(arrow::TimeUnit::MICRO)), + arrow::field("updated_at", arrow::timestamp(arrow::TimeUnit::MICRO)), + }); + + arrow::Int32Builder id_builder; + arrow::StringBuilder name_builder; + arrow::FloatBuilder score_builder; + arrow::Int32Builder age_builder; + arrow::Date32Builder date_builder; + arrow::Time32Builder time_builder(arrow::time32(arrow::TimeUnit::MILLI), + arrow::default_memory_pool()); + arrow::TimestampBuilder ts_ntz_builder(arrow::timestamp(arrow::TimeUnit::MICRO), + arrow::default_memory_pool()); + arrow::TimestampBuilder ts_ltz_builder(arrow::timestamp(arrow::TimeUnit::MICRO), + arrow::default_memory_pool()); + + // Row 1 + (void)id_builder.Append(200); + (void)name_builder.Append("ArrowAlice"); + (void)score_builder.Append(88.5f); + (void)age_builder.Append(28); + (void)date_builder.Append(19888); // days since epoch (2024-06-15 ≈ 19888) + (void)time_builder.Append(52245000); // 14:30:45 in ms + (void)ts_ntz_builder.Append(1718467200000000); // micros + (void)ts_ltz_builder.Append(1718467200000000); + + // Row 2 + (void)id_builder.Append(201); + (void)name_builder.Append("ArrowBob"); + (void)score_builder.Append(91.3f); + (void)age_builder.Append(33); + (void)date_builder.Append(20089); // 2025-01-02 + (void)time_builder.Append(3600000); // 01:00:00 + (void)ts_ntz_builder.Append(1735689600000000); + (void)ts_ltz_builder.Append(1735689600000000); + + auto batch_result = arrow::RecordBatch::Make( + arrow_schema, 2, + {id_builder.Finish().ValueOrDie(), name_builder.Finish().ValueOrDie(), + score_builder.Finish().ValueOrDie(), age_builder.Finish().ValueOrDie(), + date_builder.Finish().ValueOrDie(), time_builder.Finish().ValueOrDie(), + ts_ntz_builder.Finish().ValueOrDie(), ts_ltz_builder.Finish().ValueOrDie()}); + + check("append_arrow_batch", writer.AppendArrowBatch(batch_result)); + check("flush_arrow", writer.Flush()); + std::cout << "Wrote 2 rows via AppendArrowBatch" << std::endl; + + // Verify by scanning from latest offsets + fluss::LogScanner arrow_write_scanner; + check("new_arrow_write_scanner", table.NewScan().CreateLogScanner(arrow_write_scanner)); + for (const auto& [bid, off] : latest_offsets) { + check("subscribe_arrow_write", arrow_write_scanner.Subscribe(bid, off)); + } + + fluss::ScanRecords arrow_write_records; + check("poll_arrow_write", arrow_write_scanner.Poll(5000, arrow_write_records)); + std::cout << "Scanned " << arrow_write_records.Count() + << " records written via AppendArrowBatch:" << std::endl; + for (const auto& tb : arrow_write_records.Buckets()) { + for (const auto& rec : arrow_write_records.Records(tb)) { + std::cout << " id=" << rec.row.GetInt32(0) << " name=" << rec.row.GetString(1) + << " score=" << rec.row.GetFloat32(2) << std::endl; + } + } + } + + // 13) Decimal support example + std::cout << "\n=== Decimal Support Example ===" << std::endl; + + fluss::TablePath decimal_table_path("fluss", "decimal_table_cpp_v1"); + + // Drop table if exists + admin.DropTable(decimal_table_path, true); + + // Create schema with decimal columns + auto decimal_schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("price", fluss::DataType::Decimal(10, 2)) // compact + .AddColumn("amount", fluss::DataType::Decimal(28, 8)) // i128 + .Build(); + + auto decimal_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(decimal_schema) + .SetBucketCount(1) + .SetComment("cpp decimal example table") + .Build(); + + check("create_decimal_table", admin.CreateTable(decimal_table_path, decimal_descriptor, false)); + + // Get table and writer + fluss::Table decimal_table; + check("get_decimal_table", conn.GetTable(decimal_table_path, decimal_table)); + + fluss::AppendWriter decimal_writer; + check("new_decimal_writer", decimal_table.NewAppend().CreateWriter(decimal_writer)); + + // Just provide the value — Rust resolves (p,s) from schema + { + fluss::GenericRow row; + row.SetInt32(0, 1); + row.SetDecimal(1, "123.45"); // Rust knows DECIMAL(10,2) + row.SetDecimal(2, "1.00000000"); // Rust knows DECIMAL(28,8) + check("append_decimal", decimal_writer.Append(row)); + } + { + fluss::GenericRow row; + row.SetInt32(0, 2); + row.SetDecimal(1, "-999.99"); + row.SetDecimal(2, "3.14159265"); + check("append_decimal", decimal_writer.Append(row)); + } + { + fluss::GenericRow row; + row.SetInt32(0, 3); + row.SetDecimal(1, "500.00"); + row.SetDecimal(2, "2.71828182"); + check("append_decimal", decimal_writer.Append(row)); + } + check("flush_decimal", decimal_writer.Flush()); + std::cout << "Wrote 3 decimal rows" << std::endl; + + // Scan and read back + fluss::LogScanner decimal_scanner; + check("new_decimal_scanner", decimal_table.NewScan().CreateLogScanner(decimal_scanner)); + check("subscribe_decimal", decimal_scanner.Subscribe(0, 0)); + + fluss::ScanRecords decimal_records; + check("poll_decimal", decimal_scanner.Poll(5000, decimal_records)); + + std::cout << "Scanned decimal records: " << decimal_records.Count() << std::endl; + for (const auto& tb : decimal_records.Buckets()) { + for (const auto& rec : decimal_records.Records(tb)) { + std::cout << " id=" << rec.row.GetInt32(0) << " price=" << rec.row.GetDecimalString(1) + << " amount=" << rec.row.GetDecimalString(2) + << " is_decimal=" << rec.row.IsDecimal(1) << std::endl; + } + } + + // 14) Partitioned table example + std::cout << "\n=== Partitioned Table Example ===" << std::endl; + + fluss::TablePath partitioned_table_path("fluss", "partitioned_table_cpp_v1"); + + // Drop if exists + check("drop_partitioned_table_if_exists", admin.DropTable(partitioned_table_path, true)); + + // Create a partitioned table with a "region" partition key + auto partitioned_schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("region", fluss::DataType::String()) + .AddColumn("value", fluss::DataType::BigInt()) + .Build(); + + auto partitioned_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(partitioned_schema) + .SetPartitionKeys({"region"}) + .SetBucketCount(1) + .SetComment("cpp partitioned table example") + .Build(); + + check("create_partitioned_table", + admin.CreateTable(partitioned_table_path, partitioned_descriptor, false)); + std::cout << "Created partitioned table" << std::endl; + + // Create partitions + check("create_partition_US", + admin.CreatePartition(partitioned_table_path, {{"region", "US"}}, true)); + check("create_partition_EU", + admin.CreatePartition(partitioned_table_path, {{"region", "EU"}}, true)); + std::cout << "Created partitions: US, EU" << std::endl; + + // List all partitions + std::vector partition_infos; + check("list_partition_infos", + admin.ListPartitionInfos(partitioned_table_path, partition_infos)); + for (const auto& pi : partition_infos) { + std::cout << " Partition: " << pi.partition_name << " (id=" << pi.partition_id << ")" + << std::endl; + } + + // List partitions with partial spec filter + std::vector us_partition_infos; + check("list_partition_infos_with_spec", + admin.ListPartitionInfos(partitioned_table_path, {{"region", "US"}}, us_partition_infos)); + std::cout << " Filtered (region=US): " << us_partition_infos.size() << " partition(s)" + << std::endl; + + // Write data to partitioned table + fluss::Table partitioned_table; + check("get_partitioned_table", conn.GetTable(partitioned_table_path, partitioned_table)); + + fluss::AppendWriter partitioned_writer; + check("new_partitioned_writer", partitioned_table.NewAppend().CreateWriter(partitioned_writer)); + + struct PartitionedRow { + int id; + const char* region; + int64_t value; + }; + + std::vector partitioned_rows = { + {1, "US", 100}, + {2, "US", 200}, + {3, "EU", 300}, + {4, "EU", 400}, + }; + + for (const auto& r : partitioned_rows) { + fluss::GenericRow row; + row.SetInt32(0, r.id); + row.SetString(1, r.region); + row.SetInt64(2, r.value); + check("append_partitioned", partitioned_writer.Append(row)); + } + check("flush_partitioned", partitioned_writer.Flush()); + std::cout << "Wrote " << partitioned_rows.size() << " rows to partitioned table" << std::endl; + + // 14.1) subscribe_partition_buckets: subscribe to each partition individually + std::cout << "\n--- Testing SubscribePartitionBuckets ---" << std::endl; + fluss::LogScanner partition_scanner; + check("new_partition_scanner", partitioned_table.NewScan().CreateLogScanner(partition_scanner)); + + for (const auto& pi : partition_infos) { + check("subscribe_partition_buckets", + partition_scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0)); + std::cout << "Subscribed to partition " << pi.partition_name << std::endl; + } + + fluss::ScanRecords partition_records; + check("poll_partitioned", partition_scanner.Poll(5000, partition_records)); + std::cout << "Scanned " << partition_records.Count() << " records from partitioned table" + << std::endl; + for (const auto& tb : partition_records.Buckets()) { + for (const auto& rec : partition_records.Records(tb)) { + std::cout << " partition_id=" + << (tb.partition_id.has_value() ? std::to_string(*tb.partition_id) : "none") + << ", id=" << rec.row.GetInt32(0) << ", region=" << rec.row.GetString(1) + << ", value=" << rec.row.GetInt64(2) << std::endl; + } + } + + // 14.2) subscribe_partition_buckets: batch subscribe to all partitions at once + std::cout << "\n--- Testing SubscribePartitionBuckets (batch) ---" << std::endl; + fluss::LogScanner partition_batch_scanner; + check("new_partition_batch_scanner", + partitioned_table.NewScan().CreateLogScanner(partition_batch_scanner)); + + std::vector partition_subs; + for (const auto& pi : partition_infos) { + partition_subs.push_back({pi.partition_id, 0, 0}); + } + check("subscribe_partition_buckets", + partition_batch_scanner.SubscribePartitionBuckets(partition_subs)); + std::cout << "Batch subscribed to " << partition_subs.size() << " partition+bucket combinations" + << std::endl; + + fluss::ScanRecords partition_batch_records; + check("poll_partition_batch", partition_batch_scanner.Poll(5000, partition_batch_records)); + std::cout << "Scanned " << partition_batch_records.Count() + << " records from batch partition subscription" << std::endl; + for (const auto& tb : partition_batch_records.Buckets()) { + for (const auto& rec : partition_batch_records.Records(tb)) { + std::cout << " id=" << rec.row.GetInt32(0) << ", region=" << rec.row.GetString(1) + << ", value=" << rec.row.GetInt64(2) << std::endl; + } + } + + // 14.3) UnsubscribePartition: unsubscribe from one partition, verify remaining + std::cout << "\n--- Testing UnsubscribePartition ---" << std::endl; + fluss::LogScanner unsub_partition_scanner; + check("new_unsub_partition_scanner", + partitioned_table.NewScan().CreateLogScanner(unsub_partition_scanner)); + + for (const auto& pi : partition_infos) { + check("subscribe_for_unsub", + unsub_partition_scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0)); + } + // Unsubscribe from the first partition + check("unsubscribe_partition", + unsub_partition_scanner.UnsubscribePartition(partition_infos[0].partition_id, 0)); + std::cout << "Unsubscribed from partition " << partition_infos[0].partition_name << std::endl; + + fluss::ScanRecords unsub_records; + check("poll_after_unsub", unsub_partition_scanner.Poll(5000, unsub_records)); + std::cout << "After unsubscribe, scanned " << unsub_records.Count() << " records" << std::endl; + for (const auto& tb : unsub_records.Buckets()) { + for (const auto& rec : unsub_records.Records(tb)) { + std::cout << " id=" << rec.row.GetInt32(0) << ", region=" << rec.row.GetString(1) + << ", value=" << rec.row.GetInt64(2) << std::endl; + } + } + + // Cleanup + check("drop_partitioned_table", admin.DropTable(partitioned_table_path, true)); + std::cout << "Dropped partitioned table" << std::endl; + return 0; +} diff --git a/fluss-rust/bindings/cpp/examples/kv_example.cpp b/fluss-rust/bindings/cpp/examples/kv_example.cpp new file mode 100644 index 0000000000..46ed01f682 --- /dev/null +++ b/fluss-rust/bindings/cpp/examples/kv_example.cpp @@ -0,0 +1,537 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "fluss.hpp" + +static void check(const char* step, const fluss::Result& r) { + if (!r.Ok()) { + std::cerr << step << " failed: code=" << r.error_code << " msg=" << r.error_message + << std::endl; + std::exit(1); + } +} + +int main() { + // 1) Connect and get Admin + fluss::Configuration config; + config.bootstrap_servers = "127.0.0.1:9123"; + + fluss::Connection conn; + check("create", fluss::Connection::Create(config, conn)); + + fluss::Admin admin; + check("get_admin", conn.GetAdmin(admin)); + + fluss::TablePath kv_table_path("fluss", "kv_table_cpp_v1"); + + // Drop if exists + admin.DropTable(kv_table_path, true); + + // 2) Create a KV table with primary key, including decimal and temporal types + auto kv_schema = fluss::Schema::NewBuilder() + .AddColumn("user_id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("email", fluss::DataType::String()) + .AddColumn("score", fluss::DataType::Float()) + .AddColumn("balance", fluss::DataType::Decimal(10, 2)) + .AddColumn("birth_date", fluss::DataType::Date()) + .AddColumn("login_time", fluss::DataType::Time()) + .AddColumn("created_at", fluss::DataType::Timestamp()) + .AddColumn("last_seen", fluss::DataType::TimestampLtz()) + .SetPrimaryKeys({"user_id"}) + .Build(); + + auto kv_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(kv_schema) + .SetBucketCount(3) + .SetComment("cpp kv table example") + .Build(); + + check("create_kv_table", admin.CreateTable(kv_table_path, kv_descriptor, false)); + std::cout << "Created KV table with primary key" << std::endl; + + fluss::Table kv_table; + check("get_kv_table", conn.GetTable(kv_table_path, kv_table)); + + // 3) Upsert rows using name-based Set() + // - Set("balance", "1234.56") auto-routes to SetDecimal (schema-aware) + // - Set("created_at", ts) auto-routes to SetTimestampNtz (schema-aware) + // - Set("last_seen", ts) auto-routes to SetTimestampLtz (schema-aware) + std::cout << "\n--- Upsert Rows ---" << std::endl; + fluss::UpsertWriter upsert_writer; + check("new_upsert_writer", kv_table.NewUpsert().CreateWriter(upsert_writer)); + + // Fire-and-forget upserts (flush at the end) + { + auto row = kv_table.NewRow(); + row.Set("user_id", 1); + row.Set("name", "Alice"); + row.Set("email", "alice@example.com"); + row.Set("score", 95.5f); + row.Set("balance", "1234.56"); + row.Set("birth_date", fluss::Date::FromYMD(1990, 3, 15)); + row.Set("login_time", fluss::Time::FromHMS(9, 30, 0)); + row.Set("created_at", fluss::Timestamp::FromMillis(1700000000000)); + row.Set("last_seen", fluss::Timestamp::FromMillis(1700000060000)); + check("upsert_1", upsert_writer.Upsert(row)); + } + { + auto row = kv_table.NewRow(); + row.Set("user_id", 2); + row.Set("name", "Bob"); + row.Set("email", "bob@example.com"); + row.Set("score", 87.3f); + row.Set("balance", "567.89"); + row.Set("birth_date", fluss::Date::FromYMD(1985, 7, 22)); + row.Set("login_time", fluss::Time::FromHMS(14, 15, 30)); + row.Set("created_at", fluss::Timestamp::FromMillis(1700000100000)); + row.Set("last_seen", fluss::Timestamp::FromMillis(1700000200000)); + check("upsert_2", upsert_writer.Upsert(row)); + } + + // Per-record acknowledgment + { + auto row = kv_table.NewRow(); + row.Set("user_id", 3); + row.Set("name", "Charlie"); + row.Set("email", "charlie@example.com"); + row.Set("score", 92.0f); + row.Set("balance", "99999.99"); + row.Set("birth_date", fluss::Date::FromYMD(2000, 1, 1)); + row.Set("login_time", fluss::Time::FromHMS(23, 59, 59)); + row.Set("created_at", fluss::Timestamp::FromMillis(1700000300000)); + row.Set("last_seen", fluss::Timestamp::FromMillis(1700000400000)); + fluss::WriteResult wr; + check("upsert_3", upsert_writer.Upsert(row, wr)); + check("upsert_3_wait", wr.Wait()); + std::cout << "Upsert acknowledged by server" << std::endl; + } + + check("upsert_flush", upsert_writer.Flush()); + std::cout << "Upserted 3 rows" << std::endl; + + // 4) Lookup by primary key — verify all types round-trip + std::cout << "\n--- Lookup by Primary Key ---" << std::endl; + fluss::Lookuper lookuper; + check("new_lookuper", kv_table.NewLookup().CreateLookuper(lookuper)); + + // Lookup existing key + { + auto pk_row = kv_table.NewRow(); + pk_row.Set("user_id", 1); + + fluss::LookupResult result; + check("lookup_1", lookuper.Lookup(pk_row, result)); + if (result.Found()) { + // Name-based getters — same data as index-based but self-documenting + auto date = result.GetDate("birth_date"); + auto time = result.GetTime("login_time"); + auto created = result.GetTimestamp("created_at"); + auto seen = result.GetTimestamp("last_seen"); + std::cout << "Found user_id=1:" + << "\n name=" << result.GetString("name") + << "\n email=" << result.GetString("email") + << "\n score=" << result.GetFloat32("score") + << "\n balance=" << result.GetDecimalString("balance") + << "\n birth_date=" << date.Year() << "-" << date.Month() << "-" + << date.Day() << "\n login_time=" << time.Hour() << ":" << time.Minute() + << ":" << time.Second() << "\n created_at(ms)=" << created.epoch_millis + << "\n last_seen(ms)=" << seen.epoch_millis << std::endl; + } else { + std::cerr << "ERROR: Expected to find user_id=1" << std::endl; + std::exit(1); + } + } + + // Lookup non-existing key + { + auto pk_row = kv_table.NewRow(); + pk_row.Set("user_id", 999); + + fluss::LookupResult result; + check("lookup_999", lookuper.Lookup(pk_row, result)); + if (!result.Found()) { + std::cout << "user_id=999 not found (expected)" << std::endl; + } else { + std::cerr << "ERROR: Expected user_id=999 to not be found" << std::endl; + std::exit(1); + } + } + + // 4b) Null row round-trip (matches Rust kv_table.rs all_supported_datatypes) + // Upsert a row with all non-PK fields null, lookup, verify IsNull + std::cout << "\n--- Null Row Round-Trip ---" << std::endl; + { + auto row = kv_table.NewRow(); + row.Set("user_id", 100); + row.SetNull(1); // name + row.SetNull(2); // email + row.SetNull(3); // score + row.SetNull(4); // balance + row.SetNull(5); // birth_date + row.SetNull(6); // login_time + row.SetNull(7); // created_at + row.SetNull(8); // last_seen + fluss::WriteResult wr; + check("upsert_null_row", upsert_writer.Upsert(row, wr)); + check("upsert_null_row_wait", wr.Wait()); + } + { + auto pk_row = kv_table.NewRow(); + pk_row.Set("user_id", 100); + + fluss::LookupResult result; + check("lookup_null_row", lookuper.Lookup(pk_row, result)); + if (!result.Found()) { + std::cerr << "ERROR: Expected to find user_id=100 (null row)" << std::endl; + std::exit(1); + } + + // Verify PK is not null + if (result.IsNull(0)) { + std::cerr << "ERROR: PK (user_id) should not be null" << std::endl; + std::exit(1); + } + + // Verify all nullable columns are null (matches Rust is_null_at assertions) + bool null_ok = true; + for (size_t i = 1; i < result.FieldCount(); ++i) { + if (!result.IsNull(i)) { + std::cerr << "ERROR: column " << i << " should be null" << std::endl; + null_ok = false; + } + } + if (null_ok) { + std::cout << "Null row verified: all " << (result.FieldCount() - 1) + << " nullable fields are null" << std::endl; + } else { + std::exit(1); + } + } + + // 5) Update via upsert (overwrite existing key) + std::cout << "\n--- Update via Upsert ---" << std::endl; + { + auto row = kv_table.NewRow(); + row.Set("user_id", 1); + row.Set("name", "Alice Updated"); + row.Set("email", "alice.new@example.com"); + row.Set("score", 99.0f); + row.Set("balance", "9999.00"); + row.Set("birth_date", fluss::Date::FromYMD(1990, 3, 15)); + row.Set("login_time", fluss::Time::FromHMS(10, 0, 0)); + row.Set("created_at", fluss::Timestamp::FromMillis(1700000000000)); + row.Set("last_seen", fluss::Timestamp::FromMillis(1700000500000)); + fluss::WriteResult wr; + check("upsert_update", upsert_writer.Upsert(row, wr)); + check("upsert_update_wait", wr.Wait()); + } + + // Verify update + { + auto pk_row = kv_table.NewRow(); + pk_row.Set("user_id", 1); + + fluss::LookupResult result; + check("lookup_updated", lookuper.Lookup(pk_row, result)); + if (result.Found() && result.GetString(1) == "Alice Updated") { + std::cout << "Update verified: name=" << result.GetString(1) + << " balance=" << result.GetDecimalString(4) + << " last_seen(ms)=" << result.GetTimestamp(8).epoch_millis << std::endl; + } else { + std::cerr << "ERROR: Update verification failed" << std::endl; + std::exit(1); + } + } + + // 6) Delete by primary key + std::cout << "\n--- Delete by Primary Key ---" << std::endl; + { + auto pk_row = kv_table.NewRow(); + pk_row.Set("user_id", 2); + fluss::WriteResult wr; + check("delete_2", upsert_writer.Delete(pk_row, wr)); + check("delete_2_wait", wr.Wait()); + std::cout << "Deleted user_id=2" << std::endl; + } + + // Verify deletion + { + auto pk_row = kv_table.NewRow(); + pk_row.Set("user_id", 2); + + fluss::LookupResult result; + check("lookup_deleted", lookuper.Lookup(pk_row, result)); + if (!result.Found()) { + std::cout << "Delete verified: user_id=2 not found" << std::endl; + } else { + std::cerr << "ERROR: Expected user_id=2 to be deleted" << std::endl; + std::exit(1); + } + } + + // 7) Partial update by column names + std::cout << "\n--- Partial Update by Column Names ---" << std::endl; + fluss::UpsertWriter partial_writer; + check("new_partial_upsert_writer", kv_table.NewUpsert() + .PartialUpdateByName({"user_id", "balance", "last_seen"}) + .CreateWriter(partial_writer)); + + { + auto row = kv_table.NewRow(); + row.Set("user_id", 3); + row.Set("balance", "50000.00"); + row.Set("last_seen", fluss::Timestamp::FromMillis(1700000999000)); + fluss::WriteResult wr; + check("partial_upsert", partial_writer.Upsert(row, wr)); + check("partial_upsert_wait", wr.Wait()); + std::cout << "Partial update: set balance=50000.00, last_seen for user_id=3" << std::endl; + } + + // Verify partial update (other fields unchanged) + { + auto pk_row = kv_table.NewRow(); + pk_row.Set("user_id", 3); + + fluss::LookupResult result; + check("lookup_partial", lookuper.Lookup(pk_row, result)); + if (result.Found()) { + std::cout << "Partial update verified:" + << "\n name=" << result.GetString(1) << " (unchanged)" + << "\n balance=" << result.GetDecimalString(4) << " (updated)" + << "\n last_seen(ms)=" << result.GetTimestamp(8).epoch_millis << " (updated)" + << std::endl; + } else { + std::cerr << "ERROR: Expected to find user_id=3" << std::endl; + std::exit(1); + } + } + + // 8) Partial update by column indices (using index-based setters for lower overhead) + std::cout << "\n--- Partial Update by Column Indices ---" << std::endl; + fluss::UpsertWriter partial_writer_idx; + // Columns: 0=user_id (PK), 1=name — update name only + check("new_partial_upsert_writer_idx", + kv_table.NewUpsert().PartialUpdateByIndex({0, 1}).CreateWriter(partial_writer_idx)); + + { + // Index-based setters: lighter than name-based, useful for hot paths + fluss::GenericRow row; + row.SetInt32(0, 3); // user_id (PK) + row.SetString(1, "Charlie Updated"); // name + fluss::WriteResult wr; + check("partial_upsert_idx", partial_writer_idx.Upsert(row, wr)); + check("partial_upsert_idx_wait", wr.Wait()); + std::cout << "Partial update by indices: set name='Charlie Updated' for user_id=3" + << std::endl; + } + + // Verify: name changed, balance/last_seen unchanged from previous partial update + { + auto pk_row = kv_table.NewRow(); + pk_row.Set("user_id", 3); + + fluss::LookupResult result; + check("lookup_partial_idx", lookuper.Lookup(pk_row, result)); + if (result.Found()) { + std::cout << "Partial update by indices verified:" + << "\n name=" << result.GetString(1) << " (updated)" + << "\n balance=" << result.GetDecimalString(4) << " (unchanged)" + << "\n last_seen(ms)=" << result.GetTimestamp(8).epoch_millis + << " (unchanged)" << std::endl; + } else { + std::cerr << "ERROR: Expected to find user_id=3" << std::endl; + std::exit(1); + } + } + + // Cleanup + check("drop_kv_table", admin.DropTable(kv_table_path, true)); + + // 9) Partitioned KV table + std::cout << "\n--- Partitioned KV Table ---" << std::endl; + fluss::TablePath partitioned_kv_path("fluss", "partitioned_kv_cpp_v1"); + admin.DropTable(partitioned_kv_path, true); + + // PK columns intentionally interleaved with non-PK columns to verify + // that lookup correctly builds a dense PK-only row (not sparse full-width). + auto partitioned_kv_schema = fluss::Schema::NewBuilder() + .AddColumn("region", fluss::DataType::String()) + .AddColumn("score", fluss::DataType::BigInt()) + .AddColumn("user_id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .SetPrimaryKeys({"region", "user_id"}) + .Build(); + + auto partitioned_kv_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(partitioned_kv_schema) + .SetPartitionKeys({"region"}) + .SetComment("partitioned kv table example") + .Build(); + + check("create_partitioned_kv", + admin.CreateTable(partitioned_kv_path, partitioned_kv_descriptor, false)); + std::cout << "Created partitioned KV table" << std::endl; + + // Create partitions + check("create_US", admin.CreatePartition(partitioned_kv_path, {{"region", "US"}})); + check("create_EU", admin.CreatePartition(partitioned_kv_path, {{"region", "EU"}})); + check("create_APAC", admin.CreatePartition(partitioned_kv_path, {{"region", "APAC"}})); + std::cout << "Created partitions: US, EU, APAC" << std::endl; + + fluss::Table partitioned_kv_table; + check("get_partitioned_kv_table", conn.GetTable(partitioned_kv_path, partitioned_kv_table)); + + fluss::UpsertWriter partitioned_writer; + check("new_partitioned_writer", + partitioned_kv_table.NewUpsert().CreateWriter(partitioned_writer)); + + // Upsert rows across partitions + // Column order: region(0), score(1), user_id(2), name(3) + struct TestRow { + const char* region; + int64_t score; + int32_t user_id; + const char* name; + }; + TestRow test_data[] = { + {"US", 100, 1, "Gustave"}, {"US", 200, 2, "Lune"}, {"EU", 150, 1, "Sciel"}, + {"EU", 250, 2, "Maelle"}, {"APAC", 300, 1, "Noco"}, + }; + + for (const auto& td : test_data) { + auto row = partitioned_kv_table.NewRow(); + row.Set("region", td.region); + row.Set("score", td.score); + row.Set("user_id", td.user_id); + row.Set("name", td.name); + check("partitioned_upsert", partitioned_writer.Upsert(row)); + } + check("partitioned_flush", partitioned_writer.Flush()); + std::cout << "Upserted 5 rows across 3 partitions" << std::endl; + + // Lookup all rows + fluss::Lookuper partitioned_lookuper; + check("new_partitioned_lookuper", + partitioned_kv_table.NewLookup().CreateLookuper(partitioned_lookuper)); + + for (const auto& td : test_data) { + auto pk = partitioned_kv_table.NewRow(); + pk.Set("region", td.region); + pk.Set("user_id", td.user_id); + + fluss::LookupResult result; + check("partitioned_lookup", partitioned_lookuper.Lookup(pk, result)); + if (!result.Found()) { + std::cerr << "ERROR: Expected to find region=" << td.region << " user_id=" << td.user_id + << std::endl; + std::exit(1); + } + if (result.GetString(3) != td.name || result.GetInt64(1) != td.score) { + std::cerr << "ERROR: Data mismatch for region=" << td.region + << " user_id=" << td.user_id << std::endl; + std::exit(1); + } + } + std::cout << "All 5 rows verified across partitions" << std::endl; + + // Update within a partition + { + auto row = partitioned_kv_table.NewRow(); + row.Set("region", "US"); + row.Set("score", static_cast(999)); + row.Set("user_id", 1); + row.Set("name", "Gustave Updated"); + fluss::WriteResult wr; + check("partitioned_update", partitioned_writer.Upsert(row, wr)); + check("partitioned_update_wait", wr.Wait()); + } + { + auto pk = partitioned_kv_table.NewRow(); + pk.Set("region", "US"); + pk.Set("user_id", 1); + fluss::LookupResult result; + check("partitioned_lookup_updated", partitioned_lookuper.Lookup(pk, result)); + if (!result.Found() || result.GetString(3) != "Gustave Updated" || + result.GetInt64(1) != 999) { + std::cerr << "ERROR: Partition update verification failed" << std::endl; + std::exit(1); + } + std::cout << "Update verified: US/1 name=" << result.GetString(3) + << " score=" << result.GetInt64(1) << std::endl; + } + + // Lookup in non-existent partition + { + auto pk = partitioned_kv_table.NewRow(); + pk.Set("region", "UNKNOWN"); + pk.Set("user_id", 1); + fluss::LookupResult result; + check("partitioned_lookup_unknown", partitioned_lookuper.Lookup(pk, result)); + if (result.Found()) { + std::cerr << "ERROR: Expected UNKNOWN partition lookup to return not found" + << std::endl; + std::exit(1); + } + std::cout << "UNKNOWN partition lookup: not found (expected)" << std::endl; + } + + // Delete within a partition + { + auto pk = partitioned_kv_table.NewRow(); + pk.Set("region", "EU"); + pk.Set("user_id", 1); + fluss::WriteResult wr; + check("partitioned_delete", partitioned_writer.Delete(pk, wr)); + check("partitioned_delete_wait", wr.Wait()); + } + { + auto pk = partitioned_kv_table.NewRow(); + pk.Set("region", "EU"); + pk.Set("user_id", 1); + fluss::LookupResult result; + check("partitioned_lookup_deleted", partitioned_lookuper.Lookup(pk, result)); + if (result.Found()) { + std::cerr << "ERROR: Expected EU/1 to be deleted" << std::endl; + std::exit(1); + } + std::cout << "Delete verified: EU/1 not found" << std::endl; + } + + // Verify other record in same partition still exists + { + auto pk = partitioned_kv_table.NewRow(); + pk.Set("region", "EU"); + pk.Set("user_id", 2); + fluss::LookupResult result; + check("partitioned_lookup_eu2", partitioned_lookuper.Lookup(pk, result)); + if (!result.Found() || result.GetString(3) != "Maelle") { + std::cerr << "ERROR: Expected EU/2 (Maelle) to still exist" << std::endl; + std::exit(1); + } + std::cout << "EU/2 still exists: name=" << result.GetString(3) << std::endl; + } + + check("drop_partitioned_kv", admin.DropTable(partitioned_kv_path, true)); + std::cout << "\nKV table example completed successfully!" << std::endl; + + return 0; +} diff --git a/fluss-rust/bindings/cpp/include/fluss.hpp b/fluss-rust/bindings/cpp/include/fluss.hpp new file mode 100644 index 0000000000..d019b42787 --- /dev/null +++ b/fluss-rust/bindings/cpp/include/fluss.hpp @@ -0,0 +1,1633 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Forward declare Arrow classes to avoid including heavy Arrow headers in header +namespace arrow { +class RecordBatch; +} + +namespace fluss { + +namespace ffi { +struct Connection; +struct Admin; +struct Table; +struct AppendWriter; +struct WriteResult; +struct LogScanner; +struct UpsertWriter; +struct Lookuper; +struct ScanResultInner; +struct GenericRowInner; +struct LookupResultInner; +struct ArrayWriterInner; +struct ArrayViewInner; +} // namespace ffi + +/// Named constants for Fluss API error codes. +/// +/// Server API errors have error_code > 0 or == -1. +/// Client-side errors have error_code == CLIENT_ERROR (-2). +/// These constants match the Rust core FlussError enum and are stable across protocol versions. +/// New server error codes work automatically (error_code is a raw int, not a closed enum) — +/// these constants are convenience names, not an exhaustive list. +struct ErrorCode { + /// Client-side error (not from server API protocol). Check error_message for details. + static constexpr int CLIENT_ERROR = -2; + /// No error. + static constexpr int NONE = 0; + /// The server experienced an unexpected error when processing the request. + static constexpr int UNKNOWN_SERVER_ERROR = -1; + /// The server disconnected before a response was received. + static constexpr int NETWORK_EXCEPTION = 1; + /// The version of API is not supported. + static constexpr int UNSUPPORTED_VERSION = 2; + /// This message has failed its CRC checksum, exceeds the valid size, or is otherwise corrupt. + static constexpr int CORRUPT_MESSAGE = 3; + /// The database does not exist. + static constexpr int DATABASE_NOT_EXIST = 4; + /// The database is not empty. + static constexpr int DATABASE_NOT_EMPTY = 5; + /// The database already exists. + static constexpr int DATABASE_ALREADY_EXIST = 6; + /// The table does not exist. + static constexpr int TABLE_NOT_EXIST = 7; + /// The table already exists. + static constexpr int TABLE_ALREADY_EXIST = 8; + /// The schema does not exist. + static constexpr int SCHEMA_NOT_EXIST = 9; + /// Exception occurred while storing data for log in server. + static constexpr int LOG_STORAGE_EXCEPTION = 10; + /// Exception occurred while storing data for kv in server. + static constexpr int KV_STORAGE_EXCEPTION = 11; + /// Not leader or follower. + static constexpr int NOT_LEADER_OR_FOLLOWER = 12; + /// The record is too large. + static constexpr int RECORD_TOO_LARGE_EXCEPTION = 13; + /// The record is corrupt. + static constexpr int CORRUPT_RECORD_EXCEPTION = 14; + /// The client has attempted to perform an operation on an invalid table. + static constexpr int INVALID_TABLE_EXCEPTION = 15; + /// The client has attempted to perform an operation on an invalid database. + static constexpr int INVALID_DATABASE_EXCEPTION = 16; + /// The replication factor is larger than the number of available tablet servers. + static constexpr int INVALID_REPLICATION_FACTOR = 17; + /// Produce request specified an invalid value for required acks. + static constexpr int INVALID_REQUIRED_ACKS = 18; + /// The log offset is out of range. + static constexpr int LOG_OFFSET_OUT_OF_RANGE_EXCEPTION = 19; + /// The table is not a primary key table. + static constexpr int NON_PRIMARY_KEY_TABLE_EXCEPTION = 20; + /// The table or bucket does not exist. + static constexpr int UNKNOWN_TABLE_OR_BUCKET_EXCEPTION = 21; + /// The update version is invalid. + static constexpr int INVALID_UPDATE_VERSION_EXCEPTION = 22; + /// The coordinator is invalid. + static constexpr int INVALID_COORDINATOR_EXCEPTION = 23; + /// The leader epoch is invalid. + static constexpr int FENCED_LEADER_EPOCH_EXCEPTION = 24; + /// The request timed out. + static constexpr int REQUEST_TIME_OUT = 25; + /// The general storage exception. + static constexpr int STORAGE_EXCEPTION = 26; + /// The server did not attempt to execute this operation. + static constexpr int OPERATION_NOT_ATTEMPTED_EXCEPTION = 27; + /// Records are written to the server already, but to fewer in-sync replicas than required. + static constexpr int NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION = 28; + /// Messages are rejected since there are fewer in-sync replicas than required. + static constexpr int NOT_ENOUGH_REPLICAS_EXCEPTION = 29; + /// Get file access security token exception. + static constexpr int SECURITY_TOKEN_EXCEPTION = 30; + /// The tablet server received an out of order sequence batch. + static constexpr int OUT_OF_ORDER_SEQUENCE_EXCEPTION = 31; + /// The tablet server received a duplicate sequence batch. + static constexpr int DUPLICATE_SEQUENCE_EXCEPTION = 32; + /// The tablet server could not locate the writer metadata. + static constexpr int UNKNOWN_WRITER_ID_EXCEPTION = 33; + /// The requested column projection is invalid. + static constexpr int INVALID_COLUMN_PROJECTION = 34; + /// The requested target column to write is invalid. + static constexpr int INVALID_TARGET_COLUMN = 35; + /// The partition does not exist. + static constexpr int PARTITION_NOT_EXISTS = 36; + /// The table is not partitioned. + static constexpr int TABLE_NOT_PARTITIONED_EXCEPTION = 37; + /// The timestamp is invalid. + static constexpr int INVALID_TIMESTAMP_EXCEPTION = 38; + /// The config is invalid. + static constexpr int INVALID_CONFIG_EXCEPTION = 39; + /// The lake storage is not configured. + static constexpr int LAKE_STORAGE_NOT_CONFIGURED_EXCEPTION = 40; + /// The kv snapshot does not exist. + static constexpr int KV_SNAPSHOT_NOT_EXIST = 41; + /// The partition already exists. + static constexpr int PARTITION_ALREADY_EXISTS = 42; + /// The partition spec is invalid. + static constexpr int PARTITION_SPEC_INVALID_EXCEPTION = 43; + /// There is no currently available leader for the given partition. + static constexpr int LEADER_NOT_AVAILABLE_EXCEPTION = 44; + /// Exceed the maximum number of partitions. + static constexpr int PARTITION_MAX_NUM_EXCEPTION = 45; + /// Authentication failed. + static constexpr int AUTHENTICATE_EXCEPTION = 46; + /// Security is disabled. + static constexpr int SECURITY_DISABLED_EXCEPTION = 47; + /// Authorization failed. + static constexpr int AUTHORIZATION_EXCEPTION = 48; + /// Exceed the maximum number of buckets. + static constexpr int BUCKET_MAX_NUM_EXCEPTION = 49; + /// The tiering epoch is invalid. + static constexpr int FENCED_TIERING_EPOCH_EXCEPTION = 50; + /// Authentication failed with retriable exception. + static constexpr int RETRIABLE_AUTHENTICATE_EXCEPTION = 51; + /// The server rack info is invalid. + static constexpr int INVALID_SERVER_RACK_INFO_EXCEPTION = 52; + /// The lake snapshot does not exist. + static constexpr int LAKE_SNAPSHOT_NOT_EXIST = 53; + /// The lake table already exists. + static constexpr int LAKE_TABLE_ALREADY_EXIST = 54; + /// The new ISR contains at least one ineligible replica. + static constexpr int INELIGIBLE_REPLICA_EXCEPTION = 55; + /// The alter table is invalid. + static constexpr int INVALID_ALTER_TABLE_EXCEPTION = 56; + /// Deletion operations are disabled on this table. + static constexpr int DELETION_DISABLED_EXCEPTION = 57; + + /// Returns true if retrying the request may succeed. Mirrors Java's RetriableException hierarchy. + static constexpr bool IsRetriable(int32_t code) { + return code == NETWORK_EXCEPTION || code == CORRUPT_MESSAGE || + code == SCHEMA_NOT_EXIST || code == LOG_STORAGE_EXCEPTION || + code == KV_STORAGE_EXCEPTION || code == NOT_LEADER_OR_FOLLOWER || + code == CORRUPT_RECORD_EXCEPTION || + code == UNKNOWN_TABLE_OR_BUCKET_EXCEPTION || code == REQUEST_TIME_OUT || + code == STORAGE_EXCEPTION || + code == NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION || + code == NOT_ENOUGH_REPLICAS_EXCEPTION || code == LEADER_NOT_AVAILABLE_EXCEPTION; + } +}; + +struct Date { + int32_t days_since_epoch{0}; + + static Date FromDays(int32_t days) { return {days}; } + static Date FromYMD(int year, int month, int day); + + int Year() const; + int Month() const; + int Day() const; +}; + +struct Time { + static constexpr int32_t kMillisPerSecond = 1000; + static constexpr int32_t kMillisPerMinute = 60 * kMillisPerSecond; + static constexpr int32_t kMillisPerHour = 60 * kMillisPerMinute; + + int32_t millis_since_midnight{0}; + + static Time FromMillis(int32_t ms) { return {ms}; } + static Time FromHMS(int hour, int minute, int second, int millis = 0) { + return {hour * kMillisPerHour + minute * kMillisPerMinute + second * kMillisPerSecond + + millis}; + } + + int Hour() const { return millis_since_midnight / kMillisPerHour; } + int Minute() const { return (millis_since_midnight % kMillisPerHour) / kMillisPerMinute; } + int Second() const { return (millis_since_midnight % kMillisPerMinute) / kMillisPerSecond; } + int Millis() const { return millis_since_midnight % kMillisPerSecond; } +}; + +struct Timestamp { + static constexpr int32_t kMaxNanoOfMillisecond = 999999; + static constexpr int64_t kNanosPerMilli = 1000000; + + int64_t epoch_millis{0}; + int32_t nano_of_millisecond{0}; + + static Timestamp FromMillis(int64_t ms) { return {ms, 0}; } + static Timestamp FromMillisNanos(int64_t ms, int32_t nanos) { + if (nanos < 0) nanos = 0; + if (nanos > kMaxNanoOfMillisecond) nanos = kMaxNanoOfMillisecond; + return {ms, nanos}; + } + static Timestamp FromTimePoint(std::chrono::system_clock::time_point tp) { + auto duration = tp.time_since_epoch(); + auto ns = std::chrono::duration_cast(duration).count(); + auto ms = ns / kNanosPerMilli; + auto nano_of_ms = static_cast(ns % kNanosPerMilli); + if (nano_of_ms < 0) { + nano_of_ms += kNanosPerMilli; + ms -= 1; + } + return {ms, nano_of_ms}; + } +}; + +enum class ChangeType { + AppendOnly = 0, + Insert = 1, + UpdateBefore = 2, + UpdateAfter = 3, + Delete = 4, +}; + +enum class TypeId { + Unknown = 0, + Boolean = 1, + TinyInt = 2, + SmallInt = 3, + Int = 4, + BigInt = 5, + Float = 6, + Double = 7, + String = 8, + Bytes = 9, + Date = 10, + Time = 11, + Timestamp = 12, + TimestampLtz = 13, + Decimal = 14, + Char = 15, + Binary = 16, + Array = 17, +}; + +class DataType { + public: + explicit DataType(TypeId id, int32_t p = 0, int32_t s = 0, bool nullable = true) + : id_(id), precision_(p), scale_(s), nullable_(nullable) {} + + static DataType Boolean() { return DataType(TypeId::Boolean); } + static DataType TinyInt() { return DataType(TypeId::TinyInt); } + static DataType SmallInt() { return DataType(TypeId::SmallInt); } + static DataType Int() { return DataType(TypeId::Int); } + static DataType BigInt() { return DataType(TypeId::BigInt); } + static DataType Float() { return DataType(TypeId::Float); } + static DataType Double() { return DataType(TypeId::Double); } + static DataType String() { return DataType(TypeId::String); } + static DataType Bytes() { return DataType(TypeId::Bytes); } + static DataType Date() { return DataType(TypeId::Date); } + static DataType Time() { return DataType(TypeId::Time); } + static DataType Timestamp(int32_t precision = 6) { + return DataType(TypeId::Timestamp, precision, 0); + } + static DataType TimestampLtz(int32_t precision = 6) { + return DataType(TypeId::TimestampLtz, precision, 0); + } + static DataType Decimal(int32_t precision, int32_t scale) { + return DataType(TypeId::Decimal, precision, scale); + } + static DataType Char(int32_t length) { return DataType(TypeId::Char, length, 0); } + static DataType Binary(int32_t length) { return DataType(TypeId::Binary, length, 0); } + /// Constructs an `ARRAY` type. The element DataType (possibly + /// itself an array) is deep-copied into a shared owning handle so that + /// copies of the outer DataType remain cheap while the element lives + /// as long as any reference exists. + static DataType Array(DataType element) { + DataType dt(TypeId::Array, 0, 0); + dt.element_type_ = std::make_shared(std::move(element)); + return dt; + } + + TypeId id() const { return id_; } + int32_t precision() const { return precision_; } + int32_t scale() const { return scale_; } + bool nullable() const { return nullable_; } + /// Returns the element type of an ARRAY. Returns `nullptr` for non-array + /// types. The returned pointer is valid as long as this DataType (or a + /// copy holding the same shared element) is alive. + const DataType* element_type() const { return element_type_.get(); } + + /// Returns a copy of this DataType with nullable set to false. + DataType NotNull() const { + DataType dt(id_, precision_, scale_, false); + dt.element_type_ = element_type_; + return dt; + } + + private: + TypeId id_; + int32_t precision_{0}; + int32_t scale_{0}; + bool nullable_{true}; + std::shared_ptr element_type_; +}; + +constexpr int64_t EARLIEST_OFFSET = -2; + +enum class OffsetType { + Earliest = 0, + Latest = 1, + Timestamp = 2, +}; + +struct OffsetSpec { + OffsetType type; + int64_t timestamp{0}; + + static OffsetSpec Earliest() { return {OffsetType::Earliest, 0}; } + static OffsetSpec Latest() { return {OffsetType::Latest, 0}; } + static OffsetSpec Timestamp(int64_t ts) { return {OffsetType::Timestamp, ts}; } +}; + +struct Result { + int32_t error_code{0}; + std::string error_message; + + bool Ok() const { return error_code == 0; } + + /// Returns true if retrying the request may succeed. Client-side errors always return false. + bool IsRetriable() const { return ErrorCode::IsRetriable(error_code); } +}; + +struct TablePath { + std::string database_name; + std::string table_name; + + TablePath() = default; + TablePath(std::string db, std::string tbl) + : database_name(std::move(db)), table_name(std::move(tbl)) {} + + std::string ToString() const { return database_name + "." + table_name; } +}; + +struct Column { + std::string name; + DataType data_type; + std::string comment; +}; + +struct Schema { + std::vector columns; + std::vector primary_keys; + + class Builder { + public: + Builder& AddColumn(std::string name, DataType type, std::string comment = "") { + columns_.push_back({std::move(name), std::move(type), std::move(comment)}); + return *this; + } + + Builder& SetPrimaryKeys(std::vector keys) { + primary_keys_ = std::move(keys); + return *this; + } + + Schema Build() { return Schema{std::move(columns_), std::move(primary_keys_)}; } + + private: + std::vector columns_; + std::vector primary_keys_; + }; + + static Builder NewBuilder() { return Builder(); } +}; + +struct TableDescriptor { + Schema schema; + std::vector partition_keys; + int32_t bucket_count{0}; + std::vector bucket_keys; + std::unordered_map properties; + std::unordered_map custom_properties; + std::string comment; + + class Builder { + public: + Builder& SetSchema(Schema s) { + schema_ = std::move(s); + return *this; + } + + Builder& SetPartitionKeys(std::vector keys) { + partition_keys_ = std::move(keys); + return *this; + } + + Builder& SetBucketCount(int32_t count) { + bucket_count_ = count; + return *this; + } + + Builder& SetBucketKeys(std::vector keys) { + bucket_keys_ = std::move(keys); + return *this; + } + + Builder& SetProperty(std::string key, std::string value) { + properties_[std::move(key)] = std::move(value); + return *this; + } + + Builder& SetCustomProperty(std::string key, std::string value) { + custom_properties_[std::move(key)] = std::move(value); + return *this; + } + + Builder& SetLogFormat(std::string format) { + return SetProperty("table.log.format", std::move(format)); + } + + Builder& SetKvFormat(std::string format) { + return SetProperty("table.kv.format", std::move(format)); + } + + Builder& SetComment(std::string comment) { + comment_ = std::move(comment); + return *this; + } + + TableDescriptor Build() { + return TableDescriptor{std::move(schema_), std::move(partition_keys_), + bucket_count_, std::move(bucket_keys_), + std::move(properties_), std::move(custom_properties_), + std::move(comment_)}; + } + + private: + Schema schema_; + std::vector partition_keys_; + int32_t bucket_count_{0}; + std::vector bucket_keys_; + std::unordered_map properties_; + std::unordered_map custom_properties_; + std::string comment_; + }; + + static Builder NewBuilder() { return Builder(); } +}; + +struct TableInfo { + int64_t table_id; + int32_t schema_id; + TablePath table_path; + int64_t created_time; + int64_t modified_time; + std::vector primary_keys; + std::vector bucket_keys; + std::vector partition_keys; + int32_t num_buckets; + bool has_primary_key; + bool is_partitioned; + std::unordered_map properties; + std::unordered_map custom_properties; + std::string comment; + Schema schema; +}; + +namespace detail { +struct ColumnInfo { + size_t index; + TypeId type_id; +}; +using ColumnMap = std::unordered_map; + +inline size_t ResolveColumn(const ColumnMap& map, const std::string& name) { + auto it = map.find(name); + if (it == map.end()) { + throw std::runtime_error("Unknown column '" + name + "'"); + } + return it->second.index; +} + +// Forward declaration so NamedGetters can declare GetArrayView(...) even +// though the concrete class is defined further down. +} // namespace detail +class ArrayView; +namespace detail { + +/// CRTP mixin that adds name-based getters to any class with index-based getters. +/// Derived must provide: `size_t Resolve(const std::string&) const` +/// and all the index-based getters (IsNull(idx), GetBool(idx), etc.). +template +struct NamedGetters { + bool IsNull(const std::string& n) const { return Self().IsNull(Self().Resolve(n)); } + bool GetBool(const std::string& n) const { return Self().GetBool(Self().Resolve(n)); } + int32_t GetInt32(const std::string& n) const { return Self().GetInt32(Self().Resolve(n)); } + int64_t GetInt64(const std::string& n) const { return Self().GetInt64(Self().Resolve(n)); } + float GetFloat32(const std::string& n) const { return Self().GetFloat32(Self().Resolve(n)); } + double GetFloat64(const std::string& n) const { return Self().GetFloat64(Self().Resolve(n)); } + std::string_view GetString(const std::string& n) const { + return Self().GetString(Self().Resolve(n)); + } + std::pair GetBytes(const std::string& n) const { + return Self().GetBytes(Self().Resolve(n)); + } + fluss::Date GetDate(const std::string& n) const { return Self().GetDate(Self().Resolve(n)); } + fluss::Time GetTime(const std::string& n) const { return Self().GetTime(Self().Resolve(n)); } + fluss::Timestamp GetTimestamp(const std::string& n) const { + return Self().GetTimestamp(Self().Resolve(n)); + } + std::string GetDecimalString(const std::string& n) const { + return Self().GetDecimalString(Self().Resolve(n)); + } + size_t GetArraySize(const std::string& n) const { + return Self().GetArraySize(Self().Resolve(n)); + } + TypeId GetArrayElementType(const std::string& n) const { + return Self().GetArrayElementType(Self().Resolve(n)); + } + bool IsArrayElementNull(const std::string& n, size_t element) const { + return Self().IsArrayElementNull(Self().Resolve(n), element); + } + bool GetArrayBool(const std::string& n, size_t element) const { + return Self().GetArrayBool(Self().Resolve(n), element); + } + int32_t GetArrayInt32(const std::string& n, size_t element) const { + return Self().GetArrayInt32(Self().Resolve(n), element); + } + int64_t GetArrayInt64(const std::string& n, size_t element) const { + return Self().GetArrayInt64(Self().Resolve(n), element); + } + float GetArrayFloat32(const std::string& n, size_t element) const { + return Self().GetArrayFloat32(Self().Resolve(n), element); + } + double GetArrayFloat64(const std::string& n, size_t element) const { + return Self().GetArrayFloat64(Self().Resolve(n), element); + } + std::string GetArrayString(const std::string& n, size_t element) const { + return Self().GetArrayString(Self().Resolve(n), element); + } + std::vector GetArrayBytes(const std::string& n, size_t element) const { + return Self().GetArrayBytes(Self().Resolve(n), element); + } + fluss::Date GetArrayDate(const std::string& n, size_t element) const { + return Self().GetArrayDate(Self().Resolve(n), element); + } + fluss::Time GetArrayTime(const std::string& n, size_t element) const { + return Self().GetArrayTime(Self().Resolve(n), element); + } + fluss::Timestamp GetArrayTimestamp(const std::string& n, size_t element) const { + return Self().GetArrayTimestamp(Self().Resolve(n), element); + } + std::string GetArrayDecimalString(const std::string& n, size_t element) const { + return Self().GetArrayDecimalString(Self().Resolve(n), element); + } + // Definition appears below the ArrayView class; return-by-value requires + // the complete type so we cannot inline the body here. + ArrayView GetArrayView(const std::string& n) const; + + private: + const Derived& Self() const { return static_cast(*this); } +}; + +struct ScanData { + ffi::ScanResultInner* raw; + ColumnMap columns; + + ScanData(ffi::ScanResultInner* r, ColumnMap cols) : raw(r), columns(std::move(cols)) {} + ~ScanData(); + + ScanData(const ScanData&) = delete; + ScanData& operator=(const ScanData&) = delete; +}; +} // namespace detail + +/** + * @brief Read-only view over a FlussArray column value. + * + * Obtained from RowView::GetArrayView() / LookupResult::GetArrayView(), and + * recursively from ArrayView::GetArray() for nested `ARRAY>` + * columns. Owns an opaque Rust handle (FlussArray + element DataType) that + * is released on destruction. Move-only. + */ +class ArrayView { + public: + ~ArrayView() noexcept; + + ArrayView(const ArrayView&) = delete; + ArrayView& operator=(const ArrayView&) = delete; + ArrayView(ArrayView&& other) noexcept; + ArrayView& operator=(ArrayView&& other) noexcept; + + size_t Size() const noexcept; + TypeId ElementType() const noexcept; + bool IsNull(size_t element) const; + + bool GetBool(size_t element) const; + int32_t GetInt32(size_t element) const; + int64_t GetInt64(size_t element) const; + float GetFloat32(size_t element) const; + double GetFloat64(size_t element) const; + std::string GetString(size_t element) const; + std::vector GetBytes(size_t element) const; + fluss::Date GetDate(size_t element) const; + fluss::Time GetTime(size_t element) const; + fluss::Timestamp GetTimestampNtz(size_t element) const; + fluss::Timestamp GetTimestampLtz(size_t element) const; + std::string GetDecimalString(size_t element) const; + ArrayView GetArray(size_t element) const; + + private: + friend class RowView; + friend class LookupResult; + explicit ArrayView(ffi::ArrayViewInner* inner) : inner_(inner) {} + void Destroy() noexcept; + ffi::ArrayViewInner* inner_{nullptr}; +}; + +namespace detail { +template +inline ArrayView NamedGetters::GetArrayView(const std::string& n) const { + return Self().GetArrayView(Self().Resolve(n)); +} +} // namespace detail + +class ArrayWriter { + public: + ArrayWriter(size_t size, DataType element_type); + ~ArrayWriter() noexcept; + + ArrayWriter(const ArrayWriter&) = delete; + ArrayWriter& operator=(const ArrayWriter&) = delete; + ArrayWriter(ArrayWriter&& other) noexcept; + ArrayWriter& operator=(ArrayWriter&& other) noexcept; + + bool Available() const; + size_t Size() const noexcept; + + void SetNull(size_t idx); + void SetBool(size_t idx, bool v); + void SetInt32(size_t idx, int32_t v); + void SetInt64(size_t idx, int64_t v); + void SetFloat32(size_t idx, float v); + void SetFloat64(size_t idx, double v); + void SetString(size_t idx, const std::string& v); + void SetBytes(size_t idx, const std::vector& v); + void SetDate(size_t idx, fluss::Date d); + void SetTime(size_t idx, fluss::Time t); + void SetTimestampNtz(size_t idx, fluss::Timestamp ts); + void SetTimestampLtz(size_t idx, fluss::Timestamp ts); + void SetDecimal(size_t idx, const std::string& value); + void SetArray(size_t idx, ArrayWriter&& nested); + + private: + friend class GenericRow; + void Destroy() noexcept; + ffi::ArrayWriterInner* inner_{nullptr}; + DataType element_type_; +}; + +class GenericRow { + public: + GenericRow(); + explicit GenericRow(size_t field_count); + ~GenericRow() noexcept; + + GenericRow(const GenericRow&) = delete; + GenericRow& operator=(const GenericRow&) = delete; + GenericRow(GenericRow&& other) noexcept; + GenericRow& operator=(GenericRow&& other) noexcept; + + bool Available() const; + void Reset(); + + // ── Index-based setters ────────────────────────────────────────── + void SetNull(size_t idx); + void SetBool(size_t idx, bool v); + void SetInt32(size_t idx, int32_t v); + void SetInt64(size_t idx, int64_t v); + void SetFloat32(size_t idx, float v); + void SetFloat64(size_t idx, double v); + void SetString(size_t idx, std::string v); + void SetBytes(size_t idx, std::vector v); + void SetDate(size_t idx, fluss::Date d); + void SetTime(size_t idx, fluss::Time t); + void SetTimestampNtz(size_t idx, fluss::Timestamp ts); + void SetTimestampLtz(size_t idx, fluss::Timestamp ts); + void SetDecimal(size_t idx, const std::string& value); + void SetArray(size_t idx, ArrayWriter&& writer); + + // ── Name-based setters (require schema — see Table::NewRow()) ─── + void Set(const std::string& name, std::nullptr_t) { SetNull(Resolve(name)); } + void Set(const std::string& name, bool v) { SetBool(Resolve(name), v); } + void Set(const std::string& name, int32_t v) { SetInt32(Resolve(name), v); } + void Set(const std::string& name, int64_t v) { SetInt64(Resolve(name), v); } + void Set(const std::string& name, float v) { SetFloat32(Resolve(name), v); } + void Set(const std::string& name, double v) { SetFloat64(Resolve(name), v); } + // const char* overload to prevent "string literal" -> bool conversion + void Set(const std::string& name, const char* v) { + auto [idx, type] = ResolveColumn(name); + if (type == TypeId::Decimal) { + SetDecimal(idx, v); + } else if (type == TypeId::String) { + SetString(idx, v); + } else { + throw std::runtime_error("GenericRow::Set: column '" + name + + "' is not a string or decimal column"); + } + } + void Set(const std::string& name, std::string v) { + auto [idx, type] = ResolveColumn(name); + if (type == TypeId::Decimal) { + SetDecimal(idx, v); + } else if (type == TypeId::String) { + SetString(idx, std::move(v)); + } else { + throw std::runtime_error("GenericRow::Set: column '" + name + + "' is not a string or decimal column"); + } + } + void Set(const std::string& name, std::vector v) { + SetBytes(Resolve(name), std::move(v)); + } + void Set(const std::string& name, fluss::Date d) { SetDate(Resolve(name), d); } + void Set(const std::string& name, fluss::Time t) { SetTime(Resolve(name), t); } + void Set(const std::string& name, fluss::Timestamp ts) { + auto [idx, type] = ResolveColumn(name); + if (type == TypeId::TimestampLtz) { + SetTimestampLtz(idx, ts); + } else if (type == TypeId::Timestamp) { + SetTimestampNtz(idx, ts); + } else { + throw std::runtime_error("GenericRow::Set: column '" + name + + "' is not a timestamp column"); + } + } + void Set(const std::string& name, ArrayWriter&& writer) { SetArray(Resolve(name), std::move(writer)); } + + private: + friend class Table; + friend class AppendWriter; + friend class UpsertWriter; + friend class Lookuper; + + using ColumnInfo = detail::ColumnInfo; + using ColumnMap = detail::ColumnMap; + + size_t Resolve(const std::string& name) const { return ResolveColumn(name).index; } + + const ColumnInfo& ResolveColumn(const std::string& name) const { + if (!column_map_) { + throw std::runtime_error( + "GenericRow: name-based Set() requires a schema. " + "Use Table::NewRow() to create a schema-aware row."); + } + auto it = column_map_->find(name); + if (it == column_map_->end()) { + throw std::runtime_error("GenericRow: unknown column '" + name + "'"); + } + return it->second; + } + + void Destroy() noexcept; + + ffi::GenericRowInner* inner_{nullptr}; + std::shared_ptr column_map_; +}; + +/// Read-only row view for scan results. Zero-copy access to string and bytes data. +/// +/// RowView shares ownership of the underlying scan data via reference counting, +/// so it can safely outlive the ScanRecords that produced it. +class RowView : public detail::NamedGetters { + friend struct detail::NamedGetters; + + public: + RowView(std::shared_ptr data, size_t bucket_idx, size_t rec_idx) + : data_(std::move(data)), bucket_idx_(bucket_idx), rec_idx_(rec_idx) {} + + // ── Index-based getters ────────────────────────────────────────── + size_t FieldCount() const; + TypeId GetType(size_t idx) const; + bool IsNull(size_t idx) const; + bool GetBool(size_t idx) const; + int32_t GetInt32(size_t idx) const; + int64_t GetInt64(size_t idx) const; + float GetFloat32(size_t idx) const; + double GetFloat64(size_t idx) const; + std::string_view GetString(size_t idx) const; + std::pair GetBytes(size_t idx) const; + fluss::Date GetDate(size_t idx) const; + fluss::Time GetTime(size_t idx) const; + fluss::Timestamp GetTimestamp(size_t idx) const; + bool IsDecimal(size_t idx) const; + std::string GetDecimalString(size_t idx) const; + + // ── Array getters ──────────────────────────────────────────────── + size_t GetArraySize(size_t idx) const; + TypeId GetArrayElementType(size_t idx) const; + bool IsArrayElementNull(size_t idx, size_t element) const; + bool GetArrayBool(size_t idx, size_t element) const; + int32_t GetArrayInt32(size_t idx, size_t element) const; + int64_t GetArrayInt64(size_t idx, size_t element) const; + float GetArrayFloat32(size_t idx, size_t element) const; + double GetArrayFloat64(size_t idx, size_t element) const; + std::string GetArrayString(size_t idx, size_t element) const; + std::vector GetArrayBytes(size_t idx, size_t element) const; + fluss::Date GetArrayDate(size_t idx, size_t element) const; + fluss::Time GetArrayTime(size_t idx, size_t element) const; + fluss::Timestamp GetArrayTimestamp(size_t idx, size_t element) const; + std::string GetArrayDecimalString(size_t idx, size_t element) const; + /// Returns an owning ArrayView over the array column at `idx`. ArrayView + /// supports nested arrays via ArrayView::GetArray(). Parity with Python's + /// recursive list return from `row.get_array(i)`. + ArrayView GetArrayView(size_t idx) const; + + // Name-based getters inherited from detail::NamedGetters + using detail::NamedGetters::IsNull; + using detail::NamedGetters::GetBool; + using detail::NamedGetters::GetInt32; + using detail::NamedGetters::GetInt64; + using detail::NamedGetters::GetFloat32; + using detail::NamedGetters::GetFloat64; + using detail::NamedGetters::GetString; + using detail::NamedGetters::GetBytes; + using detail::NamedGetters::GetDate; + using detail::NamedGetters::GetTime; + using detail::NamedGetters::GetTimestamp; + using detail::NamedGetters::GetDecimalString; + using detail::NamedGetters::GetArraySize; + using detail::NamedGetters::GetArrayElementType; + using detail::NamedGetters::IsArrayElementNull; + using detail::NamedGetters::GetArrayBool; + using detail::NamedGetters::GetArrayInt32; + using detail::NamedGetters::GetArrayInt64; + using detail::NamedGetters::GetArrayFloat32; + using detail::NamedGetters::GetArrayFloat64; + using detail::NamedGetters::GetArrayString; + using detail::NamedGetters::GetArrayBytes; + using detail::NamedGetters::GetArrayDate; + using detail::NamedGetters::GetArrayTime; + using detail::NamedGetters::GetArrayTimestamp; + using detail::NamedGetters::GetArrayDecimalString; + using detail::NamedGetters::GetArrayView; + + private: + size_t Resolve(const std::string& name) const { + if (!data_) { + throw std::runtime_error("RowView: name-based access not available"); + } + return detail::ResolveColumn(data_->columns, name); + } + std::shared_ptr data_; + size_t bucket_idx_; + size_t rec_idx_; +}; + +/// Identifies a specific bucket, optionally within a partition. +struct TableBucket { + int64_t table_id; + int32_t bucket_id; + std::optional partition_id; + + bool operator==(const TableBucket& other) const { + return table_id == other.table_id && bucket_id == other.bucket_id && + partition_id == other.partition_id; + } + + bool operator!=(const TableBucket& other) const { return !(*this == other); } +}; + +/// A single scan record. Contains metadata and a RowView for field access. +/// +/// ScanRecord is a value type that can be freely copied, stored, and +/// accumulated across multiple Poll() calls. +struct ScanRecord { + int64_t offset; + int64_t timestamp; + ChangeType change_type; + RowView row; +}; + +/// A bundle of scan records belonging to a single bucket. +/// +/// BucketRecords is a value type — it shares ownership of the underlying scan data +/// via reference counting, so it can safely outlive the ScanRecords that produced it. +class BucketRecords { + public: + BucketRecords(std::shared_ptr data, TableBucket bucket, + size_t bucket_idx, size_t count) + : data_(std::move(data)), + bucket_(std::move(bucket)), + bucket_idx_(bucket_idx), + count_(count) {} + + /// The bucket these records belong to. + const TableBucket& Bucket() const { return bucket_; } + + /// Number of records in this bucket. + size_t Size() const { return count_; } + bool Empty() const { return count_ == 0; } + + /// Access a record by its position within this bucket (0-based). + ScanRecord operator[](size_t idx) const; + + class Iterator { + public: + ScanRecord operator*() const; + Iterator& operator++() { + ++idx_; + return *this; + } + bool operator!=(const Iterator& other) const { return idx_ != other.idx_; } + + private: + friend class BucketRecords; + Iterator(const BucketRecords* owner, size_t idx) : owner_(owner), idx_(idx) {} + const BucketRecords* owner_; + size_t idx_; + }; + + Iterator begin() const { return Iterator(this, 0); } + Iterator end() const { return Iterator(this, count_); } + + private: + std::shared_ptr data_; + TableBucket bucket_; + size_t bucket_idx_; + size_t count_; +}; + +class ScanRecords { + public: + ScanRecords() noexcept = default; + ~ScanRecords() noexcept = default; + + ScanRecords(const ScanRecords&) = delete; + ScanRecords& operator=(const ScanRecords&) = delete; + ScanRecords(ScanRecords&&) noexcept = default; + ScanRecords& operator=(ScanRecords&&) noexcept = default; + + /// Total number of records across all buckets. + size_t Count() const; + bool IsEmpty() const; + + /// Number of distinct buckets with records. + size_t BucketCount() const; + + /// List of distinct buckets that have records. + std::vector Buckets() const; + + /// Get records for a specific bucket. + /// + /// Returns an empty BucketRecords if the bucket is not present (matches Rust/Java). + /// Note: O(B) linear scan. For iteration over all buckets, prefer BucketAt(idx). + BucketRecords Records(const TableBucket& bucket) const; + + /// Get records by bucket index (0-based). O(1). + /// + /// Throws std::out_of_range if idx >= BucketCount(). + BucketRecords BucketAt(size_t idx) const; + + /// Flat iterator over all records across all buckets (matches Java Iterable). + class Iterator { + public: + ScanRecord operator*() const; + Iterator& operator++(); + bool operator!=(const Iterator& other) const { + return bucket_idx_ != other.bucket_idx_ || rec_idx_ != other.rec_idx_; + } + + private: + friend class ScanRecords; + Iterator(const ScanRecords* owner, size_t bucket_idx, size_t rec_idx) + : owner_(owner), bucket_idx_(bucket_idx), rec_idx_(rec_idx) {} + const ScanRecords* owner_; + size_t bucket_idx_; + size_t rec_idx_; + }; + + Iterator begin() const; + Iterator end() const { return Iterator(this, BucketCount(), 0); } + + private: + friend class LogScanner; + ScanRecord RecordAt(size_t bucket, size_t rec_idx) const; + std::shared_ptr data_; +}; + +class ArrowRecordBatch { + public: + std::shared_ptr GetArrowRecordBatch() const { return batch_; } + + bool Available() const; + + // Get number of rows in the batch + int64_t NumRows() const; + + // Get ScanBatch metadata + int64_t GetTableId() const; + int64_t GetPartitionId() const; + int32_t GetBucketId() const; + int64_t GetBaseOffset() const; + int64_t GetLastOffset() const; + + private: + friend class LogScanner; + explicit ArrowRecordBatch(std::shared_ptr batch, int64_t table_id, + int64_t partition_id, int32_t bucket_id, + int64_t base_offset) noexcept; + + std::shared_ptr batch_{nullptr}; + + int64_t table_id_; + int64_t partition_id_; + int32_t bucket_id_; + int64_t base_offset_; +}; + +struct ArrowRecordBatches { + std::vector> batches; + + size_t Size() const { return batches.size(); } + bool Empty() const { return batches.empty(); } + const std::unique_ptr& operator[](size_t idx) const { return batches[idx]; } + + auto begin() const { return batches.begin(); } + auto end() const { return batches.end(); } +}; + +struct BucketOffset { + int64_t table_id; + int64_t partition_id; + int32_t bucket_id; + int64_t offset; +}; + +struct BucketSubscription { + int32_t bucket_id; + int64_t offset; +}; + +struct PartitionBucketSubscription { + int64_t partition_id; + int32_t bucket_id; + int64_t offset; +}; + +struct LakeSnapshot { + int64_t snapshot_id; + std::vector bucket_offsets; +}; + +struct PartitionInfo { + int64_t partition_id; + std::string partition_name; +}; + +struct ServerNode { + int32_t id; + std::string host; + uint32_t port; + std::string server_type; + std::string uid; +}; + +/// Descriptor for create_database (optional). Leave comment and properties empty for default. +struct DatabaseDescriptor { + std::string comment; + std::unordered_map properties; +}; + +/// Metadata returned by GetDatabaseInfo. +struct DatabaseInfo { + std::string database_name; + std::string comment; + std::unordered_map properties; + int64_t created_time{0}; + int64_t modified_time{0}; +}; + +/// Read-only result for lookup operations. +class LookupResult : public detail::NamedGetters { + friend struct detail::NamedGetters; + + public: + LookupResult() noexcept; + ~LookupResult() noexcept; + + LookupResult(const LookupResult&) = delete; + LookupResult& operator=(const LookupResult&) = delete; + LookupResult(LookupResult&& other) noexcept; + LookupResult& operator=(LookupResult&& other) noexcept; + + bool Found() const; + size_t FieldCount() const; + + // ── Index-based getters ────────────────────────────────────────── + TypeId GetType(size_t idx) const; + bool IsNull(size_t idx) const; + bool GetBool(size_t idx) const; + int32_t GetInt32(size_t idx) const; + int64_t GetInt64(size_t idx) const; + float GetFloat32(size_t idx) const; + double GetFloat64(size_t idx) const; + std::string_view GetString(size_t idx) const; + std::pair GetBytes(size_t idx) const; + fluss::Date GetDate(size_t idx) const; + fluss::Time GetTime(size_t idx) const; + fluss::Timestamp GetTimestamp(size_t idx) const; + bool IsDecimal(size_t idx) const; + std::string GetDecimalString(size_t idx) const; + + // ── Array getters ──────────────────────────────────────────────── + size_t GetArraySize(size_t idx) const; + TypeId GetArrayElementType(size_t idx) const; + bool IsArrayElementNull(size_t idx, size_t element) const; + bool GetArrayBool(size_t idx, size_t element) const; + int32_t GetArrayInt32(size_t idx, size_t element) const; + int64_t GetArrayInt64(size_t idx, size_t element) const; + float GetArrayFloat32(size_t idx, size_t element) const; + double GetArrayFloat64(size_t idx, size_t element) const; + std::string GetArrayString(size_t idx, size_t element) const; + std::vector GetArrayBytes(size_t idx, size_t element) const; + fluss::Date GetArrayDate(size_t idx, size_t element) const; + fluss::Time GetArrayTime(size_t idx, size_t element) const; + fluss::Timestamp GetArrayTimestamp(size_t idx, size_t element) const; + std::string GetArrayDecimalString(size_t idx, size_t element) const; + /// See RowView::GetArrayView for semantics. Supports nested arrays. + ArrayView GetArrayView(size_t idx) const; + + // Name-based getters inherited from detail::NamedGetters + using detail::NamedGetters::IsNull; + using detail::NamedGetters::GetBool; + using detail::NamedGetters::GetInt32; + using detail::NamedGetters::GetInt64; + using detail::NamedGetters::GetFloat32; + using detail::NamedGetters::GetFloat64; + using detail::NamedGetters::GetString; + using detail::NamedGetters::GetBytes; + using detail::NamedGetters::GetDate; + using detail::NamedGetters::GetTime; + using detail::NamedGetters::GetTimestamp; + using detail::NamedGetters::GetDecimalString; + using detail::NamedGetters::GetArraySize; + using detail::NamedGetters::GetArrayElementType; + using detail::NamedGetters::IsArrayElementNull; + using detail::NamedGetters::GetArrayBool; + using detail::NamedGetters::GetArrayInt32; + using detail::NamedGetters::GetArrayInt64; + using detail::NamedGetters::GetArrayFloat32; + using detail::NamedGetters::GetArrayFloat64; + using detail::NamedGetters::GetArrayString; + using detail::NamedGetters::GetArrayBytes; + using detail::NamedGetters::GetArrayDate; + using detail::NamedGetters::GetArrayTime; + using detail::NamedGetters::GetArrayTimestamp; + using detail::NamedGetters::GetArrayDecimalString; + using detail::NamedGetters::GetArrayView; + + private: + friend class Lookuper; + size_t Resolve(const std::string& name) const { + if (!column_map_) { + BuildColumnMap(); + } + return detail::ResolveColumn(*column_map_, name); + } + void Destroy() noexcept; + void BuildColumnMap() const; + ffi::LookupResultInner* inner_{nullptr}; + mutable std::shared_ptr column_map_; +}; + +class AppendWriter; +class UpsertWriter; +class Lookuper; +class WriteResult; +class LogScanner; +class Admin; +class Table; +class TableAppend; +class TableUpsert; +class TableLookup; +class TableScan; + +struct Configuration { + // Coordinator server address + std::string bootstrap_servers{"127.0.0.1:9123"}; + // Max request size in bytes (10 MB) + int32_t writer_request_max_size{10 * 1024 * 1024}; + // Writer acknowledgment mode: "all", "0", "1", or "-1" + std::string writer_acks{"all"}; + // Max number of writer retries + int32_t writer_retries{std::numeric_limits::max()}; + // Writer batch size in bytes (2 MB), also the upper bound when dynamic sizing is on + int32_t writer_batch_size{2 * 1024 * 1024}; + // Tune the per-table writer batch size from observed fill ratios + bool writer_dynamic_batch_size_enabled{true}; + // Lower bound (256 KB) for the dynamic batch size estimator + int32_t writer_dynamic_batch_size_min{256 * 1024}; + // Bucket assigner for tables without bucket keys: "sticky" or "round_robin" + std::string writer_bucket_no_key_assigner{"sticky"}; + // Number of remote log batches to prefetch during scanning + size_t scanner_remote_log_prefetch_num{4}; + // Number of threads for downloading remote log data + size_t remote_file_download_thread_num{3}; + // Remote log read concurrency within one file (streaming read path) + size_t scanner_remote_log_read_concurrency{4}; + // Maximum number of records returned in a single call to Poll() for LogScanner + size_t scanner_log_max_poll_records{500}; + // Maximum bytes per fetch response for LogScanner (16 MB) + int32_t scanner_log_fetch_max_bytes{16 * 1024 * 1024}; + // Minimum bytes to accumulate before server returns a fetch response + int32_t scanner_log_fetch_min_bytes{1}; + // Maximum time (ms) the server may wait to satisfy min bytes + int32_t scanner_log_fetch_wait_max_time_ms{500}; + // Maximum bytes per fetch response per bucket for LogScanner (1 MB) + int32_t scanner_log_fetch_max_bytes_for_bucket{1024 * 1024}; + int64_t writer_batch_timeout_ms{100}; + // Whether to enable idempotent writes + bool writer_enable_idempotence{true}; + // Maximum number of in-flight requests per bucket for idempotent writes + size_t writer_max_inflight_requests_per_bucket{5}; + // Total memory available for buffering write batches (default 64MB) + size_t writer_buffer_memory_size{64 * 1024 * 1024}; + // Maximum time in milliseconds to block waiting for buffer memory + uint64_t writer_buffer_wait_timeout_ms{std::numeric_limits::max()}; + // Connect timeout in milliseconds for TCP transport connect + uint64_t connect_timeout_ms{120000}; + // Security protocol: "PLAINTEXT" (default, no auth) or "sasl" (SASL auth) + std::string security_protocol{"PLAINTEXT"}; + // SASL mechanism (only "PLAIN" is supported) + std::string security_sasl_mechanism{"PLAIN"}; + // SASL username (required when security_protocol is "sasl") + std::string security_sasl_username; + // SASL password (required when security_protocol is "sasl") + std::string security_sasl_password; + // Maximum number of pending lookup operations + size_t lookup_queue_size{25600}; + // Maximum batch size of merging lookup operations to one lookup request + size_t lookup_max_batch_size{128}; + // Maximum time to wait for the lookup batch to fill (in milliseconds) + uint64_t lookup_batch_timeout_ms{100}; + // Maximum number of unacknowledged lookup requests + size_t lookup_max_inflight_requests{128}; + // Maximum number of lookup retries + int32_t lookup_max_retries{std::numeric_limits::max()}; +}; + +class Connection { + public: + Connection() noexcept; + ~Connection() noexcept; + + Connection(const Connection&) = delete; + Connection& operator=(const Connection&) = delete; + Connection(Connection&& other) noexcept; + Connection& operator=(Connection&& other) noexcept; + + static Result Create(const Configuration& config, Connection& out); + + bool Available() const; + + Result GetAdmin(Admin& out); + Result GetTable(const TablePath& table_path, Table& out); + + private: + void Destroy() noexcept; + ffi::Connection* conn_{nullptr}; +}; + +class Admin { + public: + Admin() noexcept; + ~Admin() noexcept; + + Admin(const Admin&) = delete; + Admin& operator=(const Admin&) = delete; + Admin(Admin&& other) noexcept; + Admin& operator=(Admin&& other) noexcept; + + bool Available() const; + + Result CreateTable(const TablePath& table_path, const TableDescriptor& descriptor, + bool ignore_if_exists = false); + + Result DropTable(const TablePath& table_path, bool ignore_if_not_exists = false); + + Result GetTableInfo(const TablePath& table_path, TableInfo& out); + + Result GetLatestLakeSnapshot(const TablePath& table_path, LakeSnapshot& out); + + Result ListOffsets(const TablePath& table_path, const std::vector& bucket_ids, + const OffsetSpec& offset_spec, std::unordered_map& out); + + Result ListPartitionOffsets(const TablePath& table_path, const std::string& partition_name, + const std::vector& bucket_ids, + const OffsetSpec& offset_spec, + std::unordered_map& out); + + Result ListPartitionInfos(const TablePath& table_path, std::vector& out); + + Result ListPartitionInfos(const TablePath& table_path, + const std::unordered_map& partition_spec, + std::vector& out); + + Result CreatePartition(const TablePath& table_path, + const std::unordered_map& partition_spec, + bool ignore_if_exists = false); + + Result DropPartition(const TablePath& table_path, + const std::unordered_map& partition_spec, + bool ignore_if_not_exists = false); + + Result CreateDatabase(const std::string& database_name, const DatabaseDescriptor& descriptor, + bool ignore_if_exists = false); + + Result DropDatabase(const std::string& database_name, bool ignore_if_not_exists = false, + bool cascade = true); + + Result ListDatabases(std::vector& out); + + Result DatabaseExists(const std::string& database_name, bool& out); + + Result GetDatabaseInfo(const std::string& database_name, DatabaseInfo& out); + + Result ListTables(const std::string& database_name, std::vector& out); + + Result TableExists(const TablePath& table_path, bool& out); + + Result GetServerNodes(std::vector& out); + + private: + Result DoListOffsets(const TablePath& table_path, const std::vector& bucket_ids, + const OffsetSpec& offset_spec, std::unordered_map& out, + const std::string* partition_name = nullptr); + + friend class Connection; + Admin(ffi::Admin* admin) noexcept; + + void Destroy() noexcept; + ffi::Admin* admin_{nullptr}; +}; + +class Table { + public: + Table() noexcept; + ~Table() noexcept; + + Table(const Table&) = delete; + Table& operator=(const Table&) = delete; + Table(Table&& other) noexcept; + Table& operator=(Table&& other) noexcept; + + bool Available() const; + + GenericRow NewRow() const; + + TableAppend NewAppend(); + TableUpsert NewUpsert(); + TableLookup NewLookup(); + TableScan NewScan(); + + TableInfo GetTableInfo() const; + TablePath GetTablePath() const; + bool HasPrimaryKey() const; + + private: + friend class Connection; + friend class TableAppend; + friend class TableUpsert; + friend class TableLookup; + friend class TableScan; + Table(ffi::Table* table) noexcept; + + void Destroy() noexcept; + const std::shared_ptr& GetColumnMap() const; + + ffi::Table* table_{nullptr}; + mutable std::shared_ptr column_map_; +}; + +class TableAppend { + public: + TableAppend(const TableAppend&) = delete; + TableAppend& operator=(const TableAppend&) = delete; + TableAppend(TableAppend&&) noexcept = default; + TableAppend& operator=(TableAppend&&) noexcept = default; + + Result CreateWriter(AppendWriter& out); + + private: + friend class Table; + explicit TableAppend(ffi::Table* table) noexcept; + + ffi::Table* table_{nullptr}; +}; + +class TableUpsert { + public: + TableUpsert(const TableUpsert&) = delete; + TableUpsert& operator=(const TableUpsert&) = delete; + TableUpsert(TableUpsert&&) noexcept = default; + TableUpsert& operator=(TableUpsert&&) noexcept = default; + + TableUpsert& PartialUpdateByIndex(std::vector column_indices); + TableUpsert& PartialUpdateByName(std::vector column_names); + + Result CreateWriter(UpsertWriter& out); + + private: + friend class Table; + explicit TableUpsert(ffi::Table* table) noexcept; + + std::vector ResolveNameProjection() const; + + ffi::Table* table_{nullptr}; + std::vector column_indices_; + std::vector column_names_; +}; + +class TableLookup { + public: + TableLookup(const TableLookup&) = delete; + TableLookup& operator=(const TableLookup&) = delete; + TableLookup(TableLookup&&) noexcept = default; + TableLookup& operator=(TableLookup&&) noexcept = default; + + Result CreateLookuper(Lookuper& out); + + private: + friend class Table; + explicit TableLookup(ffi::Table* table) noexcept; + + ffi::Table* table_{nullptr}; +}; + +class TableScan { + public: + TableScan(const TableScan&) = delete; + TableScan& operator=(const TableScan&) = delete; + TableScan(TableScan&&) noexcept = default; + TableScan& operator=(TableScan&&) noexcept = default; + + TableScan& ProjectByIndex(std::vector column_indices); + TableScan& ProjectByName(std::vector column_names); + + Result CreateLogScanner(LogScanner& out); + Result CreateRecordBatchLogScanner(LogScanner& out); + + private: + friend class Table; + explicit TableScan(ffi::Table* table) noexcept; + + std::vector ResolveNameProjection() const; + Result DoCreateScanner(LogScanner& out, bool is_record_batch); + + ffi::Table* table_{nullptr}; + std::vector projection_; + std::vector name_projection_; +}; + +class WriteResult { + public: + WriteResult() noexcept; + ~WriteResult() noexcept; + + WriteResult(const WriteResult&) = delete; + WriteResult& operator=(const WriteResult&) = delete; + WriteResult(WriteResult&& other) noexcept; + WriteResult& operator=(WriteResult&& other) noexcept; + + bool Available() const; + + /// Wait for server acknowledgment of the write. + /// For fire-and-forget, simply let the WriteResult go out of scope. + Result Wait(); + + private: + friend class AppendWriter; + friend class UpsertWriter; + WriteResult(ffi::WriteResult* inner) noexcept; + + void Destroy() noexcept; + ffi::WriteResult* inner_{nullptr}; +}; + +class AppendWriter { + public: + AppendWriter() noexcept; + ~AppendWriter() noexcept; + + AppendWriter(const AppendWriter&) = delete; + AppendWriter& operator=(const AppendWriter&) = delete; + AppendWriter(AppendWriter&& other) noexcept; + AppendWriter& operator=(AppendWriter&& other) noexcept; + + bool Available() const; + + Result Append(const GenericRow& row); + Result Append(const GenericRow& row, WriteResult& out); + Result AppendArrowBatch(const std::shared_ptr& batch); + Result AppendArrowBatch(const std::shared_ptr& batch, WriteResult& out); + Result Flush(); + + private: + friend class Table; + friend class TableAppend; + AppendWriter(ffi::AppendWriter* writer) noexcept; + + void Destroy() noexcept; + ffi::AppendWriter* writer_{nullptr}; +}; + +class UpsertWriter { + public: + UpsertWriter() noexcept; + ~UpsertWriter() noexcept; + + UpsertWriter(const UpsertWriter&) = delete; + UpsertWriter& operator=(const UpsertWriter&) = delete; + UpsertWriter(UpsertWriter&& other) noexcept; + UpsertWriter& operator=(UpsertWriter&& other) noexcept; + + bool Available() const; + + Result Upsert(const GenericRow& row); + Result Upsert(const GenericRow& row, WriteResult& out); + Result Delete(const GenericRow& row); + Result Delete(const GenericRow& row, WriteResult& out); + Result Flush(); + + private: + friend class Table; + friend class TableUpsert; + UpsertWriter(ffi::UpsertWriter* writer) noexcept; + void Destroy() noexcept; + ffi::UpsertWriter* writer_{nullptr}; +}; + +class Lookuper { + public: + Lookuper() noexcept; + ~Lookuper() noexcept; + + Lookuper(const Lookuper&) = delete; + Lookuper& operator=(const Lookuper&) = delete; + Lookuper(Lookuper&& other) noexcept; + Lookuper& operator=(Lookuper&& other) noexcept; + + bool Available() const; + + Result Lookup(const GenericRow& pk_row, LookupResult& out); + + private: + friend class Table; + friend class TableLookup; + Lookuper(ffi::Lookuper* lookuper) noexcept; + void Destroy() noexcept; + ffi::Lookuper* lookuper_{nullptr}; +}; + +class LogScanner { + public: + LogScanner() noexcept; + ~LogScanner() noexcept; + + LogScanner(const LogScanner&) = delete; + LogScanner& operator=(const LogScanner&) = delete; + LogScanner(LogScanner&& other) noexcept; + LogScanner& operator=(LogScanner&& other) noexcept; + + bool Available() const; + + Result Subscribe(int32_t bucket_id, int64_t start_offset); + Result Subscribe(const std::vector& bucket_offsets); + Result SubscribePartitionBuckets(int64_t partition_id, int32_t bucket_id, int64_t start_offset); + Result SubscribePartitionBuckets(const std::vector& subscriptions); + Result Unsubscribe(int32_t bucket_id); + Result UnsubscribePartition(int64_t partition_id, int32_t bucket_id); + Result Poll(int64_t timeout_ms, ScanRecords& out); + Result PollRecordBatch(int64_t timeout_ms, ArrowRecordBatches& out); + + private: + friend class Table; + friend class TableScan; + LogScanner(ffi::LogScanner* scanner) noexcept; + + void Destroy() noexcept; + ffi::LogScanner* scanner_{nullptr}; +}; + +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/scripts/ensure_protoc.sh b/fluss-rust/bindings/cpp/scripts/ensure_protoc.sh new file mode 100755 index 0000000000..3210bcc7a5 --- /dev/null +++ b/fluss-rust/bindings/cpp/scripts/ensure_protoc.sh @@ -0,0 +1,277 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -euo pipefail + +PROTOBUF_BASELINE_VERSION="${PROTOBUF_BASELINE_VERSION:-3.25.5}" +if [[ -n "${XDG_CACHE_HOME:-}" ]]; then + _PROTOC_DEFAULT_CACHE_BASE="${XDG_CACHE_HOME}" +elif [[ -n "${HOME:-}" ]]; then + _PROTOC_DEFAULT_CACHE_BASE="${HOME}/.cache" +else + _PROTOC_DEFAULT_CACHE_BASE="/tmp" +fi + +_PROTOC_UNAME_S="$(uname -s | tr '[:upper:]' '[:lower:]')" +case "${_PROTOC_UNAME_S}" in + linux*) + _PROTOC_DEFAULT_OS="linux" + ;; + darwin*) + _PROTOC_DEFAULT_OS="osx" + ;; + *) + echo "ERROR: unsupported host OS '${_PROTOC_UNAME_S}'. Please set PROTOC_OS explicitly." >&2 + exit 1 + ;; +esac + +_PROTOC_UNAME_M="$(uname -m)" +case "${_PROTOC_UNAME_M}" in + x86_64|amd64) + _PROTOC_DEFAULT_ARCH="x86_64" + ;; + aarch64|arm64) + _PROTOC_DEFAULT_ARCH="aarch_64" + ;; + *) + echo "ERROR: unsupported host arch '${_PROTOC_UNAME_M}'. Please set PROTOC_ARCH explicitly." >&2 + exit 1 + ;; +esac + +PROTOC_INSTALL_ROOT="${PROTOC_INSTALL_ROOT:-${_PROTOC_DEFAULT_CACHE_BASE}/fluss-cpp-tools}" +PROTOC_OS="${PROTOC_OS:-${_PROTOC_DEFAULT_OS}}" +PROTOC_ARCH="${PROTOC_ARCH:-${_PROTOC_DEFAULT_ARCH}}" +PROTOC_FORCE_INSTALL="${PROTOC_FORCE_INSTALL:-0}" +PROTOC_PRINT_PATH_ONLY="${PROTOC_PRINT_PATH_ONLY:-0}" +PROTOC_ALLOW_INSECURE_DOWNLOAD="${PROTOC_ALLOW_INSECURE_DOWNLOAD:-0}" +PROTOC_SKIP_CHECKSUM_VERIFY="${PROTOC_SKIP_CHECKSUM_VERIFY:-0}" + +usage() { + cat <<'EOF' +Usage: bindings/cpp/scripts/ensure_protoc.sh [--print-path] + +Ensures a protoc binary matching the configured protobuf baseline is available. +Installs into a local cache directory (default: \$XDG_CACHE_HOME/fluss-cpp-tools or +\$HOME/.cache/fluss-cpp-tools) and prints +the protoc path on stdout. + +Env vars: + PROTOBUF_BASELINE_VERSION Baseline protobuf version (default: 3.25.5) + PROTOC_INSTALL_ROOT Local cache root (default: XDG/HOME cache dir) + PROTOC_OS protoc package OS (default: auto-detect host: linux/osx) + PROTOC_ARCH protoc package arch (default: auto-detect host: x86_64/aarch_64) + PROTOC_FORCE_INSTALL 1 to force re-download + PROTOC_ALLOW_INSECURE_DOWNLOAD + 1 to disable TLS verification (not recommended) + PROTOC_SKIP_CHECKSUM_VERIFY + 1 to skip pinned archive checksum verification + BAZEL_PROXY_URL Optional proxy (sets curl/wget proxy envs if present) +EOF +} + +for arg in "$@"; do + case "$arg" in + --print-path) + PROTOC_PRINT_PATH_ONLY=1 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $arg" >&2 + usage >&2 + exit 1 + ;; + esac +done + +setup_proxy_env() { + if [[ -n "${BAZEL_PROXY_URL:-}" ]]; then + export http_proxy="${http_proxy:-$BAZEL_PROXY_URL}" + export https_proxy="${https_proxy:-$BAZEL_PROXY_URL}" + export HTTP_PROXY="${HTTP_PROXY:-$http_proxy}" + export HTTPS_PROXY="${HTTPS_PROXY:-$https_proxy}" + fi +} + +normalize_version_for_protoc_release() { + local v="$1" + # Protobuf release packaging switched from v3.x.y to vX.Y for newer versions. + # For our current agreed baseline (3.25.5), the protoc archive/tag is 25.5. + if [[ "$v" =~ ^3\.([0-9]+\.[0-9]+)$ ]]; then + local stripped="${BASH_REMATCH[1]}" + local major="${stripped%%.*}" + if [[ "$major" -ge 21 ]]; then + echo "$stripped" + return 0 + fi + fi + echo "$v" +} + +version_matches_baseline() { + local actual="$1" + local baseline="$2" + local actual_norm baseline_norm + actual_norm="$(normalize_version_for_protoc_release "$actual")" + baseline_norm="$(normalize_version_for_protoc_release "$baseline")" + [[ "$actual" == "$baseline" || "$actual_norm" == "$baseline_norm" ]] +} + +lookup_protoc_archive_sha256() { + local release_version="$1" + local os="$2" + local arch="$3" + case "${release_version}:${os}:${arch}" in + 25.5:linux:aarch_64) + echo "dc715bb5aab2ebf9653d7d3efbe55e01a035e45c26f391ff6d9b7923e22914b7" + ;; + 25.5:linux:x86_64) + echo "e1ed237a17b2e851cf9662cb5ad02b46e70ff8e060e05984725bc4b4228c6b28" + ;; + 25.5:osx:aarch_64) + echo "781a6fc4c265034872cadc65e63dd3c0fc49245b70917821b60e2d457a6876ab" + ;; + 25.5:osx:x86_64) + echo "c5447e4f0d5caffb18d9ff21eae7bc7faf2bb2000083d6f49e5b6000b30fceae" + ;; + *) + return 1 + ;; + esac +} + +verify_download_sha256() { + local file="$1" + local expected="$2" + local actual="" + if command -v sha256sum >/dev/null 2>&1; then + actual="$(sha256sum "$file" | awk '{print $1}')" + elif command -v shasum >/dev/null 2>&1; then + actual="$(shasum -a 256 "$file" | awk '{print $1}')" + else + echo "ERROR: neither sha256sum nor shasum is available for checksum verification." >&2 + return 1 + fi + if [[ "$actual" != "$expected" ]]; then + echo "ERROR: protoc archive checksum mismatch." >&2 + echo " expected: $expected" >&2 + echo " actual: $actual" >&2 + return 1 + fi +} + +download_file() { + local url="$1" + local out="$2" + + if command -v curl >/dev/null 2>&1; then + local curl_args=(-fL) + if [[ "${PROTOC_ALLOW_INSECURE_DOWNLOAD}" == "1" ]]; then + curl_args+=(-k) + fi + curl "${curl_args[@]}" "$url" -o "$out" + return 0 + fi + + if command -v wget >/dev/null 2>&1; then + local wget_args=() + if [[ -n "${https_proxy:-}" || -n "${http_proxy:-}" ]]; then + wget_args+=(-e use_proxy=yes) + if [[ -n "${https_proxy:-}" ]]; then + wget_args+=(-e "https_proxy=${https_proxy}") + fi + if [[ -n "${http_proxy:-}" ]]; then + wget_args+=(-e "http_proxy=${http_proxy}") + fi + fi + if [[ "${PROTOC_ALLOW_INSECURE_DOWNLOAD}" == "1" ]]; then + wget_args+=(--no-check-certificate) + fi + wget "${wget_args[@]}" -O "$out" "$url" + return 0 + fi + + echo "ERROR: neither curl nor wget is available for downloading protoc." >&2 + return 1 +} + +ensure_zip_tools() { + command -v unzip >/dev/null 2>&1 || { + echo "ERROR: unzip not found." >&2 + exit 1 + } +} + +setup_proxy_env +ensure_zip_tools + +if command -v protoc >/dev/null 2>&1; then + existing_out="$(protoc --version 2>/dev/null || true)" + if [[ "$existing_out" =~ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then + existing_ver="${BASH_REMATCH[1]}" + if version_matches_baseline "$existing_ver" "$PROTOBUF_BASELINE_VERSION"; then + command -v protoc + exit 0 + fi + fi +fi + +PROTOC_RELEASE_VERSION="$(normalize_version_for_protoc_release "$PROTOBUF_BASELINE_VERSION")" +PROTOC_ARCHIVE="protoc-${PROTOC_RELEASE_VERSION}-${PROTOC_OS}-${PROTOC_ARCH}.zip" +PROTOC_URL="https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_RELEASE_VERSION}/${PROTOC_ARCHIVE}" +PROTOC_PREFIX="${PROTOC_INSTALL_ROOT}/protoc-${PROTOC_RELEASE_VERSION}-${PROTOC_OS}-${PROTOC_ARCH}" +PROTOC_BIN="${PROTOC_PREFIX}/bin/protoc" + +if [[ "${PROTOC_FORCE_INSTALL}" != "1" && -x "${PROTOC_BIN}" ]]; then + if [[ "${PROTOC_PRINT_PATH_ONLY}" == "1" ]]; then + echo "${PROTOC_BIN}" + else + echo "${PROTOC_BIN}" + fi + exit 0 +fi + +mkdir -p "${PROTOC_INSTALL_ROOT}" +tmpdir="$(mktemp -d "${PROTOC_INSTALL_ROOT}/.protoc-download.XXXXXX")" +trap 'rm -rf "${tmpdir}"' EXIT + +archive_path="${tmpdir}/${PROTOC_ARCHIVE}" +download_file "${PROTOC_URL}" "${archive_path}" +if [[ "${PROTOC_SKIP_CHECKSUM_VERIFY}" != "1" ]]; then + if expected_sha256="$(lookup_protoc_archive_sha256 "${PROTOC_RELEASE_VERSION}" "${PROTOC_OS}" "${PROTOC_ARCH}")"; then + verify_download_sha256 "${archive_path}" "${expected_sha256}" + else + echo "ERROR: no pinned checksum for protoc archive ${PROTOC_ARCHIVE}. Set PROTOC_SKIP_CHECKSUM_VERIFY=1 to bypass." >&2 + exit 1 + fi +fi + +extract_dir="${tmpdir}/extract" +mkdir -p "${extract_dir}" +unzip -q "${archive_path}" -d "${extract_dir}" + +rm -rf "${PROTOC_PREFIX}" +mkdir -p "${PROTOC_PREFIX}" +cp -a "${extract_dir}/." "${PROTOC_PREFIX}/" +chmod +x "${PROTOC_BIN}" + +echo "${PROTOC_BIN}" diff --git a/fluss-rust/bindings/cpp/src/admin.cpp b/fluss-rust/bindings/cpp/src/admin.cpp new file mode 100644 index 0000000000..a689c6143a --- /dev/null +++ b/fluss-rust/bindings/cpp/src/admin.cpp @@ -0,0 +1,372 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "ffi_converter.hpp" +#include "fluss.hpp" +#include "lib.rs.h" +#include "rust/cxx.h" +#include + +namespace fluss { + +Admin::Admin() noexcept = default; + +Admin::Admin(ffi::Admin* admin) noexcept : admin_(admin) {} + +Admin::~Admin() noexcept { Destroy(); } + +void Admin::Destroy() noexcept { + if (admin_) { + ffi::delete_admin(admin_); + admin_ = nullptr; + } +} + +Admin::Admin(Admin&& other) noexcept : admin_(other.admin_) { other.admin_ = nullptr; } + +Admin& Admin::operator=(Admin&& other) noexcept { + if (this != &other) { + Destroy(); + admin_ = other.admin_; + other.admin_ = nullptr; + } + return *this; +} + +bool Admin::Available() const { return admin_ != nullptr; } + +Result Admin::CreateTable(const TablePath& table_path, const TableDescriptor& descriptor, + bool ignore_if_exists) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_desc = utils::to_ffi_table_descriptor(descriptor); + + auto ffi_result = admin_->create_table(ffi_path, ffi_desc, ignore_if_exists); + return utils::from_ffi_result(ffi_result); +} + +Result Admin::DropTable(const TablePath& table_path, bool ignore_if_not_exists) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_result = admin_->drop_table(ffi_path, ignore_if_not_exists); + return utils::from_ffi_result(ffi_result); +} + +Result Admin::GetTableInfo(const TablePath& table_path, TableInfo& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_result = admin_->get_table_info(ffi_path); + + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + try { + out = utils::from_ffi_table_info(ffi_result.table_info); + } catch (const std::exception& e) { + return utils::make_client_error(std::string("Failed to parse table metadata: ") + e.what()); + } + } + + return result; +} + +Result Admin::GetLatestLakeSnapshot(const TablePath& table_path, LakeSnapshot& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_result = admin_->get_latest_lake_snapshot(ffi_path); + + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = utils::from_ffi_lake_snapshot(ffi_result.lake_snapshot); + } + + return result; +} + +// function for common list offsets functionality +Result Admin::DoListOffsets(const TablePath& table_path, const std::vector& bucket_ids, + const OffsetSpec& offset_spec, + std::unordered_map& out, + const std::string* partition_name) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + + rust::Vec rust_bucket_ids; + for (int32_t id : bucket_ids) { + rust_bucket_ids.push_back(id); + } + + ffi::FfiOffsetQuery ffi_query; + ffi_query.offset_type = static_cast(offset_spec.type); + ffi_query.timestamp = offset_spec.timestamp; + + ffi::FfiListOffsetsResult ffi_result; + if (partition_name != nullptr) { + ffi_result = admin_->list_partition_offsets(ffi_path, rust::String(*partition_name), + std::move(rust_bucket_ids), ffi_query); + } else { + ffi_result = admin_->list_offsets(ffi_path, std::move(rust_bucket_ids), ffi_query); + } + + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.clear(); + for (const auto& pair : ffi_result.bucket_offsets) { + out[pair.bucket_id] = pair.offset; + } + } + + return result; +} + +Result Admin::ListOffsets(const TablePath& table_path, const std::vector& bucket_ids, + const OffsetSpec& offset_spec, + std::unordered_map& out) { + return DoListOffsets(table_path, bucket_ids, offset_spec, out); +} + +Result Admin::ListPartitionOffsets(const TablePath& table_path, const std::string& partition_name, + const std::vector& bucket_ids, + const OffsetSpec& offset_spec, + std::unordered_map& out) { + return DoListOffsets(table_path, bucket_ids, offset_spec, out, &partition_name); +} + +Result Admin::ListPartitionInfos(const TablePath& table_path, std::vector& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_result = admin_->list_partition_infos(ffi_path); + + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.clear(); + out.reserve(ffi_result.partition_infos.size()); + for (const auto& pi : ffi_result.partition_infos) { + out.push_back({pi.partition_id, std::string(pi.partition_name)}); + } + } + + return result; +} + +Result Admin::ListPartitionInfos(const TablePath& table_path, + const std::unordered_map& partition_spec, + std::vector& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + + rust::Vec rust_spec; + for (const auto& [key, value] : partition_spec) { + ffi::FfiPartitionKeyValue kv; + kv.key = rust::String(key); + kv.value = rust::String(value); + rust_spec.push_back(std::move(kv)); + } + + auto ffi_result = admin_->list_partition_infos_with_spec(ffi_path, std::move(rust_spec)); + + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.clear(); + out.reserve(ffi_result.partition_infos.size()); + for (const auto& pi : ffi_result.partition_infos) { + out.push_back({pi.partition_id, std::string(pi.partition_name)}); + } + } + + return result; +} + +Result Admin::CreatePartition(const TablePath& table_path, + const std::unordered_map& partition_spec, + bool ignore_if_exists) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + + rust::Vec rust_spec; + for (const auto& [key, value] : partition_spec) { + ffi::FfiPartitionKeyValue kv; + kv.key = rust::String(key); + kv.value = rust::String(value); + rust_spec.push_back(std::move(kv)); + } + + auto ffi_result = admin_->create_partition(ffi_path, std::move(rust_spec), ignore_if_exists); + return utils::from_ffi_result(ffi_result); +} + +Result Admin::DropPartition(const TablePath& table_path, + const std::unordered_map& partition_spec, + bool ignore_if_not_exists) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + + rust::Vec rust_spec; + for (const auto& [key, value] : partition_spec) { + ffi::FfiPartitionKeyValue kv; + kv.key = rust::String(key); + kv.value = rust::String(value); + rust_spec.push_back(std::move(kv)); + } + + auto ffi_result = admin_->drop_partition(ffi_path, std::move(rust_spec), ignore_if_not_exists); + return utils::from_ffi_result(ffi_result); +} + +Result Admin::CreateDatabase(const std::string& database_name, const DatabaseDescriptor& descriptor, + bool ignore_if_exists) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_desc = utils::to_ffi_database_descriptor(descriptor); + auto ffi_result = admin_->create_database(rust::Str(database_name), ffi_desc, ignore_if_exists); + return utils::from_ffi_result(ffi_result); +} + +Result Admin::DropDatabase(const std::string& database_name, bool ignore_if_not_exists, + bool cascade) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_result = + admin_->drop_database(rust::Str(database_name), ignore_if_not_exists, cascade); + return utils::from_ffi_result(ffi_result); +} + +Result Admin::ListDatabases(std::vector& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_result = admin_->list_databases(); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.clear(); + out.reserve(ffi_result.database_names.size()); + for (const auto& name : ffi_result.database_names) { + out.push_back(std::string(name)); + } + } + return result; +} + +Result Admin::DatabaseExists(const std::string& database_name, bool& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_result = admin_->database_exists(rust::Str(database_name)); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = ffi_result.value; + } + return result; +} + +Result Admin::GetDatabaseInfo(const std::string& database_name, DatabaseInfo& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_result = admin_->get_database_info(rust::Str(database_name)); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = utils::from_ffi_database_info(ffi_result.database_info); + } + return result; +} + +Result Admin::ListTables(const std::string& database_name, std::vector& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_result = admin_->list_tables(rust::Str(database_name)); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.clear(); + out.reserve(ffi_result.table_names.size()); + for (const auto& name : ffi_result.table_names) { + out.push_back(std::string(name)); + } + } + return result; +} + +Result Admin::TableExists(const TablePath& table_path, bool& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_result = admin_->table_exists(ffi_path); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = ffi_result.value; + } + return result; +} + +Result Admin::GetServerNodes(std::vector& out) { + if (!Available()) { + return utils::make_client_error("Admin not available"); + } + + auto ffi_result = admin_->get_server_nodes(); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.clear(); + out.reserve(ffi_result.server_nodes.size()); + for (const auto& node : ffi_result.server_nodes) { + out.push_back({node.node_id, std::string(node.host), node.port, + std::string(node.server_type), std::string(node.uid)}); + } + } + return result; +} + +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/connection.cpp b/fluss-rust/bindings/cpp/src/connection.cpp new file mode 100644 index 0000000000..6cd73017f5 --- /dev/null +++ b/fluss-rust/bindings/cpp/src/connection.cpp @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include "ffi_converter.hpp" +#include "fluss.hpp" +#include "lib.rs.h" +#include "rust/cxx.h" + +namespace fluss { + +Connection::Connection() noexcept = default; + +Connection::~Connection() noexcept { Destroy(); } + +void Connection::Destroy() noexcept { + if (conn_) { + ffi::delete_connection(conn_); + conn_ = nullptr; + } +} + +Connection::Connection(Connection&& other) noexcept : conn_(other.conn_) { other.conn_ = nullptr; } + +Connection& Connection::operator=(Connection&& other) noexcept { + if (this != &other) { + Destroy(); + conn_ = other.conn_; + other.conn_ = nullptr; + } + return *this; +} + +Result Connection::Create(const Configuration& config, Connection& out) { + auto ffi_config = utils::to_ffi_config(config); + auto ffi_result = ffi::new_connection(ffi_config); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.conn_ = utils::ptr_from_ffi(ffi_result); + } + return result; +} + +bool Connection::Available() const { return conn_ != nullptr; } + +Result Connection::GetAdmin(Admin& out) { + if (!Available()) { + return utils::make_client_error("Connection not available"); + } + + auto ffi_result = conn_->get_admin(); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.admin_ = utils::ptr_from_ffi(ffi_result); + } + return result; +} + +Result Connection::GetTable(const TablePath& table_path, Table& out) { + if (!Available()) { + return utils::make_client_error("Connection not available"); + } + + auto ffi_path = utils::to_ffi_table_path(table_path); + auto ffi_result = conn_->get_table(ffi_path); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.table_ = utils::ptr_from_ffi(ffi_result); + } + return result; +} + +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/ffi_converter.hpp b/fluss-rust/bindings/cpp/src/ffi_converter.hpp new file mode 100644 index 0000000000..47453d998a --- /dev/null +++ b/fluss-rust/bindings/cpp/src/ffi_converter.hpp @@ -0,0 +1,407 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include +#include + +#include "fluss.hpp" +#include "lib.rs.h" + +namespace fluss { +namespace utils { + +/// Compact FFI representation of a (possibly nested) array type. +/// +/// `nesting` counts the number of ARRAY wrappers stripped to reach the leaf +/// element type. `leaf_type`/`leaf_precision`/`leaf_scale` describe that leaf +/// scalar. A non-array input produces a zero-initialised value (nesting == 0). +/// `array_nullability` has `nesting + 1` entries: one per ARRAY wrapper +/// (outermost first) plus a trailing entry for the leaf scalar's nullability. +/// +/// Using a flat representation — rather than serialising a recursive +/// `DataType` — keeps the cxx bridge contract small while preserving schema +/// fidelity across the FFI boundary when paired with rebuild_array_type(). +struct FlattenedArrayType { + int32_t nesting{0}; + int32_t leaf_type{0}; + int32_t leaf_precision{0}; + int32_t leaf_scale{0}; + std::vector array_nullability; +}; + +/// Flattens an `ARRAY>>` DataType into a FlattenedArrayType. +/// +/// Contract: +/// - If `data_type` is not an ARRAY, returns a zero-valued FlattenedArrayType +/// and callers must use the column's own `id/precision/scale` instead. +/// - If `data_type` is an ARRAY but has a null element_type() chain (which +/// should only happen on malformed input), returns a zero-valued result to +/// signal the caller to reject the schema. +/// - Otherwise, `nesting >= 1`, array_nullability has `nesting + 1` entries +/// (last = leaf scalar nullability), and leaf_* describe the innermost scalar. +inline FlattenedArrayType flatten_array_type(const DataType& data_type) { + FlattenedArrayType out; + if (data_type.id() != TypeId::Array) { + return out; + } + + const DataType* current = &data_type; + while (current && current->id() == TypeId::Array) { + out.nesting += 1; + out.array_nullability.push_back(current->nullable() ? 1 : 0); + current = current->element_type(); + } + if (!current) { + return FlattenedArrayType{}; + } + + out.leaf_type = static_cast(current->id()); + out.leaf_precision = current->precision(); + out.leaf_scale = current->scale(); + out.array_nullability.push_back(current->nullable() ? 1 : 0); + return out; +} + +/// Inverse of flatten_array_type: rebuilds an `ARRAY>>` type +/// from the compact flat form. Requires `flat.nesting >= 1`; callers handle +/// the `nesting == 0` case by using a plain scalar DataType directly. +/// `array_nullability` must have `nesting + 1` entries (last = leaf). +inline DataType rebuild_array_type(const FlattenedArrayType& flat) { + bool leaf_nullable = (static_cast(flat.nesting) < flat.array_nullability.size()) + ? (flat.array_nullability[static_cast(flat.nesting)] != 0) + : true; + DataType dt(static_cast(flat.leaf_type), flat.leaf_precision, flat.leaf_scale, + leaf_nullable); + for (int32_t i = flat.nesting - 1; i >= 0; --i) { + bool nullable = (static_cast(i) < flat.array_nullability.size()) + ? (flat.array_nullability[static_cast(i)] != 0) + : true; + auto arr = DataType::Array(std::move(dt)); + if (!nullable) { + arr = arr.NotNull(); + } + dt = std::move(arr); + } + return dt; +} + +inline Result make_error(int32_t code, std::string msg) { return Result{code, std::move(msg)}; } + +inline Result make_client_error(std::string msg) { + return Result{ErrorCode::CLIENT_ERROR, std::move(msg)}; +} + +inline Result make_ok() { return Result{0, {}}; } + +inline Result from_ffi_result(const ffi::FfiResult& ffi_result) { + return Result{ffi_result.error_code, std::string(ffi_result.error_message)}; +} + +template +inline T* ptr_from_ffi(const ffi::FfiPtrResult& r) { + assert(r.ptr != 0 && "ptr_from_ffi: null pointer in FfiPtrResult"); + return reinterpret_cast(r.ptr); +} + +inline ffi::FfiTablePath to_ffi_table_path(const TablePath& path) { + ffi::FfiTablePath ffi_path; + ffi_path.database_name = rust::String(path.database_name); + ffi_path.table_name = rust::String(path.table_name); + return ffi_path; +} + +inline ffi::FfiConfig to_ffi_config(const Configuration& config) { + ffi::FfiConfig ffi_config; + ffi_config.bootstrap_servers = rust::String(config.bootstrap_servers); + ffi_config.writer_request_max_size = config.writer_request_max_size; + ffi_config.writer_acks = rust::String(config.writer_acks); + ffi_config.writer_retries = config.writer_retries; + ffi_config.writer_batch_size = config.writer_batch_size; + ffi_config.writer_dynamic_batch_size_enabled = config.writer_dynamic_batch_size_enabled; + ffi_config.writer_dynamic_batch_size_min = config.writer_dynamic_batch_size_min; + ffi_config.writer_bucket_no_key_assigner = rust::String(config.writer_bucket_no_key_assigner); + ffi_config.scanner_remote_log_prefetch_num = config.scanner_remote_log_prefetch_num; + ffi_config.remote_file_download_thread_num = config.remote_file_download_thread_num; + ffi_config.scanner_remote_log_read_concurrency = config.scanner_remote_log_read_concurrency; + ffi_config.scanner_log_max_poll_records = config.scanner_log_max_poll_records; + ffi_config.scanner_log_fetch_max_bytes = config.scanner_log_fetch_max_bytes; + ffi_config.scanner_log_fetch_min_bytes = config.scanner_log_fetch_min_bytes; + ffi_config.scanner_log_fetch_wait_max_time_ms = config.scanner_log_fetch_wait_max_time_ms; + ffi_config.scanner_log_fetch_max_bytes_for_bucket = config.scanner_log_fetch_max_bytes_for_bucket; + ffi_config.writer_batch_timeout_ms = config.writer_batch_timeout_ms; + ffi_config.writer_enable_idempotence = config.writer_enable_idempotence; + ffi_config.writer_max_inflight_requests_per_bucket = + config.writer_max_inflight_requests_per_bucket; + ffi_config.writer_buffer_memory_size = config.writer_buffer_memory_size; + ffi_config.writer_buffer_wait_timeout_ms = config.writer_buffer_wait_timeout_ms; + ffi_config.connect_timeout_ms = config.connect_timeout_ms; + ffi_config.security_protocol = rust::String(config.security_protocol); + ffi_config.security_sasl_mechanism = rust::String(config.security_sasl_mechanism); + ffi_config.security_sasl_username = rust::String(config.security_sasl_username); + ffi_config.security_sasl_password = rust::String(config.security_sasl_password); + ffi_config.lookup_queue_size = config.lookup_queue_size; + ffi_config.lookup_max_batch_size = config.lookup_max_batch_size; + ffi_config.lookup_batch_timeout_ms = config.lookup_batch_timeout_ms; + ffi_config.lookup_max_inflight_requests = config.lookup_max_inflight_requests; + ffi_config.lookup_max_retries = config.lookup_max_retries; + return ffi_config; +} + +inline ffi::FfiColumn to_ffi_column(const Column& col) { + ffi::FfiColumn ffi_col; + ffi_col.name = rust::String(col.name); + ffi_col.data_type = static_cast(col.data_type.id()); + ffi_col.nullable = col.data_type.nullable(); + ffi_col.comment = rust::String(col.comment); + ffi_col.precision = col.data_type.precision(); + ffi_col.scale = col.data_type.scale(); + auto flat = flatten_array_type(col.data_type); + ffi_col.array_nesting = flat.nesting; + for (auto nullable : flat.array_nullability) { + ffi_col.array_nullability.push_back(nullable); + } + if (flat.nesting > 0 && flat.leaf_type != 0) { + ffi_col.element_data_type = flat.leaf_type; + ffi_col.element_precision = flat.leaf_precision; + ffi_col.element_scale = flat.leaf_scale; + } else { + ffi_col.element_data_type = 0; + ffi_col.element_precision = 0; + ffi_col.element_scale = 0; + } + return ffi_col; +} + +inline ffi::FfiSchema to_ffi_schema(const Schema& schema) { + ffi::FfiSchema ffi_schema; + + rust::Vec cols; + for (const auto& col : schema.columns) { + cols.push_back(to_ffi_column(col)); + } + ffi_schema.columns = std::move(cols); + + rust::Vec pks; + for (const auto& pk : schema.primary_keys) { + pks.push_back(rust::String(pk)); + } + ffi_schema.primary_keys = std::move(pks); + + return ffi_schema; +} + +inline ffi::FfiTableDescriptor to_ffi_table_descriptor(const TableDescriptor& desc) { + ffi::FfiTableDescriptor ffi_desc; + + ffi_desc.schema = to_ffi_schema(desc.schema); + + rust::Vec partition_keys; + for (const auto& pk : desc.partition_keys) { + partition_keys.push_back(rust::String(pk)); + } + ffi_desc.partition_keys = std::move(partition_keys); + + ffi_desc.bucket_count = desc.bucket_count; + + rust::Vec bucket_keys; + for (const auto& bk : desc.bucket_keys) { + bucket_keys.push_back(rust::String(bk)); + } + ffi_desc.bucket_keys = std::move(bucket_keys); + + rust::Vec props; + for (const auto& [k, v] : desc.properties) { + ffi::HashMapValue prop; + prop.key = rust::String(k); + prop.value = rust::String(v); + props.push_back(prop); + } + ffi_desc.properties = std::move(props); + + rust::Vec custom_props; + for (const auto& [k, v] : desc.custom_properties) { + ffi::HashMapValue prop; + prop.key = rust::String(k); + prop.value = rust::String(v); + custom_props.push_back(prop); + } + ffi_desc.custom_properties = std::move(custom_props); + + ffi_desc.comment = rust::String(desc.comment); + + return ffi_desc; +} + +inline Column from_ffi_column(const ffi::FfiColumn& ffi_col) { + auto type_id = static_cast(ffi_col.data_type); + if (type_id == TypeId::Array) { + if (ffi_col.element_data_type == 0) { + throw std::runtime_error("Malformed ARRAY column '" + std::string(ffi_col.name) + + "': missing element_data_type"); + } + if (ffi_col.array_nesting < 0) { + throw std::runtime_error("Malformed ARRAY column '" + std::string(ffi_col.name) + + "': array_nesting must be non-negative"); + } + if (ffi_col.element_data_type == static_cast(TypeId::Array)) { + throw std::runtime_error("Malformed ARRAY column '" + std::string(ffi_col.name) + + "': leaf element_data_type cannot be ARRAY"); + } + auto is_supported_leaf_type = [](int32_t leaf_type) { + switch (static_cast(leaf_type)) { + case TypeId::Boolean: + case TypeId::TinyInt: + case TypeId::SmallInt: + case TypeId::Int: + case TypeId::BigInt: + case TypeId::Float: + case TypeId::Double: + case TypeId::String: + case TypeId::Bytes: + case TypeId::Date: + case TypeId::Time: + case TypeId::Timestamp: + case TypeId::TimestampLtz: + case TypeId::Decimal: + case TypeId::Char: + case TypeId::Binary: + return true; + default: + return false; + } + }; + if (!is_supported_leaf_type(ffi_col.element_data_type)) { + throw std::runtime_error("Malformed ARRAY column '" + std::string(ffi_col.name) + + "': unsupported leaf element_data_type " + + std::to_string(ffi_col.element_data_type)); + } + + int32_t nesting = ffi_col.array_nesting > 0 ? ffi_col.array_nesting : 1; + std::vector array_nullability; + for (auto nullable : ffi_col.array_nullability) { + array_nullability.push_back(nullable); + } + auto dt = rebuild_array_type( + FlattenedArrayType{ + nesting, + ffi_col.element_data_type, + ffi_col.element_precision, + ffi_col.element_scale, + std::move(array_nullability), + }); + return Column{std::string(ffi_col.name), std::move(dt), std::string(ffi_col.comment)}; + } + DataType dt(type_id, ffi_col.precision, ffi_col.scale, ffi_col.nullable); + return Column{std::string(ffi_col.name), std::move(dt), std::string(ffi_col.comment)}; +} + +inline Schema from_ffi_schema(const ffi::FfiSchema& ffi_schema) { + Schema schema; + + for (const auto& col : ffi_schema.columns) { + schema.columns.push_back(from_ffi_column(col)); + } + + for (const auto& pk : ffi_schema.primary_keys) { + schema.primary_keys.push_back(std::string(pk)); + } + + return schema; +} + +inline TableInfo from_ffi_table_info(const ffi::FfiTableInfo& ffi_info) { + TableInfo info; + + info.table_id = ffi_info.table_id; + info.schema_id = ffi_info.schema_id; + info.table_path = TablePath{std::string(ffi_info.table_path.database_name), + std::string(ffi_info.table_path.table_name)}; + info.created_time = ffi_info.created_time; + info.modified_time = ffi_info.modified_time; + + for (const auto& pk : ffi_info.primary_keys) { + info.primary_keys.push_back(std::string(pk)); + } + + for (const auto& bk : ffi_info.bucket_keys) { + info.bucket_keys.push_back(std::string(bk)); + } + + for (const auto& pk : ffi_info.partition_keys) { + info.partition_keys.push_back(std::string(pk)); + } + + info.num_buckets = ffi_info.num_buckets; + info.has_primary_key = ffi_info.has_primary_key; + info.is_partitioned = ffi_info.is_partitioned; + + for (const auto& prop : ffi_info.properties) { + info.properties[std::string(prop.key)] = std::string(prop.value); + } + + for (const auto& prop : ffi_info.custom_properties) { + info.custom_properties[std::string(prop.key)] = std::string(prop.value); + } + + info.comment = std::string(ffi_info.comment); + info.schema = from_ffi_schema(ffi_info.schema); + + return info; +} + +inline LakeSnapshot from_ffi_lake_snapshot(const ffi::FfiLakeSnapshot& ffi_snapshot) { + LakeSnapshot snapshot; + snapshot.snapshot_id = ffi_snapshot.snapshot_id; + + for (const auto& offset : ffi_snapshot.bucket_offsets) { + snapshot.bucket_offsets.push_back( + BucketOffset{offset.table_id, offset.partition_id, offset.bucket_id, offset.offset}); + } + + return snapshot; +} + +inline ffi::FfiDatabaseDescriptor to_ffi_database_descriptor(const DatabaseDescriptor& desc) { + ffi::FfiDatabaseDescriptor ffi_desc; + ffi_desc.comment = rust::String(desc.comment); + for (const auto& [k, v] : desc.properties) { + ffi::HashMapValue kv; + kv.key = rust::String(k); + kv.value = rust::String(v); + ffi_desc.properties.push_back(std::move(kv)); + } + return ffi_desc; +} + +inline DatabaseInfo from_ffi_database_info(const ffi::FfiDatabaseInfo& ffi_info) { + DatabaseInfo info; + info.database_name = std::string(ffi_info.database_name); + info.comment = std::string(ffi_info.comment); + info.created_time = ffi_info.created_time; + info.modified_time = ffi_info.modified_time; + for (const auto& prop : ffi_info.properties) { + info.properties[std::string(prop.key)] = std::string(prop.value); + } + return info; +} + +} // namespace utils +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/lib.rs b/fluss-rust/bindings/cpp/src/lib.rs new file mode 100644 index 0000000000..ed575244f6 --- /dev/null +++ b/fluss-rust/bindings/cpp/src/lib.rs @@ -0,0 +1,3650 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod types; + +use std::str::FromStr; +use std::sync::{Arc, LazyLock}; +use std::time::Duration; + +use fluss as fcore; +use fluss::PartitionId; +use fluss::error::Error; +use fluss::rpc::FlussError as CoreFlussError; + +static RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap() +}); + +#[cxx::bridge(namespace = "fluss::ffi")] +mod ffi { + struct HashMapValue { + key: String, + value: String, + } + + struct FfiConfig { + bootstrap_servers: String, + writer_request_max_size: i32, + writer_acks: String, + writer_retries: i32, + writer_batch_size: i32, + writer_dynamic_batch_size_enabled: bool, + writer_dynamic_batch_size_min: i32, + writer_bucket_no_key_assigner: String, + scanner_remote_log_prefetch_num: usize, + remote_file_download_thread_num: usize, + scanner_remote_log_read_concurrency: usize, + scanner_log_max_poll_records: usize, + scanner_log_fetch_max_bytes: i32, + scanner_log_fetch_min_bytes: i32, + scanner_log_fetch_wait_max_time_ms: i32, + scanner_log_fetch_max_bytes_for_bucket: i32, + writer_batch_timeout_ms: i64, + writer_enable_idempotence: bool, + writer_max_inflight_requests_per_bucket: usize, + writer_buffer_memory_size: usize, + writer_buffer_wait_timeout_ms: u64, + connect_timeout_ms: u64, + security_protocol: String, + security_sasl_mechanism: String, + security_sasl_username: String, + security_sasl_password: String, + lookup_queue_size: usize, + lookup_max_batch_size: usize, + lookup_batch_timeout_ms: u64, + lookup_max_inflight_requests: usize, + lookup_max_retries: i32, + } + + struct FfiResult { + error_code: i32, + error_message: String, + } + + struct FfiTablePath { + database_name: String, + table_name: String, + } + + struct FfiColumn { + name: String, + data_type: i32, + nullable: bool, + comment: String, + precision: i32, + scale: i32, + array_nesting: i32, + array_nullability: Vec, + element_data_type: i32, + element_precision: i32, + element_scale: i32, + } + + struct FfiSchema { + columns: Vec, + primary_keys: Vec, + } + + struct FfiTableDescriptor { + schema: FfiSchema, + partition_keys: Vec, + bucket_count: i32, + bucket_keys: Vec, + properties: Vec, + custom_properties: Vec, + comment: String, + } + + struct FfiTableInfo { + table_id: i64, + schema_id: i32, + table_path: FfiTablePath, + created_time: i64, + modified_time: i64, + primary_keys: Vec, + bucket_keys: Vec, + partition_keys: Vec, + num_buckets: i32, + has_primary_key: bool, + is_partitioned: bool, + properties: Vec, + custom_properties: Vec, + comment: String, + schema: FfiSchema, + } + + struct FfiTableInfoResult { + result: FfiResult, + table_info: FfiTableInfo, + } + + // NOTE: FfiDatum, FfiGenericRow, FfiScanRecord, FfiScanRecords, FfiScanRecordsResult + // have been replaced by opaque types below (ScanResultInner, GenericRowInner, LookupResultInner). + + struct FfiArrowRecordBatch { + array_ptr: usize, + schema_ptr: usize, + table_id: i64, + partition_id: i64, + bucket_id: i32, + base_offset: i64, + } + + struct FfiArrowRecordBatches { + batches: Vec, + } + + struct FfiArrowRecordBatchesResult { + result: FfiResult, + arrow_batches: FfiArrowRecordBatches, + } + + struct FfiLakeSnapshot { + snapshot_id: i64, + bucket_offsets: Vec, + } + + struct FfiBucketOffset { + table_id: i64, + partition_id: i64, + bucket_id: i32, + offset: i64, + } + + struct FfiOffsetQuery { + offset_type: i32, + timestamp: i64, + } + + struct FfiBucketInfo { + table_id: i64, + bucket_id: i32, + has_partition_id: bool, + partition_id: i64, + record_count: usize, + } + + struct FfiBucketSubscription { + bucket_id: i32, + offset: i64, + } + + struct FfiPartitionBucketSubscription { + partition_id: i64, + bucket_id: i32, + offset: i64, + } + + struct FfiBucketOffsetPair { + bucket_id: i32, + offset: i64, + } + + struct FfiListOffsetsResult { + result: FfiResult, + bucket_offsets: Vec, + } + + // NOTE: FfiLookupResult replaced by opaque LookupResultInner below. + + struct FfiLakeSnapshotResult { + result: FfiResult, + lake_snapshot: FfiLakeSnapshot, + } + + struct FfiPartitionKeyValue { + key: String, + value: String, + } + + struct FfiPartitionInfo { + partition_id: i64, + partition_name: String, + } + + struct FfiListPartitionInfosResult { + result: FfiResult, + partition_infos: Vec, + } + + struct FfiDatabaseDescriptor { + comment: String, + properties: Vec, + } + + struct FfiDatabaseInfo { + database_name: String, + comment: String, + properties: Vec, + created_time: i64, + modified_time: i64, + } + + struct FfiDatabaseInfoResult { + result: FfiResult, + database_info: FfiDatabaseInfo, + } + + struct FfiListDatabasesResult { + result: FfiResult, + database_names: Vec, + } + + struct FfiListTablesResult { + result: FfiResult, + table_names: Vec, + } + + struct FfiBoolResult { + result: FfiResult, + value: bool, + } + + struct FfiServerNode { + node_id: i32, + host: String, + port: u32, + server_type: String, + uid: String, + } + + struct FfiServerNodesResult { + result: FfiResult, + server_nodes: Vec, + } + + struct FfiPtrResult { + result: FfiResult, + ptr: usize, + } + + extern "Rust" { + type Connection; + type Admin; + type Table; + type AppendWriter; + type WriteResult; + type LogScanner; + type UpsertWriter; + type Lookuper; + + // Opaque types for optimized FFI + type ScanResultInner; + type GenericRowInner; + type LookupResultInner; + type ArrayWriterInner; + type ArrayViewInner; + + // Connection + fn new_connection(config: &FfiConfig) -> FfiPtrResult; + unsafe fn delete_connection(conn: *mut Connection); + fn get_admin(self: &Connection) -> FfiPtrResult; + fn get_table(self: &Connection, table_path: &FfiTablePath) -> FfiPtrResult; + + // Admin + unsafe fn delete_admin(admin: *mut Admin); + fn create_table( + self: &Admin, + table_path: &FfiTablePath, + descriptor: &FfiTableDescriptor, + ignore_if_exists: bool, + ) -> FfiResult; + fn drop_table( + self: &Admin, + table_path: &FfiTablePath, + ignore_if_not_exists: bool, + ) -> FfiResult; + fn get_table_info(self: &Admin, table_path: &FfiTablePath) -> FfiTableInfoResult; + fn get_latest_lake_snapshot( + self: &Admin, + table_path: &FfiTablePath, + ) -> FfiLakeSnapshotResult; + fn list_offsets( + self: &Admin, + table_path: &FfiTablePath, + bucket_ids: Vec, + offset_query: &FfiOffsetQuery, + ) -> FfiListOffsetsResult; + fn list_partition_offsets( + self: &Admin, + table_path: &FfiTablePath, + partition_name: String, + bucket_ids: Vec, + offset_query: &FfiOffsetQuery, + ) -> FfiListOffsetsResult; + fn list_partition_infos( + self: &Admin, + table_path: &FfiTablePath, + ) -> FfiListPartitionInfosResult; + fn list_partition_infos_with_spec( + self: &Admin, + table_path: &FfiTablePath, + partition_spec: Vec, + ) -> FfiListPartitionInfosResult; + fn create_partition( + self: &Admin, + table_path: &FfiTablePath, + partition_spec: Vec, + ignore_if_exists: bool, + ) -> FfiResult; + fn drop_partition( + self: &Admin, + table_path: &FfiTablePath, + partition_spec: Vec, + ignore_if_not_exists: bool, + ) -> FfiResult; + fn create_database( + self: &Admin, + database_name: &str, + descriptor: &FfiDatabaseDescriptor, + ignore_if_exists: bool, + ) -> FfiResult; + fn drop_database( + self: &Admin, + database_name: &str, + ignore_if_not_exists: bool, + cascade: bool, + ) -> FfiResult; + fn list_databases(self: &Admin) -> FfiListDatabasesResult; + fn database_exists(self: &Admin, database_name: &str) -> FfiBoolResult; + fn get_database_info(self: &Admin, database_name: &str) -> FfiDatabaseInfoResult; + fn list_tables(self: &Admin, database_name: &str) -> FfiListTablesResult; + fn table_exists(self: &Admin, table_path: &FfiTablePath) -> FfiBoolResult; + fn get_server_nodes(self: &Admin) -> FfiServerNodesResult; + + // Table + unsafe fn delete_table(table: *mut Table); + fn new_append_writer(self: &Table) -> FfiPtrResult; + fn create_scanner(self: &Table, column_indices: Vec, batch: bool) -> FfiPtrResult; + fn get_table_info_from_table(self: &Table) -> FfiTableInfo; + fn get_table_path(self: &Table) -> FfiTablePath; + fn has_primary_key(self: &Table) -> bool; + fn create_upsert_writer(self: &Table, column_indices: Vec) -> FfiPtrResult; + fn new_lookuper(self: &Table) -> FfiPtrResult; + + // GenericRowInner — opaque row for writes + fn new_generic_row(field_count: usize) -> Box; + fn gr_reset(self: &mut GenericRowInner); + fn gr_set_null(self: &mut GenericRowInner, idx: usize); + fn gr_set_bool(self: &mut GenericRowInner, idx: usize, val: bool); + fn gr_set_i32(self: &mut GenericRowInner, idx: usize, val: i32); + fn gr_set_i64(self: &mut GenericRowInner, idx: usize, val: i64); + fn gr_set_f32(self: &mut GenericRowInner, idx: usize, val: f32); + fn gr_set_f64(self: &mut GenericRowInner, idx: usize, val: f64); + fn gr_set_str(self: &mut GenericRowInner, idx: usize, val: &str); + fn gr_set_bytes(self: &mut GenericRowInner, idx: usize, val: &[u8]); + fn gr_set_date(self: &mut GenericRowInner, idx: usize, days: i32); + fn gr_set_time(self: &mut GenericRowInner, idx: usize, millis: i32); + fn gr_set_ts_ntz(self: &mut GenericRowInner, idx: usize, millis: i64, nanos: i32); + fn gr_set_ts_ltz(self: &mut GenericRowInner, idx: usize, millis: i64, nanos: i32); + fn gr_set_decimal_str(self: &mut GenericRowInner, idx: usize, val: &str); + fn gr_set_array( + self: &mut GenericRowInner, + idx: usize, + writer: &mut ArrayWriterInner, + ) -> Result<()>; + + // ArrayWriterInner — opaque array builder for writes + fn new_array_writer( + size: usize, + element_leaf_type_id: i32, + precision: u32, + scale: u32, + array_nesting: u32, + ) -> Result>; + fn aw_size(self: &ArrayWriterInner) -> usize; + fn aw_set_null(self: &mut ArrayWriterInner, idx: usize) -> Result<()>; + fn aw_set_bool(self: &mut ArrayWriterInner, idx: usize, val: bool) -> Result<()>; + fn aw_set_i32(self: &mut ArrayWriterInner, idx: usize, val: i32) -> Result<()>; + fn aw_set_i64(self: &mut ArrayWriterInner, idx: usize, val: i64) -> Result<()>; + fn aw_set_f32(self: &mut ArrayWriterInner, idx: usize, val: f32) -> Result<()>; + fn aw_set_f64(self: &mut ArrayWriterInner, idx: usize, val: f64) -> Result<()>; + fn aw_set_str(self: &mut ArrayWriterInner, idx: usize, val: &str) -> Result<()>; + fn aw_set_bytes(self: &mut ArrayWriterInner, idx: usize, val: &[u8]) -> Result<()>; + fn aw_set_date(self: &mut ArrayWriterInner, idx: usize, days: i32) -> Result<()>; + fn aw_set_time(self: &mut ArrayWriterInner, idx: usize, millis: i32) -> Result<()>; + fn aw_set_ts_ntz( + self: &mut ArrayWriterInner, + idx: usize, + millis: i64, + nanos: i32, + ) -> Result<()>; + fn aw_set_ts_ltz( + self: &mut ArrayWriterInner, + idx: usize, + millis: i64, + nanos: i32, + ) -> Result<()>; + fn aw_set_decimal_str(self: &mut ArrayWriterInner, idx: usize, val: &str) -> Result<()>; + fn aw_set_array( + self: &mut ArrayWriterInner, + idx: usize, + nested: &mut ArrayWriterInner, + ) -> Result<()>; + + // AppendWriter + unsafe fn delete_append_writer(writer: *mut AppendWriter); + fn append(self: &mut AppendWriter, row: &GenericRowInner) -> FfiPtrResult; + fn append_arrow_batch( + self: &mut AppendWriter, + array_ptr: usize, + schema_ptr: usize, + ) -> FfiPtrResult; + fn flush(self: &mut AppendWriter) -> FfiResult; + + // WriteResult + unsafe fn delete_write_result(wr: *mut WriteResult); + fn wait(self: &mut WriteResult) -> FfiResult; + + // UpsertWriter + unsafe fn delete_upsert_writer(writer: *mut UpsertWriter); + fn upsert(self: &mut UpsertWriter, row: &GenericRowInner) -> FfiPtrResult; + fn delete_row(self: &mut UpsertWriter, row: &GenericRowInner) -> FfiPtrResult; + fn upsert_flush(self: &mut UpsertWriter) -> FfiResult; + + // Lookuper + unsafe fn delete_lookuper(lookuper: *mut Lookuper); + fn lookup(self: &mut Lookuper, pk_row: &GenericRowInner) -> Box; + + // LookupResultInner accessors + fn lv_has_error(self: &LookupResultInner) -> bool; + fn lv_error_code(self: &LookupResultInner) -> i32; + fn lv_error_message(self: &LookupResultInner) -> &str; + fn lv_found(self: &LookupResultInner) -> bool; + fn lv_field_count(self: &LookupResultInner) -> usize; + fn lv_column_name(self: &LookupResultInner, field: usize) -> Result<&str>; + fn lv_column_type(self: &LookupResultInner, field: usize) -> Result; + fn lv_is_null(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_bool(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_i32(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_i64(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_f32(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_f64(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_str(self: &LookupResultInner, field: usize) -> Result<&str>; + fn lv_get_bytes(self: &LookupResultInner, field: usize) -> Result<&[u8]>; + fn lv_get_date_days(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_time_millis(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_ts_millis(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_ts_nanos(self: &LookupResultInner, field: usize) -> Result; + fn lv_is_ts_ltz(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_decimal_str(self: &LookupResultInner, field: usize) -> Result; + + fn lv_get_array_size(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_array_is_null( + self: &LookupResultInner, + field: usize, + element: usize, + ) -> Result; + fn lv_get_array_bool( + self: &LookupResultInner, + field: usize, + element: usize, + ) -> Result; + fn lv_get_array_i32(self: &LookupResultInner, field: usize, element: usize) -> Result; + fn lv_get_array_i64(self: &LookupResultInner, field: usize, element: usize) -> Result; + fn lv_get_array_f32(self: &LookupResultInner, field: usize, element: usize) -> Result; + fn lv_get_array_f64(self: &LookupResultInner, field: usize, element: usize) -> Result; + fn lv_get_array_str( + self: &LookupResultInner, + field: usize, + element: usize, + ) -> Result; + fn lv_get_array_bytes( + self: &LookupResultInner, + field: usize, + element: usize, + ) -> Result>; + fn lv_get_array_date_days( + self: &LookupResultInner, + field: usize, + element: usize, + ) -> Result; + fn lv_get_array_time_millis( + self: &LookupResultInner, + field: usize, + element: usize, + ) -> Result; + fn lv_get_array_ts_millis( + self: &LookupResultInner, + field: usize, + element: usize, + ) -> Result; + fn lv_get_array_ts_nanos( + self: &LookupResultInner, + field: usize, + element: usize, + ) -> Result; + fn lv_get_array_decimal_str( + self: &LookupResultInner, + field: usize, + element: usize, + ) -> Result; + fn lv_get_array_element_type(self: &LookupResultInner, field: usize) -> Result; + fn lv_get_array_view(self: &LookupResultInner, field: usize) + -> Result>; + + // ArrayViewInner — opaque recursive array reader for C++ bindings + fn av_size(self: &ArrayViewInner) -> usize; + fn av_element_type_id(self: &ArrayViewInner) -> i32; + fn av_is_null(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_bool(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_i32(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_i64(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_f32(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_f64(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_str(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_bytes(self: &ArrayViewInner, element: usize) -> Result>; + fn av_get_date_days(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_time_millis(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_ts_millis(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_ts_nanos(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_decimal_str(self: &ArrayViewInner, element: usize) -> Result; + fn av_get_nested(self: &ArrayViewInner, element: usize) -> Result>; + + // LogScanner + unsafe fn delete_log_scanner(scanner: *mut LogScanner); + fn subscribe(self: &LogScanner, bucket_id: i32, start_offset: i64) -> FfiResult; + fn subscribe_buckets( + self: &LogScanner, + subscriptions: Vec, + ) -> FfiResult; + fn subscribe_partition( + self: &LogScanner, + partition_id: i64, + bucket_id: i32, + start_offset: i64, + ) -> FfiResult; + fn subscribe_partition_buckets( + self: &LogScanner, + subscriptions: Vec, + ) -> FfiResult; + fn unsubscribe(self: &LogScanner, bucket_id: i32) -> FfiResult; + fn unsubscribe_partition(self: &LogScanner, partition_id: i64, bucket_id: i32) + -> FfiResult; + fn poll(self: &LogScanner, timeout_ms: i64) -> Box; + fn poll_record_batch(self: &LogScanner, timeout_ms: i64) -> FfiArrowRecordBatchesResult; + fn free_arrow_ffi_structures(array_ptr: usize, schema_ptr: usize); + + // ScanResultInner accessors + fn sv_has_error(self: &ScanResultInner) -> bool; + fn sv_error_code(self: &ScanResultInner) -> i32; + fn sv_error_message(self: &ScanResultInner) -> &str; + fn sv_record_count(self: &ScanResultInner) -> usize; + fn sv_column_count(self: &ScanResultInner) -> usize; + fn sv_column_name(self: &ScanResultInner, field: usize) -> Result<&str>; + fn sv_column_type(self: &ScanResultInner, field: usize) -> Result; + fn sv_offset(self: &ScanResultInner, bucket: usize, rec: usize) -> i64; + fn sv_timestamp(self: &ScanResultInner, bucket: usize, rec: usize) -> i64; + fn sv_change_type(self: &ScanResultInner, bucket: usize, rec: usize) -> i32; + fn sv_field_count(self: &ScanResultInner) -> usize; + fn sv_is_null( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_bool( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_i32( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_i64( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_f32( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_f64( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_str( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result<&str>; + fn sv_get_bytes( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result<&[u8]>; + fn sv_get_date_days( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_time_millis( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_ts_millis( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_ts_nanos( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_is_ts_ltz( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_decimal_str( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + + fn sv_get_array_size( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result; + fn sv_get_array_is_null( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_bool( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_i32( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_i64( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_f32( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_f64( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_str( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_bytes( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result>; + fn sv_get_array_date_days( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_time_millis( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_ts_millis( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_ts_nanos( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_decimal_str( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result; + fn sv_get_array_element_type(self: &ScanResultInner, field: usize) -> Result; + fn sv_get_array_view( + self: &ScanResultInner, + bucket: usize, + rec: usize, + field: usize, + ) -> Result>; + + fn sv_bucket_infos(self: &ScanResultInner) -> &Vec; + } +} + +pub struct Connection { + inner: Arc, +} + +pub struct Admin { + inner: Arc, +} + +pub struct Table { + connection: Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + table_path: fcore::metadata::TablePath, + has_pk: bool, +} + +pub struct AppendWriter { + inner: fcore::client::AppendWriter, + table_info: fcore::metadata::TableInfo, +} + +pub struct WriteResult { + inner: Option, +} + +enum ScannerKind { + Record(fcore::client::LogScanner), + Batch(fcore::client::RecordBatchLogScanner), +} + +pub struct LogScanner { + scanner: ScannerKind, + /// Fluss columns matching the projected Arrow fields (1:1 by index). + /// For non-projected scanners this is the full table schema columns. + projected_columns: Vec, +} + +pub struct UpsertWriter { + inner: fcore::client::UpsertWriter, + table_info: fcore::metadata::TableInfo, +} + +pub struct Lookuper { + inner: fcore::client::Lookuper, + table_info: fcore::metadata::TableInfo, +} + +/// Error code for client-side errors that did not originate from the server API protocol. +/// Must be non-zero so that CPP `Result::Ok()` (which checks `error_code == 0`) correctly +/// detects client-side errors as failures. The value -2 is outside the server API error +/// code range (-1 .. 57+), so it will never collide with current or future API codes. +const CLIENT_ERROR_CODE: i32 = -2; + +fn ok_result() -> ffi::FfiResult { + ffi::FfiResult { + error_code: 0, + error_message: String::new(), + } +} + +fn err_result(code: i32, msg: String) -> ffi::FfiResult { + ffi::FfiResult { + error_code: code, + error_message: msg, + } +} + +/// Create a client-side error result (not from server API). +fn client_err(msg: String) -> ffi::FfiResult { + err_result(CLIENT_ERROR_CODE, msg) +} + +fn err_from_core_error(e: &Error) -> ffi::FfiResult { + // Transport failures map to `NetworkException` (Java parity, + // retriable). + match e { + Error::FlussAPIError { api_error } => err_result(api_error.code, api_error.message.clone()), + Error::RpcError { .. } => { + err_result(CoreFlussError::NetworkException.code(), e.to_string()) + } + _ => client_err(e.to_string()), + } +} + +fn ok_ptr(ptr: usize) -> ffi::FfiPtrResult { + ffi::FfiPtrResult { + result: ok_result(), + ptr, + } +} + +fn client_err_ptr(msg: String) -> ffi::FfiPtrResult { + ffi::FfiPtrResult { + result: client_err(msg), + ptr: 0usize, + } +} + +fn err_ptr_from_core(e: &fcore::error::Error) -> ffi::FfiPtrResult { + ffi::FfiPtrResult { + result: err_from_core_error(e), + ptr: 0usize, + } +} + +// Connection implementation +fn new_connection(config: &ffi::FfiConfig) -> ffi::FfiPtrResult { + let assigner_type = match config + .writer_bucket_no_key_assigner + .parse::() + { + Ok(v) => v, + Err(e) => return client_err_ptr(format!("Invalid bucket assigner type: {e}")), + }; + let config_core = fluss::config::Config { + bootstrap_servers: config.bootstrap_servers.to_string(), + writer_request_max_size: config.writer_request_max_size, + writer_acks: config.writer_acks.to_string(), + writer_retries: config.writer_retries, + writer_batch_size: config.writer_batch_size, + writer_dynamic_batch_size_enabled: config.writer_dynamic_batch_size_enabled, + writer_dynamic_batch_size_min: config.writer_dynamic_batch_size_min, + writer_batch_timeout_ms: config.writer_batch_timeout_ms, + writer_bucket_no_key_assigner: assigner_type, + scanner_remote_log_prefetch_num: config.scanner_remote_log_prefetch_num, + remote_file_download_thread_num: config.remote_file_download_thread_num, + scanner_remote_log_read_concurrency: config.scanner_remote_log_read_concurrency, + scanner_log_max_poll_records: config.scanner_log_max_poll_records, + scanner_log_fetch_max_bytes: config.scanner_log_fetch_max_bytes, + scanner_log_fetch_min_bytes: config.scanner_log_fetch_min_bytes, + scanner_log_fetch_wait_max_time_ms: config.scanner_log_fetch_wait_max_time_ms, + scanner_log_fetch_max_bytes_for_bucket: config.scanner_log_fetch_max_bytes_for_bucket, + writer_enable_idempotence: config.writer_enable_idempotence, + writer_max_inflight_requests_per_bucket: config.writer_max_inflight_requests_per_bucket, + writer_buffer_memory_size: config.writer_buffer_memory_size, + writer_buffer_wait_timeout_ms: config.writer_buffer_wait_timeout_ms, + connect_timeout_ms: config.connect_timeout_ms, + security_protocol: config.security_protocol.to_string(), + security_sasl_mechanism: config.security_sasl_mechanism.to_string(), + security_sasl_username: config.security_sasl_username.to_string(), + security_sasl_password: config.security_sasl_password.to_string(), + lookup_queue_size: config.lookup_queue_size, + lookup_max_batch_size: config.lookup_max_batch_size, + lookup_batch_timeout_ms: config.lookup_batch_timeout_ms, + lookup_max_inflight_requests: config.lookup_max_inflight_requests, + lookup_max_retries: config.lookup_max_retries, + }; + + let conn = RUNTIME.block_on(async { fcore::client::FlussConnection::new(config_core).await }); + + match conn { + Ok(c) => { + let ptr = Box::into_raw(Box::new(Connection { inner: Arc::new(c) })); + ok_ptr(ptr as usize) + } + Err(e) => err_ptr_from_core(&e), + } +} + +unsafe fn delete_connection(conn: *mut Connection) { + if !conn.is_null() { + unsafe { + drop(Box::from_raw(conn)); + } + } +} + +impl Connection { + fn get_admin(&self) -> ffi::FfiPtrResult { + let admin_result = self.inner.get_admin(); + + match admin_result { + Ok(admin) => { + let ptr = Box::into_raw(Box::new(Admin { inner: admin })); + ok_ptr(ptr as usize) + } + Err(e) => err_ptr_from_core(&e), + } + } + + fn get_table(&self, table_path: &ffi::FfiTablePath) -> ffi::FfiPtrResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let table_result = RUNTIME.block_on(async { self.inner.get_table(&path).await }); + + match table_result { + Ok(t) => { + let ptr = Box::into_raw(Box::new(Table { + connection: self.inner.clone(), + metadata: t.metadata().clone(), + table_info: t.get_table_info().clone(), + table_path: t.table_path().clone(), + has_pk: t.has_primary_key(), + })); + ok_ptr(ptr as usize) + } + Err(e) => err_ptr_from_core(&e), + } + } +} + +// Admin implementation +unsafe fn delete_admin(admin: *mut Admin) { + if !admin.is_null() { + unsafe { + drop(Box::from_raw(admin)); + } + } +} + +impl Admin { + fn create_table( + &self, + table_path: &ffi::FfiTablePath, + descriptor: &ffi::FfiTableDescriptor, + ignore_if_exists: bool, + ) -> ffi::FfiResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let core_descriptor = match types::ffi_descriptor_to_core(descriptor) { + Ok(d) => d, + Err(e) => return client_err(e.to_string()), + }; + + let result = RUNTIME.block_on(async { + self.inner + .create_table(&path, &core_descriptor, ignore_if_exists) + .await + }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + } + + fn drop_table( + &self, + table_path: &ffi::FfiTablePath, + ignore_if_not_exists: bool, + ) -> ffi::FfiResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let result = + RUNTIME.block_on(async { self.inner.drop_table(&path, ignore_if_not_exists).await }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + } + + fn get_table_info(&self, table_path: &ffi::FfiTablePath) -> ffi::FfiTableInfoResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let result = RUNTIME.block_on(async { self.inner.get_table_info(&path).await }); + + match result { + Ok(info) => ffi::FfiTableInfoResult { + result: ok_result(), + table_info: types::core_table_info_to_ffi(&info), + }, + Err(e) => ffi::FfiTableInfoResult { + result: err_from_core_error(&e), + table_info: types::empty_table_info(), + }, + } + } + + fn get_latest_lake_snapshot( + &self, + table_path: &ffi::FfiTablePath, + ) -> ffi::FfiLakeSnapshotResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let result = RUNTIME.block_on(async { self.inner.get_latest_lake_snapshot(&path).await }); + + match result { + Ok(snapshot) => ffi::FfiLakeSnapshotResult { + result: ok_result(), + lake_snapshot: types::core_lake_snapshot_to_ffi(&snapshot), + }, + Err(e) => ffi::FfiLakeSnapshotResult { + result: err_from_core_error(&e), + lake_snapshot: ffi::FfiLakeSnapshot { + snapshot_id: -1, + bucket_offsets: vec![], + }, + }, + } + } + + // Helper function for common list offsets functionality + fn do_list_offsets( + &self, + table_path: &ffi::FfiTablePath, + partition_name: Option<&str>, + bucket_ids: Vec, + offset_query: &ffi::FfiOffsetQuery, + ) -> ffi::FfiListOffsetsResult { + use fcore::rpc::message::OffsetSpec; + + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let offset_spec = match offset_query.offset_type { + 0 => OffsetSpec::Earliest, + 1 => OffsetSpec::Latest, + 2 => OffsetSpec::Timestamp(offset_query.timestamp), + _ => { + return ffi::FfiListOffsetsResult { + result: client_err(format!( + "Invalid offset_type: {}", + offset_query.offset_type + )), + bucket_offsets: vec![], + }; + } + }; + + let result = RUNTIME.block_on(async { + if let Some(part_name) = partition_name { + self.inner + .list_partition_offsets(&path, part_name, &bucket_ids, offset_spec) + .await + } else { + self.inner + .list_offsets(&path, &bucket_ids, offset_spec) + .await + } + }); + + match result { + Ok(offsets) => { + let bucket_offsets: Vec = offsets + .into_iter() + .map(|(bucket_id, offset)| ffi::FfiBucketOffsetPair { bucket_id, offset }) + .collect(); + ffi::FfiListOffsetsResult { + result: ok_result(), + bucket_offsets, + } + } + Err(e) => ffi::FfiListOffsetsResult { + result: err_from_core_error(&e), + bucket_offsets: vec![], + }, + } + } + + fn list_offsets( + &self, + table_path: &ffi::FfiTablePath, + bucket_ids: Vec, + offset_query: &ffi::FfiOffsetQuery, + ) -> ffi::FfiListOffsetsResult { + self.do_list_offsets(table_path, None, bucket_ids, offset_query) + } + + fn list_partition_offsets( + &self, + table_path: &ffi::FfiTablePath, + partition_name: String, + bucket_ids: Vec, + offset_query: &ffi::FfiOffsetQuery, + ) -> ffi::FfiListOffsetsResult { + self.do_list_offsets(table_path, Some(&partition_name), bucket_ids, offset_query) + } + + fn list_partition_infos( + &self, + table_path: &ffi::FfiTablePath, + ) -> ffi::FfiListPartitionInfosResult { + self.do_list_partition_infos(table_path, None) + } + + fn list_partition_infos_with_spec( + &self, + table_path: &ffi::FfiTablePath, + partition_spec: Vec, + ) -> ffi::FfiListPartitionInfosResult { + let spec_map: std::collections::HashMap = partition_spec + .into_iter() + .map(|kv| (kv.key, kv.value)) + .collect(); + let spec = fcore::metadata::PartitionSpec::new(spec_map); + self.do_list_partition_infos(table_path, Some(&spec)) + } + fn create_partition( + &self, + table_path: &ffi::FfiTablePath, + partition_spec: Vec, + ignore_if_exists: bool, + ) -> ffi::FfiResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + let spec_map: std::collections::HashMap = partition_spec + .into_iter() + .map(|kv| (kv.key, kv.value)) + .collect(); + let partition_spec = fcore::metadata::PartitionSpec::new(spec_map); + + let result = RUNTIME.block_on(async { + self.inner + .create_partition(&path, &partition_spec, ignore_if_exists) + .await + }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + } + + fn drop_partition( + &self, + table_path: &ffi::FfiTablePath, + partition_spec: Vec, + ignore_if_not_exists: bool, + ) -> ffi::FfiResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + let spec_map: std::collections::HashMap = partition_spec + .into_iter() + .map(|kv| (kv.key, kv.value)) + .collect(); + let partition_spec = fcore::metadata::PartitionSpec::new(spec_map); + + let result = RUNTIME.block_on(async { + self.inner + .drop_partition(&path, &partition_spec, ignore_if_not_exists) + .await + }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + } + + fn create_database( + &self, + database_name: &str, + descriptor: &ffi::FfiDatabaseDescriptor, + ignore_if_exists: bool, + ) -> ffi::FfiResult { + let descriptor_opt = types::ffi_database_descriptor_to_core(descriptor); + + let result = RUNTIME.block_on(async { + self.inner + .create_database(database_name, descriptor_opt.as_ref(), ignore_if_exists) + .await + }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + } + + fn drop_database( + &self, + database_name: &str, + ignore_if_not_exists: bool, + cascade: bool, + ) -> ffi::FfiResult { + let result = RUNTIME.block_on(async { + self.inner + .drop_database(database_name, ignore_if_not_exists, cascade) + .await + }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + } + + fn list_databases(&self) -> ffi::FfiListDatabasesResult { + let result = RUNTIME.block_on(async { self.inner.list_databases().await }); + + match result { + Ok(names) => ffi::FfiListDatabasesResult { + result: ok_result(), + database_names: names, + }, + Err(e) => ffi::FfiListDatabasesResult { + result: err_from_core_error(&e), + database_names: vec![], + }, + } + } + + fn database_exists(&self, database_name: &str) -> ffi::FfiBoolResult { + let result = RUNTIME.block_on(async { self.inner.database_exists(database_name).await }); + + match result { + Ok(exists) => ffi::FfiBoolResult { + result: ok_result(), + value: exists, + }, + Err(e) => ffi::FfiBoolResult { + result: err_from_core_error(&e), + value: false, + }, + } + } + + fn get_database_info(&self, database_name: &str) -> ffi::FfiDatabaseInfoResult { + let result = RUNTIME.block_on(async { self.inner.get_database_info(database_name).await }); + + match result { + Ok(info) => ffi::FfiDatabaseInfoResult { + result: ok_result(), + database_info: types::core_database_info_to_ffi(&info), + }, + Err(e) => ffi::FfiDatabaseInfoResult { + result: err_from_core_error(&e), + database_info: ffi::FfiDatabaseInfo { + database_name: String::new(), + comment: String::new(), + properties: vec![], + created_time: 0, + modified_time: 0, + }, + }, + } + } + + fn list_tables(&self, database_name: &str) -> ffi::FfiListTablesResult { + let result = RUNTIME.block_on(async { self.inner.list_tables(database_name).await }); + + match result { + Ok(names) => ffi::FfiListTablesResult { + result: ok_result(), + table_names: names, + }, + Err(e) => ffi::FfiListTablesResult { + result: err_from_core_error(&e), + table_names: vec![], + }, + } + } + + fn table_exists(&self, table_path: &ffi::FfiTablePath) -> ffi::FfiBoolResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + + let result = RUNTIME.block_on(async { self.inner.table_exists(&path).await }); + + match result { + Ok(exists) => ffi::FfiBoolResult { + result: ok_result(), + value: exists, + }, + Err(e) => ffi::FfiBoolResult { + result: err_from_core_error(&e), + value: false, + }, + } + } + + fn do_list_partition_infos( + &self, + table_path: &ffi::FfiTablePath, + partial_partition_spec: Option<&fcore::metadata::PartitionSpec>, + ) -> ffi::FfiListPartitionInfosResult { + let path = fcore::metadata::TablePath::new( + table_path.database_name.clone(), + table_path.table_name.clone(), + ); + let result = RUNTIME.block_on(async { + self.inner + .list_partition_infos_with_spec(&path, partial_partition_spec) + .await + }); + match result { + Ok(infos) => { + let partition_infos: Vec = infos + .into_iter() + .map(|info| ffi::FfiPartitionInfo { + partition_id: info.get_partition_id(), + partition_name: info.get_partition_name(), + }) + .collect(); + ffi::FfiListPartitionInfosResult { + result: ok_result(), + partition_infos, + } + } + Err(e) => ffi::FfiListPartitionInfosResult { + result: err_from_core_error(&e), + partition_infos: vec![], + }, + } + } + + fn get_server_nodes(&self) -> ffi::FfiServerNodesResult { + let result = RUNTIME.block_on(async { self.inner.get_server_nodes().await }); + + match result { + Ok(nodes) => { + let server_nodes: Vec = nodes + .into_iter() + .map(|node| ffi::FfiServerNode { + node_id: node.id(), + host: node.host().to_string(), + port: node.port(), + server_type: node.server_type().to_string(), + uid: node.uid().to_string(), + }) + .collect(); + ffi::FfiServerNodesResult { + result: ok_result(), + server_nodes, + } + } + Err(e) => ffi::FfiServerNodesResult { + result: err_from_core_error(&e), + server_nodes: vec![], + }, + } + } +} + +// Table implementation +unsafe fn delete_table(table: *mut Table) { + if !table.is_null() { + unsafe { + drop(Box::from_raw(table)); + } + } +} + +impl Table { + fn fluss_table(&self) -> fcore::client::FlussTable<'_> { + fcore::client::FlussTable::new( + &self.connection, + self.metadata.clone(), + self.table_info.clone(), + ) + } + + fn resolve_projected_columns( + &self, + indices: &[usize], + ) -> Result, String> { + let all_columns = self.table_info.get_schema().columns(); + indices + .iter() + .map(|&i| { + all_columns.get(i).cloned().ok_or_else(|| { + format!( + "Invalid column index {i}: schema has {} columns", + all_columns.len() + ) + }) + }) + .collect() + } + + fn new_append_writer(&self) -> ffi::FfiPtrResult { + let _enter = RUNTIME.enter(); + + let table_append = match self.fluss_table().new_append() { + Ok(a) => a, + Err(e) => return err_ptr_from_core(&e), + }; + + let writer = match table_append.create_writer() { + Ok(w) => w, + Err(e) => return err_ptr_from_core(&e), + }; + + let ptr = Box::into_raw(Box::new(AppendWriter { + inner: writer, + table_info: self.table_info.clone(), + })); + ok_ptr(ptr as usize) + } + + fn create_scanner(&self, column_indices: Vec, batch: bool) -> ffi::FfiPtrResult { + RUNTIME.block_on(async { + let fluss_table = self.fluss_table(); + let scan = fluss_table.new_scan(); + + let (projected_columns, scan) = if column_indices.is_empty() { + (self.table_info.get_schema().columns().to_vec(), scan) + } else { + let cols = match self.resolve_projected_columns(&column_indices) { + Ok(c) => c, + Err(e) => return client_err_ptr(e), + }; + let scan = match scan.project(&column_indices) { + Ok(s) => s, + Err(e) => return err_ptr_from_core(&e), + }; + (cols, scan) + }; + + let scanner = if batch { + match scan.create_record_batch_log_scanner() { + Ok(s) => ScannerKind::Batch(s), + Err(e) => return err_ptr_from_core(&e), + } + } else { + match scan.create_log_scanner() { + Ok(s) => ScannerKind::Record(s), + Err(e) => return err_ptr_from_core(&e), + } + }; + + let ptr = Box::into_raw(Box::new(LogScanner { + scanner, + projected_columns, + })); + ok_ptr(ptr as usize) + }) + } + + fn get_table_info_from_table(&self) -> ffi::FfiTableInfo { + types::core_table_info_to_ffi(&self.table_info) + } + + fn get_table_path(&self) -> ffi::FfiTablePath { + ffi::FfiTablePath { + database_name: self.table_path.database().to_string(), + table_name: self.table_path.table().to_string(), + } + } + + fn has_primary_key(&self) -> bool { + self.has_pk + } + + fn create_upsert_writer(&self, column_indices: Vec) -> ffi::FfiPtrResult { + let _enter = RUNTIME.enter(); + + let table_upsert = match self.fluss_table().new_upsert() { + Ok(u) => u, + Err(e) => return err_ptr_from_core(&e), + }; + + let table_upsert = if column_indices.is_empty() { + table_upsert + } else { + match table_upsert.partial_update(Some(column_indices)) { + Ok(u) => u, + Err(e) => return err_ptr_from_core(&e), + } + }; + + let writer = match table_upsert.create_writer() { + Ok(w) => w, + Err(e) => return err_ptr_from_core(&e), + }; + + let ptr = Box::into_raw(Box::new(UpsertWriter { + inner: writer, + table_info: self.table_info.clone(), + })); + ok_ptr(ptr as usize) + } + + fn new_lookuper(&self) -> ffi::FfiPtrResult { + let _enter = RUNTIME.enter(); + + let table_lookup = match self.fluss_table().new_lookup() { + Ok(l) => l, + Err(e) => return err_ptr_from_core(&e), + }; + + let lookuper = match table_lookup.create_lookuper() { + Ok(l) => l, + Err(e) => return err_ptr_from_core(&e), + }; + + let ptr = Box::into_raw(Box::new(Lookuper { + inner: lookuper, + table_info: self.table_info.clone(), + })); + ok_ptr(ptr as usize) + } +} + +// AppendWriter implementation +unsafe fn delete_append_writer(writer: *mut AppendWriter) { + if !writer.is_null() { + unsafe { + drop(Box::from_raw(writer)); + } + } +} + +impl AppendWriter { + fn append(&mut self, row: &GenericRowInner) -> ffi::FfiPtrResult { + let schema = self.table_info.get_schema(); + let generic_row = match types::resolve_row_types(&row.row, Some(schema)) { + Ok(r) => r, + Err(e) => return client_err_ptr(e.to_string()), + }; + + let result_future = match self.inner.append(&generic_row) { + Ok(f) => f, + Err(e) => return err_ptr_from_core(&e), + }; + + let ptr = Box::into_raw(Box::new(WriteResult { + inner: Some(result_future), + })); + ok_ptr(ptr as usize) + } + + fn append_arrow_batch(&mut self, array_ptr: usize, schema_ptr: usize) -> ffi::FfiPtrResult { + use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; + + // Safety: C++ allocates these via `new ArrowArray/ArrowSchema` after a + // successful `ExportRecordBatch`, so both pointers are valid heap + // allocations that we take ownership of here. + let ffi_array = unsafe { *Box::from_raw(array_ptr as *mut FFI_ArrowArray) }; + let ffi_schema = unsafe { Box::from_raw(schema_ptr as *mut FFI_ArrowSchema) }; + + // Safety: `from_ffi` requires that the array and schema conform to the + // Arrow C Data Interface, which is guaranteed by C++'s ExportRecordBatch. + let array_data = match unsafe { arrow::ffi::from_ffi(ffi_array, &ffi_schema) } { + Ok(d) => d, + Err(e) => return client_err_ptr(format!("Failed to import Arrow batch: {e}")), + }; + // ffi_array is consumed by from_ffi; ffi_schema is dropped here (Box goes out of scope) + + // Reconstruct RecordBatch from the imported StructArray data + let struct_array = arrow::array::StructArray::from(array_data); + let batch = arrow::record_batch::RecordBatch::from(struct_array); + + let result_future = match self.inner.append_arrow_batch(batch) { + Ok(f) => f, + Err(e) => return err_ptr_from_core(&e), + }; + + let ptr = Box::into_raw(Box::new(WriteResult { + inner: Some(result_future), + })); + ok_ptr(ptr as usize) + } + + fn flush(&mut self) -> ffi::FfiResult { + let result = RUNTIME.block_on(async { self.inner.flush().await }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + } +} + +unsafe fn delete_write_result(wr: *mut WriteResult) { + if !wr.is_null() { + unsafe { + drop(Box::from_raw(wr)); + } + } +} + +impl WriteResult { + fn wait(&mut self) -> ffi::FfiResult { + if let Some(future) = self.inner.take() { + let result = RUNTIME.block_on(future); + match result { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + } else { + client_err("WriteResult already consumed".to_string()) + } + } +} + +// UpsertWriter implementation +unsafe fn delete_upsert_writer(writer: *mut UpsertWriter) { + if !writer.is_null() { + unsafe { + drop(Box::from_raw(writer)); + } + } +} + +impl UpsertWriter { + /// Pad row with Null to full schema width. + /// This allows callers to only set the fields they care about. + fn pad_row<'a>(&self, mut row: fcore::row::GenericRow<'a>) -> fcore::row::GenericRow<'a> { + let num_columns = self.table_info.get_schema().columns().len(); + if row.values.len() < num_columns { + row.values.resize(num_columns, fcore::row::Datum::Null); + } + row + } + + fn upsert(&mut self, row: &GenericRowInner) -> ffi::FfiPtrResult { + let schema = self.table_info.get_schema(); + let generic_row = match types::resolve_row_types(&row.row, Some(schema)) { + Ok(r) => r, + Err(e) => return client_err_ptr(e.to_string()), + }; + let generic_row = self.pad_row(generic_row); + + let result_future = match self.inner.upsert(&generic_row) { + Ok(f) => f, + Err(e) => return err_ptr_from_core(&e), + }; + + let ptr = Box::into_raw(Box::new(WriteResult { + inner: Some(result_future), + })); + ok_ptr(ptr as usize) + } + + fn delete_row(&mut self, row: &GenericRowInner) -> ffi::FfiPtrResult { + let schema = self.table_info.get_schema(); + let generic_row = match types::resolve_row_types(&row.row, Some(schema)) { + Ok(r) => r, + Err(e) => return client_err_ptr(e.to_string()), + }; + let generic_row = self.pad_row(generic_row); + + let result_future = match self.inner.delete(&generic_row) { + Ok(f) => f, + Err(e) => return err_ptr_from_core(&e), + }; + + let ptr = Box::into_raw(Box::new(WriteResult { + inner: Some(result_future), + })); + ok_ptr(ptr as usize) + } + + fn upsert_flush(&mut self) -> ffi::FfiResult { + let result = RUNTIME.block_on(async { self.inner.flush().await }); + + match result { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + } +} + +// Lookuper implementation +unsafe fn delete_lookuper(lookuper: *mut Lookuper) { + if !lookuper.is_null() { + unsafe { + drop(Box::from_raw(lookuper)); + } + } +} + +impl Lookuper { + /// Build a dense PK-only row from a (possibly sparse) input row. + /// The user may set PK values at their full schema positions (e.g. [0, 2]) + /// via name-based Set(). We compact them into [0, 1, …] to match + /// the lookup_row_type the core KeyEncoder expects. + fn dense_pk_row<'a>(&self, mut row: fcore::row::GenericRow<'a>) -> fcore::row::GenericRow<'a> { + let pk_indices = self.table_info.get_schema().primary_key_indexes(); + let mut dense = fcore::row::GenericRow::new(pk_indices.len()); + for (dense_idx, &schema_idx) in pk_indices.iter().enumerate() { + if schema_idx < row.values.len() { + dense.values[dense_idx] = + std::mem::replace(&mut row.values[schema_idx], fcore::row::Datum::Null); + } + } + dense + } + + fn lookup(&mut self, pk_row: &GenericRowInner) -> Box { + let schema = self.table_info.get_schema(); + let generic_row = match types::resolve_row_types(&pk_row.row, Some(schema)) { + Ok(r) => self.dense_pk_row(r), + Err(e) => { + return Box::new(LookupResultInner::from_error( + CLIENT_ERROR_CODE, + e.to_string(), + )); + } + }; + + let lookup_result = match RUNTIME.block_on(self.inner.lookup(&generic_row)) { + Ok(r) => r, + Err(e) => { + let ffi_err = err_from_core_error(&e); + return Box::new(LookupResultInner::from_error( + ffi_err.error_code, + ffi_err.error_message, + )); + } + }; + + let columns = self.table_info.get_schema().columns().to_vec(); + match lookup_result.get_single_row() { + Ok(Some(row)) => match types::compacted_row_to_owned(&row, &self.table_info) { + Ok(owned_row) => Box::new(LookupResultInner { + error: None, + found: true, + row: Some(owned_row), + columns, + }), + Err(e) => Box::new(LookupResultInner::from_error( + CLIENT_ERROR_CODE, + e.to_string(), + )), + }, + Ok(None) => Box::new(LookupResultInner { + error: None, + found: false, + row: None, + columns, + }), + Err(e) => { + let ffi_err = err_from_core_error(&e); + Box::new(LookupResultInner::from_error( + ffi_err.error_code, + ffi_err.error_message, + )) + } + } + } +} + +// LogScanner implementation +unsafe fn delete_log_scanner(scanner: *mut LogScanner) { + if !scanner.is_null() { + unsafe { + drop(Box::from_raw(scanner)); + } + } +} + +// Helper function to free the Arrow FFI structures separately (for use after ImportRecordBatch) +pub extern "C" fn free_arrow_ffi_structures(array_ptr: usize, schema_ptr: usize) { + use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; + if array_ptr != 0 { + let _array = unsafe { Box::from_raw(array_ptr as *mut FFI_ArrowArray) }; + } + if schema_ptr != 0 { + let _schema = unsafe { Box::from_raw(schema_ptr as *mut FFI_ArrowSchema) }; + } +} + +/// Dispatch a method call to whichever scanner variant is active. +/// Both LogScanner and RecordBatchLogScanner share the same subscribe/unsubscribe interface. +macro_rules! dispatch_scanner { + ($self:expr, $method:ident($($arg:expr),*)) => { + match RUNTIME.block_on(async { + match &$self.scanner { + ScannerKind::Record(s) => s.$method($($arg),*).await, + ScannerKind::Batch(s) => s.$method($($arg),*).await, + } + }) { + Ok(_) => ok_result(), + Err(e) => err_from_core_error(&e), + } + }; +} + +impl LogScanner { + fn subscribe(&self, bucket_id: i32, start_offset: i64) -> ffi::FfiResult { + dispatch_scanner!(self, subscribe(bucket_id, start_offset)) + } + + fn subscribe_buckets(&self, subscriptions: Vec) -> ffi::FfiResult { + use std::collections::HashMap; + let bucket_offsets: HashMap = subscriptions + .into_iter() + .map(|s| (s.bucket_id, s.offset)) + .collect(); + dispatch_scanner!(self, subscribe_buckets(&bucket_offsets)) + } + + fn subscribe_partition( + &self, + partition_id: PartitionId, + bucket_id: i32, + start_offset: i64, + ) -> ffi::FfiResult { + dispatch_scanner!( + self, + subscribe_partition(partition_id, bucket_id, start_offset) + ) + } + + fn subscribe_partition_buckets( + &self, + subscriptions: Vec, + ) -> ffi::FfiResult { + use std::collections::HashMap; + let offsets: HashMap<(PartitionId, i32), i64> = subscriptions + .into_iter() + .map(|s| ((s.partition_id, s.bucket_id), s.offset)) + .collect(); + dispatch_scanner!(self, subscribe_partition_buckets(&offsets)) + } + + fn unsubscribe(&self, bucket_id: i32) -> ffi::FfiResult { + dispatch_scanner!(self, unsubscribe(bucket_id)) + } + + fn unsubscribe_partition(&self, partition_id: PartitionId, bucket_id: i32) -> ffi::FfiResult { + dispatch_scanner!(self, unsubscribe_partition(partition_id, bucket_id)) + } + + fn poll(&self, timeout_ms: i64) -> Box { + let ScannerKind::Record(ref inner) = self.scanner else { + return Box::new(ScanResultInner::from_error( + CLIENT_ERROR_CODE, + "Record-based scanner not available".to_string(), + )); + }; + + let timeout = Duration::from_millis(timeout_ms.max(0) as u64); + let result = RUNTIME.block_on(async { inner.poll(timeout).await }); + + match result { + Ok(records) => { + let columns = self.projected_columns.clone(); + let mut total_count = 0usize; + let mut buckets = Vec::new(); + let mut bucket_infos = Vec::new(); + for (table_bucket, bucket_records) in records.into_records_by_buckets() { + let count = bucket_records.len(); + total_count += count; + bucket_infos.push(ffi::FfiBucketInfo { + table_id: table_bucket.table_id(), + bucket_id: table_bucket.bucket_id(), + has_partition_id: table_bucket.partition_id().is_some(), + partition_id: table_bucket.partition_id().unwrap_or(0), + record_count: count, + }); + buckets.push((table_bucket, bucket_records)); + } + Box::new(ScanResultInner { + error: None, + buckets, + columns, + bucket_infos, + total_count, + }) + } + Err(e) => { + let ffi_err = err_from_core_error(&e); + Box::new(ScanResultInner::from_error( + ffi_err.error_code, + ffi_err.error_message, + )) + } + } + } + + fn poll_record_batch(&self, timeout_ms: i64) -> ffi::FfiArrowRecordBatchesResult { + let ScannerKind::Batch(ref inner_batch) = self.scanner else { + return ffi::FfiArrowRecordBatchesResult { + result: client_err("Batch-based scanner not available".to_string()), + arrow_batches: ffi::FfiArrowRecordBatches { batches: vec![] }, + }; + }; + + let timeout = Duration::from_millis(timeout_ms.max(0) as u64); + let result = RUNTIME.block_on(async { inner_batch.poll(timeout).await }); + + match result { + Ok(batches) => match types::core_scan_batches_to_ffi(&batches) { + Ok(arrow_batches) => ffi::FfiArrowRecordBatchesResult { + result: ok_result(), + arrow_batches, + }, + Err(e) => ffi::FfiArrowRecordBatchesResult { + result: client_err(e), + arrow_batches: ffi::FfiArrowRecordBatches { batches: vec![] }, + }, + }, + Err(e) => ffi::FfiArrowRecordBatchesResult { + result: err_from_core_error(&e), + arrow_batches: ffi::FfiArrowRecordBatches { batches: vec![] }, + }, + } + } +} + +// ============================================================================ +// Opaque types: GenericRowInner (write path) +// ============================================================================ + +pub struct GenericRowInner { + row: fcore::row::GenericRow<'static>, +} + +fn new_generic_row(field_count: usize) -> Box { + Box::new(GenericRowInner { + row: fcore::row::GenericRow::new(field_count), + }) +} + +impl GenericRowInner { + fn gr_reset(&mut self) { + let len = self.row.values.len(); + self.row = fcore::row::GenericRow::new(len); + } + + fn gr_set_null(&mut self, idx: usize) { + self.ensure_size(idx); + self.row.set_field(idx, fcore::row::Datum::Null); + } + + fn gr_set_bool(&mut self, idx: usize, val: bool) { + self.ensure_size(idx); + self.row.set_field(idx, fcore::row::Datum::Bool(val)); + } + + fn gr_set_i32(&mut self, idx: usize, val: i32) { + self.ensure_size(idx); + self.row.set_field(idx, fcore::row::Datum::Int32(val)); + } + + fn gr_set_i64(&mut self, idx: usize, val: i64) { + self.ensure_size(idx); + self.row.set_field(idx, fcore::row::Datum::Int64(val)); + } + + fn gr_set_f32(&mut self, idx: usize, val: f32) { + self.ensure_size(idx); + self.row + .set_field(idx, fcore::row::Datum::Float32(val.into())); + } + + fn gr_set_f64(&mut self, idx: usize, val: f64) { + self.ensure_size(idx); + self.row + .set_field(idx, fcore::row::Datum::Float64(val.into())); + } + + fn gr_set_str(&mut self, idx: usize, val: &str) { + self.ensure_size(idx); + self.row.set_field( + idx, + fcore::row::Datum::String(std::borrow::Cow::Owned(val.to_string())), + ); + } + + fn gr_set_bytes(&mut self, idx: usize, val: &[u8]) { + self.ensure_size(idx); + self.row.set_field( + idx, + fcore::row::Datum::Blob(std::borrow::Cow::Owned(val.to_vec())), + ); + } + + fn gr_set_date(&mut self, idx: usize, days: i32) { + self.ensure_size(idx); + self.row + .set_field(idx, fcore::row::Datum::Date(fcore::row::Date::new(days))); + } + + fn gr_set_time(&mut self, idx: usize, millis: i32) { + self.ensure_size(idx); + self.row + .set_field(idx, fcore::row::Datum::Time(fcore::row::Time::new(millis))); + } + + fn gr_set_ts_ntz(&mut self, idx: usize, millis: i64, nanos: i32) { + self.ensure_size(idx); + // Use from_millis_nanos, falling back to millis-only on error + let ts = fcore::row::TimestampNtz::from_millis_nanos(millis, nanos) + .unwrap_or_else(|_| fcore::row::TimestampNtz::new(millis)); + self.row.set_field(idx, fcore::row::Datum::TimestampNtz(ts)); + } + + fn gr_set_ts_ltz(&mut self, idx: usize, millis: i64, nanos: i32) { + self.ensure_size(idx); + let ts = fcore::row::TimestampLtz::from_millis_nanos(millis, nanos) + .unwrap_or_else(|_| fcore::row::TimestampLtz::new(millis)); + self.row.set_field(idx, fcore::row::Datum::TimestampLtz(ts)); + } + + fn gr_set_decimal_str(&mut self, idx: usize, val: &str) { + self.ensure_size(idx); + // Store as string; resolve_row_types() will parse and validate against schema + self.row.set_field( + idx, + fcore::row::Datum::String(std::borrow::Cow::Owned(val.to_string())), + ); + } + + fn gr_set_array(&mut self, idx: usize, writer: &mut ArrayWriterInner) -> Result<(), String> { + self.ensure_size(idx); + writer.complete_if_needed()?; + let arr = writer.completed.take().ok_or_else(|| { + "ArrayWriter invariant violation: completed array missing after finalize".to_string() + })?; + self.row.set_field(idx, fcore::row::Datum::Array(arr)); + Ok(()) + } + + fn ensure_size(&mut self, idx: usize) { + if self.row.values.len() <= idx { + self.row.values.resize(idx + 1, fcore::row::Datum::Null); + } + } +} + +// ============================================================================ +// Shared row-reading helpers (used by both ScanResultInner and LookupResultInner) +// ============================================================================ + +mod row_reader { + use super::array_reader; + use fcore::row::InternalRow; + use fluss as fcore; + + use crate::types; + + /// Get column at `field`, or error if out of bounds. + fn get_column( + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result<&fcore::metadata::Column, String> { + columns.get(field).ok_or_else(|| { + format!( + "field index {field} out of range ({} columns)", + columns.len() + ) + }) + } + + /// Validate bounds, null, and type compatibility in a single pass. + /// Returns the data type on success for callers that need to dispatch on it. + fn validate<'a>( + row: &dyn InternalRow, + columns: &'a [fcore::metadata::Column], + field: usize, + getter: &str, + allowed: impl FnOnce(&fcore::metadata::DataType) -> bool, + ) -> Result<&'a fcore::metadata::DataType, String> { + let col = get_column(columns, field)?; + if row.is_null_at(field).map_err(|e| e.to_string())? { + return Err(format!("field {field} is null")); + } + let dt = col.data_type(); + if !allowed(dt) { + return Err(format!( + "{getter}: column {field} has incompatible type {dt}" + )); + } + Ok(dt) + } + + pub fn column_type(columns: &[fcore::metadata::Column], field: usize) -> Result { + Ok(types::core_data_type_to_ffi( + get_column(columns, field)?.data_type(), + )) + } + + pub fn column_name(columns: &[fcore::metadata::Column], field: usize) -> Result<&str, String> { + Ok(get_column(columns, field)?.name()) + } + + pub fn is_null( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + get_column(columns, field)?; + row.is_null_at(field).map_err(|e| e.to_string()) + } + + pub fn get_bool( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + validate(row, columns, field, "get_bool", |dt| { + matches!(dt, fcore::metadata::DataType::Boolean(_)) + })?; + row.get_boolean(field).map_err(|e| e.to_string()) + } + + pub fn get_i32( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + let dt = validate(row, columns, field, "get_i32", |dt| { + matches!( + dt, + fcore::metadata::DataType::TinyInt(_) + | fcore::metadata::DataType::SmallInt(_) + | fcore::metadata::DataType::Int(_) + ) + })?; + match dt { + fcore::metadata::DataType::TinyInt(_) => row + .get_byte(field) + .map(|v| v as i32) + .map_err(|e| e.to_string()), + fcore::metadata::DataType::SmallInt(_) => row + .get_short(field) + .map(|v| v as i32) + .map_err(|e| e.to_string()), + _ => row.get_int(field).map_err(|e| e.to_string()), + } + } + + pub fn get_i64( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + validate(row, columns, field, "get_i64", |dt| { + matches!(dt, fcore::metadata::DataType::BigInt(_)) + })?; + row.get_long(field).map_err(|e| e.to_string()) + } + + pub fn get_f32( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + validate(row, columns, field, "get_f32", |dt| { + matches!(dt, fcore::metadata::DataType::Float(_)) + })?; + row.get_float(field).map_err(|e| e.to_string()) + } + + pub fn get_f64( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + validate(row, columns, field, "get_f64", |dt| { + matches!(dt, fcore::metadata::DataType::Double(_)) + })?; + row.get_double(field).map_err(|e| e.to_string()) + } + + pub fn get_str<'a>( + row: &'a dyn InternalRow, + columns: &'a [fcore::metadata::Column], + field: usize, + ) -> Result<&'a str, String> { + let dt = validate(row, columns, field, "get_str", |dt| { + matches!( + dt, + fcore::metadata::DataType::Char(_) | fcore::metadata::DataType::String(_) + ) + })?; + match dt { + fcore::metadata::DataType::Char(ct) => row + .get_char(field, ct.length() as usize) + .map_err(|e| e.to_string()), + _ => row.get_string(field).map_err(|e| e.to_string()), + } + } + + pub fn get_bytes<'a>( + row: &'a dyn InternalRow, + columns: &'a [fcore::metadata::Column], + field: usize, + ) -> Result<&'a [u8], String> { + let dt = validate(row, columns, field, "get_bytes", |dt| { + matches!( + dt, + fcore::metadata::DataType::Binary(_) | fcore::metadata::DataType::Bytes(_) + ) + })?; + match dt { + fcore::metadata::DataType::Binary(bt) => row + .get_binary(field, bt.length()) + .map_err(|e| e.to_string()), + _ => row.get_bytes(field).map_err(|e| e.to_string()), + } + } + + pub fn get_date_days( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + validate(row, columns, field, "get_date_days", |dt| { + matches!(dt, fcore::metadata::DataType::Date(_)) + })?; + row.get_date(field) + .map(|d| d.get_inner()) + .map_err(|e| e.to_string()) + } + + pub fn get_time_millis( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + validate(row, columns, field, "get_time_millis", |dt| { + matches!(dt, fcore::metadata::DataType::Time(_)) + })?; + row.get_time(field) + .map(|t| t.get_inner()) + .map_err(|e| e.to_string()) + } + + pub fn get_ts_millis( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + let dt = validate(row, columns, field, "get_ts_millis", |dt| { + matches!( + dt, + fcore::metadata::DataType::Timestamp(_) + | fcore::metadata::DataType::TimestampLTz(_) + ) + })?; + match dt { + fcore::metadata::DataType::TimestampLTz(ts) => row + .get_timestamp_ltz(field, ts.precision()) + .map(|v| v.get_epoch_millisecond()) + .map_err(|e| e.to_string()), + fcore::metadata::DataType::Timestamp(ts) => row + .get_timestamp_ntz(field, ts.precision()) + .map(|v| v.get_millisecond()) + .map_err(|e| e.to_string()), + dt => Err(format!("get_ts_millis: unexpected type {dt}")), + } + } + + pub fn get_ts_nanos( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + let dt = validate(row, columns, field, "get_ts_nanos", |dt| { + matches!( + dt, + fcore::metadata::DataType::Timestamp(_) + | fcore::metadata::DataType::TimestampLTz(_) + ) + })?; + match dt { + fcore::metadata::DataType::TimestampLTz(ts) => row + .get_timestamp_ltz(field, ts.precision()) + .map(|v| v.get_nano_of_millisecond()) + .map_err(|e| e.to_string()), + fcore::metadata::DataType::Timestamp(ts) => row + .get_timestamp_ntz(field, ts.precision()) + .map(|v| v.get_nano_of_millisecond()) + .map_err(|e| e.to_string()), + dt => Err(format!("get_ts_nanos: unexpected type {dt}")), + } + } + + pub fn is_ts_ltz(columns: &[fcore::metadata::Column], field: usize) -> Result { + Ok(matches!( + get_column(columns, field)?.data_type(), + fcore::metadata::DataType::TimestampLTz(_) + )) + } + + pub fn get_decimal_str( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + let dt = validate(row, columns, field, "get_decimal_str", |dt| { + matches!(dt, fcore::metadata::DataType::Decimal(_)) + })?; + match dt { + fcore::metadata::DataType::Decimal(dd) => { + let decimal = row + .get_decimal(field, dd.precision() as usize, dd.scale() as usize) + .map_err(|e| e.to_string())?; + Ok(decimal.to_big_decimal().to_string()) + } + dt => Err(format!("get_decimal_str: unexpected type {dt}")), + } + } + + fn get_fluss_array( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + validate(row, columns, field, "get_array", |dt| { + matches!(dt, fcore::metadata::DataType::Array(_)) + })?; + row.get_array(field).map_err(|e| e.to_string()) + } + + pub fn get_array_element_type( + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result<&fcore::metadata::DataType, String> { + let col = get_column(columns, field)?; + match col.data_type() { + fcore::metadata::DataType::Array(at) => Ok(at.get_element_type()), + dt => Err(format!("get_array: column {field} is not Array, got {dt}")), + } + } + + pub fn get_array_size( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + let arr = get_fluss_array(row, columns, field)?; + Ok(arr.size()) + } + + pub fn get_array_and_elem_type<'a>( + row: &dyn InternalRow, + columns: &'a [fcore::metadata::Column], + field: usize, + ) -> Result< + ( + fcore::row::binary_array::FlussArray, + &'a fcore::metadata::DataType, + ), + String, + > { + let arr = get_fluss_array(row, columns, field)?; + let elem = get_array_element_type(columns, field)?; + Ok((arr, elem)) + } + + pub fn get_array_is_null( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let arr = get_fluss_array(row, columns, field)?; + array_reader::is_null(&arr, element) + } + + pub fn get_array_bool( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_bool(&arr, elem, element) + } + + pub fn get_array_i32( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_i32(&arr, elem, element) + } + + pub fn get_array_i64( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_i64(&arr, elem, element) + } + + pub fn get_array_f32( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_f32(&arr, elem, element) + } + + pub fn get_array_f64( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_f64(&arr, elem, element) + } + + pub fn get_array_str( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_str(&arr, elem, element) + } + + pub fn get_array_bytes( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result, String> { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_bytes(&arr, elem, element) + } + + pub fn get_array_date_days( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_date_days(&arr, elem, element) + } + + pub fn get_array_time_millis( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_time_millis(&arr, elem, element) + } + + pub fn get_array_ts_millis( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_ts_millis(&arr, elem, element) + } + + pub fn get_array_ts_nanos( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_ts_nanos(&arr, elem, element) + } + + pub fn get_array_decimal_str( + row: &dyn InternalRow, + columns: &[fcore::metadata::Column], + field: usize, + element: usize, + ) -> Result { + let (arr, elem) = get_array_and_elem_type(row, columns, field)?; + array_reader::get_decimal_str(&arr, elem, element) + } + + pub fn get_array_element_type_id( + columns: &[fcore::metadata::Column], + field: usize, + ) -> Result { + let elem_type = get_array_element_type(columns, field)?; + Ok(crate::types::core_data_type_to_ffi(elem_type)) + } +} + +// ============================================================================ +// array_reader — low-level accessors over an already-resolved FlussArray +// +// Shared by the top-level `row_reader::get_array_*` wrappers and by +// `ArrayViewInner` (which exposes recursive/nested access to C++). Keeping +// one implementation here guarantees identical bounds-checking, null +// validation, type checking, and type dispatch across flat and nested reads. +// ============================================================================ + +mod array_reader { + use super::fcore; + + fn validate_index( + arr: &fcore::row::binary_array::FlussArray, + element: usize, + op: &str, + ) -> Result<(), String> { + if element < arr.size() { + Ok(()) + } else { + Err(format!( + "{op}: element index out of bounds: element={element}, size={}", + arr.size() + )) + } + } + + fn ensure_non_null( + arr: &fcore::row::binary_array::FlussArray, + element: usize, + op: &str, + ) -> Result<(), String> { + if arr.is_null_at(element) { + Err(format!( + "{op}: element at index {element} is null; call array_is_null first" + )) + } else { + Ok(()) + } + } + + fn ensure_type( + elem_type: &fcore::metadata::DataType, + op: &str, + expected: &str, + allowed: impl FnOnce(&fcore::metadata::DataType) -> bool, + ) -> Result<(), String> { + if allowed(elem_type) { + Ok(()) + } else { + Err(format!( + "{op}: element type is {elem_type}, expected {expected}" + )) + } + } + + fn ensure_readable( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + op: &str, + expected: &str, + allowed: impl FnOnce(&fcore::metadata::DataType) -> bool, + ) -> Result<(), String> { + validate_index(arr, element, op)?; + ensure_type(elem_type, op, expected, allowed)?; + ensure_non_null(arr, element, op) + } + + pub fn is_null( + arr: &fcore::row::binary_array::FlussArray, + element: usize, + ) -> Result { + validate_index(arr, element, "array_is_null")?; + Ok(arr.is_null_at(element)) + } + + pub fn get_bool( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable(arr, elem_type, element, "array_bool", "BOOLEAN", |dt| { + matches!(dt, fcore::metadata::DataType::Boolean(_)) + })?; + arr.get_boolean(element).map_err(|e| e.to_string()) + } + + pub fn get_i32( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable( + arr, + elem_type, + element, + "array_i32", + "TINYINT/SMALLINT/INT", + |dt| { + matches!( + dt, + fcore::metadata::DataType::TinyInt(_) + | fcore::metadata::DataType::SmallInt(_) + | fcore::metadata::DataType::Int(_) + ) + }, + )?; + match elem_type { + fcore::metadata::DataType::TinyInt(_) => arr + .get_byte(element) + .map(|v| v as i32) + .map_err(|e| e.to_string()), + fcore::metadata::DataType::SmallInt(_) => arr + .get_short(element) + .map(|v| v as i32) + .map_err(|e| e.to_string()), + fcore::metadata::DataType::Int(_) => arr.get_int(element).map_err(|e| e.to_string()), + _ => unreachable!("type validated by ensure_readable"), + } + } + + pub fn get_i64( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable(arr, elem_type, element, "array_i64", "BIGINT", |dt| { + matches!(dt, fcore::metadata::DataType::BigInt(_)) + })?; + arr.get_long(element).map_err(|e| e.to_string()) + } + + pub fn get_f32( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable(arr, elem_type, element, "array_f32", "FLOAT", |dt| { + matches!(dt, fcore::metadata::DataType::Float(_)) + })?; + arr.get_float(element).map_err(|e| e.to_string()) + } + + pub fn get_f64( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable(arr, elem_type, element, "array_f64", "DOUBLE", |dt| { + matches!(dt, fcore::metadata::DataType::Double(_)) + })?; + arr.get_double(element).map_err(|e| e.to_string()) + } + + pub fn get_str( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable(arr, elem_type, element, "array_str", "STRING/CHAR", |dt| { + matches!( + dt, + fcore::metadata::DataType::String(_) | fcore::metadata::DataType::Char(_) + ) + })?; + arr.get_string(element) + .map(|s| s.to_string()) + .map_err(|e| e.to_string()) + } + + pub fn get_bytes( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result, String> { + ensure_readable( + arr, + elem_type, + element, + "array_bytes", + "BYTES/BINARY", + |dt| { + matches!( + dt, + fcore::metadata::DataType::Bytes(_) | fcore::metadata::DataType::Binary(_) + ) + }, + )?; + arr.get_binary(element) + .map(|b| b.to_vec()) + .map_err(|e| e.to_string()) + } + + pub fn get_date_days( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable(arr, elem_type, element, "array_date", "DATE", |dt| { + matches!(dt, fcore::metadata::DataType::Date(_)) + })?; + arr.get_date(element) + .map(|d| d.get_inner()) + .map_err(|e| e.to_string()) + } + + pub fn get_time_millis( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable(arr, elem_type, element, "array_time", "TIME", |dt| { + matches!(dt, fcore::metadata::DataType::Time(_)) + })?; + arr.get_time(element) + .map(|t| t.get_inner()) + .map_err(|e| e.to_string()) + } + + pub fn get_ts_millis( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable( + arr, + elem_type, + element, + "array_ts_millis", + "TIMESTAMP/TIMESTAMP_LTZ", + |dt| { + matches!( + dt, + fcore::metadata::DataType::Timestamp(_) + | fcore::metadata::DataType::TimestampLTz(_) + ) + }, + )?; + match elem_type { + fcore::metadata::DataType::TimestampLTz(ts) => arr + .get_timestamp_ltz(element, ts.precision()) + .map(|v| v.get_epoch_millisecond()) + .map_err(|e| e.to_string()), + fcore::metadata::DataType::Timestamp(ts) => arr + .get_timestamp_ntz(element, ts.precision()) + .map(|v| v.get_millisecond()) + .map_err(|e| e.to_string()), + _ => unreachable!("type validated by ensure_readable"), + } + } + + pub fn get_ts_nanos( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable( + arr, + elem_type, + element, + "array_ts_nanos", + "TIMESTAMP/TIMESTAMP_LTZ", + |dt| { + matches!( + dt, + fcore::metadata::DataType::Timestamp(_) + | fcore::metadata::DataType::TimestampLTz(_) + ) + }, + )?; + match elem_type { + fcore::metadata::DataType::TimestampLTz(ts) => arr + .get_timestamp_ltz(element, ts.precision()) + .map(|v| v.get_nano_of_millisecond()) + .map_err(|e| e.to_string()), + fcore::metadata::DataType::Timestamp(ts) => arr + .get_timestamp_ntz(element, ts.precision()) + .map(|v| v.get_nano_of_millisecond()) + .map_err(|e| e.to_string()), + _ => unreachable!("type validated by ensure_readable"), + } + } + + pub fn get_decimal_str( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result { + ensure_readable(arr, elem_type, element, "array_decimal", "DECIMAL", |dt| { + matches!(dt, fcore::metadata::DataType::Decimal(_)) + })?; + match elem_type { + fcore::metadata::DataType::Decimal(dd) => { + let decimal = arr + .get_decimal(element, dd.precision(), dd.scale()) + .map_err(|e| e.to_string())?; + Ok(decimal.to_big_decimal().to_string()) + } + _ => unreachable!("type validated by ensure_readable"), + } + } + + pub fn get_nested_array( + arr: &fcore::row::binary_array::FlussArray, + elem_type: &fcore::metadata::DataType, + element: usize, + ) -> Result< + ( + fcore::row::binary_array::FlussArray, + fcore::metadata::DataType, + ), + String, + > { + ensure_readable(arr, elem_type, element, "array_nested", "ARRAY", |dt| { + matches!(dt, fcore::metadata::DataType::Array(_)) + })?; + match elem_type { + fcore::metadata::DataType::Array(at) => { + let nested = arr.get_array(element).map_err(|e| e.to_string())?; + Ok((nested, at.get_element_type().clone())) + } + _ => unreachable!("type validated by ensure_readable"), + } + } +} + +// ============================================================================ +// Macros that generate uniform sv_/lv_ array element getters (thin wrappers +// that only forward to `row_reader::get_array_*`). +// ============================================================================ + +macro_rules! sv_array_element_getters { + ($( $method:ident, $reader_fn:ident, $ret:ty; )+) => { + $( + fn $method( + &self, + bucket: usize, + rec: usize, + field: usize, + element: usize, + ) -> Result<$ret, String> { + row_reader::$reader_fn( + self.resolve(bucket, rec).row(), + &self.columns, + field, + element, + ) + } + )+ + }; +} + +macro_rules! lv_array_element_getters { + ($( $method:ident, $reader_fn:ident, $ret:ty; )+) => { + $( + fn $method(&self, field: usize, element: usize) -> Result<$ret, String> { + let r = self.lv_row()?; + row_reader::$reader_fn(r, &self.columns, field, element) + } + )+ + }; +} + +// ============================================================================ +// Opaque types: ScanResultInner (scan read path) +// ============================================================================ + +pub struct ScanResultInner { + error: Option<(i32, String)>, + buckets: Vec<(fcore::metadata::TableBucket, Vec)>, + columns: Vec, + bucket_infos: Vec, + total_count: usize, +} + +impl ScanResultInner { + fn from_error(code: i32, msg: String) -> Self { + Self { + error: Some((code, msg)), + buckets: Vec::new(), + columns: Vec::new(), + bucket_infos: Vec::new(), + total_count: 0, + } + } + + fn resolve(&self, bucket: usize, rec: usize) -> &fcore::record::ScanRecord { + &self.buckets[bucket].1[rec] + } + + fn sv_has_error(&self) -> bool { + self.error.is_some() + } + + fn sv_error_code(&self) -> i32 { + self.error.as_ref().map_or(0, |e| e.0) + } + + fn sv_error_message(&self) -> &str { + self.error.as_ref().map_or("", |e| e.1.as_str()) + } + + fn sv_record_count(&self) -> usize { + self.total_count + } + + fn sv_column_count(&self) -> usize { + self.columns.len() + } + fn sv_column_name(&self, field: usize) -> Result<&str, String> { + row_reader::column_name(&self.columns, field) + } + fn sv_column_type(&self, field: usize) -> Result { + row_reader::column_type(&self.columns, field) + } + + fn sv_offset(&self, bucket: usize, rec: usize) -> i64 { + self.resolve(bucket, rec).offset() + } + fn sv_timestamp(&self, bucket: usize, rec: usize) -> i64 { + self.resolve(bucket, rec).timestamp() + } + fn sv_change_type(&self, bucket: usize, rec: usize) -> i32 { + self.resolve(bucket, rec).change_type().to_byte_value() as i32 + } + fn sv_field_count(&self) -> usize { + self.columns.len() + } + + // Field accessors — C++ validates bounds in BucketRecords/RecordAt, validate() checks field. + fn sv_is_null(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::is_null(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_bool(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_bool(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_i32(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_i32(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_i64(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_i64(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_f32(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_f32(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_f64(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_f64(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_str(&self, bucket: usize, rec: usize, field: usize) -> Result<&str, String> { + row_reader::get_str(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_bytes(&self, bucket: usize, rec: usize, field: usize) -> Result<&[u8], String> { + row_reader::get_bytes(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_date_days(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_date_days(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_time_millis(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_time_millis(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_ts_millis(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_ts_millis(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_get_ts_nanos(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_ts_nanos(self.resolve(bucket, rec).row(), &self.columns, field) + } + fn sv_is_ts_ltz(&self, _bucket: usize, _rec: usize, field: usize) -> Result { + row_reader::is_ts_ltz(&self.columns, field) + } + fn sv_get_decimal_str( + &self, + bucket: usize, + rec: usize, + field: usize, + ) -> Result { + row_reader::get_decimal_str(self.resolve(bucket, rec).row(), &self.columns, field) + } + + fn sv_get_array_size(&self, bucket: usize, rec: usize, field: usize) -> Result { + row_reader::get_array_size(self.resolve(bucket, rec).row(), &self.columns, field) + } + sv_array_element_getters! { + sv_get_array_is_null, get_array_is_null, bool; + sv_get_array_bool, get_array_bool, bool; + sv_get_array_i32, get_array_i32, i32; + sv_get_array_i64, get_array_i64, i64; + sv_get_array_f32, get_array_f32, f32; + sv_get_array_f64, get_array_f64, f64; + sv_get_array_str, get_array_str, String; + sv_get_array_bytes, get_array_bytes, Vec; + sv_get_array_date_days, get_array_date_days, i32; + sv_get_array_time_millis, get_array_time_millis, i32; + sv_get_array_ts_millis, get_array_ts_millis, i64; + sv_get_array_ts_nanos, get_array_ts_nanos, i32; + sv_get_array_decimal_str, get_array_decimal_str, String; + } + fn sv_get_array_element_type(&self, field: usize) -> Result { + row_reader::get_array_element_type_id(&self.columns, field) + } + fn sv_get_array_view( + &self, + bucket: usize, + rec: usize, + field: usize, + ) -> Result, String> { + let (arr, elem) = row_reader::get_array_and_elem_type( + self.resolve(bucket, rec).row(), + &self.columns, + field, + )?; + Ok(Box::new(ArrayViewInner { + array: arr, + element_type: elem.clone(), + })) + } + + fn sv_bucket_infos(&self) -> &Vec { + &self.bucket_infos + } +} + +// ============================================================================ +// Opaque types: LookupResultInner (lookup read path) +// ============================================================================ + +pub struct LookupResultInner { + error: Option<(i32, String)>, + found: bool, + row: Option>, + columns: Vec, +} + +impl LookupResultInner { + fn from_error(code: i32, msg: String) -> Self { + Self { + error: Some((code, msg)), + found: false, + row: None, + columns: Vec::new(), + } + } + + fn lv_has_error(&self) -> bool { + self.error.is_some() + } + + fn lv_error_code(&self) -> i32 { + self.error.as_ref().map_or(0, |e| e.0) + } + + fn lv_error_message(&self) -> &str { + self.error.as_ref().map_or("", |e| e.1.as_str()) + } + + fn lv_found(&self) -> bool { + self.found + } + + fn lv_field_count(&self) -> usize { + self.columns.len() + } + + fn lv_column_type(&self, field: usize) -> Result { + row_reader::column_type(&self.columns, field) + } + + fn lv_column_name(&self, field: usize) -> Result<&str, String> { + row_reader::column_name(&self.columns, field) + } + + fn lv_row(&self) -> Result<&fcore::row::GenericRow<'static>, String> { + self.row + .as_ref() + .ok_or_else(|| "no row available (not found or error)".to_string()) + } + + // Field accessors — delegate to shared row_reader helpers. + fn lv_is_null(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::is_null(r, &self.columns, field) + } + fn lv_get_bool(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_bool(r, &self.columns, field) + } + fn lv_get_i32(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_i32(r, &self.columns, field) + } + fn lv_get_i64(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_i64(r, &self.columns, field) + } + fn lv_get_f32(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_f32(r, &self.columns, field) + } + fn lv_get_f64(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_f64(r, &self.columns, field) + } + fn lv_get_str(&self, field: usize) -> Result<&str, String> { + let r = self.lv_row()?; + row_reader::get_str(r, &self.columns, field) + } + fn lv_get_bytes(&self, field: usize) -> Result<&[u8], String> { + let r = self.lv_row()?; + row_reader::get_bytes(r, &self.columns, field) + } + fn lv_get_date_days(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_date_days(r, &self.columns, field) + } + fn lv_get_time_millis(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_time_millis(r, &self.columns, field) + } + fn lv_get_ts_millis(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_ts_millis(r, &self.columns, field) + } + fn lv_get_ts_nanos(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_ts_nanos(r, &self.columns, field) + } + fn lv_is_ts_ltz(&self, field: usize) -> Result { + row_reader::is_ts_ltz(&self.columns, field) + } + fn lv_get_decimal_str(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_decimal_str(r, &self.columns, field) + } + fn lv_get_array_size(&self, field: usize) -> Result { + let r = self.lv_row()?; + row_reader::get_array_size(r, &self.columns, field) + } + lv_array_element_getters! { + lv_get_array_is_null, get_array_is_null, bool; + lv_get_array_bool, get_array_bool, bool; + lv_get_array_i32, get_array_i32, i32; + lv_get_array_i64, get_array_i64, i64; + lv_get_array_f32, get_array_f32, f32; + lv_get_array_f64, get_array_f64, f64; + lv_get_array_str, get_array_str, String; + lv_get_array_bytes, get_array_bytes, Vec; + lv_get_array_date_days, get_array_date_days, i32; + lv_get_array_time_millis, get_array_time_millis, i32; + lv_get_array_ts_millis, get_array_ts_millis, i64; + lv_get_array_ts_nanos, get_array_ts_nanos, i32; + lv_get_array_decimal_str, get_array_decimal_str, String; + } + fn lv_get_array_element_type(&self, field: usize) -> Result { + row_reader::get_array_element_type_id(&self.columns, field) + } + fn lv_get_array_view(&self, field: usize) -> Result, String> { + let r = self.lv_row()?; + let (arr, elem) = row_reader::get_array_and_elem_type(r, &self.columns, field)?; + Ok(Box::new(ArrayViewInner { + array: arr, + element_type: elem.clone(), + })) + } +} + +// ============================================================================ +// Opaque types: ArrayViewInner (recursive array reader) +// +// Wraps an owned `FlussArray` plus its element `DataType` and exposes the +// same accessors as `row_reader::get_array_*`, delegating to the shared +// `array_reader` primitives. Enables C++ bindings to recurse into nested +// arrays without per-level FFI scaffolding. +// ============================================================================ + +pub struct ArrayViewInner { + array: fcore::row::binary_array::FlussArray, + element_type: fcore::metadata::DataType, +} + +impl ArrayViewInner { + fn av_size(&self) -> usize { + self.array.size() + } + + fn av_element_type_id(&self) -> i32 { + crate::types::core_data_type_to_ffi(&self.element_type) + } + + fn av_is_null(&self, element: usize) -> Result { + array_reader::is_null(&self.array, element) + } + + fn av_get_bool(&self, element: usize) -> Result { + array_reader::get_bool(&self.array, &self.element_type, element) + } + + fn av_get_i32(&self, element: usize) -> Result { + array_reader::get_i32(&self.array, &self.element_type, element) + } + + fn av_get_i64(&self, element: usize) -> Result { + array_reader::get_i64(&self.array, &self.element_type, element) + } + + fn av_get_f32(&self, element: usize) -> Result { + array_reader::get_f32(&self.array, &self.element_type, element) + } + + fn av_get_f64(&self, element: usize) -> Result { + array_reader::get_f64(&self.array, &self.element_type, element) + } + + fn av_get_str(&self, element: usize) -> Result { + array_reader::get_str(&self.array, &self.element_type, element) + } + + fn av_get_bytes(&self, element: usize) -> Result, String> { + array_reader::get_bytes(&self.array, &self.element_type, element) + } + + fn av_get_date_days(&self, element: usize) -> Result { + array_reader::get_date_days(&self.array, &self.element_type, element) + } + + fn av_get_time_millis(&self, element: usize) -> Result { + array_reader::get_time_millis(&self.array, &self.element_type, element) + } + + fn av_get_ts_millis(&self, element: usize) -> Result { + array_reader::get_ts_millis(&self.array, &self.element_type, element) + } + + fn av_get_ts_nanos(&self, element: usize) -> Result { + array_reader::get_ts_nanos(&self.array, &self.element_type, element) + } + + fn av_get_decimal_str(&self, element: usize) -> Result { + array_reader::get_decimal_str(&self.array, &self.element_type, element) + } + + fn av_get_nested(&self, element: usize) -> Result, String> { + let (arr, elem) = array_reader::get_nested_array(&self.array, &self.element_type, element)?; + Ok(Box::new(ArrayViewInner { + array: arr, + element_type: elem, + })) + } +} + +// ============================================================================ +// Opaque types: ArrayWriterInner (array builder for writes) +// ============================================================================ + +pub struct ArrayWriterInner { + writer: Option, + completed: Option, + element_type: fcore::metadata::DataType, + num_elements: usize, +} + +fn new_array_writer( + size: usize, + element_leaf_type_id: i32, + precision: u32, + scale: u32, + array_nesting: u32, +) -> Result, String> { + let element_type = + types::element_type_from_ffi(element_leaf_type_id, precision, scale, array_nesting) + .map_err(|e| e.to_string())?; + let writer = fcore::row::binary_array::FlussArrayWriter::new(size, &element_type); + Ok(Box::new(ArrayWriterInner { + writer: Some(writer), + completed: None, + element_type, + num_elements: size, + })) +} + +impl ArrayWriterInner { + fn writer_mut(&mut self) -> Result<&mut fcore::row::binary_array::FlussArrayWriter, String> { + self.writer + .as_mut() + .ok_or_else(|| "ArrayWriter is already finalized".to_string()) + } + + fn validate_index(&self, idx: usize) -> Result<(), String> { + if idx < self.num_elements { + Ok(()) + } else { + Err(format!( + "ArrayWriter index out of bounds: idx={idx}, size={}", + self.num_elements + )) + } + } + + fn complete_if_needed(&mut self) -> Result<(), String> { + if self.completed.is_none() { + let w = self + .writer + .take() + .ok_or_else(|| "ArrayWriter has already been finalized".to_string())?; + self.completed = Some(w.complete().map_err(|e| e.to_string())?); + } + Ok(()) + } + + /// Checks writer liveness first, then the element index. Returning the + /// clearest finalization error before a bounds error keeps diagnostics + /// aligned with the caller's intent when a writer is misused after + /// completion. + fn ensure_writable(&self, idx: usize) -> Result<(), String> { + if self.writer.is_none() { + return Err("ArrayWriter is already finalized".to_string()); + } + self.validate_index(idx) + } + + fn aw_size(&self) -> usize { + self.num_elements + } + + fn aw_set_null(&mut self, idx: usize) -> Result<(), String> { + self.ensure_writable(idx)?; + self.writer_mut()?.set_null_at(idx); + Ok(()) + } + + fn aw_set_bool(&mut self, idx: usize, val: bool) -> Result<(), String> { + self.ensure_writable(idx)?; + if !matches!(self.element_type, fcore::metadata::DataType::Boolean(_)) { + return Err(format!( + "ArrayWriter type mismatch: expected BOOLEAN element, got {}", + self.element_type + )); + } + self.writer_mut()?.write_boolean(idx, val); + Ok(()) + } + + fn aw_set_i32(&mut self, idx: usize, val: i32) -> Result<(), String> { + self.ensure_writable(idx)?; + match &self.element_type { + fcore::metadata::DataType::TinyInt(_) => { + let v = i8::try_from(val) + .map_err(|_| format!("Value {val} does not fit TINYINT element"))?; + self.writer_mut()?.write_byte(idx, v); + } + fcore::metadata::DataType::SmallInt(_) => { + let v = i16::try_from(val) + .map_err(|_| format!("Value {val} does not fit SMALLINT element"))?; + self.writer_mut()?.write_short(idx, v); + } + fcore::metadata::DataType::Int(_) => { + self.writer_mut()?.write_int(idx, val); + } + _ => { + return Err(format!( + "ArrayWriter type mismatch: expected TINYINT/SMALLINT/INT element, got {}", + self.element_type + )); + } + } + Ok(()) + } + + fn aw_set_i64(&mut self, idx: usize, val: i64) -> Result<(), String> { + self.ensure_writable(idx)?; + if !matches!(self.element_type, fcore::metadata::DataType::BigInt(_)) { + return Err(format!( + "ArrayWriter type mismatch: expected BIGINT element, got {}", + self.element_type + )); + } + self.writer_mut()?.write_long(idx, val); + Ok(()) + } + + fn aw_set_f32(&mut self, idx: usize, val: f32) -> Result<(), String> { + self.ensure_writable(idx)?; + if !matches!(self.element_type, fcore::metadata::DataType::Float(_)) { + return Err(format!( + "ArrayWriter type mismatch: expected FLOAT element, got {}", + self.element_type + )); + } + self.writer_mut()?.write_float(idx, val); + Ok(()) + } + + fn aw_set_f64(&mut self, idx: usize, val: f64) -> Result<(), String> { + self.ensure_writable(idx)?; + if !matches!(self.element_type, fcore::metadata::DataType::Double(_)) { + return Err(format!( + "ArrayWriter type mismatch: expected DOUBLE element, got {}", + self.element_type + )); + } + self.writer_mut()?.write_double(idx, val); + Ok(()) + } + + fn aw_set_str(&mut self, idx: usize, val: &str) -> Result<(), String> { + self.ensure_writable(idx)?; + if !matches!( + self.element_type, + fcore::metadata::DataType::String(_) | fcore::metadata::DataType::Char(_) + ) { + return Err(format!( + "ArrayWriter type mismatch: expected STRING/CHAR element, got {}", + self.element_type + )); + } + self.writer_mut()?.write_string(idx, val); + Ok(()) + } + + fn aw_set_bytes(&mut self, idx: usize, val: &[u8]) -> Result<(), String> { + self.ensure_writable(idx)?; + if !matches!( + self.element_type, + fcore::metadata::DataType::Bytes(_) | fcore::metadata::DataType::Binary(_) + ) { + return Err(format!( + "ArrayWriter type mismatch: expected BYTES/BINARY element, got {}", + self.element_type + )); + } + self.writer_mut()?.write_binary_bytes(idx, val); + Ok(()) + } + + fn aw_set_date(&mut self, idx: usize, days: i32) -> Result<(), String> { + self.ensure_writable(idx)?; + if !matches!(self.element_type, fcore::metadata::DataType::Date(_)) { + return Err(format!( + "ArrayWriter type mismatch: expected DATE element, got {}", + self.element_type + )); + } + self.writer_mut()? + .write_date(idx, fcore::row::Date::new(days)); + Ok(()) + } + + fn aw_set_time(&mut self, idx: usize, millis: i32) -> Result<(), String> { + self.ensure_writable(idx)?; + if !matches!(self.element_type, fcore::metadata::DataType::Time(_)) { + return Err(format!( + "ArrayWriter type mismatch: expected TIME element, got {}", + self.element_type + )); + } + self.writer_mut()? + .write_time(idx, fcore::row::Time::new(millis)); + Ok(()) + } + + fn aw_set_ts_ntz(&mut self, idx: usize, millis: i64, nanos: i32) -> Result<(), String> { + self.ensure_writable(idx)?; + let precision = match &self.element_type { + fcore::metadata::DataType::Timestamp(ts) => ts.precision(), + _ => { + return Err(format!( + "ArrayWriter type mismatch: expected TIMESTAMP element, got {}", + self.element_type + )); + } + }; + let ts = fcore::row::TimestampNtz::from_millis_nanos(millis, nanos) + .map_err(|e| e.to_string())?; + self.writer_mut()?.write_timestamp_ntz(idx, &ts, precision); + Ok(()) + } + + fn aw_set_ts_ltz(&mut self, idx: usize, millis: i64, nanos: i32) -> Result<(), String> { + self.ensure_writable(idx)?; + let precision = match &self.element_type { + fcore::metadata::DataType::TimestampLTz(ts) => ts.precision(), + _ => { + return Err(format!( + "ArrayWriter type mismatch: expected TIMESTAMP_LTZ element, got {}", + self.element_type + )); + } + }; + let ts = fcore::row::TimestampLtz::from_millis_nanos(millis, nanos) + .map_err(|e| e.to_string())?; + self.writer_mut()?.write_timestamp_ltz(idx, &ts, precision); + Ok(()) + } + + fn aw_set_decimal_str(&mut self, idx: usize, val: &str) -> Result<(), String> { + self.ensure_writable(idx)?; + let (precision, scale) = match &self.element_type { + fcore::metadata::DataType::Decimal(d) => (d.precision(), d.scale()), + _ => { + return Err(format!( + "ArrayWriter type mismatch: expected DECIMAL element, got {}", + self.element_type + )); + } + }; + let bd = bigdecimal::BigDecimal::from_str(val).map_err(|e| e.to_string())?; + let decimal = fcore::row::Decimal::from_big_decimal(bd, precision, scale) + .map_err(|e| e.to_string())?; + self.writer_mut()?.write_decimal(idx, &decimal, precision); + Ok(()) + } + + fn aw_set_array(&mut self, idx: usize, nested: &mut ArrayWriterInner) -> Result<(), String> { + self.ensure_writable(idx)?; + let expected_inner = match &self.element_type { + fcore::metadata::DataType::Array(at) => at.get_element_type(), + _ => { + return Err(format!( + "ArrayWriter type mismatch: expected ARRAY element, got {}", + self.element_type + )); + } + }; + if !structurally_compatible(expected_inner, &nested.element_type) { + return Err(format!( + "Nested ArrayWriter type mismatch: expected nested element type {}, got {}", + expected_inner, nested.element_type + )); + } + nested.complete_if_needed()?; + let arr = nested.completed.as_ref().ok_or_else(|| { + "ArrayWriter invariant violation: nested completed array missing after finalize" + .to_string() + })?; + self.writer_mut()?.write_array(idx, arr); + Ok(()) + } +} + +/// Structural type equivalence that ignores nullability flags but preserves +/// variant and precision/scale semantics. Used to compare ArrayWriter element +/// types on the binding boundary. Nullability is ignored in structural comparison +/// because the Rust-side element type is always reconstructed as nullable +/// (encoding doesn't depend on it). +fn structurally_compatible(a: &fcore::metadata::DataType, b: &fcore::metadata::DataType) -> bool { + use fcore::metadata::DataType; + match (a, b) { + (DataType::Boolean(_), DataType::Boolean(_)) + | (DataType::TinyInt(_), DataType::TinyInt(_)) + | (DataType::SmallInt(_), DataType::SmallInt(_)) + | (DataType::Int(_), DataType::Int(_)) + | (DataType::BigInt(_), DataType::BigInt(_)) + | (DataType::Float(_), DataType::Float(_)) + | (DataType::Double(_), DataType::Double(_)) + | (DataType::String(_), DataType::String(_)) + | (DataType::Bytes(_), DataType::Bytes(_)) + | (DataType::Date(_), DataType::Date(_)) + | (DataType::Time(_), DataType::Time(_)) => true, + (DataType::Timestamp(x), DataType::Timestamp(y)) => x.precision() == y.precision(), + (DataType::TimestampLTz(x), DataType::TimestampLTz(y)) => x.precision() == y.precision(), + (DataType::Char(x), DataType::Char(y)) => x.length() == y.length(), + (DataType::Binary(x), DataType::Binary(y)) => x.length() == y.length(), + (DataType::Decimal(x), DataType::Decimal(y)) => { + x.precision() == y.precision() && x.scale() == y.scale() + } + (DataType::Array(x), DataType::Array(y)) => { + structurally_compatible(x.get_element_type(), y.get_element_type()) + } + _ => false, + } +} diff --git a/fluss-rust/bindings/cpp/src/table.cpp b/fluss-rust/bindings/cpp/src/table.cpp new file mode 100644 index 0000000000..f389f7ac90 --- /dev/null +++ b/fluss-rust/bindings/cpp/src/table.cpp @@ -0,0 +1,1619 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include +#include + +#include "ffi_converter.hpp" +#include "fluss.hpp" +#include "lib.rs.h" +#include "rust/cxx.h" +// todo: bindings/cpp/BUILD.bazel still doesn't declare Arrow include/link dependencies. +// In environments where Bazel does not already have Arrow available, this will fail at compile/link +// time. +#include + +namespace fluss { + +static constexpr int kSecondsPerDay = 24 * 60 * 60; + +static std::time_t timegm_utc(std::tm* tm) { +#if defined(_WIN32) + return _mkgmtime(tm); +#else + return ::timegm(tm); +#endif +} + +static std::tm gmtime_utc(std::time_t epoch_seconds) { + std::tm tm{}; +#if defined(_WIN32) + gmtime_s(&tm, &epoch_seconds); +#else + ::gmtime_r(&epoch_seconds, &tm); +#endif + return tm; +} + +Date Date::FromYMD(int year, int month, int day) { + std::tm tm{}; + tm.tm_year = year - 1900; + tm.tm_mon = month - 1; + tm.tm_mday = day; + std::time_t epoch_seconds = timegm_utc(&tm); + return {static_cast(epoch_seconds / kSecondsPerDay)}; +} + +int Date::Year() const { + std::time_t epoch_seconds = static_cast(days_since_epoch) * kSecondsPerDay; + std::tm tm = gmtime_utc(epoch_seconds); + return tm.tm_year + 1900; +} + +int Date::Month() const { + std::time_t epoch_seconds = static_cast(days_since_epoch) * kSecondsPerDay; + std::tm tm = gmtime_utc(epoch_seconds); + return tm.tm_mon + 1; +} + +int Date::Day() const { + std::time_t epoch_seconds = static_cast(days_since_epoch) * kSecondsPerDay; + std::tm tm = gmtime_utc(epoch_seconds); + return tm.tm_mday; +} + +// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) +#define CHECK_INNER(name) \ + do { \ + if (!inner_) throw std::logic_error(name ": not available (moved-from or null)"); \ + } while (0) + +// ============================================================================ +// ArrayWriter — builder for array values backed by Rust ArrayWriterInner +// ============================================================================ + +ArrayWriter::ArrayWriter(size_t size, DataType element_type) : element_type_(std::move(element_type)) { + auto flat = utils::flatten_array_type(element_type_); + int32_t leaf_type_id = flat.nesting > 0 ? flat.leaf_type : static_cast(element_type_.id()); + uint32_t leaf_precision = static_cast(flat.nesting > 0 ? flat.leaf_precision + : element_type_.precision()); + uint32_t leaf_scale = static_cast(flat.nesting > 0 ? flat.leaf_scale : element_type_.scale()); + uint32_t array_nesting = static_cast(flat.nesting); + + auto box = ffi::new_array_writer(size, leaf_type_id, leaf_precision, leaf_scale, array_nesting); + inner_ = box.into_raw(); +} + +ArrayWriter::~ArrayWriter() noexcept { Destroy(); } + +void ArrayWriter::Destroy() noexcept { + if (inner_) { + rust::Box::from_raw(inner_); + inner_ = nullptr; + } +} + +ArrayWriter::ArrayWriter(ArrayWriter&& other) noexcept + : inner_(other.inner_), element_type_(std::move(other.element_type_)) { + other.inner_ = nullptr; +} + +ArrayWriter& ArrayWriter::operator=(ArrayWriter&& other) noexcept { + if (this != &other) { + Destroy(); + inner_ = other.inner_; + element_type_ = std::move(other.element_type_); + other.inner_ = nullptr; + } + return *this; +} + +bool ArrayWriter::Available() const { return inner_ != nullptr; } + +size_t ArrayWriter::Size() const noexcept { + assert(inner_ && "ArrayWriter::Size called on moved-from instance"); + return inner_->aw_size(); +} + +// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) +#define CHECK_AW(name) \ + do { \ + if (!inner_) throw std::logic_error(name ": not available (moved-from or null)"); \ + } while (0) + +void ArrayWriter::SetNull(size_t idx) { CHECK_AW("ArrayWriter"); inner_->aw_set_null(idx); } +void ArrayWriter::SetBool(size_t idx, bool v) { CHECK_AW("ArrayWriter"); inner_->aw_set_bool(idx, v); } +void ArrayWriter::SetInt32(size_t idx, int32_t v) { CHECK_AW("ArrayWriter"); inner_->aw_set_i32(idx, v); } +void ArrayWriter::SetInt64(size_t idx, int64_t v) { CHECK_AW("ArrayWriter"); inner_->aw_set_i64(idx, v); } +void ArrayWriter::SetFloat32(size_t idx, float v) { CHECK_AW("ArrayWriter"); inner_->aw_set_f32(idx, v); } +void ArrayWriter::SetFloat64(size_t idx, double v) { CHECK_AW("ArrayWriter"); inner_->aw_set_f64(idx, v); } + +void ArrayWriter::SetString(size_t idx, const std::string& v) { + CHECK_AW("ArrayWriter"); + inner_->aw_set_str(idx, v); +} + +void ArrayWriter::SetBytes(size_t idx, const std::vector& v) { + CHECK_AW("ArrayWriter"); + inner_->aw_set_bytes(idx, rust::Slice(v.data(), v.size())); +} + +void ArrayWriter::SetDate(size_t idx, fluss::Date d) { + CHECK_AW("ArrayWriter"); + inner_->aw_set_date(idx, d.days_since_epoch); +} + +void ArrayWriter::SetTime(size_t idx, fluss::Time t) { + CHECK_AW("ArrayWriter"); + inner_->aw_set_time(idx, t.millis_since_midnight); +} + +void ArrayWriter::SetTimestampNtz(size_t idx, fluss::Timestamp ts) { + CHECK_AW("ArrayWriter"); + inner_->aw_set_ts_ntz(idx, ts.epoch_millis, ts.nano_of_millisecond); +} + +void ArrayWriter::SetTimestampLtz(size_t idx, fluss::Timestamp ts) { + CHECK_AW("ArrayWriter"); + inner_->aw_set_ts_ltz(idx, ts.epoch_millis, ts.nano_of_millisecond); +} + +void ArrayWriter::SetDecimal(size_t idx, const std::string& value) { + CHECK_AW("ArrayWriter"); + inner_->aw_set_decimal_str(idx, value); +} + +void ArrayWriter::SetArray(size_t idx, ArrayWriter&& nested) { + CHECK_AW("ArrayWriter"); + if (!nested.inner_) { + throw std::logic_error("ArrayWriter::SetArray: nested writer not available"); + } + inner_->aw_set_array(idx, *nested.inner_); + nested.Destroy(); +} + +// ============================================================================ +// ArrayView — read-only recursive view into an array column value +// ============================================================================ + +ArrayView::~ArrayView() noexcept { Destroy(); } + +void ArrayView::Destroy() noexcept { + if (inner_) { + rust::Box::from_raw(inner_); + inner_ = nullptr; + } +} + +ArrayView::ArrayView(ArrayView&& other) noexcept : inner_(other.inner_) { other.inner_ = nullptr; } + +ArrayView& ArrayView::operator=(ArrayView&& other) noexcept { + if (this != &other) { + Destroy(); + inner_ = other.inner_; + other.inner_ = nullptr; + } + return *this; +} + +// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) +#define CHECK_AV() \ + do { \ + if (!inner_) throw std::logic_error("ArrayView: not available (moved-from)"); \ + } while (0) + +size_t ArrayView::Size() const noexcept { + assert(inner_ && "ArrayView::Size called on moved-from instance"); + return inner_->av_size(); +} + +TypeId ArrayView::ElementType() const noexcept { + assert(inner_ && "ArrayView::ElementType called on moved-from instance"); + return static_cast(inner_->av_element_type_id()); +} + +bool ArrayView::IsNull(size_t element) const { + CHECK_AV(); + return inner_->av_is_null(element); +} + +bool ArrayView::GetBool(size_t element) const { + CHECK_AV(); + return inner_->av_get_bool(element); +} + +int32_t ArrayView::GetInt32(size_t element) const { + CHECK_AV(); + return inner_->av_get_i32(element); +} + +int64_t ArrayView::GetInt64(size_t element) const { + CHECK_AV(); + return inner_->av_get_i64(element); +} + +float ArrayView::GetFloat32(size_t element) const { + CHECK_AV(); + return inner_->av_get_f32(element); +} + +double ArrayView::GetFloat64(size_t element) const { + CHECK_AV(); + return inner_->av_get_f64(element); +} + +std::string ArrayView::GetString(size_t element) const { + CHECK_AV(); + return std::string(inner_->av_get_str(element)); +} + +std::vector ArrayView::GetBytes(size_t element) const { + CHECK_AV(); + auto rv = inner_->av_get_bytes(element); + return {rv.data(), rv.data() + rv.size()}; +} + +fluss::Date ArrayView::GetDate(size_t element) const { + CHECK_AV(); + return fluss::Date{inner_->av_get_date_days(element)}; +} + +fluss::Time ArrayView::GetTime(size_t element) const { + CHECK_AV(); + return fluss::Time{inner_->av_get_time_millis(element)}; +} + +fluss::Timestamp ArrayView::GetTimestampNtz(size_t element) const { + CHECK_AV(); + return fluss::Timestamp{inner_->av_get_ts_millis(element), + inner_->av_get_ts_nanos(element)}; +} + +fluss::Timestamp ArrayView::GetTimestampLtz(size_t element) const { + CHECK_AV(); + return fluss::Timestamp{inner_->av_get_ts_millis(element), + inner_->av_get_ts_nanos(element)}; +} + +std::string ArrayView::GetDecimalString(size_t element) const { + CHECK_AV(); + return std::string(inner_->av_get_decimal_str(element)); +} + +ArrayView ArrayView::GetArray(size_t element) const { + CHECK_AV(); + auto box = inner_->av_get_nested(element); + return ArrayView(box.into_raw()); +} + +#undef CHECK_AV + +// ============================================================================ +// GenericRow — write-only row backed by opaque Rust GenericRowInner +// ============================================================================ + +GenericRow::GenericRow() { + auto box = ffi::new_generic_row(0); + inner_ = box.into_raw(); +} + +GenericRow::GenericRow(size_t field_count) { + auto box = ffi::new_generic_row(field_count); + inner_ = box.into_raw(); +} + +GenericRow::~GenericRow() noexcept { Destroy(); } + +void GenericRow::Destroy() noexcept { + if (inner_) { + rust::Box::from_raw(inner_); + inner_ = nullptr; + } + column_map_.reset(); +} + +GenericRow::GenericRow(GenericRow&& other) noexcept + : inner_(other.inner_), column_map_(std::move(other.column_map_)) { + other.inner_ = nullptr; +} + +GenericRow& GenericRow::operator=(GenericRow&& other) noexcept { + if (this != &other) { + Destroy(); + inner_ = other.inner_; + column_map_ = std::move(other.column_map_); + other.inner_ = nullptr; + } + return *this; +} + +bool GenericRow::Available() const { return inner_ != nullptr; } + +void GenericRow::Reset() { + CHECK_INNER("GenericRow"); + inner_->gr_reset(); +} + +void GenericRow::SetNull(size_t idx) { + CHECK_INNER("GenericRow"); + inner_->gr_set_null(idx); +} +void GenericRow::SetBool(size_t idx, bool v) { + CHECK_INNER("GenericRow"); + inner_->gr_set_bool(idx, v); +} +void GenericRow::SetInt32(size_t idx, int32_t v) { + CHECK_INNER("GenericRow"); + inner_->gr_set_i32(idx, v); +} +void GenericRow::SetInt64(size_t idx, int64_t v) { + CHECK_INNER("GenericRow"); + inner_->gr_set_i64(idx, v); +} +void GenericRow::SetFloat32(size_t idx, float v) { + CHECK_INNER("GenericRow"); + inner_->gr_set_f32(idx, v); +} +void GenericRow::SetFloat64(size_t idx, double v) { + CHECK_INNER("GenericRow"); + inner_->gr_set_f64(idx, v); +} + +void GenericRow::SetString(size_t idx, std::string v) { + CHECK_INNER("GenericRow"); + inner_->gr_set_str(idx, v); +} + +void GenericRow::SetBytes(size_t idx, std::vector v) { + CHECK_INNER("GenericRow"); + inner_->gr_set_bytes(idx, rust::Slice(v.data(), v.size())); +} + +void GenericRow::SetDate(size_t idx, fluss::Date d) { + CHECK_INNER("GenericRow"); + inner_->gr_set_date(idx, d.days_since_epoch); +} + +void GenericRow::SetTime(size_t idx, fluss::Time t) { + CHECK_INNER("GenericRow"); + inner_->gr_set_time(idx, t.millis_since_midnight); +} + +void GenericRow::SetTimestampNtz(size_t idx, fluss::Timestamp ts) { + CHECK_INNER("GenericRow"); + inner_->gr_set_ts_ntz(idx, ts.epoch_millis, ts.nano_of_millisecond); +} + +void GenericRow::SetTimestampLtz(size_t idx, fluss::Timestamp ts) { + CHECK_INNER("GenericRow"); + inner_->gr_set_ts_ltz(idx, ts.epoch_millis, ts.nano_of_millisecond); +} + +void GenericRow::SetDecimal(size_t idx, const std::string& value) { + CHECK_INNER("GenericRow"); + inner_->gr_set_decimal_str(idx, value); +} + +void GenericRow::SetArray(size_t idx, ArrayWriter&& writer) { + CHECK_INNER("GenericRow"); + if (!writer.inner_) { + throw std::logic_error("GenericRow::SetArray: ArrayWriter not available"); + } + inner_->gr_set_array(idx, *writer.inner_); + writer.Destroy(); +} + +// ============================================================================ +// ScanData — destructor must live in .cpp where rust::Box is visible +// ============================================================================ + +detail::ScanData::~ScanData() { + if (raw) { + rust::Box::from_raw(raw); + } +} + +// ============================================================================ +// RowView — zero-copy read-only row view for scan results +// ============================================================================ + +// NOLINTNEXTLINE(cppcoreguidelines-macro-usage) +#define CHECK_DATA(name) \ + do { \ + if (!data_) throw std::logic_error(name ": not available (moved-from or null)"); \ + } while (0) + +size_t RowView::FieldCount() const { return data_ ? data_->raw->sv_field_count() : 0; } + +TypeId RowView::GetType(size_t idx) const { + CHECK_DATA("RowView"); + return static_cast(data_->raw->sv_column_type(idx)); +} + +bool RowView::IsNull(size_t idx) const { + CHECK_DATA("RowView"); + return data_->raw->sv_is_null(bucket_idx_, rec_idx_, idx); +} +bool RowView::GetBool(size_t idx) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_bool(bucket_idx_, rec_idx_, idx); +} +int32_t RowView::GetInt32(size_t idx) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_i32(bucket_idx_, rec_idx_, idx); +} +int64_t RowView::GetInt64(size_t idx) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_i64(bucket_idx_, rec_idx_, idx); +} +float RowView::GetFloat32(size_t idx) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_f32(bucket_idx_, rec_idx_, idx); +} +double RowView::GetFloat64(size_t idx) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_f64(bucket_idx_, rec_idx_, idx); +} + +std::string_view RowView::GetString(size_t idx) const { + CHECK_DATA("RowView"); + auto s = data_->raw->sv_get_str(bucket_idx_, rec_idx_, idx); + return std::string_view(s.data(), s.size()); +} + +std::pair RowView::GetBytes(size_t idx) const { + CHECK_DATA("RowView"); + auto bytes = data_->raw->sv_get_bytes(bucket_idx_, rec_idx_, idx); + return {bytes.data(), bytes.size()}; +} + +Date RowView::GetDate(size_t idx) const { + CHECK_DATA("RowView"); + return Date{data_->raw->sv_get_date_days(bucket_idx_, rec_idx_, idx)}; +} + +Time RowView::GetTime(size_t idx) const { + CHECK_DATA("RowView"); + return Time{data_->raw->sv_get_time_millis(bucket_idx_, rec_idx_, idx)}; +} + +Timestamp RowView::GetTimestamp(size_t idx) const { + CHECK_DATA("RowView"); + return Timestamp{data_->raw->sv_get_ts_millis(bucket_idx_, rec_idx_, idx), + data_->raw->sv_get_ts_nanos(bucket_idx_, rec_idx_, idx)}; +} + +bool RowView::IsDecimal(size_t idx) const { return GetType(idx) == TypeId::Decimal; } + +std::string RowView::GetDecimalString(size_t idx) const { + CHECK_DATA("RowView"); + return std::string(data_->raw->sv_get_decimal_str(bucket_idx_, rec_idx_, idx)); +} + +size_t RowView::GetArraySize(size_t idx) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_array_size(bucket_idx_, rec_idx_, idx); +} + +TypeId RowView::GetArrayElementType(size_t idx) const { + CHECK_DATA("RowView"); + return static_cast(data_->raw->sv_get_array_element_type(idx)); +} + +bool RowView::IsArrayElementNull(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_array_is_null(bucket_idx_, rec_idx_, idx, element); +} + +bool RowView::GetArrayBool(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_array_bool(bucket_idx_, rec_idx_, idx, element); +} + +int32_t RowView::GetArrayInt32(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_array_i32(bucket_idx_, rec_idx_, idx, element); +} + +int64_t RowView::GetArrayInt64(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_array_i64(bucket_idx_, rec_idx_, idx, element); +} + +float RowView::GetArrayFloat32(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_array_f32(bucket_idx_, rec_idx_, idx, element); +} + +double RowView::GetArrayFloat64(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return data_->raw->sv_get_array_f64(bucket_idx_, rec_idx_, idx, element); +} + +std::string RowView::GetArrayString(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return std::string(data_->raw->sv_get_array_str(bucket_idx_, rec_idx_, idx, element)); +} + +std::vector RowView::GetArrayBytes(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + auto rv = data_->raw->sv_get_array_bytes(bucket_idx_, rec_idx_, idx, element); + return {rv.data(), rv.data() + rv.size()}; +} + +fluss::Date RowView::GetArrayDate(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return fluss::Date{data_->raw->sv_get_array_date_days(bucket_idx_, rec_idx_, idx, element)}; +} + +fluss::Time RowView::GetArrayTime(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return fluss::Time{data_->raw->sv_get_array_time_millis(bucket_idx_, rec_idx_, idx, element)}; +} + +fluss::Timestamp RowView::GetArrayTimestamp(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + auto millis = data_->raw->sv_get_array_ts_millis(bucket_idx_, rec_idx_, idx, element); + auto nanos = data_->raw->sv_get_array_ts_nanos(bucket_idx_, rec_idx_, idx, element); + return fluss::Timestamp{millis, nanos}; +} + +std::string RowView::GetArrayDecimalString(size_t idx, size_t element) const { + CHECK_DATA("RowView"); + return std::string(data_->raw->sv_get_array_decimal_str(bucket_idx_, rec_idx_, idx, element)); +} + +ArrayView RowView::GetArrayView(size_t idx) const { + CHECK_DATA("RowView"); + auto box = data_->raw->sv_get_array_view(bucket_idx_, rec_idx_, idx); + return ArrayView(box.into_raw()); +} + +// ============================================================================ +// ScanRecords — backed by opaque Rust ScanResultInner +// ============================================================================ + +// ScanRecords constructor, destructor, move operations are all defaulted in the header. + +size_t ScanRecords::Count() const { return data_ ? data_->raw->sv_record_count() : 0; } + +bool ScanRecords::IsEmpty() const { return Count() == 0; } + +ScanRecord ScanRecords::RecordAt(size_t bucket, size_t rec_idx) const { + if (!data_) { + throw std::logic_error("ScanRecords: not available (moved-from or null)"); + } + return ScanRecord{data_->raw->sv_offset(bucket, rec_idx), + data_->raw->sv_timestamp(bucket, rec_idx), + static_cast(data_->raw->sv_change_type(bucket, rec_idx)), + RowView(data_, bucket, rec_idx)}; +} + +static TableBucket to_table_bucket(const ffi::FfiBucketInfo& g) { + return TableBucket{g.table_id, g.bucket_id, + g.has_partition_id ? std::optional(g.partition_id) : std::nullopt}; +} + +size_t ScanRecords::BucketCount() const { return data_ ? data_->raw->sv_bucket_infos().size() : 0; } + +ScanRecord ScanRecords::Iterator::operator*() const { + return owner_->RecordAt(bucket_idx_, rec_idx_); +} + +ScanRecords::Iterator ScanRecords::begin() const { return Iterator(this, 0, 0); } + +ScanRecords::Iterator& ScanRecords::Iterator::operator++() { + ++rec_idx_; + if (owner_->data_) { + const auto& infos = owner_->data_->raw->sv_bucket_infos(); + while (bucket_idx_ < infos.size() && rec_idx_ >= infos[bucket_idx_].record_count) { + rec_idx_ = 0; + ++bucket_idx_; + } + } + return *this; +} + +std::vector ScanRecords::Buckets() const { + std::vector result; + if (!data_) return result; + const auto& infos = data_->raw->sv_bucket_infos(); + result.reserve(infos.size()); + for (const auto& g : infos) { + result.push_back(to_table_bucket(g)); + } + return result; +} + +BucketRecords ScanRecords::Records(const TableBucket& bucket) const { + if (!data_) { + return BucketRecords({}, bucket, 0, 0); + } + const auto& infos = data_->raw->sv_bucket_infos(); + for (size_t i = 0; i < infos.size(); ++i) { + TableBucket tb = to_table_bucket(infos[i]); + if (tb == bucket) { + return BucketRecords(data_, std::move(tb), i, infos[i].record_count); + } + } + return BucketRecords({}, bucket, 0, 0); +} + +BucketRecords ScanRecords::BucketAt(size_t idx) const { + if (!data_) { + throw std::logic_error("ScanRecords: not available (moved-from or null)"); + } + const auto& infos = data_->raw->sv_bucket_infos(); + if (idx >= infos.size()) { + throw std::out_of_range("ScanRecords::BucketAt: index " + std::to_string(idx) + + " out of range (" + std::to_string(infos.size()) + " buckets)"); + } + return BucketRecords(data_, to_table_bucket(infos[idx]), idx, infos[idx].record_count); +} + +ScanRecord BucketRecords::operator[](size_t idx) const { + if (idx >= count_) { + throw std::out_of_range("BucketRecords: index " + std::to_string(idx) + " out of range (" + + std::to_string(count_) + " records)"); + } + return ScanRecord{data_->raw->sv_offset(bucket_idx_, idx), + data_->raw->sv_timestamp(bucket_idx_, idx), + static_cast(data_->raw->sv_change_type(bucket_idx_, idx)), + RowView(data_, bucket_idx_, idx)}; +} + +ScanRecord BucketRecords::Iterator::operator*() const { return owner_->operator[](idx_); } + +// ============================================================================ +// LookupResult — backed by opaque Rust LookupResultInner +// ============================================================================ + +LookupResult::LookupResult() noexcept = default; + +LookupResult::~LookupResult() noexcept { Destroy(); } + +void LookupResult::Destroy() noexcept { + if (inner_) { + rust::Box::from_raw(inner_); + inner_ = nullptr; + column_map_.reset(); + } +} + +LookupResult::LookupResult(LookupResult&& other) noexcept + : inner_(other.inner_), column_map_(std::move(other.column_map_)) { + other.inner_ = nullptr; +} + +LookupResult& LookupResult::operator=(LookupResult&& other) noexcept { + if (this != &other) { + Destroy(); + inner_ = other.inner_; + column_map_ = std::move(other.column_map_); + other.inner_ = nullptr; + } + return *this; +} + +void LookupResult::BuildColumnMap() const { + if (!inner_) return; + auto map = std::make_shared(); + auto count = inner_->lv_field_count(); + for (size_t i = 0; i < count; ++i) { + auto name = inner_->lv_column_name(i); + (*map)[std::string(name.data(), name.size())] = { + i, static_cast(inner_->lv_column_type(i))}; + } + column_map_ = std::move(map); +} + +bool LookupResult::Found() const { return inner_ && inner_->lv_found(); } + +size_t LookupResult::FieldCount() const { return inner_ ? inner_->lv_field_count() : 0; } + +TypeId LookupResult::GetType(size_t idx) const { + CHECK_INNER("LookupResult"); + return static_cast(inner_->lv_column_type(idx)); +} + +bool LookupResult::IsNull(size_t idx) const { + CHECK_INNER("LookupResult"); + return inner_->lv_is_null(idx); +} +bool LookupResult::GetBool(size_t idx) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_bool(idx); +} +int32_t LookupResult::GetInt32(size_t idx) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_i32(idx); +} +int64_t LookupResult::GetInt64(size_t idx) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_i64(idx); +} +float LookupResult::GetFloat32(size_t idx) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_f32(idx); +} +double LookupResult::GetFloat64(size_t idx) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_f64(idx); +} + +std::string_view LookupResult::GetString(size_t idx) const { + CHECK_INNER("LookupResult"); + auto s = inner_->lv_get_str(idx); + return std::string_view(s.data(), s.size()); +} + +std::pair LookupResult::GetBytes(size_t idx) const { + CHECK_INNER("LookupResult"); + auto bytes = inner_->lv_get_bytes(idx); + return {bytes.data(), bytes.size()}; +} + +Date LookupResult::GetDate(size_t idx) const { + CHECK_INNER("LookupResult"); + return Date{inner_->lv_get_date_days(idx)}; +} + +Time LookupResult::GetTime(size_t idx) const { + CHECK_INNER("LookupResult"); + return Time{inner_->lv_get_time_millis(idx)}; +} + +Timestamp LookupResult::GetTimestamp(size_t idx) const { + CHECK_INNER("LookupResult"); + return Timestamp{inner_->lv_get_ts_millis(idx), inner_->lv_get_ts_nanos(idx)}; +} + +bool LookupResult::IsDecimal(size_t idx) const { return GetType(idx) == TypeId::Decimal; } + +std::string LookupResult::GetDecimalString(size_t idx) const { + CHECK_INNER("LookupResult"); + return std::string(inner_->lv_get_decimal_str(idx)); +} + +size_t LookupResult::GetArraySize(size_t idx) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_array_size(idx); +} + +TypeId LookupResult::GetArrayElementType(size_t idx) const { + CHECK_INNER("LookupResult"); + return static_cast(inner_->lv_get_array_element_type(idx)); +} + +bool LookupResult::IsArrayElementNull(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_array_is_null(idx, element); +} + +bool LookupResult::GetArrayBool(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_array_bool(idx, element); +} + +int32_t LookupResult::GetArrayInt32(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_array_i32(idx, element); +} + +int64_t LookupResult::GetArrayInt64(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_array_i64(idx, element); +} + +float LookupResult::GetArrayFloat32(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_array_f32(idx, element); +} + +double LookupResult::GetArrayFloat64(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return inner_->lv_get_array_f64(idx, element); +} + +std::string LookupResult::GetArrayString(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return std::string(inner_->lv_get_array_str(idx, element)); +} + +std::vector LookupResult::GetArrayBytes(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + auto rv = inner_->lv_get_array_bytes(idx, element); + return {rv.data(), rv.data() + rv.size()}; +} + +fluss::Date LookupResult::GetArrayDate(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return fluss::Date{inner_->lv_get_array_date_days(idx, element)}; +} + +fluss::Time LookupResult::GetArrayTime(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return fluss::Time{inner_->lv_get_array_time_millis(idx, element)}; +} + +fluss::Timestamp LookupResult::GetArrayTimestamp(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + auto millis = inner_->lv_get_array_ts_millis(idx, element); + auto nanos = inner_->lv_get_array_ts_nanos(idx, element); + return fluss::Timestamp{millis, nanos}; +} + +std::string LookupResult::GetArrayDecimalString(size_t idx, size_t element) const { + CHECK_INNER("LookupResult"); + return std::string(inner_->lv_get_array_decimal_str(idx, element)); +} + +ArrayView LookupResult::GetArrayView(size_t idx) const { + CHECK_INNER("LookupResult"); + auto box = inner_->lv_get_array_view(idx); + return ArrayView(box.into_raw()); +} + +// ============================================================================ +// Table +// ============================================================================ + +Table::Table() noexcept = default; + +Table::Table(ffi::Table* table) noexcept : table_(table) {} + +Table::~Table() noexcept { Destroy(); } + +void Table::Destroy() noexcept { + if (table_) { + ffi::delete_table(table_); + table_ = nullptr; + } +} + +Table::Table(Table&& other) noexcept + : table_(other.table_), column_map_(std::move(other.column_map_)) { + other.table_ = nullptr; +} + +Table& Table::operator=(Table&& other) noexcept { + if (this != &other) { + Destroy(); + table_ = other.table_; + column_map_ = std::move(other.column_map_); + other.table_ = nullptr; + } + return *this; +} + +bool Table::Available() const { return table_ != nullptr; } + +TableAppend Table::NewAppend() { return TableAppend(table_); } + +TableUpsert Table::NewUpsert() { return TableUpsert(table_); } + +TableLookup Table::NewLookup() { return TableLookup(table_); } + +TableScan Table::NewScan() { return TableScan(table_); } + +const std::shared_ptr& Table::GetColumnMap() const { + if (!column_map_ && Available()) { + auto info = GetTableInfo(); + column_map_ = std::make_shared(); + for (size_t i = 0; i < info.schema.columns.size(); ++i) { + (*column_map_)[info.schema.columns[i].name] = {i, + info.schema.columns[i].data_type.id()}; + } + } + return column_map_; +} + +GenericRow Table::NewRow() const { + GenericRow row; + row.column_map_ = GetColumnMap(); + return row; +} + +TableInfo Table::GetTableInfo() const { + if (!Available()) { + return TableInfo{}; + } + auto ffi_info = table_->get_table_info_from_table(); + return utils::from_ffi_table_info(ffi_info); +} + +TablePath Table::GetTablePath() const { + if (!Available()) { + return TablePath{}; + } + auto ffi_path = table_->get_table_path(); + return TablePath{std::string(ffi_path.database_name), std::string(ffi_path.table_name)}; +} + +bool Table::HasPrimaryKey() const { + if (!Available()) { + return false; + } + return table_->has_primary_key(); +} + +// ============================================================================ +// TableAppend +// ============================================================================ + +TableAppend::TableAppend(ffi::Table* table) noexcept : table_(table) {} + +Result TableAppend::CreateWriter(AppendWriter& out) { + if (table_ == nullptr) { + return utils::make_client_error("Table not available"); + } + + auto ffi_result = table_->new_append_writer(); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = AppendWriter(utils::ptr_from_ffi(ffi_result)); + } + return result; +} + +// ============================================================================ +// TableUpsert +// ============================================================================ + +TableUpsert::TableUpsert(ffi::Table* table) noexcept : table_(table) {} + +TableUpsert& TableUpsert::PartialUpdateByIndex(std::vector column_indices) { + if (column_indices.empty()) { + throw std::invalid_argument("PartialUpdateByIndex requires at least one column"); + } + column_indices_ = std::move(column_indices); + column_names_.clear(); + return *this; +} + +TableUpsert& TableUpsert::PartialUpdateByName(std::vector column_names) { + if (column_names.empty()) { + throw std::invalid_argument("PartialUpdateByName requires at least one column"); + } + column_names_ = std::move(column_names); + column_indices_.clear(); + return *this; +} + +std::vector TableUpsert::ResolveNameProjection() const { + auto ffi_info = table_->get_table_info_from_table(); + const auto& columns = ffi_info.schema.columns; + + std::vector indices; + for (const auto& name : column_names_) { + bool found = false; + for (size_t i = 0; i < columns.size(); ++i) { + if (std::string(columns[i].name) == name) { + indices.push_back(i); + found = true; + break; + } + } + if (!found) { + throw std::runtime_error("Column '" + name + "' not found"); + } + } + return indices; +} + +Result TableUpsert::CreateWriter(UpsertWriter& out) { + if (table_ == nullptr) { + return utils::make_client_error("Table not available"); + } + + try { + auto resolved_indices = !column_names_.empty() ? ResolveNameProjection() : column_indices_; + + rust::Vec rust_indices; + for (size_t idx : resolved_indices) { + rust_indices.push_back(idx); + } + auto ffi_result = table_->create_upsert_writer(std::move(rust_indices)); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = UpsertWriter(utils::ptr_from_ffi(ffi_result)); + } + return result; + } catch (const std::exception& e) { + // ResolveNameProjection() may throw + return utils::make_client_error(e.what()); + } +} + +// ============================================================================ +// TableLookup +// ============================================================================ + +TableLookup::TableLookup(ffi::Table* table) noexcept : table_(table) {} + +Result TableLookup::CreateLookuper(Lookuper& out) { + if (table_ == nullptr) { + return utils::make_client_error("Table not available"); + } + + auto ffi_result = table_->new_lookuper(); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = Lookuper(utils::ptr_from_ffi(ffi_result)); + } + return result; +} + +// ============================================================================ +// TableScan +// ============================================================================ + +TableScan::TableScan(ffi::Table* table) noexcept : table_(table) {} + +TableScan& TableScan::ProjectByIndex(std::vector column_indices) { + projection_ = std::move(column_indices); + name_projection_.clear(); + return *this; +} + +TableScan& TableScan::ProjectByName(std::vector column_names) { + name_projection_ = std::move(column_names); + projection_.clear(); + return *this; +} + +std::vector TableScan::ResolveNameProjection() const { + auto ffi_info = table_->get_table_info_from_table(); + const auto& columns = ffi_info.schema.columns; + + std::vector indices; + for (const auto& name : name_projection_) { + bool found = false; + for (size_t i = 0; i < columns.size(); ++i) { + if (std::string(columns[i].name) == name) { + indices.push_back(i); + found = true; + break; + } + } + if (!found) { + throw std::runtime_error("Column '" + name + "' not found"); + } + } + return indices; +} + +Result TableScan::CreateLogScanner(LogScanner& out) { return DoCreateScanner(out, false); } + +Result TableScan::CreateRecordBatchLogScanner(LogScanner& out) { + return DoCreateScanner(out, true); +} + +Result TableScan::DoCreateScanner(LogScanner& out, bool is_record_batch) { + if (table_ == nullptr) { + return utils::make_client_error("Table not available"); + } + + try { + auto resolved_indices = !name_projection_.empty() ? ResolveNameProjection() : projection_; + rust::Vec rust_indices; + for (size_t idx : resolved_indices) { + rust_indices.push_back(idx); + } + auto ffi_result = table_->create_scanner(std::move(rust_indices), is_record_batch); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.scanner_ = utils::ptr_from_ffi(ffi_result); + } + return result; + } catch (const std::exception& e) { + // ResolveNameProjection() may throw + return utils::make_client_error(e.what()); + } +} + +// ============================================================================ +// WriteResult +// ============================================================================ + +WriteResult::WriteResult() noexcept = default; + +WriteResult::WriteResult(ffi::WriteResult* inner) noexcept : inner_(inner) {} + +WriteResult::~WriteResult() noexcept { Destroy(); } + +void WriteResult::Destroy() noexcept { + if (inner_) { + ffi::delete_write_result(inner_); + inner_ = nullptr; + } +} + +WriteResult::WriteResult(WriteResult&& other) noexcept : inner_(other.inner_) { + other.inner_ = nullptr; +} + +WriteResult& WriteResult::operator=(WriteResult&& other) noexcept { + if (this != &other) { + Destroy(); + inner_ = other.inner_; + other.inner_ = nullptr; + } + return *this; +} + +bool WriteResult::Available() const { return inner_ != nullptr; } + +Result WriteResult::Wait() { + if (!Available()) { + return utils::make_ok(); + } + + auto ffi_result = inner_->wait(); + return utils::from_ffi_result(ffi_result); +} + +// ============================================================================ +// AppendWriter +// ============================================================================ + +AppendWriter::AppendWriter() noexcept = default; + +AppendWriter::AppendWriter(ffi::AppendWriter* writer) noexcept : writer_(writer) {} + +AppendWriter::~AppendWriter() noexcept { Destroy(); } + +void AppendWriter::Destroy() noexcept { + if (writer_) { + ffi::delete_append_writer(writer_); + writer_ = nullptr; + } +} + +AppendWriter::AppendWriter(AppendWriter&& other) noexcept : writer_(other.writer_) { + other.writer_ = nullptr; +} + +AppendWriter& AppendWriter::operator=(AppendWriter&& other) noexcept { + if (this != &other) { + Destroy(); + writer_ = other.writer_; + other.writer_ = nullptr; + } + return *this; +} + +bool AppendWriter::Available() const { return writer_ != nullptr; } + +Result AppendWriter::Append(const GenericRow& row) { + WriteResult wr; + return Append(row, wr); +} + +Result AppendWriter::Append(const GenericRow& row, WriteResult& out) { + if (!Available()) { + return utils::make_client_error("AppendWriter not available"); + } + if (!row.Available()) { + return utils::make_client_error("GenericRow not available"); + } + + auto ffi_result = writer_->append(*row.inner_); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = WriteResult(utils::ptr_from_ffi(ffi_result)); + } + return result; +} + +Result AppendWriter::AppendArrowBatch(const std::shared_ptr& batch) { + WriteResult wr; + return AppendArrowBatch(batch, wr); +} + +Result AppendWriter::AppendArrowBatch(const std::shared_ptr& batch, + WriteResult& out) { + if (!Available()) { + return utils::make_client_error("AppendWriter not available"); + } + if (!batch) { + return utils::make_client_error("Arrow RecordBatch is null"); + } + + // Export via Arrow C Data Interface + struct ArrowArray c_array; + struct ArrowSchema c_schema; + auto status = arrow::ExportRecordBatch(*batch, &c_array, &c_schema); + if (!status.ok()) { + return utils::make_client_error("Failed to export Arrow batch: " + status.ToString()); + } + + // Heap-allocate for Rust ownership transfer + auto* array_heap = new ArrowArray(std::move(c_array)); + auto* schema_heap = new ArrowSchema(std::move(c_schema)); + + // Rust takes ownership of both pointers immediately via Box::from_raw(), + // so after this call C++ must NOT free them. + auto ffi_result = writer_->append_arrow_batch(reinterpret_cast(array_heap), + reinterpret_cast(schema_heap)); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out.Destroy(); + out.inner_ = utils::ptr_from_ffi(ffi_result); + } + return result; +} + +Result AppendWriter::Flush() { + if (!Available()) { + return utils::make_client_error("AppendWriter not available"); + } + + auto ffi_result = writer_->flush(); + return utils::from_ffi_result(ffi_result); +} + +// ============================================================================ +// UpsertWriter +// ============================================================================ + +UpsertWriter::UpsertWriter() noexcept = default; + +UpsertWriter::UpsertWriter(ffi::UpsertWriter* writer) noexcept : writer_(writer) {} + +UpsertWriter::~UpsertWriter() noexcept { Destroy(); } + +void UpsertWriter::Destroy() noexcept { + if (writer_) { + ffi::delete_upsert_writer(writer_); + writer_ = nullptr; + } +} + +UpsertWriter::UpsertWriter(UpsertWriter&& other) noexcept : writer_(other.writer_) { + other.writer_ = nullptr; +} + +UpsertWriter& UpsertWriter::operator=(UpsertWriter&& other) noexcept { + if (this != &other) { + Destroy(); + writer_ = other.writer_; + other.writer_ = nullptr; + } + return *this; +} + +bool UpsertWriter::Available() const { return writer_ != nullptr; } + +Result UpsertWriter::Upsert(const GenericRow& row) { + WriteResult wr; + return Upsert(row, wr); +} + +Result UpsertWriter::Upsert(const GenericRow& row, WriteResult& out) { + if (!Available()) { + return utils::make_client_error("UpsertWriter not available"); + } + if (!row.Available()) { + return utils::make_client_error("GenericRow not available"); + } + + auto ffi_result = writer_->upsert(*row.inner_); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = WriteResult(utils::ptr_from_ffi(ffi_result)); + } + return result; +} + +Result UpsertWriter::Delete(const GenericRow& row) { + WriteResult wr; + return Delete(row, wr); +} + +Result UpsertWriter::Delete(const GenericRow& row, WriteResult& out) { + if (!Available()) { + return utils::make_client_error("UpsertWriter not available"); + } + if (!row.Available()) { + return utils::make_client_error("GenericRow not available"); + } + + auto ffi_result = writer_->delete_row(*row.inner_); + auto result = utils::from_ffi_result(ffi_result.result); + if (result.Ok()) { + out = WriteResult(utils::ptr_from_ffi(ffi_result)); + } + return result; +} + +Result UpsertWriter::Flush() { + if (!Available()) { + return utils::make_client_error("UpsertWriter not available"); + } + + auto ffi_result = writer_->upsert_flush(); + return utils::from_ffi_result(ffi_result); +} + +// ============================================================================ +// Lookuper +// ============================================================================ + +Lookuper::Lookuper() noexcept = default; + +Lookuper::Lookuper(ffi::Lookuper* lookuper) noexcept : lookuper_(lookuper) {} + +Lookuper::~Lookuper() noexcept { Destroy(); } + +void Lookuper::Destroy() noexcept { + if (lookuper_) { + ffi::delete_lookuper(lookuper_); + lookuper_ = nullptr; + } +} + +Lookuper::Lookuper(Lookuper&& other) noexcept : lookuper_(other.lookuper_) { + other.lookuper_ = nullptr; +} + +Lookuper& Lookuper::operator=(Lookuper&& other) noexcept { + if (this != &other) { + Destroy(); + lookuper_ = other.lookuper_; + other.lookuper_ = nullptr; + } + return *this; +} + +bool Lookuper::Available() const { return lookuper_ != nullptr; } + +Result Lookuper::Lookup(const GenericRow& pk_row, LookupResult& out) { + if (!Available()) { + return utils::make_client_error("Lookuper not available"); + } + if (!pk_row.Available()) { + return utils::make_client_error("GenericRow not available"); + } + + auto result_box = lookuper_->lookup(*pk_row.inner_); + if (result_box->lv_has_error()) { + return utils::make_error(result_box->lv_error_code(), + std::string(result_box->lv_error_message())); + } + + out.Destroy(); + out.inner_ = result_box.into_raw(); + return utils::make_ok(); +} + +// ============================================================================ +// LogScanner +// ============================================================================ + +LogScanner::LogScanner() noexcept = default; + +LogScanner::LogScanner(ffi::LogScanner* scanner) noexcept : scanner_(scanner) {} + +LogScanner::~LogScanner() noexcept { Destroy(); } + +void LogScanner::Destroy() noexcept { + if (scanner_) { + ffi::delete_log_scanner(scanner_); + scanner_ = nullptr; + } +} + +LogScanner::LogScanner(LogScanner&& other) noexcept : scanner_(other.scanner_) { + other.scanner_ = nullptr; +} + +LogScanner& LogScanner::operator=(LogScanner&& other) noexcept { + if (this != &other) { + Destroy(); + scanner_ = other.scanner_; + other.scanner_ = nullptr; + } + return *this; +} + +bool LogScanner::Available() const { return scanner_ != nullptr; } + +Result LogScanner::Subscribe(int32_t bucket_id, int64_t start_offset) { + if (!Available()) { + return utils::make_client_error("LogScanner not available"); + } + + auto ffi_result = scanner_->subscribe(bucket_id, start_offset); + return utils::from_ffi_result(ffi_result); +} + +Result LogScanner::Subscribe(const std::vector& bucket_offsets) { + if (!Available()) { + return utils::make_client_error("LogScanner not available"); + } + + rust::Vec rust_subs; + for (const auto& sub : bucket_offsets) { + ffi::FfiBucketSubscription ffi_sub; + ffi_sub.bucket_id = sub.bucket_id; + ffi_sub.offset = sub.offset; + rust_subs.push_back(ffi_sub); + } + + auto ffi_result = scanner_->subscribe_buckets(std::move(rust_subs)); + return utils::from_ffi_result(ffi_result); +} + +Result LogScanner::SubscribePartitionBuckets(int64_t partition_id, int32_t bucket_id, + int64_t start_offset) { + if (!Available()) { + return utils::make_client_error("LogScanner not available"); + } + + auto ffi_result = scanner_->subscribe_partition(partition_id, bucket_id, start_offset); + return utils::from_ffi_result(ffi_result); +} + +Result LogScanner::SubscribePartitionBuckets( + const std::vector& subscriptions) { + if (!Available()) { + return utils::make_client_error("LogScanner not available"); + } + + rust::Vec rust_subs; + for (const auto& sub : subscriptions) { + ffi::FfiPartitionBucketSubscription ffi_sub; + ffi_sub.partition_id = sub.partition_id; + ffi_sub.bucket_id = sub.bucket_id; + ffi_sub.offset = sub.offset; + rust_subs.push_back(ffi_sub); + } + + auto ffi_result = scanner_->subscribe_partition_buckets(std::move(rust_subs)); + return utils::from_ffi_result(ffi_result); +} + +Result LogScanner::Unsubscribe(int32_t bucket_id) { + if (!Available()) { + return utils::make_client_error("LogScanner not available"); + } + + auto ffi_result = scanner_->unsubscribe(bucket_id); + return utils::from_ffi_result(ffi_result); +} + +Result LogScanner::UnsubscribePartition(int64_t partition_id, int32_t bucket_id) { + if (!Available()) { + return utils::make_client_error("LogScanner not available"); + } + + auto ffi_result = scanner_->unsubscribe_partition(partition_id, bucket_id); + return utils::from_ffi_result(ffi_result); +} + +Result LogScanner::Poll(int64_t timeout_ms, ScanRecords& out) { + if (!Available()) { + return utils::make_client_error("LogScanner not available"); + } + + auto result_box = scanner_->poll(timeout_ms); + if (result_box->sv_has_error()) { + return utils::make_error(result_box->sv_error_code(), + std::string(result_box->sv_error_message())); + } + + // Wrap raw pointer in ScanData immediately so it's never leaked on exception. + auto data = std::make_shared(result_box.into_raw(), detail::ColumnMap{}); + // Build column map eagerly — shared by all RowViews/BucketRecords. + auto col_count = data->raw->sv_column_count(); + for (size_t i = 0; i < col_count; ++i) { + auto name = data->raw->sv_column_name(i); + data->columns[std::string(name.data(), name.size())] = { + i, static_cast(data->raw->sv_column_type(i))}; + } + out.data_ = std::move(data); + return utils::make_ok(); +} + +ArrowRecordBatch::ArrowRecordBatch(std::shared_ptr batch, int64_t table_id, + int64_t partition_id, int32_t bucket_id, + int64_t base_offset) noexcept + : batch_(std::move(batch)), + table_id_(table_id), + partition_id_(partition_id), + bucket_id_(bucket_id), + base_offset_(base_offset) {} + +bool ArrowRecordBatch::Available() const { return batch_ != nullptr; } + +int64_t ArrowRecordBatch::NumRows() const { + if (!Available()) return 0; + return batch_->num_rows(); +} + +int64_t ArrowRecordBatch::GetTableId() const { + if (!Available()) return 0; + return this->table_id_; +} + +int64_t ArrowRecordBatch::GetPartitionId() const { + if (!Available()) return -1; + return this->partition_id_; +} + +int32_t ArrowRecordBatch::GetBucketId() const { + if (!Available()) return -1; + return this->bucket_id_; +} + +int64_t ArrowRecordBatch::GetBaseOffset() const { + if (!Available()) return -1; + return this->base_offset_; +} + +int64_t ArrowRecordBatch::GetLastOffset() const { + if (!Available()) return -1; + return this->base_offset_ + this->NumRows() - 1; +} + +Result LogScanner::PollRecordBatch(int64_t timeout_ms, ArrowRecordBatches& out) { + if (!Available()) { + return utils::make_client_error("LogScanner not available"); + } + + auto ffi_result = scanner_->poll_record_batch(timeout_ms); + auto result = utils::from_ffi_result(ffi_result.result); + if (!result.Ok()) { + return result; + } + + // Convert the FFI Arrow record batches to C++ ArrowRecordBatch objects + out.batches.clear(); + for (const auto& ffi_batch : ffi_result.arrow_batches.batches) { + auto* c_array = reinterpret_cast(ffi_batch.array_ptr); + auto* c_schema = reinterpret_cast(ffi_batch.schema_ptr); + + auto import_result = arrow::ImportRecordBatch(c_array, c_schema); + if (import_result.ok()) { + auto batch_ptr = import_result.ValueOrDie(); + auto batch_wrapper = std::unique_ptr(new ArrowRecordBatch( + std::move(batch_ptr), ffi_batch.table_id, ffi_batch.partition_id, + ffi_batch.bucket_id, ffi_batch.base_offset)); + out.batches.push_back(std::move(batch_wrapper)); + + // Free the container structures that were allocated in Rust after successful import + ffi::free_arrow_ffi_structures(ffi_batch.array_ptr, ffi_batch.schema_ptr); + } else { + // Import failed, free the container structures to avoid leaks and return error + ffi::free_arrow_ffi_structures(ffi_batch.array_ptr, ffi_batch.schema_ptr); + + // Return an error indicating that the import failed + std::string error_msg = + "Failed to import Arrow record batch: " + import_result.status().ToString(); + return utils::make_client_error(error_msg); + } + } + + return utils::make_ok(); +} + +} // namespace fluss diff --git a/fluss-rust/bindings/cpp/src/types.rs b/fluss-rust/bindings/cpp/src/types.rs new file mode 100644 index 0000000000..23ac636d4c --- /dev/null +++ b/fluss-rust/bindings/cpp/src/types.rs @@ -0,0 +1,646 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::ffi; +use anyhow::{Result, anyhow}; +use arrow::array::Array; +use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; +use fluss as fcore; +use std::borrow::Cow; +use std::str::FromStr; + +pub const DATA_TYPE_BOOLEAN: i32 = 1; +pub const DATA_TYPE_TINYINT: i32 = 2; +pub const DATA_TYPE_SMALLINT: i32 = 3; +pub const DATA_TYPE_INT: i32 = 4; +pub const DATA_TYPE_BIGINT: i32 = 5; +pub const DATA_TYPE_FLOAT: i32 = 6; +pub const DATA_TYPE_DOUBLE: i32 = 7; +pub const DATA_TYPE_STRING: i32 = 8; +pub const DATA_TYPE_BYTES: i32 = 9; +pub const DATA_TYPE_DATE: i32 = 10; +pub const DATA_TYPE_TIME: i32 = 11; +pub const DATA_TYPE_TIMESTAMP: i32 = 12; +pub const DATA_TYPE_TIMESTAMP_LTZ: i32 = 13; +pub const DATA_TYPE_DECIMAL: i32 = 14; +pub const DATA_TYPE_CHAR: i32 = 15; +pub const DATA_TYPE_BINARY: i32 = 16; +pub const DATA_TYPE_ARRAY: i32 = 17; + +/// Separates scalar and array type specs so each variant only carries +/// the fields it actually needs — no zeroed-out placeholders. +enum FfiDataTypeSpec { + Scalar { + data_type: i32, + precision: u32, + scale: u32, + nullable: bool, + }, + Array { + element_data_type: i32, + element_precision: u32, + element_scale: u32, + array_nesting: u32, + /// `nesting` entries for each ARRAY wrapper (outermost first) plus + /// one trailing entry for the leaf scalar. Length = `nesting + 1`. + array_nullability: Vec, + }, +} + +fn ffi_column_to_core_data_type(col: &ffi::FfiColumn) -> Result { + if col.data_type == DATA_TYPE_ARRAY { + ffi_data_type_to_core(FfiDataTypeSpec::Array { + element_data_type: col.element_data_type, + element_precision: col.element_precision as u32, + element_scale: col.element_scale as u32, + array_nesting: col.array_nesting.max(0) as u32, + array_nullability: col.array_nullability.clone(), + }) + } else { + ffi_data_type_to_core(FfiDataTypeSpec::Scalar { + data_type: col.data_type, + precision: col.precision as u32, + scale: col.scale as u32, + nullable: col.nullable, + }) + } +} + +fn type_precision_scale(dt: &fcore::metadata::DataType) -> (i32, i32) { + match dt { + fcore::metadata::DataType::Decimal(d) => (d.precision() as i32, d.scale() as i32), + fcore::metadata::DataType::Timestamp(ts) => (ts.precision() as i32, 0), + fcore::metadata::DataType::TimestampLTz(ts) => (ts.precision() as i32, 0), + fcore::metadata::DataType::Char(ch) => (ch.length() as i32, 0), + fcore::metadata::DataType::Binary(bin) => (bin.length() as i32, 0), + _ => (0, 0), + } +} + +struct FlattenedLeafType { + nesting: i32, + leaf_type: i32, + leaf_precision: i32, + leaf_scale: i32, + /// `nesting` entries for ARRAY wrappers (outermost first) plus one + /// trailing entry for the leaf scalar. Length = `nesting + 1`. + array_nullability: Vec, +} + +fn flatten_array_leaf_type(dt: &fcore::metadata::DataType) -> Result { + let mut nesting = 0_i32; + let mut leaf = dt; + let mut array_nullability = Vec::new(); + while let fcore::metadata::DataType::Array(at) = leaf { + nesting += 1; + array_nullability.push(u8::from(leaf.is_nullable())); + leaf = at.get_element_type(); + } + if nesting == 0 { + return Err(anyhow!("Expected ARRAY data type, got {dt}")); + } + let leaf_type = core_data_type_to_ffi(leaf); + if leaf_type == 0 { + return Err(anyhow!( + "Unsupported ARRAY leaf type for C++ bindings: {leaf}" + )); + } + array_nullability.push(u8::from(leaf.is_nullable())); + let (leaf_precision, leaf_scale) = type_precision_scale(leaf); + Ok(FlattenedLeafType { + nesting, + leaf_type, + leaf_precision, + leaf_scale, + array_nullability, + }) +} + +fn build_array_type_from_leaf( + element_data_type: i32, + element_precision: u32, + element_scale: u32, + array_nesting: u32, + array_nullability: &[u8], +) -> Result { + if array_nesting == 0 { + return Err(anyhow!("ARRAY nesting must be >= 1")); + } + let leaf_nullable = array_nullability + .get(array_nesting as usize) + .map(|v| *v != 0) + .unwrap_or(true); + let mut dt = ffi_data_type_to_core(FfiDataTypeSpec::Scalar { + data_type: element_data_type, + precision: element_precision, + scale: element_scale, + nullable: leaf_nullable, + })?; + for i in (0..array_nesting).rev() { + let nullable = array_nullability + .get(i as usize) + .map(|v| *v != 0) + .unwrap_or(true); + dt = fcore::metadata::DataType::Array(fcore::metadata::ArrayType::with_nullable( + nullable, dt, + )); + } + Ok(dt) +} + +fn ffi_data_type_to_core(spec: FfiDataTypeSpec) -> Result { + match spec { + FfiDataTypeSpec::Scalar { + data_type, + precision, + scale, + nullable, + } => { + let dt = match data_type { + DATA_TYPE_BOOLEAN => fcore::metadata::DataTypes::boolean(), + DATA_TYPE_TINYINT => fcore::metadata::DataTypes::tinyint(), + DATA_TYPE_SMALLINT => fcore::metadata::DataTypes::smallint(), + DATA_TYPE_INT => fcore::metadata::DataTypes::int(), + DATA_TYPE_BIGINT => fcore::metadata::DataTypes::bigint(), + DATA_TYPE_FLOAT => fcore::metadata::DataTypes::float(), + DATA_TYPE_DOUBLE => fcore::metadata::DataTypes::double(), + DATA_TYPE_STRING => fcore::metadata::DataTypes::string(), + DATA_TYPE_BYTES => fcore::metadata::DataTypes::bytes(), + DATA_TYPE_DATE => fcore::metadata::DataTypes::date(), + DATA_TYPE_TIME => fcore::metadata::DataTypes::time(), + DATA_TYPE_TIMESTAMP => { + fcore::metadata::DataTypes::timestamp_with_precision(precision) + } + DATA_TYPE_TIMESTAMP_LTZ => { + fcore::metadata::DataTypes::timestamp_ltz_with_precision(precision) + } + DATA_TYPE_DECIMAL => { + let dt = fcore::metadata::DecimalType::new(precision, scale)?; + fcore::metadata::DataType::Decimal(dt) + } + DATA_TYPE_CHAR => fcore::metadata::DataTypes::char(precision), + DATA_TYPE_BINARY => fcore::metadata::DataTypes::binary(precision as usize), + _ => return Err(anyhow!("Unknown data type: {}", data_type)), + }; + if nullable { + Ok(dt) + } else { + Ok(dt.as_non_nullable()) + } + } + FfiDataTypeSpec::Array { + element_data_type, + element_precision, + element_scale, + array_nesting, + ref array_nullability, + } => build_array_type_from_leaf( + element_data_type, + element_precision, + element_scale, + array_nesting, + array_nullability, + ), + } +} + +pub fn core_data_type_to_ffi(dt: &fcore::metadata::DataType) -> i32 { + match dt { + fcore::metadata::DataType::Boolean(_) => DATA_TYPE_BOOLEAN, + fcore::metadata::DataType::TinyInt(_) => DATA_TYPE_TINYINT, + fcore::metadata::DataType::SmallInt(_) => DATA_TYPE_SMALLINT, + fcore::metadata::DataType::Int(_) => DATA_TYPE_INT, + fcore::metadata::DataType::BigInt(_) => DATA_TYPE_BIGINT, + fcore::metadata::DataType::Float(_) => DATA_TYPE_FLOAT, + fcore::metadata::DataType::Double(_) => DATA_TYPE_DOUBLE, + fcore::metadata::DataType::String(_) => DATA_TYPE_STRING, + fcore::metadata::DataType::Bytes(_) => DATA_TYPE_BYTES, + fcore::metadata::DataType::Date(_) => DATA_TYPE_DATE, + fcore::metadata::DataType::Time(_) => DATA_TYPE_TIME, + fcore::metadata::DataType::Timestamp(_) => DATA_TYPE_TIMESTAMP, + fcore::metadata::DataType::TimestampLTz(_) => DATA_TYPE_TIMESTAMP_LTZ, + fcore::metadata::DataType::Decimal(_) => DATA_TYPE_DECIMAL, + fcore::metadata::DataType::Char(_) => DATA_TYPE_CHAR, + fcore::metadata::DataType::Binary(_) => DATA_TYPE_BINARY, + fcore::metadata::DataType::Array(_) => DATA_TYPE_ARRAY, + _ => 0, + } +} + +fn core_column_to_ffi(col: &fcore::metadata::Column) -> ffi::FfiColumn { + let (precision, scale) = type_precision_scale(col.data_type()); + + let flat = match col.data_type() { + fcore::metadata::DataType::Array(_) => flatten_array_leaf_type(col.data_type()).ok(), + _ => None, + }; + + ffi::FfiColumn { + name: col.name().to_string(), + data_type: core_data_type_to_ffi(col.data_type()), + nullable: col.data_type().is_nullable(), + comment: col.comment().unwrap_or("").to_string(), + precision, + scale, + array_nesting: flat.as_ref().map_or(0, |f| f.nesting), + array_nullability: flat + .as_ref() + .map_or_else(Vec::new, |f| f.array_nullability.clone()), + element_data_type: flat.as_ref().map_or(0, |f| f.leaf_type), + element_precision: flat.as_ref().map_or(0, |f| f.leaf_precision), + element_scale: flat.as_ref().map_or(0, |f| f.leaf_scale), + } +} + +pub fn ffi_descriptor_to_core( + descriptor: &ffi::FfiTableDescriptor, +) -> Result { + let mut schema_builder = fcore::metadata::Schema::builder(); + + for col in &descriptor.schema.columns { + if col.precision < 0 || col.scale < 0 || col.array_nesting < 0 { + return Err(anyhow!( + "Column '{}': precision, scale, and array_nesting must be non-negative", + col.name + )); + } + let dt = ffi_column_to_core_data_type(col)?; + schema_builder = schema_builder.column(&col.name, dt); + if !col.comment.is_empty() { + schema_builder = schema_builder.with_comment(&col.comment); + } + } + + if !descriptor.schema.primary_keys.is_empty() { + schema_builder = schema_builder.primary_key(descriptor.schema.primary_keys.clone()); + } + + let schema = schema_builder.build()?; + + let mut builder = fcore::metadata::TableDescriptor::builder() + .schema(schema) + .partitioned_by(descriptor.partition_keys.clone()); + + if descriptor.bucket_count > 0 { + builder = builder.distributed_by( + Some(descriptor.bucket_count), + descriptor.bucket_keys.clone(), + ); + } else { + builder = builder.distributed_by(None, descriptor.bucket_keys.clone()); + } + + for prop in &descriptor.properties { + builder = builder.property(&prop.key, &prop.value); + } + + if !descriptor.custom_properties.is_empty() { + let custom: std::collections::HashMap = descriptor + .custom_properties + .iter() + .map(|kv| (kv.key.clone(), kv.value.clone())) + .collect(); + builder = builder.custom_properties(custom); + } + + if !descriptor.comment.is_empty() { + builder = builder.comment(&descriptor.comment); + } + + Ok(builder.build()?) +} + +pub fn core_table_info_to_ffi(info: &fcore::metadata::TableInfo) -> ffi::FfiTableInfo { + let schema = info.get_schema(); + let columns: Vec = schema.columns().iter().map(core_column_to_ffi).collect(); + + let primary_keys: Vec = schema + .primary_key() + .map(|pk| pk.column_names().to_vec()) + .unwrap_or_default(); + + let properties: Vec = info + .get_properties() + .iter() + .map(|(k, v)| ffi::HashMapValue { + key: k.clone(), + value: v.clone(), + }) + .collect(); + + let custom_properties: Vec = info + .get_custom_properties() + .iter() + .map(|(k, v)| ffi::HashMapValue { + key: k.clone(), + value: v.clone(), + }) + .collect(); + + ffi::FfiTableInfo { + table_id: info.get_table_id(), + schema_id: info.get_schema_id(), + table_path: ffi::FfiTablePath { + database_name: info.get_table_path().database().to_string(), + table_name: info.get_table_path().table().to_string(), + }, + created_time: info.get_created_time(), + modified_time: info.get_modified_time(), + primary_keys: info.get_primary_keys().clone(), + bucket_keys: info.get_bucket_keys().to_vec(), + partition_keys: info.get_partition_keys().to_vec(), + num_buckets: info.get_num_buckets(), + has_primary_key: info.has_primary_key(), + is_partitioned: info.is_partitioned(), + properties, + custom_properties, + comment: info.get_comment().unwrap_or("").to_string(), + schema: ffi::FfiSchema { + columns, + primary_keys, + }, + } +} + +pub fn empty_table_info() -> ffi::FfiTableInfo { + ffi::FfiTableInfo { + table_id: 0, + schema_id: 0, + table_path: ffi::FfiTablePath { + database_name: String::new(), + table_name: String::new(), + }, + created_time: 0, + modified_time: 0, + primary_keys: vec![], + bucket_keys: vec![], + partition_keys: vec![], + num_buckets: 0, + has_primary_key: false, + is_partitioned: false, + properties: vec![], + custom_properties: vec![], + comment: String::new(), + schema: ffi::FfiSchema { + columns: vec![], + primary_keys: vec![], + }, + } +} + +/// Convert element type tag + precision/scale to core DataType. +/// Used by ArrayWriterInner construction from C++. +/// +/// Nullability is hardcoded to `true` (the default) because `ArrayWriter` +/// only needs the type for encoding — the binary array format does not +/// vary based on nullability. Nullability is a schema-level constraint +/// enforced elsewhere (column definition, primary key normalization). +pub fn element_type_from_ffi( + leaf_dt: i32, + precision: u32, + scale: u32, + array_nesting: u32, +) -> Result { + if array_nesting == 0 { + ffi_data_type_to_core(FfiDataTypeSpec::Scalar { + data_type: leaf_dt, + precision, + scale, + nullable: true, + }) + } else { + let array_nullability = vec![1u8; (array_nesting + 1) as usize]; + build_array_type_from_leaf(leaf_dt, precision, scale, array_nesting, &array_nullability) + } +} + +/// Convert FFI database descriptor to core. Returns None if descriptor is effectively empty +/// (no comment and no properties), so create_database can pass Option::None to core. +pub fn ffi_database_descriptor_to_core( + d: &ffi::FfiDatabaseDescriptor, +) -> Option { + if d.comment.is_empty() && d.properties.is_empty() { + return None; + } + let mut builder = fcore::metadata::DatabaseDescriptor::builder(); + if !d.comment.is_empty() { + builder = builder.comment(&d.comment); + } + if !d.properties.is_empty() { + let props: std::collections::HashMap = d + .properties + .iter() + .map(|kv| (kv.key.clone(), kv.value.clone())) + .collect(); + builder = builder.custom_properties(props); + } + Some(builder.build()) +} + +/// Convert core DatabaseInfo to FFI. +pub fn core_database_info_to_ffi(info: &fcore::metadata::DatabaseInfo) -> ffi::FfiDatabaseInfo { + let desc = info.database_descriptor(); + let properties: Vec = desc + .custom_properties() + .iter() + .map(|(k, v)| ffi::HashMapValue { + key: k.clone(), + value: v.clone(), + }) + .collect(); + ffi::FfiDatabaseInfo { + database_name: info.database_name().to_string(), + comment: desc.comment().unwrap_or("").to_string(), + properties, + created_time: info.created_time(), + modified_time: info.modified_time(), + } +} + +/// Resolve types in a GenericRow using schema metadata. +/// Narrows Int32 → Int8/Int16, parses decimal strings, etc. +/// Used by both AppendWriter and UpsertWriter. +pub fn resolve_row_types( + row: &fcore::row::GenericRow<'_>, + schema: Option<&fcore::metadata::Schema>, +) -> Result> { + use fcore::row::Datum; + + let mut out = fcore::row::GenericRow::new(row.values.len()); + + for (idx, datum) in row.values.iter().enumerate() { + let resolved = match datum { + Datum::Null => Datum::Null, + Datum::Bool(v) => Datum::Bool(*v), + Datum::Int32(v) => match schema + .and_then(|s| s.columns().get(idx)) + .map(|c| c.data_type()) + { + Some(fcore::metadata::DataType::TinyInt(_)) => Datum::Int8( + i8::try_from(*v).map_err(|_| anyhow!("Column {idx}: {v} overflows TinyInt"))?, + ), + Some(fcore::metadata::DataType::SmallInt(_)) => Datum::Int16( + i16::try_from(*v) + .map_err(|_| anyhow!("Column {idx}: {v} overflows SmallInt"))?, + ), + _ => Datum::Int32(*v), + }, + Datum::Int64(v) => Datum::Int64(*v), + Datum::Float32(v) => Datum::Float32(*v), + Datum::Float64(v) => Datum::Float64(*v), + Datum::Int8(v) => Datum::Int8(*v), + Datum::Int16(v) => Datum::Int16(*v), + Datum::String(cow) => { + // Check if the schema column is Decimal — if so, parse the string as decimal + match schema + .and_then(|s| s.columns().get(idx)) + .map(|c| c.data_type()) + { + Some(fcore::metadata::DataType::Decimal(dt)) => { + let (precision, scale) = (dt.precision(), dt.scale()); + let bd = bigdecimal::BigDecimal::from_str(cow.as_ref()).map_err(|e| { + anyhow!("Column {idx}: invalid decimal string '{cow}': {e}") + })?; + let decimal = fcore::row::Decimal::from_big_decimal(bd, precision, scale) + .map_err(|e| anyhow!("Column {idx}: {e}"))?; + Datum::Decimal(decimal) + } + _ => Datum::String(Cow::Owned(cow.to_string())), + } + } + Datum::Blob(cow) => Datum::Blob(Cow::Owned(cow.to_vec())), + Datum::Decimal(d) => Datum::Decimal(d.clone()), + Datum::Date(d) => Datum::Date(*d), + Datum::Time(t) => Datum::Time(*t), + Datum::TimestampNtz(ts) => Datum::TimestampNtz(*ts), + Datum::TimestampLtz(ts) => Datum::TimestampLtz(*ts), + Datum::Array(a) => Datum::Array(a.clone()), + Datum::Map(m) => Datum::Map(m.clone()), + Datum::Row(_) => return Err(anyhow!("Row datum is not yet supported in C++ bindings")), + }; + out.set_field(idx, resolved); + } + + Ok(out) +} + +/// Convert a CompactedRow (lookup result) to an owned GenericRow<'static>. +/// One copy for strings/bytes (Cow::Owned), but no second copy into FfiDatum. +pub fn compacted_row_to_owned( + row: &dyn fcore::row::InternalRow, + table_info: &fcore::metadata::TableInfo, +) -> Result> { + use fcore::row::Datum; + + let schema = table_info.get_schema(); + let columns = schema.columns(); + let mut out = fcore::row::GenericRow::new(columns.len()); + + for (i, col) in columns.iter().enumerate() { + if row.is_null_at(i)? { + out.set_field(i, Datum::Null); + continue; + } + + let datum = match col.data_type() { + fcore::metadata::DataType::Boolean(_) => Datum::Bool(row.get_boolean(i)?), + fcore::metadata::DataType::TinyInt(_) => Datum::Int8(row.get_byte(i)?), + fcore::metadata::DataType::SmallInt(_) => Datum::Int16(row.get_short(i)?), + fcore::metadata::DataType::Int(_) => Datum::Int32(row.get_int(i)?), + fcore::metadata::DataType::BigInt(_) => Datum::Int64(row.get_long(i)?), + fcore::metadata::DataType::Float(_) => Datum::Float32(row.get_float(i)?.into()), + fcore::metadata::DataType::Double(_) => Datum::Float64(row.get_double(i)?.into()), + fcore::metadata::DataType::String(_) => { + Datum::String(Cow::Owned(row.get_string(i)?.to_string())) + } + fcore::metadata::DataType::Bytes(_) => { + Datum::Blob(Cow::Owned(row.get_bytes(i)?.to_vec())) + } + fcore::metadata::DataType::Date(_) => Datum::Date(row.get_date(i)?), + fcore::metadata::DataType::Time(_) => Datum::Time(row.get_time(i)?), + fcore::metadata::DataType::Timestamp(dt) => { + Datum::TimestampNtz(row.get_timestamp_ntz(i, dt.precision())?) + } + fcore::metadata::DataType::TimestampLTz(dt) => { + Datum::TimestampLtz(row.get_timestamp_ltz(i, dt.precision())?) + } + fcore::metadata::DataType::Decimal(dt) => { + let decimal = row.get_decimal(i, dt.precision() as usize, dt.scale() as usize)?; + Datum::Decimal(decimal) + } + fcore::metadata::DataType::Char(dt) => Datum::String(Cow::Owned( + row.get_char(i, dt.length() as usize)?.to_string(), + )), + fcore::metadata::DataType::Binary(dt) => { + Datum::Blob(Cow::Owned(row.get_binary(i, dt.length())?.to_vec())) + } + fcore::metadata::DataType::Array(_) => Datum::Array(row.get_array(i)?), + fcore::metadata::DataType::Map(_) => Datum::Map(row.get_map(i)?), + other => return Err(anyhow!("Unsupported data type for column {i}: {other:?}")), + }; + + out.set_field(i, datum); + } + + Ok(out) +} + +pub fn core_lake_snapshot_to_ffi(snapshot: &fcore::metadata::LakeSnapshot) -> ffi::FfiLakeSnapshot { + let bucket_offsets: Vec = snapshot + .table_buckets_offset + .iter() + .map(|(bucket, offset)| ffi::FfiBucketOffset { + table_id: bucket.table_id(), + partition_id: bucket.partition_id().unwrap_or(-1), + bucket_id: bucket.bucket_id(), + offset: *offset, + }) + .collect(); + + ffi::FfiLakeSnapshot { + snapshot_id: snapshot.snapshot_id, + bucket_offsets, + } +} + +pub fn core_scan_batches_to_ffi( + batches: &[fcore::record::ScanBatch], +) -> Result { + let mut ffi_batches = Vec::new(); + for batch in batches { + let record_batch = batch.batch(); + // Convert RecordBatch to StructArray first, then get the data + let struct_array = arrow::array::StructArray::from(record_batch.clone()); + let ffi_array = Box::new(FFI_ArrowArray::new(&struct_array.into_data())); + let ffi_schema = Box::new( + FFI_ArrowSchema::try_from(record_batch.schema().as_ref()).map_err(|e| e.to_string())?, + ); + // Export as raw pointers + ffi_batches.push(ffi::FfiArrowRecordBatch { + array_ptr: Box::into_raw(ffi_array) as usize, + schema_ptr: Box::into_raw(ffi_schema) as usize, + table_id: batch.bucket().table_id(), + partition_id: batch.bucket().partition_id().unwrap_or(-1), + bucket_id: batch.bucket().bucket_id(), + base_offset: batch.base_offset(), + }); + } + + Ok(ffi::FfiArrowRecordBatches { + batches: ffi_batches, + }) +} diff --git a/fluss-rust/bindings/cpp/test/test_admin.cpp b/fluss-rust/bindings/cpp/test/test_admin.cpp new file mode 100644 index 0000000000..99f93fcf1e --- /dev/null +++ b/fluss-rust/bindings/cpp/test/test_admin.cpp @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "test_utils.h" + +class AdminTest : public ::testing::Test { + protected: + fluss::Admin& admin() { return fluss_test::FlussTestEnvironment::Instance()->GetAdmin(); } +}; + +TEST_F(AdminTest, CreateDatabase) { + auto& adm = admin(); + + std::string db_name = "test_create_database_cpp"; + + // Database should not exist initially + bool exists = true; + ASSERT_OK(adm.DatabaseExists(db_name, exists)); + ASSERT_FALSE(exists); + + // Create database with descriptor + fluss::DatabaseDescriptor descriptor; + descriptor.comment = "test_db"; + descriptor.properties = {{"k1", "v1"}, {"k2", "v2"}}; + ASSERT_OK(adm.CreateDatabase(db_name, descriptor, false)); + + // Database should exist now + ASSERT_OK(adm.DatabaseExists(db_name, exists)); + ASSERT_TRUE(exists); + + // Get database info + fluss::DatabaseInfo db_info; + ASSERT_OK(adm.GetDatabaseInfo(db_name, db_info)); + EXPECT_EQ(db_info.database_name, db_name); + EXPECT_EQ(db_info.comment, "test_db"); + EXPECT_EQ(db_info.properties.at("k1"), "v1"); + EXPECT_EQ(db_info.properties.at("k2"), "v2"); + + // Drop database + ASSERT_OK(adm.DropDatabase(db_name, false, true)); + + // Database should not exist now + ASSERT_OK(adm.DatabaseExists(db_name, exists)); + ASSERT_FALSE(exists); +} + +TEST_F(AdminTest, CreateTable) { + auto& adm = admin(); + + std::string db_name = "test_create_table_cpp_db"; + fluss::DatabaseDescriptor db_desc; + db_desc.comment = "Database for test_create_table"; + + bool exists = false; + ASSERT_OK(adm.DatabaseExists(db_name, exists)); + ASSERT_FALSE(exists); + + ASSERT_OK(adm.CreateDatabase(db_name, db_desc, false)); + + std::string table_name = "test_user_table"; + fluss::TablePath table_path(db_name, table_name); + + // Build schema + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("age", fluss::DataType::Int(), "User's age (optional)") + .AddColumn("email", fluss::DataType::String()) + .SetPrimaryKeys({"id"}) + .Build(); + + // Build table descriptor + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetComment("Test table for user data (id, name, age, email)") + .SetBucketCount(3) + .SetBucketKeys({"id"}) + .SetProperty("table.replication.factor", "1") + .SetLogFormat("arrow") + .SetKvFormat("indexed") + .Build(); + + // Create table + ASSERT_OK(adm.CreateTable(table_path, table_descriptor, false)); + + // Table should exist + ASSERT_OK(adm.TableExists(table_path, exists)); + ASSERT_TRUE(exists); + + // List tables + std::vector tables; + ASSERT_OK(adm.ListTables(db_name, tables)); + ASSERT_EQ(tables.size(), 1u); + EXPECT_TRUE(std::find(tables.begin(), tables.end(), table_name) != tables.end()); + + // Get table info + fluss::TableInfo table_info; + ASSERT_OK(adm.GetTableInfo(table_path, table_info)); + + EXPECT_EQ(table_info.comment, "Test table for user data (id, name, age, email)"); + EXPECT_EQ(table_info.primary_keys, std::vector{"id"}); + EXPECT_EQ(table_info.num_buckets, 3); + EXPECT_EQ(table_info.bucket_keys, std::vector{"id"}); + + // Drop table + ASSERT_OK(adm.DropTable(table_path, false)); + ASSERT_OK(adm.TableExists(table_path, exists)); + ASSERT_FALSE(exists); + + // Drop database + ASSERT_OK(adm.DropDatabase(db_name, false, true)); + ASSERT_OK(adm.DatabaseExists(db_name, exists)); + ASSERT_FALSE(exists); +} + +TEST_F(AdminTest, PartitionApis) { + auto& adm = admin(); + + std::string db_name = "test_partition_apis_cpp_db"; + fluss::DatabaseDescriptor db_desc; + db_desc.comment = "Database for test_partition_apis"; + ASSERT_OK(adm.CreateDatabase(db_name, db_desc, true)); + + fluss::TablePath table_path(db_name, "partitioned_table"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("dt", fluss::DataType::String()) + .AddColumn("region", fluss::DataType::String()) + .SetPrimaryKeys({"id", "dt", "region"}) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(3) + .SetBucketKeys({"id"}) + .SetPartitionKeys({"dt", "region"}) + .SetProperty("table.replication.factor", "1") + .SetLogFormat("arrow") + .SetKvFormat("compacted") + .Build(); + + ASSERT_OK(adm.CreateTable(table_path, table_descriptor, true)); + + // No partitions initially + std::vector partitions; + ASSERT_OK(adm.ListPartitionInfos(table_path, partitions)); + ASSERT_TRUE(partitions.empty()); + + // Create a partition + std::unordered_map partition_spec = { + {"dt", "2024-01-15"}, {"region", "EMEA"}}; + ASSERT_OK(adm.CreatePartition(table_path, partition_spec, false)); + + // Should have one partition + ASSERT_OK(adm.ListPartitionInfos(table_path, partitions)); + ASSERT_EQ(partitions.size(), 1u); + EXPECT_EQ(partitions[0].partition_name, "2024-01-15$EMEA"); + + // List with partial spec filter - should find the partition + std::unordered_map partial_spec = {{"dt", "2024-01-15"}}; + std::vector partitions_with_spec; + ASSERT_OK(adm.ListPartitionInfos(table_path, partial_spec, partitions_with_spec)); + ASSERT_EQ(partitions_with_spec.size(), 1u); + EXPECT_EQ(partitions_with_spec[0].partition_name, "2024-01-15$EMEA"); + + // List with non-matching spec - should find no partitions + std::unordered_map non_matching_spec = {{"dt", "2024-01-16"}}; + std::vector empty_partitions; + ASSERT_OK(adm.ListPartitionInfos(table_path, non_matching_spec, empty_partitions)); + ASSERT_TRUE(empty_partitions.empty()); + + // Drop partition + ASSERT_OK(adm.DropPartition(table_path, partition_spec, false)); + + ASSERT_OK(adm.ListPartitionInfos(table_path, partitions)); + ASSERT_TRUE(partitions.empty()); + + // Cleanup + ASSERT_OK(adm.DropTable(table_path, true)); + ASSERT_OK(adm.DropDatabase(db_name, true, true)); +} + +TEST_F(AdminTest, FlussErrorResponse) { + auto& adm = admin(); + + fluss::TablePath table_path("fluss", "not_exist_cpp"); + + fluss::TableInfo info; + auto result = adm.GetTableInfo(table_path, info); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::TABLE_NOT_EXIST); +} + +TEST_F(AdminTest, ErrorDatabaseNotExist) { + auto& adm = admin(); + + // get_database_info for non-existent database + fluss::DatabaseInfo info; + auto result = adm.GetDatabaseInfo("no_such_db_cpp", info); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::DATABASE_NOT_EXIST); + + // drop_database without ignore flag + result = adm.DropDatabase("no_such_db_cpp", false, false); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::DATABASE_NOT_EXIST); + + // list_tables for non-existent database + std::vector tables; + result = adm.ListTables("no_such_db_cpp", tables); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::DATABASE_NOT_EXIST); +} + +TEST_F(AdminTest, ErrorDatabaseAlreadyExist) { + auto& adm = admin(); + + std::string db_name = "test_error_db_already_exist_cpp"; + fluss::DatabaseDescriptor descriptor; + + ASSERT_OK(adm.CreateDatabase(db_name, descriptor, false)); + + // Create same database again without ignore flag + auto result = adm.CreateDatabase(db_name, descriptor, false); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::DATABASE_ALREADY_EXIST); + + // With ignore flag should succeed + ASSERT_OK(adm.CreateDatabase(db_name, descriptor, true)); + + // Cleanup + ASSERT_OK(adm.DropDatabase(db_name, true, true)); +} + +TEST_F(AdminTest, ErrorTableAlreadyExist) { + auto& adm = admin(); + + std::string db_name = "test_error_tbl_already_exist_cpp_db"; + fluss::DatabaseDescriptor db_desc; + ASSERT_OK(adm.CreateDatabase(db_name, db_desc, true)); + + fluss::TablePath table_path(db_name, "my_table"); + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .Build(); + auto table_desc = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(1) + .SetProperty("table.replication.factor", "1") + .Build(); + + ASSERT_OK(adm.CreateTable(table_path, table_desc, false)); + + // Create same table again without ignore flag + auto result = adm.CreateTable(table_path, table_desc, false); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::TABLE_ALREADY_EXIST); + + // With ignore flag should succeed + ASSERT_OK(adm.CreateTable(table_path, table_desc, true)); + + // Cleanup + ASSERT_OK(adm.DropTable(table_path, true)); + ASSERT_OK(adm.DropDatabase(db_name, true, true)); +} + +TEST_F(AdminTest, GetServerNodes) { + auto& adm = admin(); + + std::vector nodes; + ASSERT_OK(adm.GetServerNodes(nodes)); + + ASSERT_GT(nodes.size(), 0u) << "Expected at least one server node"; + + bool has_coordinator = false; + bool has_tablet = false; + for (const auto& node : nodes) { + EXPECT_FALSE(node.host.empty()) << "Server node host should not be empty"; + EXPECT_GT(node.port, 0u) << "Server node port should be > 0"; + EXPECT_FALSE(node.uid.empty()) << "Server node uid should not be empty"; + + if (node.server_type == "CoordinatorServer") { + has_coordinator = true; + } else if (node.server_type == "TabletServer") { + has_tablet = true; + } + } + EXPECT_TRUE(has_coordinator) << "Expected a coordinator server node"; + EXPECT_TRUE(has_tablet) << "Expected at least one tablet server node"; +} + +TEST_F(AdminTest, ErrorTableNotExist) { + auto& adm = admin(); + + fluss::TablePath table_path("fluss", "no_such_table_cpp"); + + // Drop without ignore flag + auto result = adm.DropTable(table_path, false); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::TABLE_NOT_EXIST); + + // Drop with ignore flag should succeed + ASSERT_OK(adm.DropTable(table_path, true)); +} + +TEST_F(AdminTest, ErrorTableNotPartitioned) { + auto& adm = admin(); + + std::string db_name = "test_error_not_partitioned_cpp_db"; + fluss::DatabaseDescriptor db_desc; + ASSERT_OK(adm.CreateDatabase(db_name, db_desc, true)); + + fluss::TablePath table_path(db_name, "non_partitioned_table"); + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .Build(); + auto table_desc = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(1) + .SetProperty("table.replication.factor", "1") + .Build(); + + ASSERT_OK(adm.CreateTable(table_path, table_desc, false)); + + // list_partition_infos on non-partitioned table + std::vector partitions; + auto result = adm.ListPartitionInfos(table_path, partitions); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::TABLE_NOT_PARTITIONED_EXCEPTION); + + // Cleanup + ASSERT_OK(adm.DropTable(table_path, true)); + ASSERT_OK(adm.DropDatabase(db_name, true, true)); +} diff --git a/fluss-rust/bindings/cpp/test/test_ffi_converter.cpp b/fluss-rust/bindings/cpp/test/test_ffi_converter.cpp new file mode 100644 index 0000000000..2078bdabb4 --- /dev/null +++ b/fluss-rust/bindings/cpp/test/test_ffi_converter.cpp @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include "ffi_converter.hpp" + +namespace { + +fluss::ffi::FfiColumn MakeArrayColumn(int32_t nesting, int32_t element_type, + bool nullable = true, bool leaf_nullable = true, + std::vector per_level_nullability = {}) { + fluss::ffi::FfiColumn col; + col.name = rust::String("bad_array"); + col.data_type = static_cast(fluss::TypeId::Array); + col.nullable = nullable; + col.comment = rust::String(""); + col.precision = 0; + col.scale = 0; + col.array_nesting = nesting; + if (!per_level_nullability.empty()) { + for (auto v : per_level_nullability) { + col.array_nullability.push_back(v); + } + } else { + for (int32_t i = 0; i < nesting; ++i) { + col.array_nullability.push_back((i == 0 ? nullable : true) ? 1 : 0); + } + col.array_nullability.push_back(leaf_nullable ? 1 : 0); + } + col.element_data_type = element_type; + col.element_precision = 0; + col.element_scale = 0; + return col; +} + +fluss::ffi::FfiColumn MakeScalarColumn(const char* name, fluss::TypeId type_id, + bool nullable = true, int32_t precision = 0, + int32_t scale = 0) { + fluss::ffi::FfiColumn col; + col.name = rust::String(name); + col.data_type = static_cast(type_id); + col.nullable = nullable; + col.comment = rust::String(""); + col.precision = precision; + col.scale = scale; + col.array_nesting = 0; + col.element_data_type = 0; + col.element_precision = 0; + col.element_scale = 0; + return col; +} + +} // namespace + +TEST(FfiConverterTest, RejectsArrayWithoutElementType) { + auto col = MakeArrayColumn(1, 0); + EXPECT_THROW((void)fluss::utils::from_ffi_column(col), std::runtime_error); +} + +TEST(FfiConverterTest, RejectsArrayWithArrayLeafType) { + auto col = MakeArrayColumn(2, static_cast(fluss::TypeId::Array)); + EXPECT_THROW((void)fluss::utils::from_ffi_column(col), std::runtime_error); +} + +TEST(FfiConverterTest, RejectsArrayWithUnknownLeafType) { + auto col = MakeArrayColumn(1, 999); + EXPECT_THROW((void)fluss::utils::from_ffi_column(col), std::runtime_error); +} + +TEST(FfiConverterTest, SupportsLegacyOneLevelArrayMetadata) { + auto col = MakeArrayColumn(0, static_cast(fluss::TypeId::Int)); + auto converted = fluss::utils::from_ffi_column(col); + EXPECT_EQ(converted.data_type.id(), fluss::TypeId::Array); + ASSERT_NE(converted.data_type.element_type(), nullptr); + EXPECT_EQ(converted.data_type.element_type()->id(), fluss::TypeId::Int); +} + +// --- Nullability tests --- + +TEST(DataTypeTest, DefaultNullable) { + auto dt = fluss::DataType::Int(); + EXPECT_TRUE(dt.nullable()); +} + +TEST(DataTypeTest, NotNullMethod) { + auto dt = fluss::DataType::Int().NotNull(); + EXPECT_FALSE(dt.nullable()); + EXPECT_EQ(dt.id(), fluss::TypeId::Int); +} + +TEST(DataTypeTest, NotNullPreservesPrecisionScale) { + auto dt = fluss::DataType::Decimal(10, 2).NotNull(); + EXPECT_FALSE(dt.nullable()); + EXPECT_EQ(dt.precision(), 10); + EXPECT_EQ(dt.scale(), 2); +} + +TEST(DataTypeTest, ArrayElementNullability) { + auto dt = fluss::DataType::Array(fluss::DataType::Int().NotNull()); + EXPECT_TRUE(dt.nullable()); + ASSERT_NE(dt.element_type(), nullptr); + EXPECT_FALSE(dt.element_type()->nullable()); +} + +TEST(DataTypeTest, NotNullArrayNullableElement) { + auto dt = fluss::DataType::Array(fluss::DataType::Int()).NotNull(); + EXPECT_FALSE(dt.nullable()); + ASSERT_NE(dt.element_type(), nullptr); + EXPECT_TRUE(dt.element_type()->nullable()); +} + +TEST(DataTypeTest, NotNullArrayNotNullElement) { + auto dt = fluss::DataType::Array(fluss::DataType::Int().NotNull()).NotNull(); + EXPECT_FALSE(dt.nullable()); + ASSERT_NE(dt.element_type(), nullptr); + EXPECT_FALSE(dt.element_type()->nullable()); +} + +TEST(FfiConverterTest, ScalarNullableRoundTrip) { + fluss::Column col{"id", fluss::DataType::Int(), ""}; + auto ffi_col = fluss::utils::to_ffi_column(col); + EXPECT_TRUE(ffi_col.nullable); + auto back = fluss::utils::from_ffi_column(ffi_col); + EXPECT_TRUE(back.data_type.nullable()); +} + +TEST(FfiConverterTest, ScalarNotNullRoundTrip) { + fluss::Column col{"id", fluss::DataType::Int().NotNull(), ""}; + auto ffi_col = fluss::utils::to_ffi_column(col); + EXPECT_FALSE(ffi_col.nullable); + auto back = fluss::utils::from_ffi_column(ffi_col); + EXPECT_FALSE(back.data_type.nullable()); +} + +TEST(FfiConverterTest, ArrayNotNullElementRoundTrip) { + fluss::Column col{"tags", fluss::DataType::Array(fluss::DataType::String().NotNull()), ""}; + auto ffi_col = fluss::utils::to_ffi_column(col); + EXPECT_TRUE(ffi_col.nullable); + ASSERT_EQ(ffi_col.array_nullability.size(), 2u); + EXPECT_EQ(ffi_col.array_nullability[1], 0); + auto back = fluss::utils::from_ffi_column(ffi_col); + EXPECT_TRUE(back.data_type.nullable()); + ASSERT_NE(back.data_type.element_type(), nullptr); + EXPECT_FALSE(back.data_type.element_type()->nullable()); +} + +TEST(FfiConverterTest, NotNullArrayNullableElementRoundTrip) { + fluss::Column col{"ids", fluss::DataType::Array(fluss::DataType::Int()).NotNull(), ""}; + auto ffi_col = fluss::utils::to_ffi_column(col); + EXPECT_FALSE(ffi_col.nullable); + ASSERT_EQ(ffi_col.array_nullability.size(), 2u); + EXPECT_EQ(ffi_col.array_nullability[1], 1); + auto back = fluss::utils::from_ffi_column(ffi_col); + EXPECT_FALSE(back.data_type.nullable()); + ASSERT_NE(back.data_type.element_type(), nullptr); + EXPECT_TRUE(back.data_type.element_type()->nullable()); +} + +TEST(FfiConverterTest, NotNullArrayNotNullElementRoundTrip) { + fluss::Column col{ + "strict_ids", + fluss::DataType::Array(fluss::DataType::Int().NotNull()).NotNull(), + "", + }; + auto ffi_col = fluss::utils::to_ffi_column(col); + EXPECT_FALSE(ffi_col.nullable); + ASSERT_EQ(ffi_col.array_nullability.size(), 2u); + EXPECT_EQ(ffi_col.array_nullability[1], 0); + auto back = fluss::utils::from_ffi_column(ffi_col); + EXPECT_FALSE(back.data_type.nullable()); + ASSERT_NE(back.data_type.element_type(), nullptr); + EXPECT_FALSE(back.data_type.element_type()->nullable()); +} + +TEST(FfiConverterTest, NestedArrayIntermediateNullabilityRoundTrip) { + fluss::Column col{ + "nested", + fluss::DataType::Array(fluss::DataType::Array(fluss::DataType::Int()).NotNull()), + "", + }; + auto ffi_col = fluss::utils::to_ffi_column(col); + auto back = fluss::utils::from_ffi_column(ffi_col); + + EXPECT_TRUE(back.data_type.nullable()); + ASSERT_NE(back.data_type.element_type(), nullptr); + EXPECT_FALSE(back.data_type.element_type()->nullable()); + ASSERT_NE(back.data_type.element_type()->element_type(), nullptr); + EXPECT_TRUE(back.data_type.element_type()->element_type()->nullable()); +} + +TEST(FfiConverterTest, NestedArrayAllLevelsNullabilityRoundTrip) { + fluss::Column col{ + "strict_nested", + fluss::DataType::Array( + fluss::DataType::Array(fluss::DataType::Int().NotNull()).NotNull()) + .NotNull(), + "", + }; + auto ffi_col = fluss::utils::to_ffi_column(col); + auto back = fluss::utils::from_ffi_column(ffi_col); + + EXPECT_FALSE(back.data_type.nullable()); + ASSERT_NE(back.data_type.element_type(), nullptr); + EXPECT_FALSE(back.data_type.element_type()->nullable()); + ASSERT_NE(back.data_type.element_type()->element_type(), nullptr); + EXPECT_FALSE(back.data_type.element_type()->element_type()->nullable()); +} + +TEST(FfiConverterTest, FfiColumnNonNullableScalarReconstructed) { + auto col = MakeScalarColumn("id", fluss::TypeId::Int, false); + auto converted = fluss::utils::from_ffi_column(col); + EXPECT_FALSE(converted.data_type.nullable()); + EXPECT_EQ(converted.data_type.id(), fluss::TypeId::Int); +} + +TEST(FfiConverterTest, FfiColumnNonNullableArrayReconstructed) { + auto col = MakeArrayColumn(1, static_cast(fluss::TypeId::String), false, false); + auto converted = fluss::utils::from_ffi_column(col); + EXPECT_FALSE(converted.data_type.nullable()); + ASSERT_NE(converted.data_type.element_type(), nullptr); + EXPECT_FALSE(converted.data_type.element_type()->nullable()); +} diff --git a/fluss-rust/bindings/cpp/test/test_kv_table.cpp b/fluss-rust/bindings/cpp/test/test_kv_table.cpp new file mode 100644 index 0000000000..5cc8f79d23 --- /dev/null +++ b/fluss-rust/bindings/cpp/test/test_kv_table.cpp @@ -0,0 +1,892 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "test_utils.h" + +class KvTableTest : public ::testing::Test { + protected: + fluss::Admin& admin() { return fluss_test::FlussTestEnvironment::Instance()->GetAdmin(); } + + fluss::Connection& connection() { + return fluss_test::FlussTestEnvironment::Instance()->GetConnection(); + } +}; + +TEST_F(KvTableTest, UpsertDeleteAndLookup) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_upsert_and_lookup_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("age", fluss::DataType::BigInt()) + .SetPrimaryKeys({"id"}) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + // Create upsert writer + auto table_upsert = table.NewUpsert(); + fluss::UpsertWriter upsert_writer; + ASSERT_OK(table_upsert.CreateWriter(upsert_writer)); + + // Upsert 3 rows (fire-and-forget, then flush) + struct TestData { + int32_t id; + std::string name; + int64_t age; + }; + std::vector test_data = {{1, "Verso", 32}, {2, "Noco", 25}, {3, "Esquie", 35}}; + + for (const auto& d : test_data) { + fluss::GenericRow row(3); + row.SetInt32(0, d.id); + row.SetString(1, d.name); + row.SetInt64(2, d.age); + ASSERT_OK(upsert_writer.Upsert(row)); + } + ASSERT_OK(upsert_writer.Flush()); + + // Create lookuper + fluss::Lookuper lookuper; + ASSERT_OK(table.NewLookup().CreateLookuper(lookuper)); + + // Verify lookup results + for (const auto& d : test_data) { + fluss::GenericRow key(3); + key.SetInt32(0, d.id); + + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()) << "Row with id=" << d.id << " should exist"; + + EXPECT_EQ(result.GetInt32(0), d.id) << "id mismatch"; + EXPECT_EQ(result.GetString(1), d.name) << "name mismatch"; + EXPECT_EQ(result.GetInt64(2), d.age) << "age mismatch"; + } + + // Update record with id=1 (await acknowledgment) + { + fluss::GenericRow updated_row(3); + updated_row.SetInt32(0, 1); + updated_row.SetString(1, "Verso"); + updated_row.SetInt64(2, 33); + fluss::WriteResult wr; + ASSERT_OK(upsert_writer.Upsert(updated_row, wr)); + ASSERT_OK(wr.Wait()); + } + + // Verify the update + { + fluss::GenericRow key(3); + key.SetInt32(0, 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(result.GetInt64(2), 33) << "Age should be updated"; + EXPECT_EQ(result.GetString(1), "Verso") << "Name should remain unchanged"; + } + + // Delete record with id=1 (await acknowledgment) + { + fluss::GenericRow delete_row(3); + delete_row.SetInt32(0, 1); + fluss::WriteResult wr; + ASSERT_OK(upsert_writer.Delete(delete_row, wr)); + ASSERT_OK(wr.Wait()); + } + + // Verify deletion + { + fluss::GenericRow key(3); + key.SetInt32(0, 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_FALSE(result.Found()) << "Record 1 should not exist after delete"; + } + + // Verify other records still exist + for (int id : {2, 3}) { + fluss::GenericRow key(3); + key.SetInt32(0, id); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()) << "Record " << id + << " should still exist after deleting record 1"; + } + + // Lookup non-existent key + { + fluss::GenericRow key(3); + key.SetInt32(0, 999); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_FALSE(result.Found()) << "Non-existent key should return not found"; + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(KvTableTest, LookupWithNestedArrayArrayView) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_lookup_nested_array_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("matrix", + fluss::DataType::Array(fluss::DataType::Array(fluss::DataType::Int()))) + .SetPrimaryKeys({"id"}) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + auto upsert = table.NewUpsert(); + fluss::UpsertWriter writer; + ASSERT_OK(upsert.CreateWriter(writer)); + + { + auto row = table.NewRow(); + row.Set("id", 1); + + fluss::ArrayWriter inner1(2, fluss::DataType::Int()); + inner1.SetInt32(0, 11); + inner1.SetInt32(1, 12); + + fluss::ArrayWriter inner2(2, fluss::DataType::Int()); + inner2.SetInt32(0, 21); + inner2.SetInt32(1, 22); + + fluss::ArrayWriter outer(2, fluss::DataType::Array(fluss::DataType::Int())); + outer.SetArray(0, std::move(inner1)); + outer.SetArray(1, std::move(inner2)); + row.Set("matrix", std::move(outer)); + + ASSERT_OK(writer.Upsert(row)); + ASSERT_OK(writer.Flush()); + } + + fluss::Lookuper lookuper; + ASSERT_OK(table.NewLookup().CreateLookuper(lookuper)); + + auto key = table.NewRow(); + key.Set("id", 1); + + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(result.GetArraySize("matrix"), 2u); + EXPECT_EQ(result.GetArrayElementType("matrix"), fluss::TypeId::Array); + + auto outer = result.GetArrayView("matrix"); + ASSERT_EQ(outer.Size(), 2u); + EXPECT_EQ(outer.ElementType(), fluss::TypeId::Array); + + auto first = outer.GetArray(0); + ASSERT_EQ(first.Size(), 2u); + EXPECT_EQ(first.ElementType(), fluss::TypeId::Int); + EXPECT_EQ(first.GetInt32(0), 11); + EXPECT_EQ(first.GetInt32(1), 12); + + auto second = outer.GetArray(1); + ASSERT_EQ(second.Size(), 2u); + EXPECT_EQ(second.ElementType(), fluss::TypeId::Int); + EXPECT_EQ(second.GetInt32(0), 21); + EXPECT_EQ(second.GetInt32(1), 22); + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(KvTableTest, LookupArrayValidationErrors) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_lookup_array_validation_errors_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("vals", fluss::DataType::Array(fluss::DataType::Int())) + .SetPrimaryKeys({"id"}) + .Build(); + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + auto upsert = table.NewUpsert(); + fluss::UpsertWriter writer; + ASSERT_OK(upsert.CreateWriter(writer)); + + auto row = table.NewRow(); + row.Set("id", 1); + fluss::ArrayWriter vals(2, fluss::DataType::Int()); + vals.SetInt32(0, 99); + vals.SetNull(1); + row.Set("vals", std::move(vals)); + ASSERT_OK(writer.Upsert(row)); + ASSERT_OK(writer.Flush()); + + fluss::Lookuper lookuper; + ASSERT_OK(table.NewLookup().CreateLookuper(lookuper)); + + auto key = table.NewRow(); + key.Set("id", 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + + bool wrong_type_threw = false; + try { + (void)result.GetArrayInt64("vals", 0); + } catch (const std::exception&) { + wrong_type_threw = true; + } + EXPECT_TRUE(wrong_type_threw); + + bool null_typed_getter_threw = false; + try { + (void)result.GetArrayInt32("vals", 1); + } catch (const std::exception&) { + null_typed_getter_threw = true; + } + EXPECT_TRUE(null_typed_getter_threw); + + auto view = result.GetArrayView("vals"); + EXPECT_EQ(view.Size(), 2u); + EXPECT_TRUE(view.IsNull(1)); + + bool view_wrong_type_threw = false; + try { + (void)view.GetInt64(0); + } catch (const std::exception&) { + view_wrong_type_threw = true; + } + EXPECT_TRUE(view_wrong_type_threw); + + bool view_null_typed_getter_threw = false; + try { + (void)view.GetInt32(1); + } catch (const std::exception&) { + view_null_typed_getter_threw = true; + } + EXPECT_TRUE(view_null_typed_getter_threw); + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(KvTableTest, CompositePrimaryKeys) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_composite_pk_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("region", fluss::DataType::String()) + .AddColumn("score", fluss::DataType::BigInt()) + .AddColumn("user_id", fluss::DataType::Int()) + .SetPrimaryKeys({"region", "user_id"}) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + auto table_upsert = table.NewUpsert(); + fluss::UpsertWriter upsert_writer; + ASSERT_OK(table_upsert.CreateWriter(upsert_writer)); + + // Insert records with composite keys + struct TestData { + std::string region; + int32_t user_id; + int64_t score; + }; + std::vector test_data = { + {"US", 1, 100}, {"US", 2, 200}, {"EU", 1, 150}, {"EU", 2, 250}}; + + for (const auto& d : test_data) { + auto row = table.NewRow(); + row.Set("region", d.region); + row.Set("score", d.score); + row.Set("user_id", d.user_id); + ASSERT_OK(upsert_writer.Upsert(row)); + } + ASSERT_OK(upsert_writer.Flush()); + + // Create lookuper + fluss::Lookuper lookuper; + ASSERT_OK(table.NewLookup().CreateLookuper(lookuper)); + + // Lookup (US, 1) - should return score 100 + { + auto key = table.NewRow(); + key.Set("region", "US"); + key.Set("user_id", 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(result.GetInt64("score"), 100) << "Score for (US, 1) should be 100"; + } + + // Lookup (EU, 2) - should return score 250 + { + auto key = table.NewRow(); + key.Set("region", "EU"); + key.Set("user_id", 2); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(result.GetInt64("score"), 250) << "Score for (EU, 2) should be 250"; + } + + // Update (US, 1) score (await acknowledgment) + { + auto update_row = table.NewRow(); + update_row.Set("region", "US"); + update_row.Set("user_id", 1); + update_row.Set("score", static_cast(500)); + fluss::WriteResult wr; + ASSERT_OK(upsert_writer.Upsert(update_row, wr)); + ASSERT_OK(wr.Wait()); + } + + // Verify update + { + auto key = table.NewRow(); + key.Set("region", "US"); + key.Set("user_id", 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(result.GetInt64("score"), 500) << "Row score should be updated"; + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(KvTableTest, PartialUpdate) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_partial_update_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("age", fluss::DataType::BigInt()) + .AddColumn("score", fluss::DataType::BigInt()) + .SetPrimaryKeys({"id"}) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + // Insert initial record with all columns + auto table_upsert = table.NewUpsert(); + fluss::UpsertWriter upsert_writer; + ASSERT_OK(table_upsert.CreateWriter(upsert_writer)); + + { + fluss::GenericRow row(4); + row.SetInt32(0, 1); + row.SetString(1, "Verso"); + row.SetInt64(2, 32); + row.SetInt64(3, 6942); + fluss::WriteResult wr; + ASSERT_OK(upsert_writer.Upsert(row, wr)); + ASSERT_OK(wr.Wait()); + } + + // Verify initial record + fluss::Lookuper lookuper; + ASSERT_OK(table.NewLookup().CreateLookuper(lookuper)); + + { + fluss::GenericRow key(4); + key.SetInt32(0, 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(result.GetInt32(0), 1); + EXPECT_EQ(result.GetString(1), "Verso"); + EXPECT_EQ(result.GetInt64(2), 32); + EXPECT_EQ(result.GetInt64(3), 6942); + } + + // Create partial update writer to update only score column + auto partial_upsert = table.NewUpsert(); + partial_upsert.PartialUpdateByName({"id", "score"}); + fluss::UpsertWriter partial_writer; + ASSERT_OK(partial_upsert.CreateWriter(partial_writer)); + + // Update only the score column (await acknowledgment) + { + fluss::GenericRow partial_row(4); + partial_row.SetInt32(0, 1); + partial_row.SetNull(1); // not in partial update + partial_row.SetNull(2); // not in partial update + partial_row.SetInt64(3, 420); + fluss::WriteResult wr; + ASSERT_OK(partial_writer.Upsert(partial_row, wr)); + ASSERT_OK(wr.Wait()); + } + + // Verify partial update - name and age should remain unchanged + { + fluss::GenericRow key(4); + key.SetInt32(0, 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(result.GetInt32(0), 1) << "id should remain 1"; + EXPECT_EQ(result.GetString(1), "Verso") << "name should remain unchanged"; + EXPECT_EQ(result.GetInt64(2), 32) << "age should remain unchanged"; + EXPECT_EQ(result.GetInt64(3), 420) << "score should be updated to 420"; + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(KvTableTest, PartialUpdateByIndex) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_partial_update_by_index_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("age", fluss::DataType::BigInt()) + .AddColumn("score", fluss::DataType::BigInt()) + .SetPrimaryKeys({"id"}) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + // Insert initial record with all columns + auto table_upsert = table.NewUpsert(); + fluss::UpsertWriter upsert_writer; + ASSERT_OK(table_upsert.CreateWriter(upsert_writer)); + + { + fluss::GenericRow row(4); + row.SetInt32(0, 1); + row.SetString(1, "Verso"); + row.SetInt64(2, 32); + row.SetInt64(3, 6942); + fluss::WriteResult wr; + ASSERT_OK(upsert_writer.Upsert(row, wr)); + ASSERT_OK(wr.Wait()); + } + + // Verify initial record + fluss::Lookuper lookuper; + ASSERT_OK(table.NewLookup().CreateLookuper(lookuper)); + + { + fluss::GenericRow key(4); + key.SetInt32(0, 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(result.GetInt32(0), 1); + EXPECT_EQ(result.GetString(1), "Verso"); + EXPECT_EQ(result.GetInt64(2), 32); + EXPECT_EQ(result.GetInt64(3), 6942); + } + + // Create partial update writer using column indices: 0 (id) and 3 (score) + auto partial_upsert = table.NewUpsert(); + partial_upsert.PartialUpdateByIndex({0, 3}); + fluss::UpsertWriter partial_writer; + ASSERT_OK(partial_upsert.CreateWriter(partial_writer)); + + // Update only the score column (await acknowledgment) + { + fluss::GenericRow partial_row(4); + partial_row.SetInt32(0, 1); + partial_row.SetNull(1); // not in partial update + partial_row.SetNull(2); // not in partial update + partial_row.SetInt64(3, 420); + fluss::WriteResult wr; + ASSERT_OK(partial_writer.Upsert(partial_row, wr)); + ASSERT_OK(wr.Wait()); + } + + // Verify partial update - name and age should remain unchanged + { + fluss::GenericRow key(4); + key.SetInt32(0, 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(result.GetInt32(0), 1) << "id should remain 1"; + EXPECT_EQ(result.GetString(1), "Verso") << "name should remain unchanged"; + EXPECT_EQ(result.GetInt64(2), 32) << "age should remain unchanged"; + EXPECT_EQ(result.GetInt64(3), 420) << "score should be updated to 420"; + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(KvTableTest, PartitionedTableUpsertAndLookup) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_partitioned_kv_table_cpp"); + + // Create a partitioned KV table with region as partition key + auto schema = fluss::Schema::NewBuilder() + .AddColumn("region", fluss::DataType::String()) + .AddColumn("user_id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("score", fluss::DataType::BigInt()) + .SetPrimaryKeys({"region", "user_id"}) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetPartitionKeys({"region"}) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + // Create partitions + fluss_test::CreatePartitions(adm, table_path, "region", {"US", "EU", "APAC"}); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + auto table_upsert = table.NewUpsert(); + fluss::UpsertWriter upsert_writer; + ASSERT_OK(table_upsert.CreateWriter(upsert_writer)); + + // Insert records with different partitions + struct TestData { + std::string region; + int32_t user_id; + std::string name; + int64_t score; + }; + std::vector test_data = {{"US", 1, "Gustave", 100}, {"US", 2, "Lune", 200}, + {"EU", 1, "Sciel", 150}, {"EU", 2, "Maelle", 250}, + {"APAC", 1, "Noco", 300}}; + + for (const auto& d : test_data) { + fluss::GenericRow row(4); + row.SetString(0, d.region); + row.SetInt32(1, d.user_id); + row.SetString(2, d.name); + row.SetInt64(3, d.score); + ASSERT_OK(upsert_writer.Upsert(row)); + } + ASSERT_OK(upsert_writer.Flush()); + + // Create lookuper + fluss::Lookuper lookuper; + ASSERT_OK(table.NewLookup().CreateLookuper(lookuper)); + + // Lookup records + for (const auto& d : test_data) { + fluss::GenericRow key(4); + key.SetString(0, d.region); + key.SetInt32(1, d.user_id); + + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + + EXPECT_EQ(std::string(result.GetString(0)), d.region) << "region mismatch"; + EXPECT_EQ(result.GetInt32(1), d.user_id) << "user_id mismatch"; + EXPECT_EQ(std::string(result.GetString(2)), d.name) << "name mismatch"; + EXPECT_EQ(result.GetInt64(3), d.score) << "score mismatch"; + } + + // Update within a partition (await acknowledgment) + { + fluss::GenericRow updated_row(4); + updated_row.SetString(0, "US"); + updated_row.SetInt32(1, 1); + updated_row.SetString(2, "Gustave Updated"); + updated_row.SetInt64(3, 999); + fluss::WriteResult wr; + ASSERT_OK(upsert_writer.Upsert(updated_row, wr)); + ASSERT_OK(wr.Wait()); + } + + // Verify the update + { + fluss::GenericRow key(4); + key.SetString(0, "US"); + key.SetInt32(1, 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(std::string(result.GetString(2)), "Gustave Updated"); + EXPECT_EQ(result.GetInt64(3), 999); + } + + // Lookup in non-existent partition should return not found + { + fluss::GenericRow key(4); + key.SetString(0, "UNKNOWN_REGION"); + key.SetInt32(1, 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_FALSE(result.Found()) << "Lookup in non-existent partition should return not found"; + } + + // Delete a record within a partition (await acknowledgment) + { + fluss::GenericRow delete_key(4); + delete_key.SetString(0, "EU"); + delete_key.SetInt32(1, 1); + fluss::WriteResult wr; + ASSERT_OK(upsert_writer.Delete(delete_key, wr)); + ASSERT_OK(wr.Wait()); + } + + // Verify deletion + { + fluss::GenericRow key(4); + key.SetString(0, "EU"); + key.SetInt32(1, 1); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_FALSE(result.Found()) << "Deleted record should not exist"; + } + + // Verify other records in same partition still exist + { + fluss::GenericRow key(4); + key.SetString(0, "EU"); + key.SetInt32(1, 2); + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + EXPECT_EQ(std::string(result.GetString(2)), "Maelle"); + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(KvTableTest, AllSupportedDatatypes) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_all_datatypes_cpp"); + + // Create a table with all supported datatypes + auto schema = fluss::Schema::NewBuilder() + .AddColumn("pk_int", fluss::DataType::Int()) + .AddColumn("col_boolean", fluss::DataType::Boolean()) + .AddColumn("col_tinyint", fluss::DataType::TinyInt()) + .AddColumn("col_smallint", fluss::DataType::SmallInt()) + .AddColumn("col_int", fluss::DataType::Int()) + .AddColumn("col_bigint", fluss::DataType::BigInt()) + .AddColumn("col_float", fluss::DataType::Float()) + .AddColumn("col_double", fluss::DataType::Double()) + .AddColumn("col_char", fluss::DataType::Char(10)) + .AddColumn("col_string", fluss::DataType::String()) + .AddColumn("col_decimal", fluss::DataType::Decimal(10, 2)) + .AddColumn("col_date", fluss::DataType::Date()) + .AddColumn("col_time", fluss::DataType::Time()) + .AddColumn("col_timestamp", fluss::DataType::Timestamp()) + .AddColumn("col_timestamp_ltz", fluss::DataType::TimestampLtz()) + .AddColumn("col_bytes", fluss::DataType::Bytes()) + .AddColumn("col_binary", fluss::DataType::Binary(20)) + .SetPrimaryKeys({"pk_int"}) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + auto table_upsert = table.NewUpsert(); + fluss::UpsertWriter upsert_writer; + ASSERT_OK(table_upsert.CreateWriter(upsert_writer)); + + // Test data + int32_t pk_int = 1; + bool col_boolean = true; + int32_t col_tinyint = 127; + int32_t col_smallint = 32767; + int32_t col_int = 2147483647; + int64_t col_bigint = 9223372036854775807LL; + float col_float = 3.14f; + double col_double = 2.718281828459045; + std::string col_char = "hello"; + std::string col_string = "world of fluss rust client"; + std::string col_decimal = "123.45"; + auto col_date = fluss::Date::FromDays(20476); // 2026-01-23 + auto col_time = fluss::Time::FromMillis(36827000); // 10:13:47 + auto col_timestamp = fluss::Timestamp::FromMillis(1769163227123); // 2026-01-23 10:13:47.123 + auto col_timestamp_ltz = fluss::Timestamp::FromMillis(1769163227123); + std::vector col_bytes = {'b', 'i', 'n', 'a', 'r', 'y', ' ', 'd', 'a', 't', 'a'}; + std::vector col_binary = {'f', 'i', 'x', 'e', 'd', ' ', 'b', 'i', 'n', 'a', + 'r', 'y', ' ', 'd', 'a', 't', 'a', '!', '!', '!'}; + + // Upsert a row with all datatypes + { + fluss::GenericRow row(17); + row.SetInt32(0, pk_int); + row.SetBool(1, col_boolean); + row.SetInt32(2, col_tinyint); + row.SetInt32(3, col_smallint); + row.SetInt32(4, col_int); + row.SetInt64(5, col_bigint); + row.SetFloat32(6, col_float); + row.SetFloat64(7, col_double); + row.SetString(8, col_char); + row.SetString(9, col_string); + row.SetDecimal(10, col_decimal); + row.SetDate(11, col_date); + row.SetTime(12, col_time); + row.SetTimestampNtz(13, col_timestamp); + row.SetTimestampLtz(14, col_timestamp_ltz); + row.SetBytes(15, col_bytes); + row.SetBytes(16, col_binary); + fluss::WriteResult wr; + ASSERT_OK(upsert_writer.Upsert(row, wr)); + ASSERT_OK(wr.Wait()); + } + + // Lookup the record + fluss::Lookuper lookuper; + ASSERT_OK(table.NewLookup().CreateLookuper(lookuper)); + + { + fluss::GenericRow key(17); + key.SetInt32(0, pk_int); + + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + + // Verify all datatypes + EXPECT_EQ(result.GetInt32(0), pk_int) << "pk_int mismatch"; + EXPECT_EQ(result.GetBool(1), col_boolean) << "col_boolean mismatch"; + EXPECT_EQ(result.GetInt32(2), col_tinyint) << "col_tinyint mismatch"; + EXPECT_EQ(result.GetInt32(3), col_smallint) << "col_smallint mismatch"; + EXPECT_EQ(result.GetInt32(4), col_int) << "col_int mismatch"; + EXPECT_EQ(result.GetInt64(5), col_bigint) << "col_bigint mismatch"; + EXPECT_NEAR(result.GetFloat32(6), col_float, 1e-6f) << "col_float mismatch"; + EXPECT_NEAR(result.GetFloat64(7), col_double, 1e-15) << "col_double mismatch"; + EXPECT_EQ(result.GetString(8), col_char) << "col_char mismatch"; + EXPECT_EQ(result.GetString(9), col_string) << "col_string mismatch"; + EXPECT_EQ(result.GetDecimalString(10), col_decimal) << "col_decimal mismatch"; + EXPECT_EQ(result.GetDate(11).days_since_epoch, col_date.days_since_epoch) << "col_date mismatch"; + EXPECT_EQ(result.GetTime(12).millis_since_midnight, col_time.millis_since_midnight) << "col_time mismatch"; + EXPECT_EQ(result.GetTimestamp(13).epoch_millis, col_timestamp.epoch_millis) + << "col_timestamp mismatch"; + EXPECT_EQ(result.GetTimestamp(14).epoch_millis, col_timestamp_ltz.epoch_millis) + << "col_timestamp_ltz mismatch"; + + auto [bytes_ptr, bytes_len] = result.GetBytes(15); + EXPECT_EQ(bytes_len, col_bytes.size()) << "col_bytes length mismatch"; + EXPECT_TRUE(std::memcmp(bytes_ptr, col_bytes.data(), bytes_len) == 0) + << "col_bytes mismatch"; + + auto [binary_ptr, binary_len] = result.GetBytes(16); + EXPECT_EQ(binary_len, col_binary.size()) << "col_binary length mismatch"; + EXPECT_TRUE(std::memcmp(binary_ptr, col_binary.data(), binary_len) == 0) + << "col_binary mismatch"; + } + + // Test with null values for nullable columns + { + fluss::GenericRow row_with_nulls(17); + row_with_nulls.SetInt32(0, 2); // pk_int = 2 + for (size_t i = 1; i < 17; ++i) { + row_with_nulls.SetNull(i); + } + fluss::WriteResult wr; + ASSERT_OK(upsert_writer.Upsert(row_with_nulls, wr)); + ASSERT_OK(wr.Wait()); + } + + // Lookup row with nulls + { + fluss::GenericRow key(17); + key.SetInt32(0, 2); + + fluss::LookupResult result; + ASSERT_OK(lookuper.Lookup(key, result)); + ASSERT_TRUE(result.Found()); + + EXPECT_EQ(result.GetInt32(0), 2) << "pk_int mismatch"; + for (size_t i = 1; i < 17; ++i) { + EXPECT_TRUE(result.IsNull(i)) << "column " << i << " should be null"; + } + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} diff --git a/fluss-rust/bindings/cpp/test/test_log_table.cpp b/fluss-rust/bindings/cpp/test/test_log_table.cpp new file mode 100644 index 0000000000..5678e4bb10 --- /dev/null +++ b/fluss-rust/bindings/cpp/test/test_log_table.cpp @@ -0,0 +1,1523 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "test_utils.h" + +class LogTableTest : public ::testing::Test { + protected: + fluss::Admin& admin() { return fluss_test::FlussTestEnvironment::Instance()->GetAdmin(); } + + fluss::Connection& connection() { + return fluss_test::FlussTestEnvironment::Instance()->GetConnection(); + } +}; + +TEST_F(LogTableTest, AppendRecordBatchAndScan) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_append_record_batch_and_scan_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("c1", fluss::DataType::Int()) + .AddColumn("c2", fluss::DataType::String()) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(3) + .SetBucketKeys({"c1"}) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + // Create append writer + auto table_append = table.NewAppend(); + fluss::AppendWriter append_writer; + ASSERT_OK(table_append.CreateWriter(append_writer)); + + // Append Arrow record batches + { + auto c1 = arrow::Int32Builder(); + c1.AppendValues({1, 2, 3}).ok(); + auto c2 = arrow::StringBuilder(); + c2.AppendValues({"a1", "a2", "a3"}).ok(); + + auto batch = arrow::RecordBatch::Make( + arrow::schema({arrow::field("c1", arrow::int32()), arrow::field("c2", arrow::utf8())}), + 3, {c1.Finish().ValueOrDie(), c2.Finish().ValueOrDie()}); + + ASSERT_OK(append_writer.AppendArrowBatch(batch)); + } + + { + auto c1 = arrow::Int32Builder(); + c1.AppendValues({4, 5, 6}).ok(); + auto c2 = arrow::StringBuilder(); + c2.AppendValues({"a4", "a5", "a6"}).ok(); + + auto batch = arrow::RecordBatch::Make( + arrow::schema({arrow::field("c1", arrow::int32()), arrow::field("c2", arrow::utf8())}), + 3, {c1.Finish().ValueOrDie(), c2.Finish().ValueOrDie()}); + + ASSERT_OK(append_writer.AppendArrowBatch(batch)); + } + + ASSERT_OK(append_writer.Flush()); + + // Create scanner and subscribe to all 3 buckets + fluss::Table scan_table; + ASSERT_OK(conn.GetTable(table_path, scan_table)); + int32_t num_buckets = scan_table.GetTableInfo().num_buckets; + ASSERT_EQ(num_buckets, 3) << "Table should have 3 buckets"; + + auto table_scan = scan_table.NewScan(); + fluss::LogScanner log_scanner; + ASSERT_OK(table_scan.CreateLogScanner(log_scanner)); + + for (int32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { + ASSERT_OK(log_scanner.Subscribe(bucket_id, fluss::EARLIEST_OFFSET)); + } + + // Poll for records across all buckets + std::vector> records; + fluss_test::PollRecords(log_scanner, 6, [](const fluss::ScanRecord& rec) { + return std::make_pair(rec.row.GetInt32(0), std::string(rec.row.GetString(1))); + }, records); + ASSERT_EQ(records.size(), 6u) << "Expected 6 records"; + std::sort(records.begin(), records.end()); + + std::vector> expected = { + {1, "a1"}, {2, "a2"}, {3, "a3"}, {4, "a4"}, {5, "a5"}, {6, "a6"}}; + EXPECT_EQ(records, expected); + + // Verify per-bucket iteration via BucketRecords + { + fluss::Table bucket_table; + ASSERT_OK(conn.GetTable(table_path, bucket_table)); + auto bucket_scan = bucket_table.NewScan(); + fluss::LogScanner bucket_scanner; + ASSERT_OK(bucket_scan.CreateLogScanner(bucket_scanner)); + + for (int32_t bid = 0; bid < num_buckets; ++bid) { + ASSERT_OK(bucket_scanner.Subscribe(bid, fluss::EARLIEST_OFFSET)); + } + + std::vector> bucket_records; + auto bucket_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); + size_t buckets_with_data = 0; + while (bucket_records.size() < 6 && std::chrono::steady_clock::now() < bucket_deadline) { + fluss::ScanRecords scan_records; + ASSERT_OK(bucket_scanner.Poll(500, scan_records)); + + // Iterate by bucket + for (size_t b = 0; b < scan_records.BucketCount(); ++b) { + auto bkt_records = scan_records.BucketAt(b); + if (!bkt_records.Empty()) { + buckets_with_data++; + } + for (auto rec : bkt_records) { + bucket_records.emplace_back(rec.row.GetInt32(0), + std::string(rec.row.GetString(1))); + } + } + } + + ASSERT_EQ(bucket_records.size(), 6u) << "Expected 6 records via per-bucket iteration"; + EXPECT_GT(buckets_with_data, 1u) << "Records should be distributed across multiple buckets"; + + std::sort(bucket_records.begin(), bucket_records.end()); + EXPECT_EQ(bucket_records, expected); + } + + // Test unsubscribe + ASSERT_OK(log_scanner.Unsubscribe(0)); + + // Verify unsubscribe_partition fails on a non-partitioned table + auto unsub_result = log_scanner.UnsubscribePartition(0, 0); + ASSERT_FALSE(unsub_result.Ok()) + << "unsubscribe_partition should fail on a non-partitioned table"; + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, ListOffsets) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_list_offsets_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + // Wait for table initialization + std::this_thread::sleep_for(std::chrono::seconds(2)); + + // Earliest offset should be 0 for empty table + std::unordered_map earliest_offsets; + ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Earliest(), earliest_offsets)); + EXPECT_EQ(earliest_offsets[0], 0) << "Earliest offset should be 0 for bucket 0"; + + // Latest offset should be 0 for empty table + std::unordered_map latest_offsets; + ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Latest(), latest_offsets)); + EXPECT_EQ(latest_offsets[0], 0) << "Latest offset should be 0 for empty table"; + + auto before_append_ms = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + // Append records + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + auto table_append = table.NewAppend(); + fluss::AppendWriter append_writer; + ASSERT_OK(table_append.CreateWriter(append_writer)); + + { + auto id_builder = arrow::Int32Builder(); + id_builder.AppendValues({1, 2, 3}).ok(); + auto name_builder = arrow::StringBuilder(); + name_builder.AppendValues({"alice", "bob", "charlie"}).ok(); + + auto batch = arrow::RecordBatch::Make( + arrow::schema( + {arrow::field("id", arrow::int32()), arrow::field("name", arrow::utf8())}), + 3, {id_builder.Finish().ValueOrDie(), name_builder.Finish().ValueOrDie()}); + + ASSERT_OK(append_writer.AppendArrowBatch(batch)); + } + ASSERT_OK(append_writer.Flush()); + + std::this_thread::sleep_for(std::chrono::seconds(1)); + + auto after_append_ms = + std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + + // Latest offset after appending should be 3 + std::unordered_map latest_after; + ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Latest(), latest_after)); + EXPECT_EQ(latest_after[0], 3) << "Latest offset should be 3 after appending 3 records"; + + // Earliest offset should still be 0 + std::unordered_map earliest_after; + ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Earliest(), earliest_after)); + EXPECT_EQ(earliest_after[0], 0) << "Earliest offset should still be 0"; + + // Timestamp before append should resolve to offset 0 + std::unordered_map ts_offsets; + ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Timestamp(before_append_ms), + ts_offsets)); + EXPECT_EQ(ts_offsets[0], 0) + << "Timestamp before append should resolve to offset 0"; + + // Timestamp after append should resolve to offset 3 + std::unordered_map ts_after_offsets; + ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Timestamp(after_append_ms), + ts_after_offsets)); + EXPECT_EQ(ts_after_offsets[0], 3) + << "Timestamp after append should resolve to offset 3"; + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, TestProject) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_project_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("col_a", fluss::DataType::Int()) + .AddColumn("col_b", fluss::DataType::String()) + .AddColumn("col_c", fluss::DataType::Int()) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + // Append 3 records + auto table_append = table.NewAppend(); + fluss::AppendWriter append_writer; + ASSERT_OK(table_append.CreateWriter(append_writer)); + + { + auto col_a_builder = arrow::Int32Builder(); + col_a_builder.AppendValues({1, 2, 3}).ok(); + auto col_b_builder = arrow::StringBuilder(); + col_b_builder.AppendValues({"x", "y", "z"}).ok(); + auto col_c_builder = arrow::Int32Builder(); + col_c_builder.AppendValues({10, 20, 30}).ok(); + + auto batch = arrow::RecordBatch::Make( + arrow::schema({arrow::field("col_a", arrow::int32()), + arrow::field("col_b", arrow::utf8()), + arrow::field("col_c", arrow::int32())}), + 3, + {col_a_builder.Finish().ValueOrDie(), col_b_builder.Finish().ValueOrDie(), + col_c_builder.Finish().ValueOrDie()}); + + ASSERT_OK(append_writer.AppendArrowBatch(batch)); + } + ASSERT_OK(append_writer.Flush()); + + // Test project_by_name: select col_b and col_c only + { + fluss::Table proj_table; + ASSERT_OK(conn.GetTable(table_path, proj_table)); + auto scan = proj_table.NewScan(); + scan.ProjectByName({"col_b", "col_c"}); + fluss::LogScanner scanner; + ASSERT_OK(scan.CreateLogScanner(scanner)); + + ASSERT_OK(scanner.Subscribe(0, 0)); + + fluss::ScanRecords records; + ASSERT_OK(scanner.Poll(10000, records)); + + ASSERT_EQ(records.Count(), 3u) << "Should have 3 records with project_by_name"; + + std::vector expected_col_b = {"x", "y", "z"}; + std::vector expected_col_c = {10, 20, 30}; + + // Collect and sort by col_c to get deterministic order + std::vector> collected; + for (auto rec : records) { + collected.emplace_back(std::string(rec.row.GetString(0)), rec.row.GetInt32(1)); + } + std::sort(collected.begin(), collected.end(), + [](const auto& a, const auto& b) { return a.second < b.second; }); + + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(collected[i].first, expected_col_b[i]) << "col_b mismatch at index " << i; + EXPECT_EQ(collected[i].second, expected_col_c[i]) << "col_c mismatch at index " << i; + } + } + + // Test project by column indices: select col_b (1) and col_a (0) in that order + { + fluss::Table proj_table; + ASSERT_OK(conn.GetTable(table_path, proj_table)); + auto scan = proj_table.NewScan(); + scan.ProjectByIndex({1, 0}); + fluss::LogScanner scanner; + ASSERT_OK(scan.CreateLogScanner(scanner)); + + ASSERT_OK(scanner.Subscribe(0, 0)); + + fluss::ScanRecords records; + ASSERT_OK(scanner.Poll(10000, records)); + + ASSERT_EQ(records.Count(), 3u); + + std::vector expected_col_b = {"x", "y", "z"}; + std::vector expected_col_a = {1, 2, 3}; + + std::vector> collected; + for (auto rec : records) { + collected.emplace_back(std::string(rec.row.GetString(0)), rec.row.GetInt32(1)); + } + std::sort(collected.begin(), collected.end(), + [](const auto& a, const auto& b) { return a.second < b.second; }); + + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(collected[i].first, expected_col_b[i]) << "col_b mismatch at index " << i; + EXPECT_EQ(collected[i].second, expected_col_a[i]) << "col_a mismatch at index " << i; + } + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, TestPollBatches) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_poll_batches_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + std::this_thread::sleep_for(std::chrono::seconds(1)); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + auto scan = table.NewScan(); + fluss::LogScanner scanner; + ASSERT_OK(scan.CreateRecordBatchLogScanner(scanner)); + ASSERT_OK(scanner.Subscribe(0, 0)); + + // Test 1: Empty table should return empty result + { + fluss::ArrowRecordBatches batches; + ASSERT_OK(scanner.PollRecordBatch(500, batches)); + ASSERT_TRUE(batches.Empty()); + } + + // Append data + auto table_append = table.NewAppend(); + fluss::AppendWriter writer; + ASSERT_OK(table_append.CreateWriter(writer)); + + auto make_batch = [](std::vector ids, std::vector names) { + auto id_builder = arrow::Int32Builder(); + id_builder.AppendValues(ids).ok(); + auto name_builder = arrow::StringBuilder(); + name_builder.AppendValues(names).ok(); + return arrow::RecordBatch::Make( + arrow::schema( + {arrow::field("id", arrow::int32()), arrow::field("name", arrow::utf8())}), + static_cast(ids.size()), + {id_builder.Finish().ValueOrDie(), name_builder.Finish().ValueOrDie()}); + }; + + ASSERT_OK(writer.AppendArrowBatch(make_batch({1, 2}, {"a", "b"}))); + ASSERT_OK(writer.AppendArrowBatch(make_batch({3, 4}, {"c", "d"}))); + ASSERT_OK(writer.AppendArrowBatch(make_batch({5, 6}, {"e", "f"}))); + ASSERT_OK(writer.Flush()); + + // Extract ids from Arrow batches + auto extract_ids = [](const fluss::ArrowRecordBatches& batches) { + std::vector ids; + for (const auto& batch : batches) { + auto arr = + std::static_pointer_cast(batch->GetArrowRecordBatch()->column(0)); + for (int64_t i = 0; i < arr->length(); ++i) { + ids.push_back(arr->Value(i)); + } + } + return ids; + }; + + // Test 2: Poll until we get all 6 records + std::vector all_ids; + fluss_test::PollRecordBatches(scanner, 6, extract_ids, all_ids); + ASSERT_EQ(all_ids, (std::vector{1, 2, 3, 4, 5, 6})); + + // Test 3: Append more and verify offset continuation (no duplicates) + ASSERT_OK(writer.AppendArrowBatch(make_batch({7, 8}, {"g", "h"}))); + ASSERT_OK(writer.Flush()); + + std::vector new_ids; + fluss_test::PollRecordBatches(scanner, 2, extract_ids, new_ids); + ASSERT_EQ(new_ids, (std::vector{7, 8})); + + // Test 4: Subscribing from mid-offset should truncate batch + { + fluss::Table trunc_table; + ASSERT_OK(conn.GetTable(table_path, trunc_table)); + auto trunc_scan = trunc_table.NewScan(); + fluss::LogScanner trunc_scanner; + ASSERT_OK(trunc_scan.CreateRecordBatchLogScanner(trunc_scanner)); + ASSERT_OK(trunc_scanner.Subscribe(0, 3)); + + std::vector trunc_ids; + fluss_test::PollRecordBatches(trunc_scanner, 5, extract_ids, trunc_ids); + ASSERT_EQ(trunc_ids, (std::vector{4, 5, 6, 7, 8})); + } + + // Test 5: Projection should only return requested columns + { + fluss::Table proj_table; + ASSERT_OK(conn.GetTable(table_path, proj_table)); + auto proj_scan = proj_table.NewScan(); + proj_scan.ProjectByName({"id"}); + fluss::LogScanner proj_scanner; + ASSERT_OK(proj_scan.CreateRecordBatchLogScanner(proj_scanner)); + ASSERT_OK(proj_scanner.Subscribe(0, 0)); + + fluss::ArrowRecordBatches proj_batches; + ASSERT_OK(proj_scanner.PollRecordBatch(10000, proj_batches)); + + ASSERT_FALSE(proj_batches.Empty()); + EXPECT_EQ(proj_batches[0]->GetArrowRecordBatch()->num_columns(), 1) + << "Projected batch should have 1 column (id), not 2"; + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, AllSupportedDatatypes) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_log_all_datatypes_cpp"); + + // Create a log table with all supported datatypes + auto schema = + fluss::Schema::NewBuilder() + .AddColumn("col_tinyint", fluss::DataType::TinyInt()) + .AddColumn("col_smallint", fluss::DataType::SmallInt()) + .AddColumn("col_int", fluss::DataType::Int()) + .AddColumn("col_bigint", fluss::DataType::BigInt()) + .AddColumn("col_float", fluss::DataType::Float()) + .AddColumn("col_double", fluss::DataType::Double()) + .AddColumn("col_boolean", fluss::DataType::Boolean()) + .AddColumn("col_char", fluss::DataType::Char(10)) + .AddColumn("col_string", fluss::DataType::String()) + .AddColumn("col_decimal", fluss::DataType::Decimal(10, 2)) + .AddColumn("col_date", fluss::DataType::Date()) + .AddColumn("col_time", fluss::DataType::Time()) + .AddColumn("col_timestamp", fluss::DataType::Timestamp()) + .AddColumn("col_timestamp_ltz", fluss::DataType::TimestampLtz()) + .AddColumn("col_bytes", fluss::DataType::Bytes()) + .AddColumn("col_binary", fluss::DataType::Binary(4)) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + size_t field_count = table.GetTableInfo().schema.columns.size(); + + auto table_append = table.NewAppend(); + fluss::AppendWriter append_writer; + ASSERT_OK(table_append.CreateWriter(append_writer)); + + // Test data + int32_t col_tinyint = 127; + int32_t col_smallint = 32767; + int32_t col_int = 2147483647; + int64_t col_bigint = 9223372036854775807LL; + float col_float = 3.14f; + double col_double = 2.718281828459045; + bool col_boolean = true; + std::string col_char = "hello"; + std::string col_string = "world of fluss rust client"; + std::string col_decimal = "123.45"; + auto col_date = fluss::Date::FromDays(20476); // 2026-01-23 + auto col_time = fluss::Time::FromMillis(36827000); // 10:13:47 + auto col_timestamp = fluss::Timestamp::FromMillisNanos(1769163227123, 456000); + auto col_timestamp_ltz = fluss::Timestamp::FromMillisNanos(1769163227123, 456000); + std::vector col_bytes = {'b', 'i', 'n', 'a', 'r', 'y', ' ', 'd', 'a', 't', 'a'}; + std::vector col_binary = {0xDE, 0xAD, 0xBE, 0xEF}; + + // Append a row with all datatypes + { + fluss::GenericRow row(field_count); + row.SetInt32(0, col_tinyint); + row.SetInt32(1, col_smallint); + row.SetInt32(2, col_int); + row.SetInt64(3, col_bigint); + row.SetFloat32(4, col_float); + row.SetFloat64(5, col_double); + row.SetBool(6, col_boolean); + row.SetString(7, col_char); + row.SetString(8, col_string); + row.SetDecimal(9, col_decimal); + row.SetDate(10, col_date); + row.SetTime(11, col_time); + row.SetTimestampNtz(12, col_timestamp); + row.SetTimestampLtz(13, col_timestamp_ltz); + row.SetBytes(14, col_bytes); + row.SetBytes(15, col_binary); + ASSERT_OK(append_writer.Append(row)); + } + + // Append a row with null values + { + fluss::GenericRow row_with_nulls(field_count); + for (size_t i = 0; i < field_count; ++i) { + row_with_nulls.SetNull(i); + } + ASSERT_OK(append_writer.Append(row_with_nulls)); + } + + ASSERT_OK(append_writer.Flush()); + + // Scan the records + fluss::Table scan_table; + ASSERT_OK(conn.GetTable(table_path, scan_table)); + auto table_scan = scan_table.NewScan(); + fluss::LogScanner log_scanner; + ASSERT_OK(table_scan.CreateLogScanner(log_scanner)); + ASSERT_OK(log_scanner.Subscribe(0, 0)); + + // Poll until we get 2 records + std::vector all_records; + fluss_test::PollRecords(log_scanner, 2, + [](const fluss::ScanRecord& rec) { return rec; }, all_records); + ASSERT_EQ(all_records.size(), 2u) << "Expected 2 records"; + + // Verify first record (all values) + auto& row = all_records[0].row; + + EXPECT_EQ(row.GetInt32(0), col_tinyint) << "col_tinyint mismatch"; + EXPECT_EQ(row.GetInt32(1), col_smallint) << "col_smallint mismatch"; + EXPECT_EQ(row.GetInt32(2), col_int) << "col_int mismatch"; + EXPECT_EQ(row.GetInt64(3), col_bigint) << "col_bigint mismatch"; + EXPECT_NEAR(row.GetFloat32(4), col_float, 1e-6f) << "col_float mismatch"; + EXPECT_NEAR(row.GetFloat64(5), col_double, 1e-15) << "col_double mismatch"; + EXPECT_EQ(row.GetBool(6), col_boolean) << "col_boolean mismatch"; + EXPECT_EQ(row.GetString(7), col_char) << "col_char mismatch"; + EXPECT_EQ(row.GetString(8), col_string) << "col_string mismatch"; + EXPECT_EQ(row.GetDecimalString(9), col_decimal) << "col_decimal mismatch"; + EXPECT_EQ(row.GetDate(10).days_since_epoch, col_date.days_since_epoch) << "col_date mismatch"; + EXPECT_EQ(row.GetTime(11).millis_since_midnight, col_time.millis_since_midnight) + << "col_time mismatch"; + EXPECT_EQ(row.GetTimestamp(12).epoch_millis, col_timestamp.epoch_millis) + << "col_timestamp millis mismatch"; + EXPECT_EQ(row.GetTimestamp(12).nano_of_millisecond, col_timestamp.nano_of_millisecond) + << "col_timestamp nanos mismatch"; + EXPECT_EQ(row.GetTimestamp(13).epoch_millis, col_timestamp_ltz.epoch_millis) + << "col_timestamp_ltz millis mismatch"; + EXPECT_EQ(row.GetTimestamp(13).nano_of_millisecond, col_timestamp_ltz.nano_of_millisecond) + << "col_timestamp_ltz nanos mismatch"; + + auto [bytes_ptr, bytes_len] = row.GetBytes(14); + EXPECT_EQ(bytes_len, col_bytes.size()) << "col_bytes length mismatch"; + EXPECT_TRUE(std::memcmp(bytes_ptr, col_bytes.data(), bytes_len) == 0) + << "col_bytes mismatch"; + + auto [binary_ptr, binary_len] = row.GetBytes(15); + EXPECT_EQ(binary_len, col_binary.size()) << "col_binary length mismatch"; + EXPECT_TRUE(std::memcmp(binary_ptr, col_binary.data(), binary_len) == 0) + << "col_binary mismatch"; + + // Verify second record (all nulls) + auto& null_row = all_records[1].row; + for (size_t i = 0; i < field_count; ++i) { + EXPECT_TRUE(null_row.IsNull(i)) << "column " << i << " should be null"; + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, PartitionedTableAppendScan) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_partitioned_log_append_cpp"); + + // Create a partitioned log table + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("region", fluss::DataType::String()) + .AddColumn("value", fluss::DataType::BigInt()) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetPartitionKeys({"region"}) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + // Create partitions + fluss_test::CreatePartitions(adm, table_path, "region", {"US", "EU"}); + + // Wait for partitions + std::this_thread::sleep_for(std::chrono::seconds(2)); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + auto table_append = table.NewAppend(); + fluss::AppendWriter append_writer; + ASSERT_OK(table_append.CreateWriter(append_writer)); + + // Append rows + struct TestData { + int32_t id; + std::string region; + int64_t value; + }; + std::vector test_data = {{1, "US", 100}, {2, "US", 200}, {3, "EU", 300}, {4, "EU", 400}}; + + for (const auto& d : test_data) { + fluss::GenericRow row(3); + row.SetInt32(0, d.id); + row.SetString(1, d.region); + row.SetInt64(2, d.value); + ASSERT_OK(append_writer.Append(row)); + } + ASSERT_OK(append_writer.Flush()); + + // Append arrow batches per partition + { + auto id_builder = arrow::Int32Builder(); + id_builder.AppendValues({5, 6}).ok(); + auto region_builder = arrow::StringBuilder(); + region_builder.AppendValues({"US", "US"}).ok(); + auto value_builder = arrow::Int64Builder(); + value_builder.AppendValues({500, 600}).ok(); + + auto batch = arrow::RecordBatch::Make( + arrow::schema({arrow::field("id", arrow::int32()), + arrow::field("region", arrow::utf8()), + arrow::field("value", arrow::int64())}), + 2, + {id_builder.Finish().ValueOrDie(), region_builder.Finish().ValueOrDie(), + value_builder.Finish().ValueOrDie()}); + + ASSERT_OK(append_writer.AppendArrowBatch(batch)); + } + + { + auto id_builder = arrow::Int32Builder(); + id_builder.AppendValues({7, 8}).ok(); + auto region_builder = arrow::StringBuilder(); + region_builder.AppendValues({"EU", "EU"}).ok(); + auto value_builder = arrow::Int64Builder(); + value_builder.AppendValues({700, 800}).ok(); + + auto batch = arrow::RecordBatch::Make( + arrow::schema({arrow::field("id", arrow::int32()), + arrow::field("region", arrow::utf8()), + arrow::field("value", arrow::int64())}), + 2, + {id_builder.Finish().ValueOrDie(), region_builder.Finish().ValueOrDie(), + value_builder.Finish().ValueOrDie()}); + + ASSERT_OK(append_writer.AppendArrowBatch(batch)); + } + ASSERT_OK(append_writer.Flush()); + + // Test list partition offsets + std::unordered_map us_offsets; + ASSERT_OK(adm.ListPartitionOffsets(table_path, "US", {0}, fluss::OffsetSpec::Latest(), + us_offsets)); + EXPECT_EQ(us_offsets[0], 4) << "US partition should have 4 records"; + + std::unordered_map eu_offsets; + ASSERT_OK(adm.ListPartitionOffsets(table_path, "EU", {0}, fluss::OffsetSpec::Latest(), + eu_offsets)); + EXPECT_EQ(eu_offsets[0], 4) << "EU partition should have 4 records"; + + // Subscribe to all partitions and scan + fluss::Table scan_table; + ASSERT_OK(conn.GetTable(table_path, scan_table)); + auto table_scan = scan_table.NewScan(); + fluss::LogScanner log_scanner; + ASSERT_OK(table_scan.CreateLogScanner(log_scanner)); + + std::vector partition_infos; + ASSERT_OK(adm.ListPartitionInfos(table_path, partition_infos)); + + for (const auto& pi : partition_infos) { + ASSERT_OK(log_scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0)); + } + + // Collect all records + using Record = std::tuple; + auto extract_record = [](const fluss::ScanRecord& rec) -> Record { + return {rec.row.GetInt32(0), std::string(rec.row.GetString(1)), rec.row.GetInt64(2)}; + }; + std::vector collected; + fluss_test::PollRecords(log_scanner, 8, extract_record, collected); + + ASSERT_EQ(collected.size(), 8u) << "Expected 8 records total"; + std::sort(collected.begin(), collected.end()); + + std::vector expected = {{1, "US", 100}, {2, "US", 200}, {3, "EU", 300}, + {4, "EU", 400}, {5, "US", 500}, {6, "US", 600}, + {7, "EU", 700}, {8, "EU", 800}}; + EXPECT_EQ(collected, expected); + + // Test unsubscribe_partition: unsubscribe EU, should only get US data + { + fluss::Table unsub_table; + ASSERT_OK(conn.GetTable(table_path, unsub_table)); + auto unsub_scan = unsub_table.NewScan(); + fluss::LogScanner unsub_scanner; + ASSERT_OK(unsub_scan.CreateLogScanner(unsub_scanner)); + + int64_t eu_partition_id = -1; + for (const auto& pi : partition_infos) { + ASSERT_OK(unsub_scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0)); + if (pi.partition_name == "EU") { + eu_partition_id = pi.partition_id; + } + } + ASSERT_GE(eu_partition_id, 0) << "EU partition should exist"; + + ASSERT_OK(unsub_scanner.UnsubscribePartition(eu_partition_id, 0)); + + std::vector us_only; + fluss_test::PollRecords(unsub_scanner, 4, extract_record, us_only); + + ASSERT_EQ(us_only.size(), 4u) << "Should receive exactly 4 US records"; + for (const auto& [id, region, val] : us_only) { + EXPECT_EQ(region, "US") << "After unsubscribe EU, only US data should be read"; + } + } + + // Test subscribe_partition_buckets (batch subscribe) + { + fluss::Table batch_table; + ASSERT_OK(conn.GetTable(table_path, batch_table)); + auto batch_scan = batch_table.NewScan(); + fluss::LogScanner batch_scanner; + ASSERT_OK(batch_scan.CreateLogScanner(batch_scanner)); + + std::vector subs; + for (const auto& pi : partition_infos) { + subs.push_back({pi.partition_id, 0, 0}); + } + ASSERT_OK(batch_scanner.SubscribePartitionBuckets(subs)); + + std::vector batch_collected; + fluss_test::PollRecords(batch_scanner, 8, extract_record, batch_collected); + ASSERT_EQ(batch_collected.size(), 8u); + std::sort(batch_collected.begin(), batch_collected.end()); + EXPECT_EQ(batch_collected, expected); + } + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +// ============================================================================ +// Array data type tests +// ============================================================================ + +TEST_F(LogTableTest, AppendAndScanWithArray) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_append_scan_with_array_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("tags", fluss::DataType::Array(fluss::DataType::String())) + .AddColumn("scores", fluss::DataType::Array(fluss::DataType::Int())) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(1) + .SetBucketKeys({"id"}) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + auto info = table.GetTableInfo(); + ASSERT_GE(info.schema.columns.size(), 3u); + const auto& tags_type = info.schema.columns[1].data_type; + ASSERT_EQ(tags_type.id(), fluss::TypeId::Array); + ASSERT_NE(tags_type.element_type(), nullptr); + ASSERT_EQ(tags_type.element_type()->id(), fluss::TypeId::String); + const auto& scores_type = info.schema.columns[2].data_type; + ASSERT_EQ(scores_type.id(), fluss::TypeId::Array); + ASSERT_NE(scores_type.element_type(), nullptr); + ASSERT_EQ(scores_type.element_type()->id(), fluss::TypeId::Int); + + fluss::AppendWriter append_writer; + ASSERT_OK(table.NewAppend().CreateWriter(append_writer)); + + { + auto row = table.NewRow(); + row.Set("id", 1); + + fluss::ArrayWriter tags(2, fluss::DataType::String()); + tags.SetString(0, "hello"); + tags.SetString(1, "world"); + row.SetArray(1, std::move(tags)); + + fluss::ArrayWriter scores(3, fluss::DataType::Int()); + scores.SetInt32(0, 10); + scores.SetInt32(1, 20); + scores.SetInt32(2, 30); + row.SetArray(2, std::move(scores)); + + ASSERT_OK(append_writer.Append(row)); + } + { + auto row = table.NewRow(); + row.Set("id", 2); + + fluss::ArrayWriter tags(1, fluss::DataType::String()); + tags.SetNull(0); + row.SetArray(1, std::move(tags)); + + fluss::ArrayWriter scores(0, fluss::DataType::Int()); + row.SetArray(2, std::move(scores)); + + ASSERT_OK(append_writer.Append(row)); + } + + ASSERT_OK(append_writer.Flush()); + + auto scan = table.NewScan(); + fluss::LogScanner scanner; + ASSERT_OK(scan.CreateLogScanner(scanner)); + ASSERT_OK(scanner.Subscribe(0, 0)); + + struct Record { + int32_t id; + size_t tag_count; + std::vector tags; + size_t score_count; + std::vector scores; + }; + + std::vector collected; + auto extract = [](const fluss::ScanRecord& scan_rec) { + const auto& rv = scan_rec.row; + Record rec; + rec.id = rv.GetInt32(0); + + rec.tag_count = rv.GetArraySize(1); + for (size_t i = 0; i < rec.tag_count; ++i) { + if (rv.IsArrayElementNull(1, i)) { + rec.tags.push_back(""); + } else { + rec.tags.push_back(rv.GetArrayString(1, i)); + } + } + + rec.score_count = rv.GetArraySize(2); + for (size_t i = 0; i < rec.score_count; ++i) { + rec.scores.push_back(rv.GetArrayInt32(2, i)); + } + + return rec; + }; + + fluss_test::PollRecords(scanner, 2, extract, collected); + + ASSERT_EQ(collected.size(), 2u); + + std::sort(collected.begin(), collected.end(), + [](const Record& a, const Record& b) { return a.id < b.id; }); + + EXPECT_EQ(collected[0].id, 1); + ASSERT_EQ(collected[0].tag_count, 2u); + EXPECT_EQ(collected[0].tags[0], "hello"); + EXPECT_EQ(collected[0].tags[1], "world"); + ASSERT_EQ(collected[0].score_count, 3u); + EXPECT_EQ(collected[0].scores[0], 10); + EXPECT_EQ(collected[0].scores[1], 20); + EXPECT_EQ(collected[0].scores[2], 30); + + EXPECT_EQ(collected[1].id, 2); + ASSERT_EQ(collected[1].tag_count, 1u); + EXPECT_EQ(collected[1].tags[0], ""); + ASSERT_EQ(collected[1].score_count, 0u); + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, AppendAndScanWithNestedArray) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_append_scan_nested_array_cpp"); + + auto schema = + fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("matrix", + fluss::DataType::Array(fluss::DataType::Array(fluss::DataType::Int()))) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(1) + .SetBucketKeys({"id"}) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + + fluss::AppendWriter append_writer; + ASSERT_OK(table.NewAppend().CreateWriter(append_writer)); + + { + auto row = table.NewRow(); + row.Set("id", 1); + + fluss::ArrayWriter inner1(2, fluss::DataType::Int()); + inner1.SetInt32(0, 1); + inner1.SetInt32(1, 2); + + fluss::ArrayWriter inner2(2, fluss::DataType::Int()); + inner2.SetInt32(0, 3); + inner2.SetInt32(1, 4); + + fluss::ArrayWriter outer(2, fluss::DataType::Array(fluss::DataType::Int())); + outer.SetArray(0, std::move(inner1)); + outer.SetArray(1, std::move(inner2)); + + row.SetArray(1, std::move(outer)); + ASSERT_OK(append_writer.Append(row)); + } + + ASSERT_OK(append_writer.Flush()); + + auto scan = table.NewScan(); + fluss::LogScanner scanner; + ASSERT_OK(scan.CreateLogScanner(scanner)); + ASSERT_OK(scanner.Subscribe(0, 0)); + + struct Record { + int32_t id; + size_t outer_count; + fluss::TypeId element_type; + std::vector> values; + }; + + std::vector collected; + auto extract = [](const fluss::ScanRecord& scan_rec) { + const auto& rv = scan_rec.row; + Record rec; + rec.id = rv.GetInt32(0); + rec.outer_count = rv.GetArraySize(1); + rec.element_type = rv.GetArrayElementType(1); + auto outer = rv.GetArrayView(1); + rec.values.reserve(outer.Size()); + for (size_t i = 0; i < outer.Size(); ++i) { + auto inner = outer.GetArray(i); + std::vector row; + row.reserve(inner.Size()); + for (size_t j = 0; j < inner.Size(); ++j) { + row.push_back(inner.GetInt32(j)); + } + rec.values.push_back(std::move(row)); + } + return rec; + }; + + fluss_test::PollRecords(scanner, 1, extract, collected); + ASSERT_EQ(collected.size(), 1u); + EXPECT_EQ(collected[0].id, 1); + EXPECT_EQ(collected[0].outer_count, 2u); + EXPECT_EQ(collected[0].element_type, fluss::TypeId::Array); + ASSERT_EQ(collected[0].values.size(), 2u); + EXPECT_EQ(collected[0].values[0], (std::vector{1, 2})); + EXPECT_EQ(collected[0].values[1], (std::vector{3, 4})); + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, AppendAndScanWithArrayRichTypes) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_append_scan_array_rich_types_cpp"); + + auto schema = + fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("arr_bytes", fluss::DataType::Array(fluss::DataType::Bytes())) + .AddColumn("arr_date", fluss::DataType::Array(fluss::DataType::Date())) + .AddColumn("arr_time", fluss::DataType::Array(fluss::DataType::Time())) + .AddColumn("arr_ts", fluss::DataType::Array(fluss::DataType::Timestamp(6))) + .AddColumn("arr_decimal", fluss::DataType::Array(fluss::DataType::Decimal(10, 2))) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(1) + .SetBucketKeys({"id"}) + .SetProperty("table.replication.factor", "1") + .Build(); + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + fluss::AppendWriter append_writer; + ASSERT_OK(table.NewAppend().CreateWriter(append_writer)); + + { + auto row = table.NewRow(); + row.Set("id", 1); + + fluss::ArrayWriter arr_bytes(2, fluss::DataType::Bytes()); + arr_bytes.SetBytes(0, std::vector{0x10, 0x20, 0x30}); + arr_bytes.SetNull(1); + row.SetArray(1, std::move(arr_bytes)); + + fluss::ArrayWriter arr_date(2, fluss::DataType::Date()); + auto d0 = fluss::Date::FromDays(20000); + arr_date.SetDate(0, d0); + arr_date.SetNull(1); + row.SetArray(2, std::move(arr_date)); + + fluss::ArrayWriter arr_time(1, fluss::DataType::Time()); + auto t0 = fluss::Time::FromMillis(3600000); + arr_time.SetTime(0, t0); + row.SetArray(3, std::move(arr_time)); + + fluss::ArrayWriter arr_ts(1, fluss::DataType::Timestamp(6)); + auto ts0 = fluss::Timestamp::FromMillisNanos(1769163227123, 456000); + arr_ts.SetTimestampNtz(0, ts0); + row.SetArray(4, std::move(arr_ts)); + + fluss::ArrayWriter arr_decimal(2, fluss::DataType::Decimal(10, 2)); + arr_decimal.SetDecimal(0, "123.45"); + arr_decimal.SetNull(1); + row.SetArray(5, std::move(arr_decimal)); + + ASSERT_OK(append_writer.Append(row)); + } + + ASSERT_OK(append_writer.Flush()); + + auto scan = table.NewScan(); + fluss::LogScanner scanner; + ASSERT_OK(scan.CreateLogScanner(scanner)); + ASSERT_OK(scanner.Subscribe(0, 0)); + + fluss::ScanRecords records; + ASSERT_OK(scanner.Poll(10000, records)); + ASSERT_EQ(records.Count(), 1u); + + auto it = records.begin(); + ASSERT_TRUE(it != records.end()); + auto rec = *it; + const auto& rv = rec.row; + + EXPECT_EQ(rv.GetArraySize(1), 2u); + auto bytes0 = rv.GetArrayBytes(1, 0); + ASSERT_EQ(bytes0.size(), 3u); + EXPECT_EQ(bytes0[0], 0x10); + EXPECT_EQ(bytes0[1], 0x20); + EXPECT_EQ(bytes0[2], 0x30); + EXPECT_TRUE(rv.IsArrayElementNull(1, 1)); + + EXPECT_EQ(rv.GetArraySize(2), 2u); + EXPECT_EQ(rv.GetArrayDate(2, 0).days_since_epoch, fluss::Date::FromDays(20000).days_since_epoch); + EXPECT_TRUE(rv.IsArrayElementNull(2, 1)); + + EXPECT_EQ(rv.GetArraySize(3), 1u); + EXPECT_EQ(rv.GetArrayTime(3, 0).millis_since_midnight, fluss::Time::FromMillis(3600000).millis_since_midnight); + + EXPECT_EQ(rv.GetArraySize(4), 1u); + auto ts = rv.GetArrayTimestamp(4, 0); + EXPECT_EQ(ts.epoch_millis, 1769163227123); + EXPECT_EQ(ts.nano_of_millisecond, 456000); + + EXPECT_EQ(rv.GetArraySize(5), 2u); + EXPECT_EQ(rv.GetArrayDecimalString(5, 0), "123.45"); + EXPECT_TRUE(rv.IsArrayElementNull(5, 1)); + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, ArrayApiValidationErrors) { + // Type mismatch setter should fail through FFI Result propagation. + { + fluss::ArrayWriter bool_array(1, fluss::DataType::Boolean()); + bool threw = false; + try { + bool_array.SetInt32(0, 42); + } catch (const std::exception&) { + threw = true; + } + EXPECT_TRUE(threw); + } + + auto& adm = admin(); + auto& conn = connection(); + fluss::TablePath table_path("fluss", "test_array_api_validation_errors_cpp"); + + auto schema = fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("vals", fluss::DataType::Array(fluss::DataType::Int())) + .Build(); + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(1) + .SetBucketKeys({"id"}) + .SetProperty("table.replication.factor", "1") + .Build(); + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + fluss::AppendWriter append_writer; + ASSERT_OK(table.NewAppend().CreateWriter(append_writer)); + auto row = table.NewRow(); + row.Set("id", 1); + fluss::ArrayWriter vals(2, fluss::DataType::Int()); + vals.SetInt32(0, 7); + vals.SetNull(1); + row.SetArray(1, std::move(vals)); + ASSERT_OK(append_writer.Append(row)); + ASSERT_OK(append_writer.Flush()); + + auto scan = table.NewScan(); + fluss::LogScanner scanner; + ASSERT_OK(scan.CreateLogScanner(scanner)); + ASSERT_OK(scanner.Subscribe(0, 0)); + fluss::ScanRecords records; + ASSERT_OK(scanner.Poll(10000, records)); + ASSERT_EQ(records.Count(), 1u); + auto it = records.begin(); + ASSERT_TRUE(it != records.end()); + auto rec = *it; + + bool oob_threw = false; + try { + (void)rec.row.GetArrayInt32(1, 5); + } catch (const std::exception&) { + oob_threw = true; + } + EXPECT_TRUE(oob_threw); + + bool wrong_type_threw = false; + try { + (void)rec.row.GetArrayInt64(1, 0); + } catch (const std::exception&) { + wrong_type_threw = true; + } + EXPECT_TRUE(wrong_type_threw); + + bool null_typed_getter_threw = false; + try { + (void)rec.row.GetArrayInt32(1, 1); + } catch (const std::exception&) { + null_typed_getter_threw = true; + } + EXPECT_TRUE(null_typed_getter_threw); + + auto view = rec.row.GetArrayView(1); + EXPECT_EQ(view.Size(), 2u); + EXPECT_TRUE(view.IsNull(1)); + + bool view_wrong_type_threw = false; + try { + (void)view.GetInt64(0); + } catch (const std::exception&) { + view_wrong_type_threw = true; + } + EXPECT_TRUE(view_wrong_type_threw); + + bool view_null_typed_getter_threw = false; + try { + (void)view.GetInt32(1); + } catch (const std::exception&) { + view_null_typed_getter_threw = true; + } + EXPECT_TRUE(view_null_typed_getter_threw); + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, AppendAndScanWithArrayEncodingEdgeCases) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_array_encoding_edge_cases_cpp"); + + auto schema = + fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("arr_long_str", fluss::DataType::Array(fluss::DataType::String())) + .AddColumn("arr_big_decimal", fluss::DataType::Array(fluss::DataType::Decimal(22, 5))) + .AddColumn("arr_ts_nano", fluss::DataType::Array(fluss::DataType::Timestamp(9))) + .AddColumn("arr_float", fluss::DataType::Array(fluss::DataType::Float())) + .AddColumn("arr_double", fluss::DataType::Array(fluss::DataType::Double())) + .AddColumn("arr_binary", fluss::DataType::Array(fluss::DataType::Binary(4))) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetBucketCount(1) + .SetBucketKeys({"id"}) + .SetProperty("table.replication.factor", "1") + .Build(); + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + fluss::AppendWriter append_writer; + ASSERT_OK(table.NewAppend().CreateWriter(append_writer)); + + { + auto row = table.NewRow(); + row.Set("id", 1); + + // >= 8 bytes forces the heap-pointer variable-length path (threshold: 7) + fluss::ArrayWriter arr_long_str(2, fluss::DataType::String()); + arr_long_str.SetString(0, "abcdefgh"); + arr_long_str.SetString(1, "this is a much longer string that definitely exceeds inline"); + row.SetArray(1, std::move(arr_long_str)); + + // precision > 18 forces non-compact decimal encoding + fluss::ArrayWriter arr_big_decimal(2, fluss::DataType::Decimal(22, 5)); + arr_big_decimal.SetDecimal(0, "12345678901234567.12345"); + arr_big_decimal.SetDecimal(1, "-99999999999999999.99999"); + row.SetArray(2, std::move(arr_big_decimal)); + + // precision > 3 forces non-compact timestamp (millis + nanos-of-millis) + fluss::ArrayWriter arr_ts_nano(1, fluss::DataType::Timestamp(9)); + auto ts_nano = fluss::Timestamp::FromMillisNanos(1769163227123, 456789); + arr_ts_nano.SetTimestampNtz(0, ts_nano); + row.SetArray(3, std::move(arr_ts_nano)); + + // IEEE 754 special values: NaN, +Infinity, -Infinity + fluss::ArrayWriter arr_float(3, fluss::DataType::Float()); + arr_float.SetFloat32(0, std::numeric_limits::quiet_NaN()); + arr_float.SetFloat32(1, std::numeric_limits::infinity()); + arr_float.SetFloat32(2, -std::numeric_limits::infinity()); + row.SetArray(4, std::move(arr_float)); + + fluss::ArrayWriter arr_double(3, fluss::DataType::Double()); + arr_double.SetFloat64(0, std::numeric_limits::quiet_NaN()); + arr_double.SetFloat64(1, std::numeric_limits::infinity()); + arr_double.SetFloat64(2, -std::numeric_limits::infinity()); + row.SetArray(5, std::move(arr_double)); + + // Fixed-length binary + fluss::ArrayWriter arr_binary(2, fluss::DataType::Binary(4)); + arr_binary.SetBytes(0, std::vector{0xDE, 0xAD, 0xBE, 0xEF}); + arr_binary.SetNull(1); + row.SetArray(6, std::move(arr_binary)); + + ASSERT_OK(append_writer.Append(row)); + } + + ASSERT_OK(append_writer.Flush()); + + auto scan = table.NewScan(); + fluss::LogScanner scanner; + ASSERT_OK(scan.CreateLogScanner(scanner)); + ASSERT_OK(scanner.Subscribe(0, 0)); + + fluss::ScanRecords records; + ASSERT_OK(scanner.Poll(10000, records)); + ASSERT_EQ(records.Count(), 1u); + + auto it = records.begin(); + ASSERT_TRUE(it != records.end()); + auto rec = *it; + const auto& rv = rec.row; + + // Long strings: heap-encoded variable-length round-trip + EXPECT_EQ(rv.GetArraySize(1), 2u); + EXPECT_EQ(rv.GetArrayString(1, 0), "abcdefgh"); + EXPECT_EQ(rv.GetArrayString(1, 1), "this is a much longer string that definitely exceeds inline"); + + // Non-compact decimal (precision 22 > MAX_COMPACT_PRECISION 18) + EXPECT_EQ(rv.GetArraySize(2), 2u); + EXPECT_EQ(rv.GetArrayDecimalString(2, 0), "12345678901234567.12345"); + EXPECT_EQ(rv.GetArrayDecimalString(2, 1), "-99999999999999999.99999"); + + // Non-compact timestamp (precision 9 > MAX_COMPACT_TIMESTAMP_PRECISION 3) + EXPECT_EQ(rv.GetArraySize(3), 1u); + auto ts = rv.GetArrayTimestamp(3, 0); + EXPECT_EQ(ts.epoch_millis, 1769163227123); + EXPECT_EQ(ts.nano_of_millisecond, 456789); + + // Float NaN / Infinity round-trip + EXPECT_EQ(rv.GetArraySize(4), 3u); + EXPECT_TRUE(std::isnan(rv.GetArrayFloat32(4, 0))); + EXPECT_TRUE(std::isinf(rv.GetArrayFloat32(4, 1))); + EXPECT_GT(rv.GetArrayFloat32(4, 1), 0.0f); + EXPECT_TRUE(std::isinf(rv.GetArrayFloat32(4, 2))); + EXPECT_LT(rv.GetArrayFloat32(4, 2), 0.0f); + + // Double NaN / Infinity round-trip + EXPECT_EQ(rv.GetArraySize(5), 3u); + EXPECT_TRUE(std::isnan(rv.GetArrayFloat64(5, 0))); + EXPECT_TRUE(std::isinf(rv.GetArrayFloat64(5, 1))); + EXPECT_GT(rv.GetArrayFloat64(5, 1), 0.0); + EXPECT_TRUE(std::isinf(rv.GetArrayFloat64(5, 2))); + EXPECT_LT(rv.GetArrayFloat64(5, 2), 0.0); + + // Fixed-length binary round-trip + EXPECT_EQ(rv.GetArraySize(6), 2u); + auto bin = rv.GetArrayBytes(6, 0); + ASSERT_EQ(bin.size(), 4u); + EXPECT_EQ(bin[0], 0xDE); + EXPECT_EQ(bin[1], 0xAD); + EXPECT_EQ(bin[2], 0xBE); + EXPECT_EQ(bin[3], 0xEF); + EXPECT_TRUE(rv.IsArrayElementNull(6, 1)); + + ASSERT_OK(adm.DropTable(table_path, false)); +} + +TEST_F(LogTableTest, ArrayWriterOverflowDetection) { + // SetInt32 on TINYINT array must throw when value overflows i8 range (-128..127) + { + fluss::ArrayWriter tinyint_arr(1, fluss::DataType::TinyInt()); + EXPECT_EQ(tinyint_arr.Size(), 1u); + bool threw = false; + try { + tinyint_arr.SetInt32(0, 1000); + } catch (const std::exception& e) { + threw = true; + std::string msg(e.what()); + EXPECT_NE(msg.find("TINYINT"), std::string::npos); + } + EXPECT_TRUE(threw); + } + + // SetInt32 on SMALLINT array must throw when value overflows i16 range (-32768..32767) + { + fluss::ArrayWriter smallint_arr(1, fluss::DataType::SmallInt()); + bool threw = false; + try { + smallint_arr.SetInt32(0, 40000); + } catch (const std::exception& e) { + threw = true; + std::string msg(e.what()); + EXPECT_NE(msg.find("SMALLINT"), std::string::npos); + } + EXPECT_TRUE(threw); + } + + // Negative overflow: -200 doesn't fit TINYINT + { + fluss::ArrayWriter tinyint_arr(1, fluss::DataType::TinyInt()); + bool threw = false; + try { + tinyint_arr.SetInt32(0, -200); + } catch (const std::exception&) { + threw = true; + } + EXPECT_TRUE(threw); + } + + // Values within range must succeed + { + fluss::ArrayWriter tinyint_arr(1, fluss::DataType::TinyInt()); + EXPECT_NO_THROW(tinyint_arr.SetInt32(0, 127)); + } + { + fluss::ArrayWriter tinyint_arr(1, fluss::DataType::TinyInt()); + EXPECT_NO_THROW(tinyint_arr.SetInt32(0, -128)); + } + { + fluss::ArrayWriter smallint_arr(1, fluss::DataType::SmallInt()); + EXPECT_NO_THROW(smallint_arr.SetInt32(0, 32767)); + } +} + +TEST_F(LogTableTest, NullabilityPreservedInTableInfo) { + auto& adm = admin(); + auto& conn = connection(); + + fluss::TablePath table_path("fluss", "test_nullability_table_info_cpp"); + + auto schema = + fluss::Schema::NewBuilder() + .AddColumn("id", fluss::DataType::Int()) + .AddColumn("name", fluss::DataType::String()) + .AddColumn("tags", fluss::DataType::Array(fluss::DataType::String().NotNull())) + .AddColumn("ids", fluss::DataType::Array(fluss::DataType::Int()).NotNull()) + .AddColumn("nested", + fluss::DataType::Array( + fluss::DataType::Array(fluss::DataType::Int()).NotNull())) + .SetPrimaryKeys({"id"}) + .Build(); + + auto table_descriptor = fluss::TableDescriptor::NewBuilder() + .SetSchema(schema) + .SetProperty("table.replication.factor", "1") + .Build(); + + fluss_test::CreateTable(adm, table_path, table_descriptor); + + fluss::Table table; + ASSERT_OK(conn.GetTable(table_path, table)); + auto info = table.GetTableInfo(); + + ASSERT_EQ(info.schema.columns.size(), 5u); + EXPECT_EQ(info.primary_keys, std::vector{"id"}); + + // Primary key columns are forced NOT NULL by schema normalization. + EXPECT_EQ(info.schema.columns[0].data_type.id(), fluss::TypeId::Int); + EXPECT_FALSE(info.schema.columns[0].data_type.nullable()); + + // "name" STRING (nullable) + EXPECT_EQ(info.schema.columns[1].data_type.id(), fluss::TypeId::String); + EXPECT_TRUE(info.schema.columns[1].data_type.nullable()); + + // "tags" ARRAY (outer nullable) + EXPECT_EQ(info.schema.columns[2].data_type.id(), fluss::TypeId::Array); + EXPECT_TRUE(info.schema.columns[2].data_type.nullable()); + ASSERT_NE(info.schema.columns[2].data_type.element_type(), nullptr); + EXPECT_FALSE(info.schema.columns[2].data_type.element_type()->nullable()); + + // "ids" ARRAY NOT NULL (outer not null, element nullable) + EXPECT_EQ(info.schema.columns[3].data_type.id(), fluss::TypeId::Array); + EXPECT_FALSE(info.schema.columns[3].data_type.nullable()); + ASSERT_NE(info.schema.columns[3].data_type.element_type(), nullptr); + EXPECT_TRUE(info.schema.columns[3].data_type.element_type()->nullable()); + + // "nested" ARRAY NOT NULL> (outer nullable, inner array not null) + EXPECT_EQ(info.schema.columns[4].data_type.id(), fluss::TypeId::Array); + EXPECT_TRUE(info.schema.columns[4].data_type.nullable()); + ASSERT_NE(info.schema.columns[4].data_type.element_type(), nullptr); + EXPECT_FALSE(info.schema.columns[4].data_type.element_type()->nullable()); + ASSERT_NE(info.schema.columns[4].data_type.element_type()->element_type(), nullptr); + EXPECT_TRUE(info.schema.columns[4].data_type.element_type()->element_type()->nullable()); + + ASSERT_OK(adm.DropTable(table_path, false)); +} diff --git a/fluss-rust/bindings/cpp/test/test_main.cpp b/fluss-rust/bindings/cpp/test/test_main.cpp new file mode 100644 index 0000000000..48d1050b8d --- /dev/null +++ b/fluss-rust/bindings/cpp/test/test_main.cpp @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "test_utils.h" + +int main(int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (std::string(argv[i]) == "--cleanup") { + fluss_test::FlussTestCluster::StopAll(); + return 0; + } + } + + ::testing::InitGoogleTest(&argc, argv); + ::testing::AddGlobalTestEnvironment(fluss_test::FlussTestEnvironment::Instance()); + return RUN_ALL_TESTS(); +} diff --git a/fluss-rust/bindings/cpp/test/test_sasl_auth.cpp b/fluss-rust/bindings/cpp/test/test_sasl_auth.cpp new file mode 100644 index 0000000000..5a52a1ab79 --- /dev/null +++ b/fluss-rust/bindings/cpp/test/test_sasl_auth.cpp @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include + +#include "test_utils.h" + +class SaslAuthTest : public ::testing::Test { + protected: + const std::string& sasl_servers() { + return fluss_test::FlussTestEnvironment::Instance()->GetSaslBootstrapServers(); + } + const std::string& plaintext_servers() { + return fluss_test::FlussTestEnvironment::Instance()->GetBootstrapServers(); + } +}; + +TEST_F(SaslAuthTest, SaslConnectWithValidCredentials) { + fluss::Configuration config; + config.bootstrap_servers = sasl_servers(); + config.security_protocol = "sasl"; + config.security_sasl_mechanism = "PLAIN"; + config.security_sasl_username = "admin"; + config.security_sasl_password = "admin-secret"; + + fluss::Connection conn; + ASSERT_OK(fluss::Connection::Create(config, conn)); + + fluss::Admin admin; + ASSERT_OK(conn.GetAdmin(admin)); + + // Perform a basic operation to confirm the connection is fully functional + std::string db_name = "cpp_sasl_test_valid_db"; + fluss::DatabaseDescriptor descriptor; + descriptor.comment = "created via SASL auth"; + ASSERT_OK(admin.CreateDatabase(db_name, descriptor, true)); + + bool exists = false; + ASSERT_OK(admin.DatabaseExists(db_name, exists)); + ASSERT_TRUE(exists); + + ASSERT_OK(admin.DropDatabase(db_name, true, true)); +} + +TEST_F(SaslAuthTest, SaslConnectWithSecondUser) { + fluss::Configuration config; + config.bootstrap_servers = sasl_servers(); + config.security_protocol = "sasl"; + config.security_sasl_mechanism = "PLAIN"; + config.security_sasl_username = "alice"; + config.security_sasl_password = "alice-secret"; + + fluss::Connection conn; + ASSERT_OK(fluss::Connection::Create(config, conn)); + + fluss::Admin admin; + ASSERT_OK(conn.GetAdmin(admin)); + + // Basic operation to confirm functional connection + bool exists = false; + ASSERT_OK(admin.DatabaseExists("some_nonexistent_db_alice", exists)); + ASSERT_FALSE(exists); +} + +TEST_F(SaslAuthTest, SaslConnectWithWrongPassword) { + fluss::Configuration config; + config.bootstrap_servers = sasl_servers(); + config.security_protocol = "sasl"; + config.security_sasl_mechanism = "PLAIN"; + config.security_sasl_username = "admin"; + config.security_sasl_password = "wrong-password"; + + fluss::Connection conn; + auto result = fluss::Connection::Create(config, conn); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::AUTHENTICATE_EXCEPTION); + EXPECT_NE(result.error_message.find("Authentication failed"), std::string::npos) + << "Expected 'Authentication failed' in: " << result.error_message; +} + +TEST_F(SaslAuthTest, SaslConnectWithUnknownUser) { + fluss::Configuration config; + config.bootstrap_servers = sasl_servers(); + config.security_protocol = "sasl"; + config.security_sasl_mechanism = "PLAIN"; + config.security_sasl_username = "nonexistent_user"; + config.security_sasl_password = "some-password"; + + fluss::Connection conn; + auto result = fluss::Connection::Create(config, conn); + ASSERT_FALSE(result.Ok()); + EXPECT_EQ(result.error_code, fluss::ErrorCode::AUTHENTICATE_EXCEPTION); + EXPECT_NE(result.error_message.find("Authentication failed"), std::string::npos) + << "Expected 'Authentication failed' in: " << result.error_message; +} + +TEST_F(SaslAuthTest, SaslClientToPlaintextServer) { + fluss::Configuration config; + config.bootstrap_servers = plaintext_servers(); + config.security_protocol = "sasl"; + config.security_sasl_mechanism = "PLAIN"; + config.security_sasl_username = "admin"; + config.security_sasl_password = "admin-secret"; + + fluss::Connection conn; + auto result = fluss::Connection::Create(config, conn); + ASSERT_FALSE(result.Ok()) << "SASL client connecting to plaintext server should fail"; +} diff --git a/fluss-rust/bindings/cpp/test/test_utils.h b/fluss-rust/bindings/cpp/test/test_utils.h new file mode 100644 index 0000000000..5d40afbaf3 --- /dev/null +++ b/fluss-rust/bindings/cpp/test/test_utils.h @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "fluss.hpp" + +#define ASSERT_OK(result) ASSERT_TRUE((result).Ok()) << (result).error_message +#define EXPECT_OK(result) EXPECT_TRUE((result).Ok()) << (result).error_message + +namespace fluss_test { + +inline std::string FindCliBinary() { + const char* env_bin = std::getenv("FLUSS_TEST_CLUSTER_BIN"); + if (env_bin && std::strlen(env_bin) > 0) { + if (std::ifstream(env_bin).good()) { + return env_bin; + } + std::cerr << "FLUSS_TEST_CLUSTER_BIN is set to '" << env_bin + << "' but that file does not exist." << std::endl; + std::abort(); + } + FILE* pipe = popen("cargo locate-project --workspace --message-format plain", "r"); + if (pipe) { + char buf[512]; + std::string root; + while (fgets(buf, sizeof(buf), pipe)) root += buf; + if (pclose(pipe) == 0) { + // cargo returns path to Cargo.toml; strip filename + trailing whitespace. + while (!root.empty() && (root.back() == '\n' || root.back() == '\r')) root.pop_back(); + auto slash = root.rfind('/'); + if (slash != std::string::npos) { + std::string dir = root.substr(0, slash); + for (const char* profile : {"debug", "release"}) { + std::string path = dir + "/target/" + profile + "/fluss-test-cluster"; + if (std::ifstream(path).good()) return path; + } + } + } + } + return "fluss-test-cluster"; +} + +constexpr const char* kClusterName = "shared-test"; + +inline std::string CliStartCmd() { + return FindCliBinary() + " start --sasl --name " + kClusterName; +} + +constexpr const char* kClusterJsonPrefix = "CLUSTER_JSON: "; + +inline bool ParseClusterJson(const std::string& output, std::string& bootstrap, + std::string& sasl_bootstrap) { + // Look for the CLUSTER_JSON: token in output lines. + std::istringstream stream(output); + std::string line; + while (std::getline(stream, line)) { + if (line.rfind(kClusterJsonPrefix, 0) != 0) continue; + std::string json_str = line.substr(std::strlen(kClusterJsonPrefix)); + try { + auto info = nlohmann::json::parse(json_str); + bootstrap = info.at("bootstrap_servers").get(); + if (info.contains("sasl_bootstrap_servers") && + !info["sasl_bootstrap_servers"].is_null()) { + sasl_bootstrap = info["sasl_bootstrap_servers"].get(); + } + return true; + } catch (const nlohmann::json::exception& e) { + std::cerr << "Failed to parse cluster JSON: " << e.what() << "\n" + << "Line: " << line << std::endl; + return false; + } + } + std::cerr << "No CLUSTER_JSON token found in output:\n" << output << std::endl; + return false; +} + +class FlussTestCluster { + public: + FlussTestCluster() = default; + + bool Start() { + const char* env = std::getenv("FLUSS_BOOTSTRAP_SERVERS"); + if (env && std::strlen(env) > 0) { + bootstrap_servers_ = env; + const char* env_sasl = std::getenv("FLUSS_SASL_BOOTSTRAP_SERVERS"); + sasl_bootstrap_servers_ = (env_sasl && std::strlen(env_sasl) > 0) ? env_sasl : env; + return true; + } + + std::string cli_cmd = CliStartCmd(); + FILE* pipe = popen(cli_cmd.c_str(), "r"); + if (!pipe) { + std::cerr << "Failed to launch fluss-test-cluster binary" << std::endl; + return false; + } + std::string output; + char buf[512]; + while (fgets(buf, sizeof(buf), pipe)) output += buf; + int rc = pclose(pipe); + if (rc != 0) { + std::cerr << "fluss-test-cluster start failed (exit " << rc << "):\n" + << output << std::endl; + return false; + } + if (!ParseClusterJson(output, bootstrap_servers_, sasl_bootstrap_servers_)) { + std::cerr << "Failed to parse cluster JSON from:\n" << output << std::endl; + return false; + } + return true; + } + + static void StopAll() { + std::string cmd = FindCliBinary() + " stop --name " + kClusterName; + system(cmd.c_str()); + } + + const std::string& GetBootstrapServers() const { return bootstrap_servers_; } + const std::string& GetSaslBootstrapServers() const { return sasl_bootstrap_servers_; } + + private: + std::string bootstrap_servers_; + std::string sasl_bootstrap_servers_; +}; + +class FlussTestEnvironment : public ::testing::Environment { + public: + static FlussTestEnvironment* Instance() { + static FlussTestEnvironment* instance = nullptr; + if (!instance) { + instance = new FlussTestEnvironment(); + } + return instance; + } + + void SetUp() override { + if (!cluster_.Start()) { + GTEST_SKIP() << "Failed to start Fluss cluster. Skipping integration tests."; + } + + fluss::Configuration config; + config.bootstrap_servers = cluster_.GetBootstrapServers(); + auto result = fluss::Connection::Create(config, connection_); + if (!result.Ok()) { + GTEST_SKIP() << "Failed to connect: " << result.error_message; + } + auto admin_result = connection_.GetAdmin(admin_); + if (!admin_result.Ok()) { + GTEST_SKIP() << "Failed to get admin: " << admin_result.error_message; + } + } + + void TearDown() override {} + + fluss::Connection& GetConnection() { return connection_; } + fluss::Admin& GetAdmin() { return admin_; } + const std::string& GetBootstrapServers() { return cluster_.GetBootstrapServers(); } + const std::string& GetSaslBootstrapServers() { return cluster_.GetSaslBootstrapServers(); } + + private: + FlussTestEnvironment() = default; + + FlussTestCluster cluster_; + fluss::Connection connection_; + fluss::Admin admin_; +}; + +inline void CreateTable(fluss::Admin& admin, const fluss::TablePath& path, + const fluss::TableDescriptor& descriptor) { + admin.DropTable(path, true); // ignore if not exists + auto result = admin.CreateTable(path, descriptor, false); + ASSERT_OK(result); +} + +inline void CreatePartitions(fluss::Admin& admin, const fluss::TablePath& path, + const std::string& partition_column, + const std::vector& values) { + for (const auto& value : values) { + std::unordered_map spec; + spec[partition_column] = value; + auto result = admin.CreatePartition(path, spec, true); + ASSERT_OK(result); + } +} + +template +void PollRecords(fluss::LogScanner& scanner, size_t expected_count, ExtractFn extract_fn, + std::vector& out) { + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); + while (out.size() < expected_count && std::chrono::steady_clock::now() < deadline) { + fluss::ScanRecords records; + ASSERT_OK(scanner.Poll(1000, records)); + for (auto rec : records) { + out.push_back(extract_fn(rec)); + } + } +} + +template +void PollRecordBatches(fluss::LogScanner& scanner, size_t expected_count, ExtractFn extract_fn, + std::vector& out) { + auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10); + while (out.size() < expected_count && std::chrono::steady_clock::now() < deadline) { + fluss::ArrowRecordBatches batches; + ASSERT_OK(scanner.PollRecordBatch(1000, batches)); + auto items = extract_fn(batches); + out.insert(out.end(), items.begin(), items.end()); + } +} + +} // namespace fluss_test diff --git a/fluss-rust/bindings/elixir/.formatter.exs b/fluss-rust/bindings/elixir/.formatter.exs new file mode 100644 index 0000000000..dd63ff521c --- /dev/null +++ b/fluss-rust/bindings/elixir/.formatter.exs @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[ + inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] +] diff --git a/fluss-rust/bindings/elixir/.gitignore b/fluss-rust/bindings/elixir/.gitignore new file mode 100644 index 0000000000..90277ffb16 --- /dev/null +++ b/fluss-rust/bindings/elixir/.gitignore @@ -0,0 +1,9 @@ +# Elixir build artifacts +_build/ +deps/ + +# Generated NIF shared library +priv/native/ + +# Crash dumps +erl_crash.dump diff --git a/fluss-rust/bindings/elixir/README.md b/fluss-rust/bindings/elixir/README.md new file mode 100644 index 0000000000..656b03c51d --- /dev/null +++ b/fluss-rust/bindings/elixir/README.md @@ -0,0 +1,60 @@ +# Fluss Elixir Client + +Elixir client for [Apache Fluss (Incubating)](https://fluss.apache.org/), built on the official Rust client via [Rustler](https://github.com/rusterlium/rustler) NIFs. + +Currently supports **log tables** (append + scan). Primary key (KV) table support is planned. + +## Requirements + +- Elixir >= 1.15 +- Rust stable toolchain (for compiling the NIF) + +## Quick Start + +```elixir +config = Fluss.Config.new("localhost:9123") +conn = Fluss.Connection.new!(config) +admin = Fluss.Admin.new!(conn) + +schema = + Fluss.Schema.build() + |> Fluss.Schema.column("ts", :bigint) + |> Fluss.Schema.column("message", :string) + |> Fluss.Schema.build!() + +:ok = Fluss.Admin.create_table(admin, "my_db", "events", Fluss.TableDescriptor.new!(schema)) + +table = Fluss.Table.get!(conn, "my_db", "events") +writer = Fluss.AppendWriter.new!(table) +Fluss.AppendWriter.append(writer, [1_700_000_000, "hello"]) +:ok = Fluss.AppendWriter.flush(writer) + +scanner = Fluss.LogScanner.new!(table) +:ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset()) +:ok = Fluss.LogScanner.poll(scanner, 5_000) + +receive do + {:fluss_records, records} -> + for record <- records, do: IO.inspect(record[:row]) +end +``` + +## Data Types + +Simple: `:boolean`, `:tinyint`, `:smallint`, `:int`, `:bigint`, `:float`, `:double`, `:string`, `:bytes`, `:date`, `:time`, `:timestamp`, `:timestamp_ltz` + +Parameterized: `{:decimal, precision, scale}`, `{:char, length}`, `{:binary, length}` + +## Development + +```bash +cd bindings/elixir +mix test # unit tests +mix test --include integration # starts Docker cluster +``` + +Set `FLUSS_BOOTSTRAP_SERVERS` to use an existing cluster. + +## License + +Apache License 2.0 diff --git a/fluss-rust/bindings/elixir/lib/fluss.ex b/fluss-rust/bindings/elixir/lib/fluss.ex new file mode 100644 index 0000000000..25aa649160 --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss.ex @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss do + @moduledoc """ + Elixir client for Apache Fluss (Incubating). + + ## Examples + + config = Fluss.Config.new("localhost:9123") + conn = Fluss.Connection.new!(config) + admin = Fluss.Admin.new!(conn) + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("ts", :bigint) + |> Fluss.Schema.column("message", :string) + + :ok = Fluss.Admin.create_table(admin, "my_db", "events", Fluss.TableDescriptor.new!(schema)) + + table = Fluss.Table.get!(conn, "my_db", "events") + writer = Fluss.AppendWriter.new!(table) + Fluss.AppendWriter.append(writer, [1_700_000_000, "hello"]) + :ok = Fluss.AppendWriter.flush(writer) + + scanner = Fluss.LogScanner.new!(table) + :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset()) + :ok = Fluss.LogScanner.poll(scanner, 5_000) + receive do + {:fluss_records, records} -> records + end + + """ + + alias Fluss.Native + + def earliest_offset, do: Native.earliest_offset() +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/admin.ex b/fluss-rust/bindings/elixir/lib/fluss/admin.ex new file mode 100644 index 0000000000..6dbdb3a9c1 --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/admin.ex @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.Admin do + @moduledoc """ + Admin client for DDL operations (create/drop databases and tables). + + ## Examples + + admin = Fluss.Admin.new!(conn) + :ok = Fluss.Admin.create_database(admin, "my_db") + + schema = Fluss.Schema.new() |> Fluss.Schema.column("ts", :bigint) + descriptor = Fluss.TableDescriptor.new!(schema) + :ok = Fluss.Admin.create_table(admin, "my_db", "events", descriptor) + + """ + + alias Fluss.Native + + @type t :: reference() + + @spec new(Fluss.Connection.t()) :: {:ok, t()} | {:error, Fluss.Error.t()} + def new(conn) do + case Native.admin_new(conn) do + {:error, _} = err -> err + admin -> {:ok, admin} + end + end + + @spec new!(Fluss.Connection.t()) :: t() + def new!(conn) do + case new(conn) do + {:ok, admin} -> admin + {:error, %Fluss.Error{} = err} -> raise err + end + end + + @spec create_database(t(), String.t(), boolean()) :: :ok | {:error, Fluss.Error.t()} + def create_database(admin, name, ignore_if_exists \\ true) do + admin + |> Native.admin_create_database(name, ignore_if_exists) + |> Native.await_nif() + end + + @spec drop_database(t(), String.t(), boolean()) :: :ok | {:error, Fluss.Error.t()} + def drop_database(admin, name, ignore_if_not_exists \\ true) do + admin + |> Native.admin_drop_database(name, ignore_if_not_exists) + |> Native.await_nif() + end + + @spec list_databases(t()) :: {:ok, [String.t()]} | {:error, Fluss.Error.t()} + def list_databases(admin) do + admin + |> Native.admin_list_databases() + |> Native.await_nif() + end + + @spec list_databases!(t()) :: [String.t()] + def list_databases!(admin) do + case list_databases(admin) do + {:ok, dbs} -> dbs + {:error, %Fluss.Error{} = err} -> raise err + end + end + + @spec create_table(t(), String.t(), String.t(), Fluss.TableDescriptor.t(), boolean()) :: + :ok | {:error, Fluss.Error.t()} + def create_table(admin, database, table, descriptor, ignore_if_exists \\ true) do + admin + |> Native.admin_create_table(database, table, descriptor, ignore_if_exists) + |> Native.await_nif() + end + + @spec drop_table(t(), String.t(), String.t(), boolean()) :: :ok | {:error, Fluss.Error.t()} + def drop_table(admin, database, table, ignore_if_not_exists \\ true) do + admin + |> Native.admin_drop_table(database, table, ignore_if_not_exists) + |> Native.await_nif() + end + + @spec list_tables(t(), String.t()) :: {:ok, [String.t()]} | {:error, Fluss.Error.t()} + def list_tables(admin, database) do + admin + |> Native.admin_list_tables(database) + |> Native.await_nif() + end + + @spec list_tables!(t(), String.t()) :: [String.t()] + def list_tables!(admin, database) do + case list_tables(admin, database) do + {:ok, tables} -> tables + {:error, %Fluss.Error{} = err} -> raise err + end + end +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/append_writer.ex b/fluss-rust/bindings/elixir/lib/fluss/append_writer.ex new file mode 100644 index 0000000000..5dddbf7b1d --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/append_writer.ex @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.AppendWriter do + @moduledoc """ + Writer for appending records to a log table. + + Values are passed as a list in column order. Use `nil` for null values. + `append/2` returns a `Fluss.WriteHandle` — drop it for fire-and-forget, + or call `Fluss.WriteHandle.wait/1` for per-record acknowledgment. + + ## Examples + + writer = Fluss.AppendWriter.new!(table) + + # Fire-and-forget + Fluss.AppendWriter.append(writer, [1_700_000_000, "hello"]) + Fluss.AppendWriter.append(writer, [1_700_000_001, "world"]) + :ok = Fluss.AppendWriter.flush(writer) + + # Per-record ack + {:ok, handle} = Fluss.AppendWriter.append(writer, [1_700_000_002, "critical"]) + :ok = Fluss.WriteHandle.wait(handle) + + """ + + alias Fluss.Native + + @type t :: reference() + + @spec new(Fluss.Table.t()) :: {:ok, t()} | {:error, Fluss.Error.t()} + def new(table) do + case Native.append_writer_new(table) do + {:error, _} = err -> err + w -> {:ok, w} + end + end + + @spec new!(Fluss.Table.t()) :: t() + def new!(table) do + case new(table) do + {:ok, w} -> w + {:error, %Fluss.Error{} = err} -> raise err + end + end + + @spec append(t(), list()) :: {:ok, Fluss.WriteHandle.t()} | {:error, Fluss.Error.t()} + def append(writer, values) when is_list(values) do + case Native.append_writer_append(writer, values) do + {:error, _} = err -> err + handle -> {:ok, handle} + end + end + + @spec flush(t()) :: :ok | {:error, Fluss.Error.t()} + def flush(writer) do + writer + |> Native.append_writer_flush() + |> Native.await_nif() + end +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/config.ex b/fluss-rust/bindings/elixir/lib/fluss/config.ex new file mode 100644 index 0000000000..8aaacf7993 --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/config.ex @@ -0,0 +1,239 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.Config do + @moduledoc """ + Client configuration for connecting to a Fluss cluster. + + Fields left as `nil` use the client's defaults. + + ## Examples + + config = Fluss.Config.new("localhost:9123") + + config = + Fluss.Config.new("host1:9123,host2:9123") + |> Fluss.Config.set_writer_batch_size(1_048_576) + + """ + + @enforce_keys [:bootstrap_servers] + defstruct bootstrap_servers: nil, + connect_timeout_ms: nil, + remote_file_download_thread_num: nil, + scanner_log_fetch_max_bytes: nil, + scanner_log_fetch_max_bytes_for_bucket: nil, + scanner_log_fetch_min_bytes: nil, + scanner_log_fetch_wait_max_time_ms: nil, + scanner_log_max_poll_records: nil, + scanner_remote_log_prefetch_num: nil, + scanner_remote_log_read_concurrency: nil, + security_protocol: nil, + security_sasl_mechanism: nil, + security_sasl_password: nil, + security_sasl_username: nil, + writer_acks: nil, + writer_batch_size: nil, + writer_batch_timeout_ms: nil, + writer_bucket_no_key_assigner: nil, + writer_buffer_memory_size: nil, + writer_buffer_wait_timeout_ms: nil, + writer_dynamic_batch_size_enabled: nil, + writer_dynamic_batch_size_min: nil, + writer_enable_idempotence: nil, + writer_max_inflight_requests_per_bucket: nil, + writer_request_max_size: nil, + writer_retries: nil + + @type t :: %__MODULE__{ + bootstrap_servers: String.t(), + connect_timeout_ms: non_neg_integer() | nil, + remote_file_download_thread_num: non_neg_integer() | nil, + scanner_log_fetch_max_bytes: non_neg_integer() | nil, + scanner_log_fetch_max_bytes_for_bucket: non_neg_integer() | nil, + scanner_log_fetch_min_bytes: non_neg_integer() | nil, + scanner_log_fetch_wait_max_time_ms: non_neg_integer() | nil, + scanner_log_max_poll_records: non_neg_integer() | nil, + scanner_remote_log_prefetch_num: non_neg_integer() | nil, + scanner_remote_log_read_concurrency: non_neg_integer() | nil, + security_protocol: String.t() | nil, + security_sasl_mechanism: String.t() | nil, + security_sasl_password: String.t() | nil, + security_sasl_username: String.t() | nil, + writer_acks: String.t() | nil, + writer_batch_size: non_neg_integer() | nil, + writer_batch_timeout_ms: non_neg_integer() | nil, + writer_bucket_no_key_assigner: :sticky | :round_robin | nil, + writer_buffer_memory_size: non_neg_integer() | nil, + writer_buffer_wait_timeout_ms: non_neg_integer() | nil, + writer_dynamic_batch_size_enabled: boolean() | nil, + writer_dynamic_batch_size_min: non_neg_integer() | nil, + writer_enable_idempotence: boolean() | nil, + writer_max_inflight_requests_per_bucket: non_neg_integer() | nil, + writer_request_max_size: non_neg_integer() | nil, + writer_retries: non_neg_integer() | nil + } + + @spec new(String.t()) :: t() + def new(bootstrap_servers) when is_binary(bootstrap_servers) do + %__MODULE__{bootstrap_servers: bootstrap_servers} + end + + @spec default() :: t() + def default, do: %__MODULE__{bootstrap_servers: ""} + + @spec set_bootstrap_servers(t(), String.t()) :: t() + def set_bootstrap_servers(%__MODULE__{} = config, servers) when is_binary(servers), + do: %{config | bootstrap_servers: servers} + + @spec set_connect_timeout_ms(t(), non_neg_integer()) :: t() + def set_connect_timeout_ms(%__MODULE__{} = config, ms) when is_integer(ms), + do: %{config | connect_timeout_ms: ms} + + @spec set_remote_file_download_thread_num(t(), non_neg_integer()) :: t() + def set_remote_file_download_thread_num(%__MODULE__{} = config, threads) + when is_integer(threads), + do: %{config | remote_file_download_thread_num: threads} + + @spec set_scanner_log_fetch_max_bytes(t(), non_neg_integer()) :: t() + def set_scanner_log_fetch_max_bytes(%__MODULE__{} = config, max_bytes) + when is_integer(max_bytes), + do: %{config | scanner_log_fetch_max_bytes: max_bytes} + + @spec set_scanner_log_fetch_max_bytes_for_bucket(t(), non_neg_integer()) :: t() + def set_scanner_log_fetch_max_bytes_for_bucket(%__MODULE__{} = config, max_bytes) + when is_integer(max_bytes), + do: %{config | scanner_log_fetch_max_bytes_for_bucket: max_bytes} + + @spec set_scanner_log_fetch_min_bytes(t(), non_neg_integer()) :: t() + def set_scanner_log_fetch_min_bytes(%__MODULE__{} = config, min_bytes) + when is_integer(min_bytes), + do: %{config | scanner_log_fetch_min_bytes: min_bytes} + + @spec set_scanner_log_fetch_wait_max_time_ms(t(), non_neg_integer()) :: t() + def set_scanner_log_fetch_wait_max_time_ms(%__MODULE__{} = config, wait_ms) + when is_integer(wait_ms), + do: %{config | scanner_log_fetch_wait_max_time_ms: wait_ms} + + @spec set_scanner_log_max_poll_records(t(), non_neg_integer()) :: t() + def set_scanner_log_max_poll_records(%__MODULE__{} = config, num) when is_integer(num), + do: %{config | scanner_log_max_poll_records: num} + + @spec set_scanner_remote_log_prefetch_num(t(), non_neg_integer()) :: t() + def set_scanner_remote_log_prefetch_num(%__MODULE__{} = config, num) when is_integer(num), + do: %{config | scanner_remote_log_prefetch_num: num} + + @spec set_scanner_remote_log_read_concurrency(t(), non_neg_integer()) :: t() + def set_scanner_remote_log_read_concurrency(%__MODULE__{} = config, concurrency) + when is_integer(concurrency), + do: %{config | scanner_remote_log_read_concurrency: concurrency} + + @spec set_security_protocol(t(), String.t()) :: t() + def set_security_protocol(%__MODULE__{} = config, protocol) when is_binary(protocol), + do: %{config | security_protocol: protocol} + + @spec set_security_sasl_mechanism(t(), String.t()) :: t() + def set_security_sasl_mechanism(%__MODULE__{} = config, mechanism) when is_binary(mechanism), + do: %{config | security_sasl_mechanism: mechanism} + + @spec set_security_sasl_password(t(), String.t()) :: t() + def set_security_sasl_password(%__MODULE__{} = config, pass) when is_binary(pass), + do: %{config | security_sasl_password: pass} + + @spec set_security_sasl_username(t(), String.t()) :: t() + def set_security_sasl_username(%__MODULE__{} = config, username) when is_binary(username), + do: %{config | security_sasl_username: username} + + @spec set_writer_acks(t(), String.t()) :: t() + def set_writer_acks(%__MODULE__{} = config, acks) when is_binary(acks), + do: %{config | writer_acks: acks} + + @spec set_writer_batch_size(t(), non_neg_integer()) :: t() + def set_writer_batch_size(%__MODULE__{} = config, size) when is_integer(size), + do: %{config | writer_batch_size: size} + + @spec set_writer_batch_timeout_ms(t(), non_neg_integer()) :: t() + def set_writer_batch_timeout_ms(%__MODULE__{} = config, ms) when is_integer(ms), + do: %{config | writer_batch_timeout_ms: ms} + + @spec set_writer_bucket_no_key_assigner(t(), :sticky | :round_robin) :: t() + def set_writer_bucket_no_key_assigner(%__MODULE__{} = config, assigner) + when assigner in [:sticky, :round_robin], + do: %{config | writer_bucket_no_key_assigner: assigner} + + @spec set_writer_buffer_memory_size(t(), non_neg_integer()) :: t() + def set_writer_buffer_memory_size(%__MODULE__{} = config, size) when is_integer(size), + do: %{config | writer_buffer_memory_size: size} + + @spec set_writer_buffer_wait_timeout_ms(t(), non_neg_integer()) :: t() + def set_writer_buffer_wait_timeout_ms(%__MODULE__{} = config, ms) when is_integer(ms), + do: %{config | writer_buffer_wait_timeout_ms: ms} + + @spec set_writer_dynamic_batch_size_enabled(t(), boolean()) :: t() + def set_writer_dynamic_batch_size_enabled(%__MODULE__{} = config, enabled) + when is_boolean(enabled), + do: %{config | writer_dynamic_batch_size_enabled: enabled} + + @spec set_writer_dynamic_batch_size_min(t(), non_neg_integer()) :: t() + def set_writer_dynamic_batch_size_min(%__MODULE__{} = config, size) when is_integer(size), + do: %{config | writer_dynamic_batch_size_min: size} + + @spec set_writer_enable_idempotence(t(), boolean()) :: t() + def set_writer_enable_idempotence(%__MODULE__{} = config, enabled) + when is_boolean(enabled), + do: %{config | writer_enable_idempotence: enabled} + + @spec set_writer_max_inflight_requests_per_bucket(t(), non_neg_integer()) :: t() + def set_writer_max_inflight_requests_per_bucket(%__MODULE__{} = config, n) + when is_integer(n), + do: %{config | writer_max_inflight_requests_per_bucket: n} + + @spec set_writer_request_max_size(t(), non_neg_integer()) :: t() + def set_writer_request_max_size(%__MODULE__{} = config, size) when is_integer(size), + do: %{config | writer_request_max_size: size} + + @spec set_writer_retries(t(), non_neg_integer()) :: t() + def set_writer_retries(%__MODULE__{} = config, n) when is_integer(n), + do: %{config | writer_retries: n} + + @spec get_bootstrap_servers(t()) :: String.t() + def get_bootstrap_servers(%__MODULE__{bootstrap_servers: servers}), do: servers +end + +defimpl Inspect, for: Fluss.Config do + import Inspect.Algebra + + def inspect(%Fluss.Config{} = config, opts) do + sanitized = %{config | security_sasl_password: redact(config.security_sasl_password)} + + fields = sanitized |> Map.from_struct() |> Map.to_list() + + container_doc( + "%Fluss.Config{", + fields, + "}", + opts, + fn {key, value}, opts -> + concat([Atom.to_string(key), ": ", to_doc(value, opts)]) + end, + separator: "," + ) + end + + defp redact(nil), do: nil + defp redact(_), do: "[REDACTED]" +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/connection.ex b/fluss-rust/bindings/elixir/lib/fluss/connection.ex new file mode 100644 index 0000000000..a56c72a664 --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/connection.ex @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.Connection do + @moduledoc """ + A connection to a Fluss cluster. + + Errors are per-operation, not per-connection. If the server becomes + unreachable, individual calls fail but the connection recovers + transparently — there is no need to recreate it. + + ## Examples + + config = Fluss.Config.new("localhost:9123") + {:ok, conn} = Fluss.Connection.new(config) + + """ + + alias Fluss.Native + + @type t :: reference() + + @spec new(Fluss.Config.t()) :: {:ok, t()} | {:error, Fluss.Error.t()} + def new(%Fluss.Config{} = config) do + config + |> Native.connection_new() + |> Native.await_nif() + end + + @spec new!(Fluss.Config.t()) :: t() + def new!(%Fluss.Config{} = config) do + case new(config) do + {:ok, conn} -> conn + {:error, %Fluss.Error{} = err} -> raise err + end + end +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/error.ex b/fluss-rust/bindings/elixir/lib/fluss/error.ex new file mode 100644 index 0000000000..fe5d1ca8b4 --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/error.ex @@ -0,0 +1,127 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.Error do + @moduledoc """ + Structured error returned from Fluss operations. + + Fields: + + * `:code` — stable atom for pattern matching. + * `:error_code` — raw integer code. Protocol codes `0..57`, `-1` for + `:unknown_server_error`, `-2` for `:client_error`. + * `:message` — human-readable description. + + Also an exception, so `raise err` works. + + `:client_error` covers any failure that didn't come from the server API + (bad input, transport, I/O, decode, consumed write handle, etc.) and is + not retriable, matching the Python and C++ bindings. + """ + + defexception [:code, :error_code, :message] + + @typedoc "Error code atom." + @type code :: + :none + | :unknown_server_error + | :network_exception + | :unsupported_version + | :corrupt_message + | :database_not_exist + | :database_not_empty + | :database_already_exist + | :table_not_exist + | :table_already_exist + | :schema_not_exist + | :log_storage_exception + | :kv_storage_exception + | :not_leader_or_follower + | :record_too_large_exception + | :corrupt_record_exception + | :invalid_table_exception + | :invalid_database_exception + | :invalid_replication_factor + | :invalid_required_acks + | :log_offset_out_of_range_exception + | :non_primary_key_table_exception + | :unknown_table_or_bucket_exception + | :invalid_update_version_exception + | :invalid_coordinator_exception + | :fenced_leader_epoch_exception + | :request_time_out + | :storage_exception + | :operation_not_attempted_exception + | :not_enough_replicas_after_append_exception + | :not_enough_replicas_exception + | :security_token_exception + | :out_of_order_sequence_exception + | :duplicate_sequence_exception + | :unknown_writer_id_exception + | :invalid_column_projection + | :invalid_target_column + | :partition_not_exists + | :table_not_partitioned_exception + | :invalid_timestamp_exception + | :invalid_config_exception + | :lake_storage_not_configured_exception + | :kv_snapshot_not_exist + | :partition_already_exists + | :partition_spec_invalid_exception + | :leader_not_available_exception + | :partition_max_num_exception + | :authenticate_exception + | :security_disabled_exception + | :authorization_exception + | :bucket_max_num_exception + | :fenced_tiering_epoch_exception + | :retriable_authenticate_exception + | :invalid_server_rack_info_exception + | :lake_snapshot_not_exist + | :lake_table_already_exist + | :ineligible_replica_exception + | :invalid_alter_table_exception + | :deletion_disabled_exception + | :client_error + + @type t :: %__MODULE__{code: code(), error_code: integer(), message: String.t()} + + @retriable_codes [ + :network_exception, + :corrupt_message, + :schema_not_exist, + :log_storage_exception, + :kv_storage_exception, + :not_leader_or_follower, + :corrupt_record_exception, + :unknown_table_or_bucket_exception, + :request_time_out, + :storage_exception, + :not_enough_replicas_after_append_exception, + :not_enough_replicas_exception, + :leader_not_available_exception + ] + + @impl true + def message(%__MODULE__{code: code, message: msg}) do + "Fluss error [#{code}]: #{msg}" + end + + @doc "Returns `true` if retrying the operation may succeed." + @spec retriable?(t()) :: boolean() + def retriable?(%__MODULE__{code: code}), do: code in @retriable_codes +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/log_scanner.ex b/fluss-rust/bindings/elixir/lib/fluss/log_scanner.ex new file mode 100644 index 0000000000..fca4168c3e --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/log_scanner.ex @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.LogScanner do + @moduledoc """ + Scanner for reading records from a log table. + + `poll/2` is non-blocking — it returns `:ok` immediately and sends results + as `{:fluss_records, records}` or `{:fluss_poll_error, %Fluss.Error{}}` to + the calling process. No dirty scheduler threads are held during the wait. + + Each record is an atom-keyed map: `:offset`, `:timestamp`, `:change_type`, `:row`. + Row values are also atom-keyed (column names interned as atoms). + + ## Examples + + scanner = Fluss.LogScanner.new!(table) + :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset()) + :ok = Fluss.LogScanner.poll(scanner, 5_000) + + receive do + {:fluss_records, records} -> + for record <- records, do: IO.inspect(record[:row]) + {:fluss_poll_error, %Fluss.Error{code: code, message: msg}} -> + IO.puts("poll error [\#{code}]: \#{msg}") + end + + """ + + alias Fluss.Native + + @type t :: reference() + @type record :: %{atom() => term()} + + @spec new(Fluss.Table.t()) :: {:ok, t()} | {:error, Fluss.Error.t()} + def new(table) do + case Native.log_scanner_new(table) do + {:error, _} = err -> err + s -> {:ok, s} + end + end + + @spec new!(Fluss.Table.t()) :: t() + def new!(table) do + case new(table) do + {:ok, s} -> s + {:error, %Fluss.Error{} = err} -> raise err + end + end + + @spec subscribe(t(), integer(), integer()) :: :ok | {:error, Fluss.Error.t()} + def subscribe(scanner, bucket, offset) do + scanner + |> Native.log_scanner_subscribe(bucket, offset) + |> Native.await_nif() + end + + @doc """ + Subscribes to multiple buckets. Takes a list of `{bucket_id, offset}` tuples. + """ + @spec subscribe_buckets(t(), [{integer(), integer()}]) :: :ok | {:error, Fluss.Error.t()} + def subscribe_buckets(scanner, bucket_offsets) when is_list(bucket_offsets) do + scanner + |> Native.log_scanner_subscribe_buckets(bucket_offsets) + |> Native.await_nif() + end + + @spec unsubscribe(t(), integer()) :: :ok | {:error, Fluss.Error.t()} + def unsubscribe(scanner, bucket) do + scanner + |> Native.log_scanner_unsubscribe(bucket) + |> Native.await_nif() + end + + @doc """ + Starts a non-blocking poll. Returns `:ok` immediately. + Results arrive as `{:fluss_records, [record]}` or + `{:fluss_poll_error, %Fluss.Error{}}`. + """ + @spec poll(t(), non_neg_integer()) :: :ok + def poll(scanner, timeout_ms), + do: Native.log_scanner_poll(scanner, timeout_ms) +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/native.ex b/fluss-rust/bindings/elixir/lib/fluss/native.ex new file mode 100644 index 0000000000..865dda142d --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/native.ex @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.Native do + @moduledoc false + use Rustler, otp_app: :fluss, crate: "fluss_nif" + + # Connection + def connection_new(_config), do: :erlang.nif_error(:nif_not_loaded) + + # Admin + def admin_new(_conn), do: :erlang.nif_error(:nif_not_loaded) + + def admin_create_database(_admin, _name, _ignore_if_exists), + do: :erlang.nif_error(:nif_not_loaded) + + def admin_drop_database(_admin, _name, _ignore_if_not_exists), + do: :erlang.nif_error(:nif_not_loaded) + + def admin_list_databases(_admin), do: :erlang.nif_error(:nif_not_loaded) + + def admin_create_table(_admin, _db, _table, _descriptor, _ignore_if_exists), + do: :erlang.nif_error(:nif_not_loaded) + + def admin_drop_table(_admin, _db, _table, _ignore_if_not_exists), + do: :erlang.nif_error(:nif_not_loaded) + + def admin_list_tables(_admin, _database), do: :erlang.nif_error(:nif_not_loaded) + + # Schema / TableDescriptor + def table_descriptor_new(_schema, _bucket_count, _properties), + do: :erlang.nif_error(:nif_not_loaded) + + # Table + def table_get(_conn, _database, _table), do: :erlang.nif_error(:nif_not_loaded) + def table_has_primary_key(_table), do: :erlang.nif_error(:nif_not_loaded) + def table_column_names(_table), do: :erlang.nif_error(:nif_not_loaded) + + # AppendWriter + def append_writer_new(_table), do: :erlang.nif_error(:nif_not_loaded) + def append_writer_append(_writer, _values), do: :erlang.nif_error(:nif_not_loaded) + def append_writer_flush(_writer), do: :erlang.nif_error(:nif_not_loaded) + + # LogScanner + def log_scanner_new(_table), do: :erlang.nif_error(:nif_not_loaded) + def log_scanner_subscribe(_scanner, _bucket, _offset), do: :erlang.nif_error(:nif_not_loaded) + + def log_scanner_subscribe_buckets(_scanner, _bucket_offsets), + do: :erlang.nif_error(:nif_not_loaded) + + def log_scanner_unsubscribe(_scanner, _bucket), do: :erlang.nif_error(:nif_not_loaded) + def log_scanner_poll(_scanner, _timeout_ms), do: :erlang.nif_error(:nif_not_loaded) + + # WriteHandle + def write_handle_wait(_handle), do: :erlang.nif_error(:nif_not_loaded) + + # Constants + def earliest_offset, do: :erlang.nif_error(:nif_not_loaded) + + @doc false + def await_nif(ref) do + receive do + {^ref, result} -> result + end + end +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/schema.ex b/fluss-rust/bindings/elixir/lib/fluss/schema.ex new file mode 100644 index 0000000000..e11911eee7 --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/schema.ex @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.Schema do + @moduledoc """ + Schema definition for a Fluss table. + + Simple types: `:boolean`, `:tinyint`, `:smallint`, `:int`, `:bigint`, + `:float`, `:double`, `:string`, `:bytes`, `:date`, `:time`, `:timestamp`, `:timestamp_ltz` + + Parameterized types: `{:decimal, precision, scale}`, `{:char, length}`, `{:binary, length}` + + ## Examples + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("id", :int) + |> Fluss.Schema.column("name", :string) + |> Fluss.Schema.column("amount", {:decimal, 10, 2}) + + """ + + defstruct columns: [], primary_key: [] + + @type data_type :: + :boolean + | :tinyint + | :smallint + | :int + | :bigint + | :float + | :double + | :string + | :bytes + | :date + | :time + | :timestamp + | :timestamp_ltz + | {:decimal, non_neg_integer(), non_neg_integer()} + | {:char, non_neg_integer()} + | {:binary, non_neg_integer()} + + @type t :: %__MODULE__{ + columns: [{String.t(), data_type()}], + primary_key: [String.t()] + } + + @spec new() :: t() + def new, do: %__MODULE__{} + + @spec column(t(), String.t(), data_type()) :: t() + def column(%__MODULE__{} = schema, name, data_type) when is_binary(name) do + %{schema | columns: schema.columns ++ [{name, data_type}]} + end + + @spec primary_key(t(), [String.t()]) :: t() + def primary_key(%__MODULE__{} = schema, keys) when is_list(keys) do + %{schema | primary_key: keys} + end +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/table.ex b/fluss-rust/bindings/elixir/lib/fluss/table.ex new file mode 100644 index 0000000000..c934fc0c01 --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/table.ex @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.Table do + @moduledoc """ + A handle to a Fluss table, used to create writers and scanners. + """ + + alias Fluss.Native + + @type t :: reference() + + @spec get(Fluss.Connection.t(), String.t(), String.t()) :: + {:ok, t()} | {:error, Fluss.Error.t()} + def get(conn, database, table) do + conn + |> Native.table_get(database, table) + |> Native.await_nif() + end + + @spec get!(Fluss.Connection.t(), String.t(), String.t()) :: t() + def get!(conn, database, table) do + case get(conn, database, table) do + {:ok, t} -> t + {:error, %Fluss.Error{} = err} -> raise err + end + end + + @spec has_primary_key?(t()) :: boolean() + def has_primary_key?(table), do: Native.table_has_primary_key(table) + + @spec column_names(t()) :: [String.t()] + def column_names(table), do: Native.table_column_names(table) +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/table_descriptor.ex b/fluss-rust/bindings/elixir/lib/fluss/table_descriptor.ex new file mode 100644 index 0000000000..b95b5a503b --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/table_descriptor.ex @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.TableDescriptor do + @moduledoc """ + Descriptor for creating a Fluss table. + + Options: `:bucket_count`, `:properties` (list of `{key, value}` string tuples). + + ## Examples + + Fluss.TableDescriptor.new!(schema) + Fluss.TableDescriptor.new!(schema, bucket_count: 3) + + """ + + alias Fluss.Native + + @type t :: reference() + + @spec new!(Fluss.Schema.t(), keyword()) :: t() + def new!(%Fluss.Schema{} = schema, opts \\ []) do + bucket_count = Keyword.get(opts, :bucket_count) + properties = Keyword.get(opts, :properties, []) + + case Native.table_descriptor_new(schema, bucket_count, properties) do + {:error, %Fluss.Error{} = err} -> raise err + ref -> ref + end + end +end diff --git a/fluss-rust/bindings/elixir/lib/fluss/write_handle.ex b/fluss-rust/bindings/elixir/lib/fluss/write_handle.ex new file mode 100644 index 0000000000..f5f16591f4 --- /dev/null +++ b/fluss-rust/bindings/elixir/lib/fluss/write_handle.ex @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.WriteHandle do + @moduledoc """ + Handle for a pending write operation. + + Returned by `Fluss.AppendWriter.append/2`. Drop for fire-and-forget, + or call `wait/1` for per-record server acknowledgment. + """ + + alias Fluss.Native + + @type t :: reference() + + @spec wait(t()) :: :ok | {:error, Fluss.Error.t()} + def wait(handle) do + handle + |> Native.write_handle_wait() + |> Native.await_nif() + end + + @spec wait!(t()) :: :ok + def wait!(handle) do + case wait(handle) do + :ok -> :ok + {:error, %Fluss.Error{} = err} -> raise err + end + end +end diff --git a/fluss-rust/bindings/elixir/mix.exs b/fluss-rust/bindings/elixir/mix.exs new file mode 100644 index 0000000000..b83e9f94b9 --- /dev/null +++ b/fluss-rust/bindings/elixir/mix.exs @@ -0,0 +1,61 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.MixProject do + use Mix.Project + + @version "1.0.0" + + def project do + [ + app: :fluss, + version: @version, + elixir: "~> 1.15", + start_permanent: Mix.env() == :prod, + elixirc_paths: elixirc_paths(Mix.env()), + deps: deps(), + description: "Elixir client for Apache Fluss", + package: package() + ] + end + + def application do + [ + extra_applications: [:logger] + ] + end + + defp elixirc_paths(:test), do: ["lib", "test/support"] + defp elixirc_paths(_), do: ["lib"] + + defp deps do + [ + {:rustler, "~> 0.37"}, + {:ex_doc, "~> 0.31", only: :dev, runtime: false}, + {:credo, "~> 1.7", only: [:dev, :test], runtime: false} + ] + end + + defp package do + [ + licenses: ["Apache-2.0"], + links: %{ + "GitHub" => "https://github.com/apache/fluss-rust" + } + ] + end +end diff --git a/fluss-rust/bindings/elixir/mix.lock b/fluss-rust/bindings/elixir/mix.lock new file mode 100644 index 0000000000..b1170d3f5b --- /dev/null +++ b/fluss-rust/bindings/elixir/mix.lock @@ -0,0 +1,13 @@ +%{ + "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"}, + "credo": {:hex, :credo, "1.7.17", "f92b6aa5b26301eaa5a35e4d48ebf5aa1e7094ac00ae38f87086c562caf8a22f", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "1eb5645c835f0b6c9b5410f94b5a185057bcf6d62a9c2b476da971cde8749645"}, + "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"}, + "ex_doc": {:hex, :ex_doc, "0.40.1", "67542e4b6dde74811cfd580e2c0149b78010fd13001fda7cfeb2b2c2ffb1344d", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "bcef0e2d360d93ac19f01a85d58f91752d930c0a30e2681145feea6bd3516e00"}, + "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"}, + "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, + "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, + "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, + "makeup_erlang": {:hex, :makeup_erlang, "1.0.3", "4252d5d4098da7415c390e847c814bad3764c94a814a0b4245176215615e1035", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "953297c02582a33411ac6208f2c6e55f0e870df7f80da724ed613f10e6706afd"}, + "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"}, + "rustler": {:hex, :rustler, "0.37.3", "5f4e6634d43b26f0a69834dd1d3ed4e1710b022a053bf4a670220c9540c92602", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "a6872c6f53dcf00486d1e7f9e046e20e01bf1654bdacc4193016c2e8002b32a2"}, +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/Cargo.toml b/fluss-rust/bindings/elixir/native/fluss_nif/Cargo.toml new file mode 100644 index 0000000000..dd4d453506 --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/Cargo.toml @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "fluss_nif" +version.workspace = true +edition.workspace = true +license.workspace = true +rust-version.workspace = true + +[lib] +name = "fluss_nif" +path = "src/lib.rs" +crate-type = ["cdylib"] + +[dependencies] +bigdecimal = { workspace = true } +fluss = { workspace = true } +rustler = "0.37" +tokio = { workspace = true } diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/admin.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/admin.rs new file mode 100644 index 0000000000..e3f29aebcd --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/admin.rs @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::async_nif; +use crate::atoms::to_nif_err; +use crate::connection::ConnectionResource; +use crate::schema::TableDescriptorResource; +use fluss::client::FlussAdmin; +use fluss::metadata::TablePath; +use rustler::{Env, ResourceArc, Term}; +use std::sync::Arc; + +pub struct AdminResource { + pub inner: Arc, +} + +impl std::panic::RefUnwindSafe for AdminResource {} + +#[rustler::resource_impl] +impl rustler::Resource for AdminResource {} + +#[rustler::nif] +fn admin_new( + conn: ResourceArc, +) -> Result, rustler::Error> { + let inner = conn.inner.get_admin().map_err(to_nif_err)?; + Ok(ResourceArc::new(AdminResource { inner })) +} + +#[rustler::nif] +fn admin_create_database<'a>( + env: Env<'a>, + admin: ResourceArc, + database_name: String, + ignore_if_exists: bool, +) -> Term<'a> { + async_nif::spawn_task(env, async move { + admin + .inner + .create_database(&database_name, None, ignore_if_exists) + .await + }) +} + +#[rustler::nif] +fn admin_drop_database<'a>( + env: Env<'a>, + admin: ResourceArc, + database_name: String, + ignore_if_not_exists: bool, +) -> Term<'a> { + async_nif::spawn_task(env, async move { + admin + .inner + .drop_database(&database_name, ignore_if_not_exists, false) + .await + }) +} + +#[rustler::nif] +fn admin_list_databases<'a>(env: Env<'a>, admin: ResourceArc) -> Term<'a> { + async_nif::spawn_task_with_result(env, async move { admin.inner.list_databases().await }) +} + +#[rustler::nif] +fn admin_create_table<'a>( + env: Env<'a>, + admin: ResourceArc, + database_name: String, + table_name: String, + descriptor: ResourceArc, + ignore_if_exists: bool, +) -> Term<'a> { + async_nif::spawn_task(env, async move { + let path = TablePath::new(&database_name, &table_name); + admin + .inner + .create_table(&path, &descriptor.inner, ignore_if_exists) + .await + }) +} + +#[rustler::nif] +fn admin_drop_table<'a>( + env: Env<'a>, + admin: ResourceArc, + database_name: String, + table_name: String, + ignore_if_not_exists: bool, +) -> Term<'a> { + async_nif::spawn_task(env, async move { + let path = TablePath::new(&database_name, &table_name); + admin.inner.drop_table(&path, ignore_if_not_exists).await + }) +} + +#[rustler::nif] +fn admin_list_tables<'a>( + env: Env<'a>, + admin: ResourceArc, + database_name: String, +) -> Term<'a> { + async_nif::spawn_task_with_result( + env, + async move { admin.inner.list_tables(&database_name).await }, + ) +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/append_writer.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/append_writer.rs new file mode 100644 index 0000000000..f26884419e --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/append_writer.rs @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::RUNTIME; +use crate::async_nif; +use crate::atoms::{client_err, to_nif_err}; +use crate::row_convert; +use crate::table::TableResource; +use crate::write_handle::WriteHandleResource; +use fluss::client::AppendWriter; +use fluss::metadata::Column; +use rustler::{Env, ResourceArc, Term}; + +pub struct AppendWriterResource { + pub inner: AppendWriter, + pub columns: Vec, +} + +impl std::panic::RefUnwindSafe for AppendWriterResource {} + +#[rustler::resource_impl] +impl rustler::Resource for AppendWriterResource {} + +#[rustler::nif] +fn append_writer_new( + table: ResourceArc, +) -> Result, rustler::Error> { + // WriterClient::new() calls tokio::spawn internally. + let _guard = RUNTIME.enter(); + let (inner, columns) = table.with_table(|t| { + let inner = t + .new_append() + .map_err(to_nif_err)? + .create_writer() + .map_err(to_nif_err)?; + Ok((inner, t.get_table_info().schema.columns().to_vec())) + })?; + Ok(ResourceArc::new(AppendWriterResource { inner, columns })) +} + +#[rustler::nif] +fn append_writer_append<'a>( + env: Env<'a>, + writer: ResourceArc, + values: Term<'a>, +) -> Result, rustler::Error> { + let row = row_convert::term_to_row(env, values, &writer.columns).map_err(client_err)?; + let future = writer.inner.append(&row).map_err(to_nif_err)?; + Ok(ResourceArc::new(WriteHandleResource::new(future))) +} + +#[rustler::nif] +fn append_writer_flush<'a>(env: Env<'a>, writer: ResourceArc) -> Term<'a> { + async_nif::spawn_task(env, async move { writer.inner.flush().await }) +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/async_nif.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/async_nif.rs new file mode 100644 index 0000000000..6b26eaaf8a --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/async_nif.rs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Async NIF helpers — spawn on tokio, send `{ref, result}` back as a BEAM +//! message instead of blocking dirty schedulers. + +use crate::RUNTIME; +use crate::atoms::{self, NifFlussError}; +use fluss::error::Error as CoreError; +use rustler::env::OwnedEnv; +use rustler::{Encoder, Env, Term}; +use std::future::Future; + +fn encode_err<'a>(env: Env<'a>, err: CoreError) -> Term<'a> { + (atoms::error(), NifFlussError::from_core(&err)).encode(env) +} + +pub fn spawn_task<'a, F>(env: Env<'a>, future: F) -> Term<'a> +where + F: Future> + Send + 'static, +{ + let pid = env.pid(); + let ref_term: Term<'a> = *env.make_ref(); + let mut task_env = OwnedEnv::new(); + let saved_ref = task_env.save(ref_term); + + RUNTIME.spawn(async move { + let result = future.await; + let _ = task_env.send_and_clear(&pid, |env| { + let r = saved_ref.load(env); + match result { + Ok(()) => (r, atoms::ok()).encode(env), + Err(e) => (r, encode_err(env, e)).encode(env), + } + }); + }); + + ref_term +} + +pub fn spawn_task_with_result<'a, F, T>(env: Env<'a>, future: F) -> Term<'a> +where + F: Future> + Send + 'static, + T: Encoder + Send + 'static, +{ + let pid = env.pid(); + let ref_term: Term<'a> = *env.make_ref(); + let mut task_env = OwnedEnv::new(); + let saved_ref = task_env.save(ref_term); + + RUNTIME.spawn(async move { + let result = future.await; + let _ = task_env.send_and_clear(&pid, |env| { + let r = saved_ref.load(env); + match result { + Ok(val) => (r, (atoms::ok(), val)).encode(env), + Err(e) => (r, encode_err(env, e)).encode(env), + } + }); + }); + + ref_term +} + +pub fn send_client_error<'a>(env: Env<'a>, msg: &str) -> Term<'a> { + let pid = env.pid(); + let ref_term: Term<'a> = *env.make_ref(); + let mut task_env = OwnedEnv::new(); + let saved_ref = task_env.save(ref_term); + let message = msg.to_string(); + + let _ = task_env.send_and_clear(&pid, |env| { + let r = saved_ref.load(env); + let err = NifFlussError::client(message); + (r, (atoms::error(), err)).encode(env) + }); + + ref_term +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/atoms.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/atoms.rs new file mode 100644 index 0000000000..45d5aa303a --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/atoms.rs @@ -0,0 +1,224 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use fluss::error::{Error as CoreError, FlussError}; +use rustler::{Atom, NifStruct}; + +rustler::atoms! { + ok, + error, + nil, + + // Change types + append_only, + insert, + update_before, + update_after, + delete, + + // Poll result message tags + fluss_records, + fluss_poll_error, + + // Record map keys + offset, + timestamp, + change_type, + row, + + // Error code atoms (mirror of fluss::error::FlussError). + none, + unknown_server_error, + network_exception, + unsupported_version, + corrupt_message, + database_not_exist, + database_not_empty, + database_already_exist, + table_not_exist, + table_already_exist, + schema_not_exist, + log_storage_exception, + kv_storage_exception, + not_leader_or_follower, + record_too_large_exception, + corrupt_record_exception, + invalid_table_exception, + invalid_database_exception, + invalid_replication_factor, + invalid_required_acks, + log_offset_out_of_range_exception, + non_primary_key_table_exception, + unknown_table_or_bucket_exception, + invalid_update_version_exception, + invalid_coordinator_exception, + fenced_leader_epoch_exception, + request_time_out, + storage_exception, + operation_not_attempted_exception, + not_enough_replicas_after_append_exception, + not_enough_replicas_exception, + security_token_exception, + out_of_order_sequence_exception, + duplicate_sequence_exception, + unknown_writer_id_exception, + invalid_column_projection, + invalid_target_column, + partition_not_exists, + table_not_partitioned_exception, + invalid_timestamp_exception, + invalid_config_exception, + lake_storage_not_configured_exception, + kv_snapshot_not_exist, + partition_already_exists, + partition_spec_invalid_exception, + leader_not_available_exception, + partition_max_num_exception, + authenticate_exception, + security_disabled_exception, + authorization_exception, + bucket_max_num_exception, + fenced_tiering_epoch_exception, + retriable_authenticate_exception, + invalid_server_rack_info_exception, + lake_snapshot_not_exist, + lake_table_already_exist, + ineligible_replica_exception, + invalid_alter_table_exception, + deletion_disabled_exception, + client_error, +} + +pub const CLIENT_ERROR_CODE: i32 = -2; + +// `__exception__` is the marker `defexception` sets. Rustler bypasses the +// Elixir constructor, so we must serialize it explicitly or `raise err` +// rejects the struct at the Elixir side. +#[derive(NifStruct)] +#[module = "Fluss.Error"] +pub struct NifFlussError { + pub code: Atom, + pub error_code: i32, + pub message: String, + #[allow(non_snake_case)] + pub __exception__: bool, +} + +impl NifFlussError { + pub fn from_core(error: &CoreError) -> Self { + // Transport failures map to `:network_exception` (Java parity, + // retriable). + let (code, error_code) = match error { + CoreError::FlussAPIError { api_error } => { + (api_error_atom(api_error.code), api_error.code) + } + CoreError::RpcError { .. } => { + (network_exception(), FlussError::NetworkException.code()) + } + _ => (client_error(), CLIENT_ERROR_CODE), + }; + Self { + code, + error_code, + message: error.to_string(), + __exception__: true, + } + } + + pub fn client(message: String) -> Self { + Self { + code: client_error(), + error_code: CLIENT_ERROR_CODE, + message, + __exception__: true, + } + } +} + +fn api_error_atom(code: i32) -> Atom { + match FlussError::for_code(code) { + FlussError::UnknownServerError => unknown_server_error(), + FlussError::None => none(), + FlussError::NetworkException => network_exception(), + FlussError::UnsupportedVersion => unsupported_version(), + FlussError::CorruptMessage => corrupt_message(), + FlussError::DatabaseNotExist => database_not_exist(), + FlussError::DatabaseNotEmpty => database_not_empty(), + FlussError::DatabaseAlreadyExist => database_already_exist(), + FlussError::TableNotExist => table_not_exist(), + FlussError::TableAlreadyExist => table_already_exist(), + FlussError::SchemaNotExist => schema_not_exist(), + FlussError::LogStorageException => log_storage_exception(), + FlussError::KvStorageException => kv_storage_exception(), + FlussError::NotLeaderOrFollower => not_leader_or_follower(), + FlussError::RecordTooLargeException => record_too_large_exception(), + FlussError::CorruptRecordException => corrupt_record_exception(), + FlussError::InvalidTableException => invalid_table_exception(), + FlussError::InvalidDatabaseException => invalid_database_exception(), + FlussError::InvalidReplicationFactor => invalid_replication_factor(), + FlussError::InvalidRequiredAcks => invalid_required_acks(), + FlussError::LogOffsetOutOfRangeException => log_offset_out_of_range_exception(), + FlussError::NonPrimaryKeyTableException => non_primary_key_table_exception(), + FlussError::UnknownTableOrBucketException => unknown_table_or_bucket_exception(), + FlussError::InvalidUpdateVersionException => invalid_update_version_exception(), + FlussError::InvalidCoordinatorException => invalid_coordinator_exception(), + FlussError::FencedLeaderEpochException => fenced_leader_epoch_exception(), + FlussError::RequestTimeOut => request_time_out(), + FlussError::StorageException => storage_exception(), + FlussError::OperationNotAttemptedException => operation_not_attempted_exception(), + FlussError::NotEnoughReplicasAfterAppendException => { + not_enough_replicas_after_append_exception() + } + FlussError::NotEnoughReplicasException => not_enough_replicas_exception(), + FlussError::SecurityTokenException => security_token_exception(), + FlussError::OutOfOrderSequenceException => out_of_order_sequence_exception(), + FlussError::DuplicateSequenceException => duplicate_sequence_exception(), + FlussError::UnknownWriterIdException => unknown_writer_id_exception(), + FlussError::InvalidColumnProjection => invalid_column_projection(), + FlussError::InvalidTargetColumn => invalid_target_column(), + FlussError::PartitionNotExists => partition_not_exists(), + FlussError::TableNotPartitionedException => table_not_partitioned_exception(), + FlussError::InvalidTimestampException => invalid_timestamp_exception(), + FlussError::InvalidConfigException => invalid_config_exception(), + FlussError::LakeStorageNotConfiguredException => lake_storage_not_configured_exception(), + FlussError::KvSnapshotNotExist => kv_snapshot_not_exist(), + FlussError::PartitionAlreadyExists => partition_already_exists(), + FlussError::PartitionSpecInvalidException => partition_spec_invalid_exception(), + FlussError::LeaderNotAvailableException => leader_not_available_exception(), + FlussError::PartitionMaxNumException => partition_max_num_exception(), + FlussError::AuthenticateException => authenticate_exception(), + FlussError::SecurityDisabledException => security_disabled_exception(), + FlussError::AuthorizationException => authorization_exception(), + FlussError::BucketMaxNumException => bucket_max_num_exception(), + FlussError::FencedTieringEpochException => fenced_tiering_epoch_exception(), + FlussError::RetriableAuthenticateException => retriable_authenticate_exception(), + FlussError::InvalidServerRackInfoException => invalid_server_rack_info_exception(), + FlussError::LakeSnapshotNotExist => lake_snapshot_not_exist(), + FlussError::LakeTableAlreadyExist => lake_table_already_exist(), + FlussError::IneligibleReplicaException => ineligible_replica_exception(), + FlussError::InvalidAlterTableException => invalid_alter_table_exception(), + FlussError::DeletionDisabledException => deletion_disabled_exception(), + } +} + +pub fn to_nif_err(e: CoreError) -> rustler::Error { + rustler::Error::Term(Box::new(NifFlussError::from_core(&e))) +} + +pub fn client_err(msg: impl Into) -> rustler::Error { + rustler::Error::Term(Box::new(NifFlussError::client(msg.into()))) +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/config.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/config.rs new file mode 100644 index 0000000000..8c1bab51eb --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/config.rs @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use fluss::config::{Config, NoKeyAssigner}; +use rustler::{NifStruct, NifUnitEnum}; + +/// Bucket-assigner strategy for tables without bucket keys. +/// Maps to fluss::config::NoKeyAssigner. +#[derive(NifUnitEnum)] +pub enum NifNoKeyAssigner { + Sticky, + RoundRobin, +} + +/// Decoded from `%Fluss.Config{}` Elixir struct. +#[derive(NifStruct)] +#[module = "Fluss.Config"] +pub struct NifConfig { + pub bootstrap_servers: String, + pub connect_timeout_ms: Option, + pub remote_file_download_thread_num: Option, + pub scanner_log_fetch_max_bytes: Option, + pub scanner_log_fetch_max_bytes_for_bucket: Option, + pub scanner_log_fetch_min_bytes: Option, + pub scanner_log_fetch_wait_max_time_ms: Option, + pub scanner_log_max_poll_records: Option, + pub scanner_remote_log_prefetch_num: Option, + pub scanner_remote_log_read_concurrency: Option, + pub security_protocol: Option, + pub security_sasl_mechanism: Option, + pub security_sasl_password: Option, + pub security_sasl_username: Option, + pub writer_acks: Option, + pub writer_batch_size: Option, + pub writer_batch_timeout_ms: Option, + pub writer_bucket_no_key_assigner: Option, + pub writer_buffer_memory_size: Option, + pub writer_buffer_wait_timeout_ms: Option, + pub writer_dynamic_batch_size_enabled: Option, + pub writer_dynamic_batch_size_min: Option, + pub writer_enable_idempotence: Option, + pub writer_max_inflight_requests_per_bucket: Option, + pub writer_request_max_size: Option, + pub writer_retries: Option, +} + +impl NifConfig { + pub fn into_core(self) -> Config { + let mut config = Config { + bootstrap_servers: self.bootstrap_servers, + ..Config::default() + }; + if let Some(timeout) = self.connect_timeout_ms { + config.connect_timeout_ms = timeout; + } + if let Some(n) = self.remote_file_download_thread_num { + config.remote_file_download_thread_num = n as usize; + } + if let Some(size) = self.scanner_log_fetch_max_bytes { + config.scanner_log_fetch_max_bytes = size; + } + if let Some(size) = self.scanner_log_fetch_max_bytes_for_bucket { + config.scanner_log_fetch_max_bytes_for_bucket = size; + } + if let Some(size) = self.scanner_log_fetch_min_bytes { + config.scanner_log_fetch_min_bytes = size; + } + if let Some(ms) = self.scanner_log_fetch_wait_max_time_ms { + config.scanner_log_fetch_wait_max_time_ms = ms; + } + if let Some(n) = self.scanner_log_max_poll_records { + config.scanner_log_max_poll_records = n as usize; + } + if let Some(n) = self.scanner_remote_log_prefetch_num { + config.scanner_remote_log_prefetch_num = n as usize; + } + if let Some(n) = self.scanner_remote_log_read_concurrency { + config.scanner_remote_log_read_concurrency = n as usize; + } + if let Some(protocol) = self.security_protocol { + config.security_protocol = protocol; + } + if let Some(mechanism) = self.security_sasl_mechanism { + config.security_sasl_mechanism = mechanism; + } + if let Some(password) = self.security_sasl_password { + config.security_sasl_password = password; + } + if let Some(username) = self.security_sasl_username { + config.security_sasl_username = username; + } + if let Some(size) = self.writer_batch_size { + config.writer_batch_size = size; + } + if let Some(ms) = self.writer_batch_timeout_ms { + config.writer_batch_timeout_ms = ms; + } + if let Some(enabled) = self.writer_dynamic_batch_size_enabled { + config.writer_dynamic_batch_size_enabled = enabled; + } + if let Some(size) = self.writer_dynamic_batch_size_min { + config.writer_dynamic_batch_size_min = size; + } + if let Some(acks) = self.writer_acks { + config.writer_acks = acks; + } + if let Some(assigner) = self.writer_bucket_no_key_assigner { + config.writer_bucket_no_key_assigner = match assigner { + NifNoKeyAssigner::Sticky => NoKeyAssigner::Sticky, + NifNoKeyAssigner::RoundRobin => NoKeyAssigner::RoundRobin, + }; + } + if let Some(memory_size) = self.writer_buffer_memory_size { + config.writer_buffer_memory_size = memory_size as usize; + } + if let Some(timeout_ms) = self.writer_buffer_wait_timeout_ms { + config.writer_buffer_wait_timeout_ms = timeout_ms; + } + if let Some(enabled) = self.writer_enable_idempotence { + config.writer_enable_idempotence = enabled; + } + if let Some(requests_limit) = self.writer_max_inflight_requests_per_bucket { + config.writer_max_inflight_requests_per_bucket = requests_limit as usize; + } + if let Some(max_size) = self.writer_request_max_size { + config.writer_request_max_size = max_size; + } + if let Some(retries) = self.writer_retries { + config.writer_retries = retries; + } + config + } +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/connection.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/connection.rs new file mode 100644 index 0000000000..4c788eeec7 --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/connection.rs @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::async_nif; +use crate::config::NifConfig; +use fluss::client::FlussConnection; +use rustler::{Env, ResourceArc, Term}; +use std::sync::Arc; + +pub struct ConnectionResource { + pub inner: Arc, +} + +impl std::panic::RefUnwindSafe for ConnectionResource {} + +#[rustler::resource_impl] +impl rustler::Resource for ConnectionResource {} + +#[rustler::nif] +fn connection_new<'a>(env: Env<'a>, config: NifConfig) -> Term<'a> { + let core_config = config.into_core(); + async_nif::spawn_task_with_result(env, async move { + FlussConnection::new(core_config).await.map(|conn| { + ResourceArc::new(ConnectionResource { + inner: Arc::new(conn), + }) + }) + }) +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/lib.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/lib.rs new file mode 100644 index 0000000000..a843d65f21 --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/lib.rs @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Rustler 0.37 wraps every NIF body in `std::panic::catch_unwind`, which requires +// all captured values (including `ResourceArc`) to be `RefUnwindSafe`. +// `ResourceArc` contains `*mut T`, so it is only `RefUnwindSafe` when `T` is. +// Our resource types contain `parking_lot` locks (`UnsafeCell`) which opt out of +// the auto-trait. We manually impl `RefUnwindSafe` on each resource type because +// panic safety is already guaranteed by the NIF boundary — a panic is caught and +// converted to an Erlang exception, never observed by Rust code. + +mod admin; +mod append_writer; +mod async_nif; +mod atoms; +mod config; +mod connection; +mod log_scanner; +mod row_convert; +mod schema; +mod table; +mod write_handle; + +use std::sync::LazyLock; + +static RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("failed to create tokio runtime") +}); + +rustler::init!("Elixir.Fluss.Native"); diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/log_scanner.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/log_scanner.rs new file mode 100644 index 0000000000..62614e0e67 --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/log_scanner.rs @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::RUNTIME; +use crate::async_nif; +use crate::atoms::{self, NifFlussError, to_nif_err}; +use crate::row_convert; +use crate::table::TableResource; +use fluss::client::{EARLIEST_OFFSET, LogScanner}; +use fluss::error::Error; +use fluss::metadata::Column; +use fluss::record::{ChangeType, ScanRecords}; +use rustler::env::OwnedEnv; +use rustler::types::LocalPid; +use rustler::{Atom, Encoder, Env, ResourceArc, Term}; +use std::collections::HashMap; +use std::time::Duration; + +pub struct LogScannerResource { + pub inner: LogScanner, + pub columns: Vec, +} + +impl std::panic::RefUnwindSafe for LogScannerResource {} + +#[rustler::resource_impl] +impl rustler::Resource for LogScannerResource {} + +#[rustler::nif] +fn log_scanner_new( + table: ResourceArc, +) -> Result, rustler::Error> { + let _guard = RUNTIME.enter(); + let (inner, columns) = table.with_table(|t| { + let inner = t.new_scan().create_log_scanner().map_err(to_nif_err)?; + Ok((inner, t.get_table_info().schema.columns().to_vec())) + })?; + Ok(ResourceArc::new(LogScannerResource { inner, columns })) +} + +#[rustler::nif] +fn log_scanner_subscribe<'a>( + env: Env<'a>, + scanner: ResourceArc, + bucket: i32, + offset: i64, +) -> Term<'a> { + async_nif::spawn_task( + env, + async move { scanner.inner.subscribe(bucket, offset).await }, + ) +} + +#[rustler::nif] +fn log_scanner_subscribe_buckets<'a>( + env: Env<'a>, + scanner: ResourceArc, + bucket_offsets: Vec<(i32, i64)>, +) -> Term<'a> { + let map: HashMap = bucket_offsets.into_iter().collect(); + async_nif::spawn_task( + env, + async move { scanner.inner.subscribe_buckets(&map).await }, + ) +} + +#[rustler::nif] +fn log_scanner_unsubscribe<'a>( + env: Env<'a>, + scanner: ResourceArc, + bucket: i32, +) -> Term<'a> { + async_nif::spawn_task(env, async move { scanner.inner.unsubscribe(bucket).await }) +} + +#[rustler::nif] +fn log_scanner_poll(env: Env, scanner: ResourceArc, timeout_ms: u64) -> Atom { + let pid = env.pid(); + let scanner = scanner.clone(); + + RUNTIME.spawn(async move { + let result = scanner.inner.poll(Duration::from_millis(timeout_ms)).await; + send_poll_result(&pid, result, &scanner.columns); + }); + + atoms::ok() +} + +fn send_poll_result(pid: &LocalPid, result: Result, columns: &[Column]) { + let mut msg_env = OwnedEnv::new(); + + match result { + Ok(scan_records) => { + let _ = msg_env.send_and_clear(pid, |env| { + match encode_scan_records(env, scan_records, columns) { + Ok(records) => (atoms::fluss_records(), records).encode(env), + Err(message) => { + (atoms::fluss_poll_error(), NifFlussError::client(message)).encode(env) + } + } + }); + } + Err(e) => { + let _ = msg_env.send_and_clear(pid, |env| { + (atoms::fluss_poll_error(), NifFlussError::from_core(&e)).encode(env) + }); + } + } +} + +fn encode_scan_records<'a>( + env: Env<'a>, + scan_records: ScanRecords, + columns: &[Column], +) -> Result, String> { + let column_atoms = row_convert::intern_column_atoms(env, columns); + let mut result = Vec::new(); + + for record in scan_records { + let row_map = row_convert::row_to_term(env, record.row(), columns, &column_atoms) + .map_err(|e| format!("failed to convert row at offset {}: {e}", record.offset()))?; + let change_type_atom = match record.change_type() { + ChangeType::AppendOnly => atoms::append_only().encode(env), + ChangeType::Insert => atoms::insert().encode(env), + ChangeType::UpdateBefore => atoms::update_before().encode(env), + ChangeType::UpdateAfter => atoms::update_after().encode(env), + ChangeType::Delete => atoms::delete().encode(env), + }; + + let record_map = rustler::Term::map_from_pairs( + env, + &[ + (atoms::offset().encode(env), record.offset().encode(env)), + ( + atoms::timestamp().encode(env), + record.timestamp().encode(env), + ), + (atoms::change_type().encode(env), change_type_atom), + (atoms::row().encode(env), row_map), + ], + ) + .map_err(|_| "failed to create record map".to_string())?; + result.push(record_map); + } + + Ok(result.encode(env)) +} + +#[rustler::nif] +fn earliest_offset() -> i64 { + EARLIEST_OFFSET +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/row_convert.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/row_convert.rs new file mode 100644 index 0000000000..c72395e900 --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/row_convert.rs @@ -0,0 +1,267 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::str::FromStr; + +use fluss::metadata::{Column, DataType}; +use fluss::row::{Date, Decimal, GenericRow, InternalRow, Time, TimestampLtz, TimestampNtz}; +use rustler::types::binary::NewBinary; +use rustler::{Encoder, Env, Term}; + +use crate::atoms; + +/// Convert column names to BEAM atoms for use as map keys. +/// +/// Note: BEAM atoms are never garbage-collected. This is safe because column +/// names come from server-defined table schemas (bounded set), not arbitrary +/// user input. The BEAM deduplicates atoms, so repeated calls with the same +/// column names do not grow the atom table. +pub fn intern_column_atoms<'a>(env: Env<'a>, columns: &[Column]) -> Vec { + columns + .iter() + .map(|col| rustler::Atom::from_str(env, col.name()).expect("valid atom")) + .collect() +} + +pub fn row_to_term<'a>( + env: Env<'a>, + row: &dyn InternalRow, + columns: &[Column], + column_atoms: &[rustler::Atom], +) -> Result, String> { + let pairs: Vec<(Term<'a>, Term<'a>)> = columns + .iter() + .enumerate() + .map(|(i, col)| { + let key = column_atoms[i].encode(env); + let value = field_to_term(env, row, i, col.data_type())?; + Ok((key, value)) + }) + .collect::>()?; + Term::map_from_pairs(env, &pairs).map_err(|_| "failed to create map".to_string()) +} + +fn field_to_term<'a>( + env: Env<'a>, + row: &dyn InternalRow, + pos: usize, + data_type: &DataType, +) -> Result, String> { + if row.is_null_at(pos).map_err(|e| e.to_string())? { + return Ok(atoms::nil().encode(env)); + } + + match data_type { + DataType::Boolean(_) => { + let v = row.get_boolean(pos).map_err(|e| e.to_string())?; + Ok(v.encode(env)) + } + DataType::TinyInt(_) => { + let v = row.get_byte(pos).map_err(|e| e.to_string())?; + Ok(v.encode(env)) + } + DataType::SmallInt(_) => { + let v = row.get_short(pos).map_err(|e| e.to_string())?; + Ok(v.encode(env)) + } + DataType::Int(_) => { + let v = row.get_int(pos).map_err(|e| e.to_string())?; + Ok(v.encode(env)) + } + DataType::BigInt(_) => { + let v = row.get_long(pos).map_err(|e| e.to_string())?; + Ok(v.encode(env)) + } + DataType::Float(_) => { + let v = row.get_float(pos).map_err(|e| e.to_string())?; + Ok(v.encode(env)) + } + DataType::Double(_) => { + let v = row.get_double(pos).map_err(|e| e.to_string())?; + Ok(v.encode(env)) + } + DataType::String(_) => { + let v = row.get_string(pos).map_err(|e| e.to_string())?; + Ok(v.encode(env)) + } + DataType::Char(ct) => { + let v = row + .get_char(pos, ct.length() as usize) + .map_err(|e| e.to_string())?; + Ok(v.encode(env)) + } + DataType::Bytes(_) => { + let v = row.get_bytes(pos).map_err(|e| e.to_string())?; + let mut bin = NewBinary::new(env, v.len()); + bin.as_mut_slice().copy_from_slice(v); + let binary: rustler::Binary = bin.into(); + Ok(binary.encode(env)) + } + DataType::Binary(bt) => { + let v = row + .get_binary(pos, bt.length()) + .map_err(|e| e.to_string())?; + let mut bin = NewBinary::new(env, v.len()); + bin.as_mut_slice().copy_from_slice(v); + let binary: rustler::Binary = bin.into(); + Ok(binary.encode(env)) + } + DataType::Date(_) => { + let v = row.get_date(pos).map_err(|e| e.to_string())?; + Ok(v.get_inner().encode(env)) + } + DataType::Time(_) => { + let v = row.get_time(pos).map_err(|e| e.to_string())?; + Ok(v.get_inner().encode(env)) + } + DataType::Timestamp(ts) => { + let v = row + .get_timestamp_ntz(pos, ts.precision()) + .map_err(|e| e.to_string())?; + Ok((v.get_millisecond(), v.get_nano_of_millisecond()).encode(env)) + } + DataType::TimestampLTz(ts) => { + let v = row + .get_timestamp_ltz(pos, ts.precision()) + .map_err(|e| e.to_string())?; + Ok((v.get_epoch_millisecond(), v.get_nano_of_millisecond()).encode(env)) + } + DataType::Decimal(dt) => { + let v = row + .get_decimal(pos, dt.precision() as usize, dt.scale() as usize) + .map_err(|e| e.to_string())?; + Ok(v.to_string().encode(env)) + } + _ => Err(format!("unsupported data type: {data_type:?}")), + } +} + +pub fn term_to_row<'a>( + env: Env<'a>, + values: Term<'a>, + columns: &[Column], +) -> Result, String> { + let list: Vec> = values + .decode() + .map_err(|_| "expected a list of values".to_string())?; + if list.len() != columns.len() { + return Err(format!( + "expected {} values, got {}", + columns.len(), + list.len() + )); + } + + let mut row = GenericRow::new(columns.len()); + for (i, (term, col)) in list.iter().zip(columns.iter()).enumerate() { + if term.is_atom() + && let Ok(atom) = term.decode::() + && atom == atoms::nil() + { + continue; // leave as null + } + set_field_from_term(env, &mut row, i, *term, col.data_type())?; + } + Ok(row) +} + +fn set_field_from_term<'a>( + _env: Env<'a>, + row: &mut GenericRow<'static>, + pos: usize, + term: Term<'a>, + data_type: &DataType, +) -> Result<(), String> { + match data_type { + DataType::Boolean(_) => { + let v: bool = term.decode().map_err(|_| "expected boolean")?; + row.set_field(pos, v); + } + DataType::TinyInt(_) => { + let v: i8 = term + .decode() + .map_err(|_| "expected integer in range -128..127 for tinyint")?; + row.set_field(pos, v); + } + DataType::SmallInt(_) => { + let v: i16 = term + .decode() + .map_err(|_| "expected integer in range -32768..32767 for smallint")?; + row.set_field(pos, v); + } + DataType::Int(_) => { + let v: i32 = term.decode().map_err(|_| "expected integer")?; + row.set_field(pos, v); + } + DataType::BigInt(_) => { + let v: i64 = term.decode().map_err(|_| "expected integer")?; + row.set_field(pos, v); + } + DataType::Date(_) => { + let v: i32 = term + .decode() + .map_err(|_| "expected integer (days since epoch)")?; + row.set_field(pos, Date::new(v)); + } + DataType::Time(_) => { + let v: i32 = term + .decode() + .map_err(|_| "expected integer (millis since midnight)")?; + row.set_field(pos, Time::new(v)); + } + DataType::Timestamp(_) => { + let (millis, nanos): (i64, i32) = term + .decode() + .map_err(|_| "expected {millis, nanos} tuple for timestamp")?; + let ts = TimestampNtz::from_millis_nanos(millis, nanos).map_err(|e| e.to_string())?; + row.set_field(pos, ts); + } + DataType::TimestampLTz(_) => { + let (millis, nanos): (i64, i32) = term + .decode() + .map_err(|_| "expected {millis, nanos} tuple for timestamp_ltz")?; + let ts = TimestampLtz::from_millis_nanos(millis, nanos).map_err(|e| e.to_string())?; + row.set_field(pos, ts); + } + DataType::Float(_) => { + let v: f64 = term.decode().map_err(|_| "expected number for float")?; + row.set_field(pos, v as f32); + } + DataType::Double(_) => { + let v: f64 = term.decode().map_err(|_| "expected number for double")?; + row.set_field(pos, v); + } + DataType::String(_) | DataType::Char(_) => { + let v: String = term.decode().map_err(|_| "expected string")?; + row.set_field(pos, v); + } + DataType::Decimal(dt) => { + let v: String = term.decode().map_err(|_| "expected string for decimal")?; + let bd = bigdecimal::BigDecimal::from_str(&v) + .map_err(|e| format!("failed to parse decimal '{v}': {e}"))?; + let decimal = Decimal::from_big_decimal(bd, dt.precision(), dt.scale()) + .map_err(|e| e.to_string())?; + row.set_field(pos, decimal); + } + DataType::Bytes(_) | DataType::Binary(_) => { + let bin: rustler::Binary = term.decode().map_err(|_| "expected binary")?; + row.set_field(pos, bin.as_slice().to_vec()); + } + _ => return Err(format!("unsupported data type for writing: {data_type:?}")), + } + Ok(()) +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/schema.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/schema.rs new file mode 100644 index 0000000000..5d61d29daf --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/schema.rs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::atoms::to_nif_err; +use fluss::metadata::{self, DataTypes, Schema, TableDescriptor}; +use rustler::{NifStruct, NifTaggedEnum, ResourceArc}; + +pub struct TableDescriptorResource { + pub inner: TableDescriptor, +} + +impl std::panic::RefUnwindSafe for TableDescriptorResource {} + +#[rustler::resource_impl] +impl rustler::Resource for TableDescriptorResource {} + +/// Fluss data type for NIF interop. +/// +/// Simple types map to atoms: `:int`, `:string`, etc. +/// Parameterized types map to tuples: `{:decimal, 10, 2}`, `{:char, 20}`. +#[derive(NifTaggedEnum)] +pub enum DataType { + Boolean, + Tinyint, + Smallint, + Int, + Bigint, + Float, + Double, + String, + Bytes, + Date, + Time, + Timestamp, + TimestampLtz, + Decimal(u32, u32), + Char(u32), + Binary(usize), +} + +fn to_fluss_type(dt: &DataType) -> metadata::DataType { + match dt { + DataType::Boolean => DataTypes::boolean(), + DataType::Tinyint => DataTypes::tinyint(), + DataType::Smallint => DataTypes::smallint(), + DataType::Int => DataTypes::int(), + DataType::Bigint => DataTypes::bigint(), + DataType::Float => DataTypes::float(), + DataType::Double => DataTypes::double(), + DataType::String => DataTypes::string(), + DataType::Bytes => DataTypes::bytes(), + DataType::Date => DataTypes::date(), + DataType::Time => DataTypes::time(), + DataType::Timestamp => DataTypes::timestamp(), + DataType::TimestampLtz => DataTypes::timestamp_ltz(), + DataType::Decimal(precision, scale) => DataTypes::decimal(*precision, *scale), + DataType::Char(length) => DataTypes::char(*length), + DataType::Binary(length) => DataTypes::binary(*length), + } +} + +/// Decoded from `%Fluss.Schema{}` Elixir struct. +#[derive(NifStruct)] +#[module = "Fluss.Schema"] +pub struct NifSchema { + pub columns: Vec<(String, DataType)>, + pub primary_key: Vec, +} + +#[rustler::nif] +fn table_descriptor_new( + schema: NifSchema, + bucket_count: Option, + properties: Vec<(String, String)>, +) -> Result, rustler::Error> { + let mut schema_builder = Schema::builder(); + for (name, dt) in &schema.columns { + schema_builder = schema_builder.column(name, to_fluss_type(dt)); + } + if !schema.primary_key.is_empty() { + schema_builder = schema_builder.primary_key(schema.primary_key); + } + let built_schema = schema_builder.build().map_err(to_nif_err)?; + + let mut builder = TableDescriptor::builder().schema(built_schema); + if let Some(count) = bucket_count { + builder = builder.distributed_by(Some(count), vec![]); + } + for (key, value) in properties { + builder = builder.property(&key, &value); + } + let descriptor = builder.build().map_err(to_nif_err)?; + Ok(ResourceArc::new(TableDescriptorResource { + inner: descriptor, + })) +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/table.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/table.rs new file mode 100644 index 0000000000..d48ff7ab29 --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/table.rs @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::async_nif; +use crate::connection::ConnectionResource; +use fluss::client::{FlussConnection, FlussTable, Metadata}; +use fluss::error::Error; +use fluss::metadata::{Column, TableInfo, TablePath}; +use rustler::{Env, ResourceArc, Term}; +use std::sync::Arc; + +/// Holds the data needed to reconstruct FlussTable (which has a lifetime +/// tied to FlussConnection). We store the Arc to keep +/// it alive and reconstruct short-lived FlussTable instances on demand. +pub struct TableResource { + pub connection: Arc, + pub metadata: Arc, + pub table_info: TableInfo, +} + +impl std::panic::RefUnwindSafe for TableResource {} + +#[rustler::resource_impl] +impl rustler::Resource for TableResource {} + +impl TableResource { + pub fn columns(&self) -> &[Column] { + self.table_info.schema.columns() + } + + pub fn with_table(&self, f: impl FnOnce(&FlussTable<'_>) -> T) -> T { + let table = FlussTable::new( + &self.connection, + self.metadata.clone(), + self.table_info.clone(), + ); + f(&table) + } +} + +#[rustler::nif] +fn table_get<'a>( + env: Env<'a>, + conn: ResourceArc, + database_name: String, + table_name: String, +) -> Term<'a> { + let conn_arc = conn.inner.clone(); + async_nif::spawn_task_with_result(env, async move { + let path = TablePath::new(&database_name, &table_name); + let (metadata, table_info) = { + let table = conn_arc.get_table(&path).await?; + (table.metadata().clone(), table.get_table_info().clone()) + }; + Ok::<_, Error>(ResourceArc::new(TableResource { + connection: conn_arc, + metadata, + table_info, + })) + }) +} + +#[rustler::nif] +fn table_has_primary_key(table: ResourceArc) -> bool { + table.table_info.has_primary_key() +} + +#[rustler::nif] +fn table_column_names(table: ResourceArc) -> Vec { + table + .columns() + .iter() + .map(|c| c.name().to_string()) + .collect() +} diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/write_handle.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/write_handle.rs new file mode 100644 index 0000000000..08046660bf --- /dev/null +++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/write_handle.rs @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::async_nif; +use fluss::client::WriteResultFuture; +use rustler::{Env, ResourceArc, Term}; +use std::sync::Mutex; + +pub struct WriteHandleResource { + inner: Mutex>, +} + +impl std::panic::RefUnwindSafe for WriteHandleResource {} + +#[rustler::resource_impl] +impl rustler::Resource for WriteHandleResource {} + +impl WriteHandleResource { + pub fn new(future: WriteResultFuture) -> Self { + Self { + inner: Mutex::new(Some(future)), + } + } +} + +#[rustler::nif] +fn write_handle_wait<'a>(env: Env<'a>, handle: ResourceArc) -> Term<'a> { + let future = handle.inner.lock().unwrap().take(); + match future { + Some(f) => async_nif::spawn_task(env, f), + None => async_nif::send_client_error(env, "WriteHandle already consumed"), + } +} diff --git a/fluss-rust/bindings/elixir/test/config_test.exs b/fluss-rust/bindings/elixir/test/config_test.exs new file mode 100644 index 0000000000..f4b8a11ca1 --- /dev/null +++ b/fluss-rust/bindings/elixir/test/config_test.exs @@ -0,0 +1,228 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.ConfigTest do + use ExUnit.Case, async: true + + test "new/1 creates config with bootstrap_servers; all other fields default to nil" do + config = Fluss.Config.new("localhost:9123") + assert config == %Fluss.Config{bootstrap_servers: "localhost:9123"} + end + + test "set_connect_timeout_ms/2 sets the connect timeout" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_connect_timeout_ms(30_000) + + assert config.connect_timeout_ms == 30_000 + end + + test "set_remote_file_download_thread_num/2 sets the download thread num" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_remote_file_download_thread_num(4) + + assert config.remote_file_download_thread_num == 4 + end + + test "set_scanner_log_fetch_max_bytes/2 sets the fetch max bytes" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_scanner_log_fetch_max_bytes(16_777_216) + + assert config.scanner_log_fetch_max_bytes == 16_777_216 + end + + test "set_scanner_log_fetch_max_bytes_for_bucket/2 sets the per-bucket fetch limit" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_scanner_log_fetch_max_bytes_for_bucket(1_048_576) + + assert config.scanner_log_fetch_max_bytes_for_bucket == 1_048_576 + end + + test "set_scanner_log_fetch_min_bytes/2 sets the fetch min bytes" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_scanner_log_fetch_min_bytes(1) + + assert config.scanner_log_fetch_min_bytes == 1 + end + + test "set_scanner_log_fetch_wait_max_time_ms/2 sets the max wait time" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_scanner_log_fetch_wait_max_time_ms(500) + + assert config.scanner_log_fetch_wait_max_time_ms == 500 + end + + test "set_scanner_log_max_poll_records/2 sets the max poll records" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_scanner_log_max_poll_records(1000) + + assert config.scanner_log_max_poll_records == 1000 + end + + test "set_scanner_remote_log_prefetch_num/2 sets the prefetch num" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_scanner_remote_log_prefetch_num(2) + + assert config.scanner_remote_log_prefetch_num == 2 + end + + test "set_scanner_remote_log_read_concurrency/2 sets the read concurrency" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_scanner_remote_log_read_concurrency(4) + + assert config.scanner_remote_log_read_concurrency == 4 + end + + test "set_security_protocol/2 sets the security protocol" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_security_protocol("sasl") + + assert config.security_protocol == "sasl" + end + + test "set_security_sasl_mechanism/2 sets the SASL mechanism" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_security_sasl_mechanism("PLAIN") + + assert config.security_sasl_mechanism == "PLAIN" + end + + test "set_security_sasl_username/2 sets the SASL username" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_security_sasl_username("admin") + + assert config.security_sasl_username == "admin" + end + + test "set_security_sasl_password/2 sets the SASL password" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_security_sasl_password("secret") + + assert config.security_sasl_password == "secret" + end + + test "inspect/1 redacts security_sasl_password when set" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_security_sasl_password("supersecret") + + output = inspect(config) + refute output =~ "supersecret" + assert output =~ "[REDACTED]" + end + + test "inspect/1 leaves nil security_sasl_password as nil" do + config = Fluss.Config.new("localhost:9123") + output = inspect(config) + assert output =~ "security_sasl_password: nil" + end + + test "set_writer_acks/2 sets the acks value" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_acks("all") + + assert config.writer_acks == "all" + end + + test "set_writer_bucket_no_key_assigner/2 sets a valid assigner" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_bucket_no_key_assigner(:sticky) + + assert config.writer_bucket_no_key_assigner == :sticky + end + + test "set_writer_bucket_no_key_assigner/2 only accepts :sticky or :round_robin" do + assert_raise FunctionClauseError, fn -> + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_bucket_no_key_assigner(:custom) + end + end + + test "set_writer_buffer_memory_size/2 sets the buffer memory size" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_buffer_memory_size(67_108_864) + + assert config.writer_buffer_memory_size == 67_108_864 + end + + test "set_writer_buffer_wait_timeout_ms/2 sets the wait timeout" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_buffer_wait_timeout_ms(5_000) + + assert config.writer_buffer_wait_timeout_ms == 5_000 + end + + test "set_writer_enable_idempotence/2 sets the idempotence flag" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_enable_idempotence(false) + + assert config.writer_enable_idempotence == false + end + + test "set_writer_max_inflight_requests_per_bucket/2 sets the inflight limit" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_max_inflight_requests_per_bucket(3) + + assert config.writer_max_inflight_requests_per_bucket == 3 + end + + test "set_writer_request_max_size/2 sets the request max size" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_request_max_size(2_097_152) + + assert config.writer_request_max_size == 2_097_152 + end + + test "set_writer_retries/2 sets the retry count" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_retries(5) + + assert config.writer_retries == 5 + end + + test "setters chain correctly" do + config = + Fluss.Config.new("localhost:9123") + |> Fluss.Config.set_writer_acks("all") + |> Fluss.Config.set_writer_retries(3) + |> Fluss.Config.set_writer_bucket_no_key_assigner(:round_robin) + + assert config.writer_acks == "all" + assert config.writer_retries == 3 + assert config.writer_bucket_no_key_assigner == :round_robin + end +end diff --git a/fluss-rust/bindings/elixir/test/error_test.exs b/fluss-rust/bindings/elixir/test/error_test.exs new file mode 100644 index 0000000000..d6d4017597 --- /dev/null +++ b/fluss-rust/bindings/elixir/test/error_test.exs @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.ErrorTest do + use ExUnit.Case, async: true + + @retriable_codes [ + :network_exception, + :corrupt_message, + :schema_not_exist, + :log_storage_exception, + :kv_storage_exception, + :not_leader_or_follower, + :corrupt_record_exception, + :unknown_table_or_bucket_exception, + :request_time_out, + :storage_exception, + :not_enough_replicas_after_append_exception, + :not_enough_replicas_exception, + :leader_not_available_exception + ] + + @non_retriable_codes [ + :client_error, + :unknown_server_error, + :none, + :table_not_exist, + :authenticate_exception, + :authorization_exception, + :record_too_large_exception, + :deletion_disabled_exception, + :invalid_coordinator_exception, + :fenced_leader_epoch_exception, + :fenced_tiering_epoch_exception, + :retriable_authenticate_exception + ] + + defp err(code), do: %Fluss.Error{code: code, error_code: 0, message: ""} + + test "Exception.message/1 formats '[]: '" do + err = %Fluss.Error{code: :network_exception, error_code: 1, message: "disconnected"} + assert Exception.message(err) == "Fluss error [network_exception]: disconnected" + end + + test "retriable?/1 returns true for transient protocol codes" do + for code <- @retriable_codes do + assert Fluss.Error.retriable?(err(code)), "expected #{code} to be retriable" + end + end + + test "retriable?/1 returns false for :client_error and permanent codes" do + for code <- @non_retriable_codes do + refute Fluss.Error.retriable?(err(code)), "expected #{code} to not be retriable" + end + end + + describe "NIF error surface" do + test "unreachable server returns %Fluss.Error{code: :network_exception, error_code: 1}" do + config = Fluss.Config.new("127.0.0.1:1") + + assert {:error, %Fluss.Error{code: :network_exception, error_code: 1}} = + Fluss.Connection.new(config) + end + + test "bang variant raises %Fluss.Error{}" do + config = Fluss.Config.new("127.0.0.1:1") + + assert_raise Fluss.Error, ~r/\[network_exception\]/, fn -> + Fluss.Connection.new!(config) + end + end + end +end diff --git a/fluss-rust/bindings/elixir/test/fluss_test.exs b/fluss-rust/bindings/elixir/test/fluss_test.exs new file mode 100644 index 0000000000..3eee273482 --- /dev/null +++ b/fluss-rust/bindings/elixir/test/fluss_test.exs @@ -0,0 +1,66 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule FlussTest do + use ExUnit.Case + + describe "TableDescriptor" do + test "creates descriptor from schema" do + Fluss.Schema.new() + |> Fluss.Schema.column("id", :int) + |> Fluss.TableDescriptor.new!() + end + + test "creates descriptor with bucket count" do + Fluss.Schema.new() + |> Fluss.Schema.column("id", :int) + |> Fluss.TableDescriptor.new!(bucket_count: 3) + end + + test "accepts all simple data types" do + Fluss.Schema.new() + |> Fluss.Schema.column("a", :boolean) + |> Fluss.Schema.column("b", :tinyint) + |> Fluss.Schema.column("c", :smallint) + |> Fluss.Schema.column("d", :int) + |> Fluss.Schema.column("e", :bigint) + |> Fluss.Schema.column("f", :float) + |> Fluss.Schema.column("g", :double) + |> Fluss.Schema.column("h", :string) + |> Fluss.Schema.column("i", :bytes) + |> Fluss.Schema.column("j", :date) + |> Fluss.Schema.column("k", :time) + |> Fluss.Schema.column("l", :timestamp) + |> Fluss.Schema.column("m", :timestamp_ltz) + |> Fluss.TableDescriptor.new!() + end + + test "accepts parameterized data types" do + Fluss.Schema.new() + |> Fluss.Schema.column("amount", {:decimal, 10, 2}) + |> Fluss.Schema.column("code", {:char, 5}) + |> Fluss.Schema.column("data", {:binary, 16}) + |> Fluss.TableDescriptor.new!() + end + end + + describe "earliest_offset/0" do + test "returns -2" do + assert Fluss.earliest_offset() == -2 + end + end +end diff --git a/fluss-rust/bindings/elixir/test/integration/log_table_test.exs b/fluss-rust/bindings/elixir/test/integration/log_table_test.exs new file mode 100644 index 0000000000..b3041b9587 --- /dev/null +++ b/fluss-rust/bindings/elixir/test/integration/log_table_test.exs @@ -0,0 +1,413 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.Integration.LogTableTest do + use ExUnit.Case, async: false + + alias Fluss.Test.Cluster + + @moduletag :integration + + @database "fluss" + + setup_all do + case Cluster.ensure_started() do + {:ok, servers} -> + config = Fluss.Config.new(servers) + + # Wait for cluster to be fully ready (connection + admin working) + {conn, admin} = connect_with_retry(config, 90) + + %{conn: conn, admin: admin, config: config} + + {:error, reason} -> + raise "Failed to start Fluss cluster: #{reason}" + end + end + + describe "append and scan" do + test "append rows and scan with log scanner", %{conn: conn, admin: admin} do + table_name = "ex_test_append_and_scan_#{:rand.uniform(100_000)}" + cleanup_table(admin, table_name) + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("c1", :int) + |> Fluss.Schema.column("c2", :string) + + descriptor = Fluss.TableDescriptor.new!(schema) + :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false) + + table = Fluss.Table.get!(conn, @database, table_name) + writer = Fluss.AppendWriter.new!(table) + + # Append 6 rows + for {c1, c2} <- [{1, "a1"}, {2, "a2"}, {3, "a3"}, {4, "a4"}, {5, "a5"}, {6, "a6"}] do + {:ok, _} = Fluss.AppendWriter.append(writer, [c1, c2]) + end + + :ok = Fluss.AppendWriter.flush(writer) + + # Scan all records + scanner = Fluss.LogScanner.new!(table) + :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset()) + + records = poll_records(scanner, 6) + + assert length(records) == 6 + + sorted = Enum.sort_by(records, fn r -> r[:row][:c1] end) + + for {record, i} <- Enum.with_index(sorted, 1) do + assert record[:row][:c1] == i + assert record[:row][:c2] == "a#{i}" + assert record[:change_type] == :append_only + end + + # Unsubscribe should not error + :ok = Fluss.LogScanner.unsubscribe(scanner, 0) + + cleanup_table(admin, table_name) + end + + test "append with nil values", %{conn: conn, admin: admin} do + table_name = "ex_test_append_nil_#{:rand.uniform(100_000)}" + cleanup_table(admin, table_name) + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("id", :int) + |> Fluss.Schema.column("name", :string) + + descriptor = Fluss.TableDescriptor.new!(schema) + :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false) + + table = Fluss.Table.get!(conn, @database, table_name) + writer = Fluss.AppendWriter.new!(table) + + {:ok, _} = Fluss.AppendWriter.append(writer, [1, nil]) + {:ok, _} = Fluss.AppendWriter.append(writer, [2, "present"]) + :ok = Fluss.AppendWriter.flush(writer) + + scanner = Fluss.LogScanner.new!(table) + :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset()) + + records = poll_records(scanner, 2) + assert length(records) == 2 + + sorted = Enum.sort_by(records, fn r -> r[:row][:id] end) + assert Enum.at(sorted, 0)[:row][:name] == nil + assert Enum.at(sorted, 1)[:row][:name] == "present" + + cleanup_table(admin, table_name) + end + end + + describe "multiple data types" do + test "tinyint, smallint, int, bigint, float, double, string, boolean", %{ + conn: conn, + admin: admin + } do + table_name = "ex_test_data_types_#{:rand.uniform(100_000)}" + cleanup_table(admin, table_name) + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("a_tinyint", :tinyint) + |> Fluss.Schema.column("b_smallint", :smallint) + |> Fluss.Schema.column("c_int", :int) + |> Fluss.Schema.column("d_bigint", :bigint) + |> Fluss.Schema.column("e_float", :float) + |> Fluss.Schema.column("f_double", :double) + |> Fluss.Schema.column("g_string", :string) + |> Fluss.Schema.column("h_bool", :boolean) + + descriptor = Fluss.TableDescriptor.new!(schema) + :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false) + + table = Fluss.Table.get!(conn, @database, table_name) + writer = Fluss.AppendWriter.new!(table) + + {:ok, _} = + Fluss.AppendWriter.append(writer, [ + 127, + 32_000, + 42, + 1_000_000_000_000, + 3.14, + 2.718281828, + "hello", + true + ]) + + {:ok, _} = + Fluss.AppendWriter.append(writer, [-128, -32_000, -1, -999, 0.0, -1.5, "", false]) + + :ok = Fluss.AppendWriter.flush(writer) + + scanner = Fluss.LogScanner.new!(table) + :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset()) + + records = poll_records(scanner, 2) + assert length(records) == 2 + + sorted = Enum.sort_by(records, fn r -> r[:row][:c_int] end) + row1 = Enum.at(sorted, 0)[:row] + row2 = Enum.at(sorted, 1)[:row] + + assert row1[:a_tinyint] == -128 + assert row1[:b_smallint] == -32_000 + assert row1[:c_int] == -1 + assert row1[:d_bigint] == -999 + assert row1[:g_string] == "" + assert row1[:h_bool] == false + + assert row2[:a_tinyint] == 127 + assert row2[:b_smallint] == 32_000 + assert row2[:c_int] == 42 + assert row2[:d_bigint] == 1_000_000_000_000 + assert row2[:g_string] == "hello" + assert row2[:h_bool] == true + + cleanup_table(admin, table_name) + end + end + + describe "subscribe_buckets" do + test "subscribe to multiple buckets at once", %{conn: conn, admin: admin} do + table_name = "ex_test_subscribe_buckets_#{:rand.uniform(100_000)}" + cleanup_table(admin, table_name) + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("id", :int) + |> Fluss.Schema.column("val", :string) + + descriptor = Fluss.TableDescriptor.new!(schema, bucket_count: 3) + :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false) + + table = Fluss.Table.get!(conn, @database, table_name) + writer = Fluss.AppendWriter.new!(table) + + for i <- 1..9 do + {:ok, _} = Fluss.AppendWriter.append(writer, [i, "v#{i}"]) + end + + :ok = Fluss.AppendWriter.flush(writer) + + scanner = Fluss.LogScanner.new!(table) + earliest = Fluss.earliest_offset() + + :ok = + Fluss.LogScanner.subscribe_buckets(scanner, [ + {0, earliest}, + {1, earliest}, + {2, earliest} + ]) + + records = poll_records(scanner, 9) + assert length(records) == 9 + + ids = records |> Enum.map(fn r -> r[:row][:id] end) |> Enum.sort() + assert ids == Enum.to_list(1..9) + + cleanup_table(admin, table_name) + end + end + + describe "admin operations" do + test "create and drop database", %{admin: admin} do + db_name = "ex_test_db_#{:rand.uniform(100_000)}" + :ok = Fluss.Admin.create_database(admin, db_name, true) + + {:ok, databases} = Fluss.Admin.list_databases(admin) + assert db_name in databases + + :ok = Fluss.Admin.drop_database(admin, db_name, true) + end + + test "list tables", %{admin: admin} do + table_name = "ex_test_list_tables_#{:rand.uniform(100_000)}" + cleanup_table(admin, table_name) + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("id", :int) + + descriptor = Fluss.TableDescriptor.new!(schema) + :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false) + + {:ok, tables} = Fluss.Admin.list_tables(admin, @database) + assert table_name in tables + + cleanup_table(admin, table_name) + end + + test "table metadata", %{conn: conn, admin: admin} do + table_name = "ex_test_table_meta_#{:rand.uniform(100_000)}" + cleanup_table(admin, table_name) + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("id", :int) + |> Fluss.Schema.column("name", :string) + + descriptor = Fluss.TableDescriptor.new!(schema) + :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false) + + table = Fluss.Table.get!(conn, @database, table_name) + assert Fluss.Table.has_primary_key?(table) == false + assert Fluss.Table.column_names(table) == ["id", "name"] + + cleanup_table(admin, table_name) + end + end + + describe "scan from offset" do + test "subscribe from specific offset skips earlier records", %{conn: conn, admin: admin} do + table_name = "ex_test_scan_offset_#{:rand.uniform(100_000)}" + cleanup_table(admin, table_name) + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("id", :int) + + descriptor = Fluss.TableDescriptor.new!(schema) + :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false) + + table = Fluss.Table.get!(conn, @database, table_name) + writer = Fluss.AppendWriter.new!(table) + + for i <- 1..5 do + {:ok, _} = Fluss.AppendWriter.append(writer, [i]) + end + + :ok = Fluss.AppendWriter.flush(writer) + + # Subscribe from offset 3, should skip first 3 records + scanner = Fluss.LogScanner.new!(table) + :ok = Fluss.LogScanner.subscribe(scanner, 0, 3) + + records = poll_records(scanner, 2) + assert length(records) == 2 + + ids = records |> Enum.map(fn r -> r[:row][:id] end) |> Enum.sort() + assert ids == [4, 5] + + cleanup_table(admin, table_name) + end + end + + describe "multiple flushes" do + test "append, flush, append more, flush, scan all", %{conn: conn, admin: admin} do + table_name = "ex_test_multi_flush_#{:rand.uniform(100_000)}" + cleanup_table(admin, table_name) + + schema = + Fluss.Schema.new() + |> Fluss.Schema.column("id", :int) + |> Fluss.Schema.column("batch", :string) + + descriptor = Fluss.TableDescriptor.new!(schema) + :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false) + + table = Fluss.Table.get!(conn, @database, table_name) + writer = Fluss.AppendWriter.new!(table) + + # First batch + {:ok, _} = Fluss.AppendWriter.append(writer, [1, "first"]) + {:ok, _} = Fluss.AppendWriter.append(writer, [2, "first"]) + :ok = Fluss.AppendWriter.flush(writer) + + # Second batch + {:ok, _} = Fluss.AppendWriter.append(writer, [3, "second"]) + {:ok, _} = Fluss.AppendWriter.append(writer, [4, "second"]) + :ok = Fluss.AppendWriter.flush(writer) + + scanner = Fluss.LogScanner.new!(table) + :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset()) + + records = poll_records(scanner, 4) + assert length(records) == 4 + + sorted = Enum.sort_by(records, fn r -> r[:row][:id] end) + assert Enum.at(sorted, 0)[:row][:batch] == "first" + assert Enum.at(sorted, 1)[:row][:batch] == "first" + assert Enum.at(sorted, 2)[:row][:batch] == "second" + assert Enum.at(sorted, 3)[:row][:batch] == "second" + + cleanup_table(admin, table_name) + end + end + + defp poll_records(scanner, expected_count, timeout_ms \\ 10_000) do + deadline = System.monotonic_time(:millisecond) + timeout_ms + do_poll(scanner, expected_count, deadline, []) + end + + defp do_poll(_scanner, expected_count, _deadline, acc) when length(acc) >= expected_count do + acc + end + + defp do_poll(scanner, expected_count, deadline, acc) do + remaining = deadline - System.monotonic_time(:millisecond) + + if remaining <= 0 do + acc + else + :ok = Fluss.LogScanner.poll(scanner, min(5_000, remaining)) + + receive do + {:fluss_records, records} -> + do_poll(scanner, expected_count, deadline, acc ++ records) + + {:fluss_poll_error, reason} -> + IO.warn("poll error during test: #{inspect(reason)}") + do_poll(scanner, expected_count, deadline, acc) + after + min(6_000, remaining) -> + acc + end + end + end + + defp cleanup_table(admin, table_name) do + Fluss.Admin.drop_table(admin, @database, table_name, true) + end + + defp connect_with_retry(config, timeout_s) do + deadline = System.monotonic_time(:second) + timeout_s + do_connect_retry(config, deadline, nil) + end + + defp do_connect_retry(config, deadline, last_error) do + if System.monotonic_time(:second) >= deadline do + raise "Could not connect to Fluss cluster: #{inspect(last_error)}" + end + + try do + conn = Fluss.Connection.new!(config) + admin = Fluss.Admin.new!(conn) + {:ok, _databases} = Fluss.Admin.list_databases(admin) + {conn, admin} + rescue + e -> + Process.sleep(2_000) + do_connect_retry(config, deadline, e) + end + end +end diff --git a/fluss-rust/bindings/elixir/test/support/cluster.ex b/fluss-rust/bindings/elixir/test/support/cluster.ex new file mode 100644 index 0000000000..40f0f68d35 --- /dev/null +++ b/fluss-rust/bindings/elixir/test/support/cluster.ex @@ -0,0 +1,130 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +defmodule Fluss.Test.Cluster do + @moduledoc false + + # Shells out to the `fluss-test-cluster` CLI (from `crates/fluss-test-cluster`), + # the same binary used by the Python and C++ integration tests. + + @cluster_name "shared-test" + @cluster_json_prefix "CLUSTER_JSON: " + + def ensure_started do + case System.get_env("FLUSS_BOOTSTRAP_SERVERS") do + nil -> start_cluster() + servers -> {:ok, servers} + end + end + + def stop do + if System.get_env("FLUSS_BOOTSTRAP_SERVERS") do + :ok + else + case find_cli_binary() do + {:ok, cli} -> + System.cmd(cli, ["stop", "--name", @cluster_name], stderr_to_stdout: true) + :ok + + {:error, _} -> + :ok + end + end + end + + defp start_cluster do + with {:ok, cli} <- find_cli_binary(), + {output, 0} <- + System.cmd(cli, ["start", "--sasl", "--name", @cluster_name], stderr_to_stdout: true), + {:ok, bootstrap} <- parse_cluster_json(output) do + {:ok, bootstrap} + else + {output, code} when is_binary(output) -> + {:error, "fluss-test-cluster start failed (exit #{code}):\n#{output}"} + + {:error, _} = err -> + err + end + end + + defp find_cli_binary do + case System.get_env("FLUSS_TEST_CLUSTER_BIN") do + bin when is_binary(bin) and bin != "" -> + if File.regular?(bin), + do: {:ok, bin}, + else: {:error, "FLUSS_TEST_CLUSTER_BIN=#{bin} does not exist"} + + _ -> + locate_via_cargo() + end + end + + defp locate_via_cargo do + case System.cmd("cargo", ["locate-project", "--workspace", "--message-format", "plain"], + stderr_to_stdout: true + ) do + {output, 0} -> + output |> String.trim() |> Path.dirname() |> find_binary_in_target() + + {output, code} -> + {:error, "cargo locate-project failed (exit #{code}): #{output}"} + end + end + + defp find_binary_in_target(root) do + Enum.find_value( + ["debug", "release"], + {:error, "fluss-test-cluster binary not found. Run: cargo build -p fluss-test-cluster"}, + &check_binary(root, &1) + ) + end + + defp check_binary(root, profile) do + path = Path.join([root, "target", profile, "fluss-test-cluster"]) + if File.regular?(path), do: {:ok, path}, else: nil + end + + defp parse_cluster_json(output) do + output + |> String.split("\n", trim: true) + |> Enum.find_value( + {:error, "No #{@cluster_json_prefix} token in output:\n#{output}"}, + &extract_bootstrap/1 + ) + end + + defp extract_bootstrap(line) do + case String.split(line, @cluster_json_prefix, parts: 2) do + [_, json] -> + case decode_bootstrap(json) do + {:ok, bootstrap} -> {:ok, bootstrap} + _ -> nil + end + + _ -> + nil + end + end + + # Minimal JSON extractor for `bootstrap_servers`: avoids adding a JSON dep just for tests. + defp decode_bootstrap(json) do + case Regex.run(~r/"bootstrap_servers"\s*:\s*"([^"]+)"/, json) do + [_, servers] -> {:ok, servers} + _ -> {:error, "no bootstrap_servers in: #{json}"} + end + end +end diff --git a/fluss-rust/bindings/elixir/test/test_helper.exs b/fluss-rust/bindings/elixir/test/test_helper.exs new file mode 100644 index 0000000000..b15b1f44a8 --- /dev/null +++ b/fluss-rust/bindings/elixir/test/test_helper.exs @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Exclude integration tests by default (they need a Docker cluster). +# Run with: mix test --include integration +ExUnit.start(exclude: [:integration]) + +# Stop Docker containers after all tests finish (matches Python's pytest_unconfigure). +ExUnit.after_suite(fn _ -> + unless System.get_env("FLUSS_BOOTSTRAP_SERVERS") do + Fluss.Test.Cluster.stop() + end +end) diff --git a/fluss-rust/bindings/python/Cargo.toml b/fluss-rust/bindings/python/Cargo.toml new file mode 100644 index 0000000000..30ac0469bc --- /dev/null +++ b/fluss-rust/bindings/python/Cargo.toml @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "fluss_python" +edition.workspace = true +version.workspace = true +license.workspace = true +rust-version.workspace = true + +[lib] +name = "fluss" +crate-type = ["cdylib"] + +[dependencies] +pyo3 = { version = "0.26.0", features = ["extension-module", "generate-import-lib"] } +fluss = { workspace = true, features = ["storage-all"] } +tokio = { workspace = true } +arrow = { workspace = true } +arrow-pyarrow = "57.0.0" +arrow-schema = "57.0.0" +arrow-array = "57.0.0" +pyo3-async-runtimes = { version = "0.26.0", features = ["tokio-runtime"] } +jiff = { workspace = true } +bigdecimal = "0.4" +indexmap = "2" diff --git a/fluss-rust/bindings/python/DEPENDENCIES.rust.tsv b/fluss-rust/bindings/python/DEPENDENCIES.rust.tsv new file mode 100644 index 0000000000..bc7b9b78f2 --- /dev/null +++ b/fluss-rust/bindings/python/DEPENDENCIES.rust.tsv @@ -0,0 +1,310 @@ +crate Apache-2.0 Apache-2.0 WITH LLVM-exception BSD-2-Clause BSD-3-Clause BSL-1.0 CC0-1.0 CDLA-Permissive-2.0 ISC LGPL-2.1-or-later MIT Unicode-3.0 Unlicense Zlib +ahash@0.8.12 X X +aho-corasick@1.1.4 X X +android_system_properties@0.1.5 X X +anstream@1.0.0 X X +anstyle@1.0.14 X X +anstyle-parse@1.0.0 X X +anstyle-query@1.1.5 X X +anstyle-wincon@3.0.11 X X +anyhow@1.0.102 X X +arrow@57.3.0 X +arrow-arith@57.3.0 X +arrow-array@57.3.0 X +arrow-buffer@57.3.0 X +arrow-cast@57.3.0 X +arrow-csv@57.3.0 X +arrow-data@57.3.0 X +arrow-ipc@57.3.0 X +arrow-json@57.3.0 X +arrow-ord@57.3.0 X +arrow-pyarrow@57.3.0 X +arrow-row@57.3.0 X +arrow-schema@57.3.0 X +arrow-select@57.3.0 X +arrow-string@57.3.0 X +async-trait@0.1.89 X X +atoi@2.0.0 X +atomic-waker@1.1.2 X X +autocfg@1.5.0 X X +backon@1.6.0 X +base64@0.22.1 X X +bigdecimal@0.4.10 X X +bitflags@2.11.0 X X +bitvec@1.0.1 X +block-buffer@0.10.4 X X +bumpalo@3.20.2 X X +byteorder@1.5.0 X X +bytes@1.11.1 X +cc@1.2.57 X X +cfg-if@1.0.4 X X +chrono@0.4.44 X X +clap@4.6.0 X X +clap_builder@4.6.0 X X +clap_derive@4.6.0 X X +clap_lex@1.1.0 X X +colorchoice@1.0.5 X X +const-oid@0.9.6 X X +const-random@0.1.18 X X +const-random-macro@0.1.16 X X +core-foundation-sys@0.8.7 X X +cpufeatures@0.2.17 X X +crc32c@0.6.8 X X +crossbeam-utils@0.8.21 X X +crunchy@0.2.4 X +crypto-common@0.1.7 X X +csv@1.4.0 X X +csv-core@0.1.13 X X +dashmap@6.1.0 X +delegate@0.13.5 X X +digest@0.10.7 X X +displaydoc@0.2.5 X X +either@1.15.0 X X +equivalent@1.0.2 X X +errno@0.3.14 X X +fastrand@2.3.0 X X +find-msvc-tools@0.1.9 X X +fixedbitset@0.5.7 X X +flatbuffers@25.12.19 X +fluss-rs@0.1.0 X +fluss_python@0.1.0 X +fnv@1.0.7 X X +foldhash@0.1.5 X +form_urlencoded@1.2.2 X X +funty@2.0.0 X +futures@0.3.32 X X +futures-channel@0.3.32 X X +futures-core@0.3.32 X X +futures-executor@0.3.32 X X +futures-io@0.3.32 X X +futures-macro@0.3.32 X X +futures-sink@0.3.32 X X +futures-task@0.3.32 X X +futures-util@0.3.32 X X +generic-array@0.14.7 X +getrandom@0.2.17 X X +getrandom@0.3.4 X X +getrandom@0.4.2 X X +gloo-timers@0.3.0 X X +h2@0.4.13 X +half@2.7.1 X X +hashbrown@0.14.5 X X +hashbrown@0.15.5 X X +hashbrown@0.16.1 X X +heck@0.5.0 X X +hex@0.4.3 X X +hmac@0.12.1 X X +home@0.5.12 X X +http@1.4.0 X X +http-body@1.0.1 X +http-body-util@0.1.3 X +httparse@1.10.1 X X +httpdate@1.0.3 X X +hyper@1.8.1 X +hyper-rustls@0.27.7 X X X +hyper-util@0.1.20 X +iana-time-zone@0.1.65 X X +iana-time-zone-haiku@0.1.2 X X +icu_collections@2.1.1 X +icu_locale_core@2.1.1 X +icu_normalizer@2.1.1 X +icu_normalizer_data@2.1.1 X +icu_properties@2.1.2 X +icu_properties_data@2.1.2 X +icu_provider@2.1.1 X +idna@1.1.0 X X +idna_adapter@1.2.1 X X +indexmap@2.13.0 X X +indoc@2.0.7 X X +ipnet@2.12.0 X X +iri-string@0.7.11 X X +is_terminal_polyfill@1.70.2 X X +itertools@0.14.0 X X +itoa@1.0.18 X X +jiff@0.2.23 X X +jiff-tzdb@0.1.6 X X +jiff-tzdb-platform@0.1.3 X X +jobserver@0.1.34 X X +js-sys@0.3.91 X X +lexical-core@1.0.6 X X +lexical-parse-float@1.0.6 X X +lexical-parse-integer@1.0.6 X X +lexical-util@1.0.7 X X +lexical-write-float@1.0.6 X X +lexical-write-integer@1.0.6 X X +libc@0.2.183 X X +libm@0.2.16 X +linked-hash-map@0.5.6 X X +linux-raw-sys@0.12.1 X X X +litemap@0.8.1 X +lock_api@0.4.14 X X +log@0.4.29 X X +lz4_flex@0.12.1 X +md-5@0.10.6 X X +memchr@2.8.0 X X +memoffset@0.9.1 X +mio@1.1.1 X +multimap@0.10.1 X X +num-bigint@0.4.6 X X +num-complex@0.4.6 X X +num-integer@0.1.46 X X +num-traits@0.2.19 X X +once_cell@1.21.4 X X +once_cell_polyfill@1.70.2 X X +opendal@0.55.0 X +ordered-float@5.1.0 X +parking_lot@0.12.5 X X +parking_lot_core@0.9.12 X X +parse-display@0.10.0 X X +parse-display-derive@0.10.0 X X +percent-encoding@2.3.2 X X +petgraph@0.8.3 X X +pin-project-lite@0.2.17 X X +pin-utils@0.1.0 X X +pkg-config@0.3.32 X X +portable-atomic@1.13.1 X X +portable-atomic-util@0.2.6 X X +potential_utf@0.1.4 X +ppv-lite86@0.2.21 X X +prettyplease@0.2.37 X X +proc-macro2@1.0.106 X X +prost@0.14.3 X +prost-build@0.14.3 X +prost-derive@0.14.3 X +prost-types@0.14.3 X +pyo3@0.26.0 X X +pyo3-async-runtimes@0.26.0 X +pyo3-build-config@0.26.0 X X +pyo3-ffi@0.26.0 X X +pyo3-macros@0.26.0 X X +pyo3-macros-backend@0.26.0 X X +python3-dll-a@0.2.14 X +quick-xml@0.37.5 X +quick-xml@0.38.4 X +quote@1.0.45 X X +r-efi@5.3.0 X X X +r-efi@6.0.0 X X X +radium@0.7.0 X +rand@0.8.5 X X +rand@0.9.2 X X +rand_chacha@0.3.1 X X +rand_chacha@0.9.0 X X +rand_core@0.6.4 X X +rand_core@0.9.5 X X +redox_syscall@0.5.18 X +regex@1.12.3 X X +regex-automata@0.4.14 X X +regex-syntax@0.8.10 X X +reqsign@0.16.5 X +reqwest@0.12.28 X X +ring@0.17.14 X X +rustc_version@0.4.1 X X +rustix@1.1.4 X X X +rustls@0.23.37 X X X +rustls-pki-types@1.14.0 X X +rustls-webpki@0.103.10 X +rustversion@1.0.22 X X +ryu@1.0.23 X X +scopeguard@1.2.0 X X +semver@1.0.27 X X +serde@1.0.228 X X +serde_core@1.0.228 X X +serde_derive@1.0.228 X X +serde_json@1.0.149 X X +serde_urlencoded@0.7.1 X X +sha1@0.10.6 X X +sha2@0.10.9 X X +shlex@1.3.0 X X +signal-hook-registry@1.4.8 X X +simdutf8@0.1.5 X X +slab@0.4.12 X +smallvec@1.15.1 X X +snafu@0.8.9 X X +snafu-derive@0.8.9 X X +socket2@0.6.3 X X +stable_deref_trait@1.2.1 X X +strsim@0.11.1 X +structmeta@0.3.0 X X +structmeta-derive@0.3.0 X X +strum@0.26.3 X +strum_macros@0.26.4 X +subtle@2.6.1 X +syn@2.0.117 X X +sync_wrapper@1.0.2 X +synstructure@0.13.2 X +tap@1.0.1 X +target-lexicon@0.13.5 X +tempfile@3.27.0 X X +thiserror@1.0.69 X X +thiserror-impl@1.0.69 X X +tiny-keccak@2.0.2 X +tinystr@0.8.2 X +tokio@1.50.0 X +tokio-macros@2.6.1 X +tokio-rustls@0.26.4 X X +tokio-util@0.7.18 X +tower@0.5.3 X +tower-http@0.6.8 X +tower-layer@0.3.3 X +tower-service@0.3.3 X +tracing@0.1.44 X +tracing-attributes@0.1.31 X +tracing-core@0.1.36 X +try-lock@0.2.5 X +twox-hash@2.1.2 X +typenum@1.19.0 X X +unicode-ident@1.0.24 X X X +unindent@0.2.4 X X +untrusted@0.9.0 X +url@2.5.8 X X +utf8_iter@1.0.4 X X +utf8parse@0.2.2 X X +uuid@1.22.0 X X +value-bag@1.12.0 X X +version_check@0.9.5 X X +want@0.3.1 X +wasi@0.11.1+wasi-snapshot-preview1 X X X +wasip2@1.0.2+wasi-0.2.9 X X X +wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06 X X X +wasm-bindgen@0.2.114 X X +wasm-bindgen-futures@0.4.64 X X +wasm-bindgen-macro@0.2.114 X X +wasm-bindgen-macro-support@0.2.114 X X +wasm-bindgen-shared@0.2.114 X X +wasm-streams@0.4.2 X X +web-sys@0.3.91 X X +webpki-roots@1.0.6 X +windows-core@0.62.2 X X +windows-implement@0.60.2 X X +windows-interface@0.59.3 X X +windows-link@0.2.1 X X +windows-result@0.4.1 X X +windows-strings@0.5.1 X X +windows-sys@0.52.0 X X +windows-sys@0.61.2 X X +windows-targets@0.52.6 X X +windows_aarch64_gnullvm@0.52.6 X X +windows_aarch64_msvc@0.52.6 X X +windows_i686_gnu@0.52.6 X X +windows_i686_gnullvm@0.52.6 X X +windows_i686_msvc@0.52.6 X X +windows_x86_64_gnu@0.52.6 X X +windows_x86_64_gnullvm@0.52.6 X X +windows_x86_64_msvc@0.52.6 X X +wit-bindgen@0.51.0 X X X +writeable@0.6.2 X +wyz@0.5.1 X +yoke@0.8.1 X +yoke-derive@0.8.1 X +zerocopy@0.8.47 X X X +zerocopy-derive@0.8.47 X X X +zerofrom@0.1.6 X +zerofrom-derive@0.1.6 X +zeroize@1.8.2 X X +zerotrie@0.2.3 X +zerovec@0.11.5 X +zerovec-derive@0.11.2 X +zmij@1.0.21 X +zstd@0.13.3 X +zstd-safe@7.2.4 X X +zstd-sys@2.0.16+zstd.1.5.7 X X diff --git a/fluss-rust/bindings/python/DEVELOPMENT.md b/fluss-rust/bindings/python/DEVELOPMENT.md new file mode 100644 index 0000000000..cccd0d1ee6 --- /dev/null +++ b/fluss-rust/bindings/python/DEVELOPMENT.md @@ -0,0 +1,95 @@ +# Development + +## Requirements + +- Python 3.9+ +- Rust 1.70+ +- [uv](https://docs.astral.sh/uv/) package manager +- Linux or MacOS + +> **Before you start:** +> Please make sure you can successfully build and run the [Fluss Rust client](../../crates/fluss/README.md) on your machine. +> The Python bindings require a working Fluss Rust backend and compatible environment. + +## Install Development Dependencies + +```bash +cd bindings/python +uv sync --all-extras +``` + +## Build Development Version + +```bash +source .venv/bin/activate +uv run maturin develop +``` + +## Build Release Version + +```bash +uv run maturin build --release +``` + +## Code Formatting and Linting + +```bash +uv run ruff format python/ +uv run ruff check python/ +``` + +## Type Checking + +```bash +uv run mypy python/ +``` + +## Run Examples + +```bash +uv run python example/example.py +``` + +## Build API Docs + +```bash +uv run pdoc fluss +``` + +## Release + +```bash +# Build wheel +uv run maturin build --release + +# Publish to PyPI +uv run maturin publish +``` + +## Project Structure + +``` +bindings/python/ +├── Cargo.toml # Rust dependency configuration +├── pyproject.toml # Python project configuration +├── README.md # User guide +├── DEVELOPMENT.md # This file +├── API_REFERENCE.md # API reference +├── src/ # Rust source code (PyO3 bindings) +│ ├── lib.rs +│ ├── config.rs +│ ├── connection.rs +│ ├── admin.rs +│ ├── table.rs +│ └── error.rs +├── fluss/ # Python package +│ ├── __init__.py +│ ├── __init__.pyi # Type stubs +│ └── py.typed +└── example/ + └── example.py +``` + +## License + +Apache 2.0 License diff --git a/fluss-rust/bindings/python/PYPI_README.md b/fluss-rust/bindings/python/PYPI_README.md new file mode 100644 index 0000000000..2e538f5ca7 --- /dev/null +++ b/fluss-rust/bindings/python/PYPI_README.md @@ -0,0 +1,28 @@ + + +# Fluss Python Client + +PyFluss is a Python library for programmatic access to Apache Fluss (Incubating). +It provides Python APIs to work with Fluss table metadata and read or write table data. + +The documentation is available at . + +## Get in Touch + +Join the Fluss community at . diff --git a/fluss-rust/bindings/python/README.md b/fluss-rust/bindings/python/README.md new file mode 100644 index 0000000000..54a167bc56 --- /dev/null +++ b/fluss-rust/bindings/python/README.md @@ -0,0 +1,21 @@ + + +# Fluss Python Client + +For full documentation, see the [Python user guide](../../website/docs/user-guide/python/). diff --git a/fluss-rust/bindings/python/example/example.py b/fluss-rust/bindings/python/example/example.py new file mode 100644 index 0000000000..23ccc6d1c1 --- /dev/null +++ b/fluss-rust/bindings/python/example/example.py @@ -0,0 +1,971 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import asyncio +import traceback +from datetime import date, datetime +from datetime import time as dt_time +from decimal import Decimal + +import pandas as pd +import pyarrow as pa + +import fluss + + +async def main(): + # Create connection configuration + config_spec = { + "bootstrap.servers": "127.0.0.1:9123", + # Add other configuration options as needed + "writer.request-max-size": "10485760", # 10 MB + "writer.acks": "all", # Wait for all replicas to acknowledge + "writer.retries": "3", # Retry up to 3 times on failure + "writer.batch-size": "1000", # Batch size for writes + } + config = fluss.Config(config_spec) + + # Create connection using the static create method + conn = await fluss.FlussConnection.create(config) + + # Define fields for PyArrow + fields = [ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("score", pa.float32()), + pa.field("age", pa.int32()), + pa.field("birth_date", pa.date32()), + pa.field("check_in_time", pa.time32("ms")), + pa.field("created_at", pa.timestamp("us")), # TIMESTAMP (NTZ) + pa.field("updated_at", pa.timestamp("us", tz="UTC")), # TIMESTAMP_LTZ + pa.field("salary", pa.decimal128(10, 2)), + ] + + # Create a PyArrow schema + schema = pa.schema(fields) + + # Create a Fluss Schema first (this is what TableDescriptor expects) + fluss_schema = fluss.Schema(schema) + + # Create a Fluss TableDescriptor + table_descriptor = fluss.TableDescriptor(fluss_schema) + + # Get the admin for Fluss + admin = conn.get_admin() + + # Create a Fluss table + table_path = fluss.TablePath("fluss", "sample_table_types") + + try: + await admin.create_table(table_path, table_descriptor, True) + print(f"Created table: {table_path}") + except Exception as e: + print(f"Table creation failed: {e}") + + # Get table information via admin + try: + table_info = await admin.get_table_info(table_path) + print(f"Table info: {table_info}") + print(f"Table ID: {table_info.table_id}") + print(f"Schema ID: {table_info.schema_id}") + print(f"Created time: {table_info.created_time}") + print(f"Primary keys: {table_info.get_primary_keys()}") + except Exception as e: + print(f"Failed to get table info: {e}") + + # Demo: List offsets + print("\n--- Testing list_offsets() ---") + try: + # Query latest offsets using OffsetSpec factory method + offsets = await admin.list_offsets( + table_path, + bucket_ids=[0], + offset_spec=fluss.OffsetSpec.latest() + ) + print(f"Latest offsets for table (before writes): {offsets}") + except Exception as e: + print(f"Failed to list offsets: {e}") + + # Get the table instance + table = await conn.get_table(table_path) + print(f"Got table: {table}") + + # Create a writer for the table + append_writer = table.new_append().create_writer() + print(f"Created append writer: {append_writer}") + + try: + # Demo: Write PyArrow Table + print("\n--- Testing PyArrow Table write ---") + pa_table = pa.Table.from_arrays( + [ + pa.array([1, 2, 3], type=pa.int32()), + pa.array(["Alice", "Bob", "Charlie"], type=pa.string()), + pa.array([95.2, 87.2, 92.1], type=pa.float32()), + pa.array([25, 30, 35], type=pa.int32()), + pa.array( + [date(1999, 5, 15), date(1994, 3, 20), date(1989, 11, 8)], + type=pa.date32(), + ), + pa.array( + [dt_time(9, 0, 0), dt_time(9, 30, 0), dt_time(10, 0, 0)], + type=pa.time32("ms"), + ), + pa.array( + [ + datetime(2024, 1, 15, 10, 30), + datetime(2024, 1, 15, 11, 0), + datetime(2024, 1, 15, 11, 30), + ], + type=pa.timestamp("us"), + ), + pa.array( + [ + datetime(2024, 1, 15, 10, 30), + datetime(2024, 1, 15, 11, 0), + datetime(2024, 1, 15, 11, 30), + ], + type=pa.timestamp("us", tz="UTC"), + ), + pa.array( + [Decimal("75000.00"), Decimal("82000.50"), Decimal("95000.75")], + type=pa.decimal128(10, 2), + ), + ], + schema=schema, + ) + + append_writer.write_arrow(pa_table) + print("Successfully wrote PyArrow Table") + + # Demo: Write PyArrow RecordBatch + print("\n--- Testing PyArrow RecordBatch write ---") + pa_record_batch = pa.RecordBatch.from_arrays( + [ + pa.array([4, 5], type=pa.int32()), + pa.array(["David", "Eve"], type=pa.string()), + pa.array([88.5, 91.0], type=pa.float32()), + pa.array([28, 32], type=pa.int32()), + pa.array([date(1996, 7, 22), date(1992, 12, 1)], type=pa.date32()), + pa.array([dt_time(14, 15, 0), dt_time(8, 45, 0)], type=pa.time32("ms")), + pa.array( + [datetime(2024, 1, 16, 9, 0), datetime(2024, 1, 16, 9, 30)], + type=pa.timestamp("us"), + ), + pa.array( + [datetime(2024, 1, 16, 9, 0), datetime(2024, 1, 16, 9, 30)], + type=pa.timestamp("us", tz="UTC"), + ), + pa.array( + [Decimal("68000.00"), Decimal("72500.25")], + type=pa.decimal128(10, 2), + ), + ], + schema=schema, + ) + + append_writer.write_arrow_batch(pa_record_batch) + print("Successfully wrote PyArrow RecordBatch") + + # Test 3: Append single rows with Date, Time, Timestamp, Decimal + print("\n--- Testing single row append with temporal/decimal types ---") + # Dict input with all types including Date, Time, Timestamp, Decimal + append_writer.append( + { + "id": 8, + "name": "Helen", + "score": 93.5, + "age": 26, + "birth_date": date(1998, 4, 10), + "check_in_time": dt_time(11, 30, 45), + "created_at": datetime(2024, 1, 17, 14, 0, 0), + "updated_at": datetime(2024, 1, 17, 14, 0, 0), + "salary": Decimal("88000.00"), + } + ) + print("Successfully appended row (dict with Date, Time, Timestamp, Decimal)") + + # List input with all types + append_writer.append( + [ + 9, + "Ivan", + 90.0, + 31, + date(1993, 8, 25), + dt_time(16, 45, 0), + datetime(2024, 1, 17, 15, 30, 0), + datetime(2024, 1, 17, 15, 30, 0), + Decimal("91500.50"), + ] + ) + print("Successfully appended row (list with Date, Time, Timestamp, Decimal)") + + # Demo: Write Pandas DataFrame + print("\n--- Testing Pandas DataFrame write ---") + df = pd.DataFrame( + { + "id": [10, 11], + "name": ["Frank", "Grace"], + "score": [89.3, 94.7], + "age": [29, 27], + "birth_date": [date(1995, 2, 14), date(1997, 9, 30)], + "check_in_time": [dt_time(10, 0, 0), dt_time(10, 30, 0)], + "created_at": [ + datetime(2024, 1, 18, 8, 0), + datetime(2024, 1, 18, 8, 30), + ], + "updated_at": [ + datetime(2024, 1, 18, 8, 0), + datetime(2024, 1, 18, 8, 30), + ], + "salary": [Decimal("79000.00"), Decimal("85500.75")], + } + ) + + append_writer.write_pandas(df) + print("Successfully wrote Pandas DataFrame") + + # Flush all pending data + print("\n--- Flushing data ---") + await append_writer.flush() + print("Successfully flushed data") + + # Demo: Check offsets after writes + print("\n--- Checking offsets after writes ---") + try: + offsets = await admin.list_offsets( + table_path, + bucket_ids=[0], + offset_spec=fluss.OffsetSpec.latest() + ) + print(f"Latest offsets after writing 7 records: {offsets}") + except Exception as e: + print(f"Failed to list offsets: {e}") + + except Exception as e: + print(f"Error during writing: {e}") + + # Now scan the table to verify data was written + print("\n--- Scanning table (batch scanner) ---") + try: + # Use new_scan().create_record_batch_log_scanner() for batch-based operations + batch_scanner = await table.new_scan().create_record_batch_log_scanner() + print(f"Created batch scanner: {batch_scanner}") + + # Subscribe to buckets (required before to_arrow/to_pandas) + # Use subscribe_buckets to subscribe all buckets from EARLIEST_OFFSET + num_buckets = (await admin.get_table_info(table_path)).num_buckets + batch_scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + print(f"Subscribed to {num_buckets} buckets from EARLIEST_OFFSET") + + # Read all data using to_arrow() + print("Scanning results using to_arrow():") + + # Try to get as PyArrow Table + try: + pa_table_result = await batch_scanner.to_arrow() + print(f"\nAs PyArrow Table: {pa_table_result}") + except Exception as e: + print(f"Could not convert to PyArrow: {e}") + + # Create a new batch scanner for to_pandas() test + batch_scanner2 = await table.new_scan().create_record_batch_log_scanner() + batch_scanner2.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + # Try to get as Pandas DataFrame + try: + df_result = await batch_scanner2.to_pandas() + print(f"\nAs Pandas DataFrame:\n{df_result}") + except Exception as e: + print(f"Could not convert to Pandas: {e}") + + # to_arrow_batch_reader() — returns a lazy PyArrow RecordBatchReader + batch_scanner_reader = await table.new_scan().create_record_batch_log_scanner() + batch_scanner_reader.subscribe_buckets( + {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)} + ) + arrow_reader = batch_scanner_reader.to_arrow_batch_reader() + reader_table = pa.Table.from_batches(list(arrow_reader), schema=arrow_reader.schema) + print(f"\nVia to_arrow_batch_reader(): {reader_table.num_rows} rows") + + # TODO: support to_duckdb() + + # Test poll_arrow() method for incremental reading as Arrow Table + print("\n--- Testing poll_arrow() method ---") + batch_scanner3 = await table.new_scan().create_record_batch_log_scanner() + batch_scanner3.subscribe(bucket_id=0, start_offset=fluss.EARLIEST_OFFSET) + print(f"Subscribed to bucket 0 at EARLIEST_OFFSET ({fluss.EARLIEST_OFFSET})") + + # Poll with a timeout of 5000ms (5 seconds) + # Note: poll_arrow() returns an empty table (not an error) on timeout + try: + poll_result = await batch_scanner3.poll_arrow(5000) + print(f"Number of rows: {poll_result.num_rows}") + + if poll_result.num_rows > 0: + poll_df = poll_result.to_pandas() + print(f"Polled data:\n{poll_df}") + else: + print("Empty result (no records available)") + # Empty table still has schema - this is useful! + print(f"Schema: {poll_result.schema}") + + except Exception as e: + print(f"Error during poll_arrow: {e}") + + # Test poll_record_batch() method for batches with metadata + print("\n--- Testing poll_record_batch() method ---") + batch_scanner4 = await table.new_scan().create_record_batch_log_scanner() + batch_scanner4.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + try: + batches = await batch_scanner4.poll_record_batch(5000) + print(f"Number of batches: {len(batches)}") + + for i, batch in enumerate(batches): + print(f" Batch {i}: bucket={batch.bucket}, " + f"offsets={batch.base_offset}-{batch.last_offset}, " + f"rows={batch.batch.num_rows}") + + except Exception as e: + print(f"Error during poll_record_batch: {e}") + + except Exception as e: + print(f"Error during batch scanning: {e}") + + # Test record-based scanning with poll() + print("\n--- Scanning table (record scanner) ---") + try: + # Use new_scan().create_log_scanner() for record-based operations + record_scanner = await table.new_scan().create_log_scanner() + print(f"Created record scanner: {record_scanner}") + + record_scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + # Poll returns ScanRecords — records grouped by bucket + print("\n--- Testing poll() method (record-by-record) ---") + try: + scan_records = await record_scanner.poll(5000) + print(f"Total records: {scan_records.count()}, buckets: {len(scan_records.buckets())}") + + # Flat iteration over all records (regardless of bucket) + print(f" Flat iteration: {scan_records.count()} records") + for record in scan_records: + print(f" offset={record.offset}, timestamp={record.timestamp}") + + # Per-bucket access + for bucket in scan_records.buckets(): + bucket_recs = scan_records.records(bucket) + print(f" Bucket {bucket}: {len(bucket_recs)} records") + for record in bucket_recs[:3]: + print(f" offset={record.offset}, " + f"timestamp={record.timestamp}, " + f"change_type={record.change_type}, " + f"row={record.row}") + + except Exception as e: + print(f"Error during poll: {e}") + + except Exception as e: + print(f"Error during record scanning: {e}") + + # Demo: unsubscribe — unsubscribe from a bucket (non-partitioned tables) + print("\n--- Testing unsubscribe ---") + try: + unsub_scanner = await table.new_scan().create_record_batch_log_scanner() + unsub_scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + print(f"Subscribed to {num_buckets} buckets") + # Unsubscribe from bucket 0 — future polls will skip this bucket + unsub_scanner.unsubscribe(bucket_id=0) + print("Unsubscribed from bucket 0") + remaining = await unsub_scanner.poll_arrow(5000) + print(f"After unsubscribe, got {remaining.num_rows} records (from remaining buckets)") + except Exception as e: + print(f"Error during unsubscribe test: {e}") + + # ===================================================== + # Demo: Primary Key Table with Lookup and Upsert + # ===================================================== + print("\n" + "=" * 60) + print("--- Testing Primary Key Table (Lookup & Upsert) ---") + print("=" * 60) + + # Create a primary key table for lookup/upsert tests + # Include temporal and decimal types to test full conversion + pk_table_fields = [ + pa.field("user_id", pa.int32()), + pa.field("name", pa.string()), + pa.field("email", pa.string()), + pa.field("age", pa.int32()), + pa.field("birth_date", pa.date32()), + pa.field("login_time", pa.time32("ms")), + pa.field("created_at", pa.timestamp("us")), # TIMESTAMP (NTZ) + pa.field("updated_at", pa.timestamp("us", tz="UTC")), # TIMESTAMP_LTZ + pa.field("balance", pa.decimal128(10, 2)), + ] + pk_schema = pa.schema(pk_table_fields) + fluss_pk_schema = fluss.Schema(pk_schema, primary_keys=["user_id"]) + + # Create table descriptor + pk_table_descriptor = fluss.TableDescriptor( + fluss_pk_schema, + bucket_count=3, + ) + + pk_table_path = fluss.TablePath("fluss", "users_pk_table_v3") + + try: + await admin.create_table(pk_table_path, pk_table_descriptor, True) + print(f"Created PK table: {pk_table_path}") + except Exception as e: + print(f"PK Table creation failed (may already exist): {e}") + + # Get the PK table + pk_table = await conn.get_table(pk_table_path) + print(f"Got PK table: {pk_table}") + print(f"Has primary key: {pk_table.has_primary_key()}") + + # --- Test Upsert --- + print("\n--- Testing Upsert (fire-and-forget) ---") + try: + upsert_writer = pk_table.new_upsert().create_writer() + print(f"Created upsert writer: {upsert_writer}") + + # Fire-and-forget: queue writes synchronously, flush at end. + # Records are batched internally for efficiency. + upsert_writer.upsert( + { + "user_id": 1, + "name": "Alice", + "email": "alice@example.com", + "age": 25, + "birth_date": date(1999, 5, 15), + "login_time": dt_time(9, 30, 45, 123000), # 09:30:45.123 + "created_at": datetime( + 2024, 1, 15, 10, 30, 45, 123456 + ), # with microseconds + "updated_at": datetime(2024, 1, 15, 10, 30, 45, 123456), + "balance": Decimal("1234.56"), + } + ) + print("Queued user_id=1 (Alice)") + + upsert_writer.upsert( + { + "user_id": 2, + "name": "Bob", + "email": "bob@example.com", + "age": 30, + "birth_date": date(1994, 3, 20), + "login_time": dt_time(14, 15, 30, 500000), # 14:15:30.500 + "created_at": datetime(2024, 1, 16, 11, 22, 33, 444555), + "updated_at": datetime(2024, 1, 16, 11, 22, 33, 444555), + "balance": Decimal("5678.91"), + } + ) + print("Queued user_id=2 (Bob)") + + upsert_writer.upsert( + { + "user_id": 3, + "name": "Charlie", + "email": "charlie@example.com", + "age": 35, + "birth_date": date(1989, 11, 8), + "login_time": dt_time(16, 45, 59, 999000), # 16:45:59.999 + "created_at": datetime(2024, 1, 17, 23, 59, 59, 999999), + "updated_at": datetime(2024, 1, 17, 23, 59, 59, 999999), + "balance": Decimal("9876.54"), + } + ) + print("Queued user_id=3 (Charlie)") + + # flush() waits for all queued writes to be acknowledged by the server + await upsert_writer.flush() + print("Flushed — all 3 rows acknowledged by server") + + # Per-record acknowledgment: await the returned handle to block until + # the server confirms this specific write, useful when you need to + # read-after-write or verify critical updates. + print("\n--- Testing Upsert (per-record acknowledgment) ---") + handle = upsert_writer.upsert( + { + "user_id": 1, + "name": "Alice Updated", + "email": "alice.new@example.com", + "age": 26, + "birth_date": date(1999, 5, 15), + "login_time": dt_time(10, 11, 12, 345000), # 10:11:12.345 + "created_at": datetime(2024, 1, 15, 10, 30, 45, 123456), # unchanged + "updated_at": datetime( + 2024, 1, 20, 15, 45, 30, 678901 + ), # new update time + "balance": Decimal("2345.67"), + } + ) + await handle.wait() # wait for server acknowledgment + print("Updated user_id=1 (Alice -> Alice Updated) — server acknowledged") + + except Exception as e: + print(f"Error during upsert: {e}") + traceback.print_exc() + + # --- Test Lookup --- + print("\n--- Testing Lookup ---") + try: + lookuper = pk_table.new_lookup().create_lookuper() + print(f"Created lookuper: {lookuper}") + + result = await lookuper.lookup({"user_id": 1}) + if result: + print("Lookup user_id=1: Found!") + print(f" name: {result['name']}") + print(f" email: {result['email']}") + print(f" age: {result['age']}") + print( + f" birth_date: {result['birth_date']} (type: {type(result['birth_date']).__name__})" + ) + print( + f" login_time: {result['login_time']} (type: {type(result['login_time']).__name__})" + ) + print( + f" created_at: {result['created_at']} (type: {type(result['created_at']).__name__})" + ) + print( + f" updated_at: {result['updated_at']} (type: {type(result['updated_at']).__name__})" + ) + print( + f" balance: {result['balance']} (type: {type(result['balance']).__name__})" + ) + else: + print("Lookup user_id=1: Not found") + + # Lookup another row + result = await lookuper.lookup({"user_id": 2}) + if result: + print(f"Lookup user_id=2: Found! -> {result}") + else: + print("Lookup user_id=2: Not found") + + # Lookup non-existent row + result = await lookuper.lookup({"user_id": 999}) + if result: + print(f"Lookup user_id=999: Found! -> {result}") + else: + print("Lookup user_id=999: Not found (as expected)") + + except Exception as e: + print(f"Error during lookup: {e}") + traceback.print_exc() + + # --- Test Delete --- + print("\n--- Testing Delete ---") + try: + upsert_writer = pk_table.new_upsert().create_writer() + + handle = upsert_writer.delete({"user_id": 3}) + await handle.wait() + print("Deleted user_id=3 — server acknowledged") + + lookuper = pk_table.new_lookup().create_lookuper() + result = await lookuper.lookup({"user_id": 3}) + if result: + print(f"Lookup user_id=3 after delete: Still found! -> {result}") + else: + print("Lookup user_id=3 after delete: Not found (deletion confirmed)") + + except Exception as e: + print(f"Error during delete: {e}") + traceback.print_exc() + + # --- Test Partial Update by column names --- + print("\n--- Testing Partial Update (by column names) ---") + try: + partial_writer = pk_table.new_upsert().partial_update_by_name(["user_id", "balance"]).create_writer() + handle = partial_writer.upsert({"user_id": 1, "balance": Decimal("9999.99")}) + await handle.wait() + print("Partial update: set balance=9999.99 for user_id=1") + + lookuper = pk_table.new_lookup().create_lookuper() + result = await lookuper.lookup({"user_id": 1}) + if result: + print(f"Partial update verified:" + f"\n name={result['name']} (unchanged)" + f"\n balance={result['balance']} (updated)") + else: + print("ERROR: Expected to find user_id=1") + + except Exception as e: + print(f"Error during partial update by names: {e}") + traceback.print_exc() + + # --- Test Partial Update by column indices --- + print("\n--- Testing Partial Update (by column indices) ---") + try: + # Columns: 0=user_id (PK), 1=name — update name only + partial_writer_idx = pk_table.new_upsert().partial_update_by_index([0, 1]).create_writer() + handle = partial_writer_idx.upsert([1, "Alice Renamed"]) + await handle.wait() + print("Partial update by indices: set name='Alice Renamed' for user_id=1") + + lookuper = pk_table.new_lookup().create_lookuper() + result = await lookuper.lookup({"user_id": 1}) + if result: + print(f"Partial update by indices verified:" + f"\n name={result['name']} (updated)" + f"\n balance={result['balance']} (unchanged)") + else: + print("ERROR: Expected to find user_id=1") + + except Exception as e: + print(f"Error during partial update by indices: {e}") + traceback.print_exc() + + # Demo: Column projection using builder pattern + print("\n--- Testing Column Projection ---") + try: + # Get bucket count for subscriptions + num_buckets = (await admin.get_table_info(table_path)).num_buckets + + # Project specific columns by index (using batch scanner for to_pandas) + print("\n1. Projection by index [0, 1] (id, name):") + scanner_index = await table.new_scan().project([0, 1]).create_record_batch_log_scanner() + scanner_index.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + df_projected = await scanner_index.to_pandas() + print(df_projected.head()) + print( + f" Projected {df_projected.shape[1]} columns: {list(df_projected.columns)}" + ) + + # Project specific columns by name (Pythonic!) + print("\n2. Projection by name ['name', 'score'] (Pythonic):") + scanner_names = await table.new_scan() \ + .project_by_name(["name", "score"]) \ + .create_record_batch_log_scanner() + scanner_names.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + df_named = await scanner_names.to_pandas() + print(df_named.head()) + print(f" Projected {df_named.shape[1]} columns: {list(df_named.columns)}") + + # Test empty result schema with projection + print("\n3. Testing empty result schema with projection:") + scanner_proj = await table.new_scan().project([0, 2]).create_record_batch_log_scanner() + scanner_proj.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + # Quick poll that may return empty + result = await scanner_proj.poll_arrow(100) + print(f" Schema columns: {result.schema.names}") + + except Exception as e: + print(f"Error during projection: {e}") + + + print("\n--- New: async context manager demo ---") + async with await fluss.FlussConnection.create(config) as demo_conn: + demo_table = await demo_conn.get_table(table_path) + async with demo_table.new_append().create_writer() as writer: + writer.append( + { + "id": 1, + "name": "demo", + "score": 1.0, + "age": 25, + "birth_date": date(2000, 1, 1), + "check_in_time": dt_time(12, 0, 0), + "created_at": datetime(2024, 1, 1, 12, 0, 0), + "updated_at": datetime(2024, 1, 1, 12, 0, 0), + "salary": Decimal("100.00"), + } + ) + # auto-flushes on exit + + # Demo: Drop tables + print("\n--- Testing drop_table() ---") + try: + # Drop the log table + await admin.drop_table(table_path, ignore_if_not_exists=True) + print(f"Successfully dropped table: {table_path}") + # Drop the PK table + await admin.drop_table(pk_table_path, ignore_if_not_exists=True) + print(f"Successfully dropped table: {pk_table_path}") + except Exception as e: + print(f"Failed to drop table: {e}") + + # ===================================================== + # Demo: Partitioned Table with list_partition_offsets + # ===================================================== + print("\n" + "=" * 60) + print("--- Testing Partitioned Table ---") + print("=" * 60) + + # Create a partitioned log table + partitioned_fields = [ + pa.field("id", pa.int32()), + pa.field("region", pa.string()), # partition key + pa.field("value", pa.int64()), + ] + partitioned_schema = pa.schema(partitioned_fields) + fluss_partitioned_schema = fluss.Schema(partitioned_schema) + + partitioned_table_descriptor = fluss.TableDescriptor( + fluss_partitioned_schema, + partition_keys=["region"], # Partition by region + bucket_count=1, + ) + + partitioned_table_path = fluss.TablePath("fluss", "partitioned_log_table_py") + + try: + # Drop if exists first + await admin.drop_table(partitioned_table_path, ignore_if_not_exists=True) + print(f"Dropped existing table: {partitioned_table_path}") + + # Create the partitioned table + await admin.create_table(partitioned_table_path, partitioned_table_descriptor, False) + print(f"Created partitioned table: {partitioned_table_path}") + + # Create partitions for US and EU regions + print("\n--- Creating partitions ---") + await admin.create_partition(partitioned_table_path, {"region": "US"}, ignore_if_exists=True) + print("Created partition: region=US") + await admin.create_partition(partitioned_table_path, {"region": "EU"}, ignore_if_exists=True) + print("Created partition: region=EU") + + # List partitions + print("\n--- Listing partitions ---") + partition_infos = await admin.list_partition_infos(partitioned_table_path) + for p in partition_infos: + print(f" {p}") # PartitionInfo(partition_id=..., partition_name='region=...') + + # Get the table and write some data + partitioned_table = await conn.get_table(partitioned_table_path) + partitioned_writer = partitioned_table.new_append().create_writer() + + # Append data to US partition + partitioned_writer.append({"id": 1, "region": "US", "value": 100}) + partitioned_writer.append({"id": 2, "region": "US", "value": 200}) + # Append data to EU partition + partitioned_writer.append({"id": 3, "region": "EU", "value": 300}) + partitioned_writer.append({"id": 4, "region": "EU", "value": 400}) + await partitioned_writer.flush() + print("\nWrote 4 records (2 to US, 2 to EU)") + + # Demo: list_partition_infos with partial spec filter + print("\n--- Testing list_partition_infos with spec ---") + us_partitions = await admin.list_partition_infos( + partitioned_table_path, partition_spec={"region": "US"} + ) + print(f"Filtered partitions (region=US): {us_partitions}") + + # Demo: list_partition_offsets + print("\n--- Testing list_partition_offsets ---") + + # Query offsets for US partition + # Note: partition_name is just the value (e.g., "US"), not "region=US" + us_offsets = await admin.list_partition_offsets( + partitioned_table_path, + partition_name="US", + bucket_ids=[0], + offset_spec=fluss.OffsetSpec.latest() + ) + print(f"US partition latest offsets: {us_offsets}") + + # Query offsets for EU partition + eu_offsets = await admin.list_partition_offsets( + partitioned_table_path, + partition_name="EU", + bucket_ids=[0], + offset_spec=fluss.OffsetSpec.latest() + ) + print(f"EU partition latest offsets: {eu_offsets}") + + # Demo: subscribe_partition for reading partitioned data + print("\n--- Testing subscribe_partition + to_arrow() ---") + partitioned_scanner = await partitioned_table.new_scan().create_record_batch_log_scanner() + + # Subscribe to each partition using partition_id + for p in partition_infos: + partitioned_scanner.subscribe_partition( + partition_id=p.partition_id, + bucket_id=0, + start_offset=fluss.EARLIEST_OFFSET + ) + print(f"Subscribed to partition {p.partition_name} (id={p.partition_id})") + + # Use to_arrow() - now works for partitioned tables! + partitioned_arrow = await partitioned_scanner.to_arrow() + print(f"\nto_arrow() returned {partitioned_arrow.num_rows} records from partitioned table:") + print(partitioned_arrow.to_pandas()) + + # Demo: subscribe_partition_buckets for batch subscribing to multiple partitions at once + print("\n--- Testing subscribe_partition_buckets + to_arrow() ---") + partitioned_scanner_batch = await partitioned_table.new_scan().create_record_batch_log_scanner() + partition_bucket_offsets = { + (p.partition_id, 0): fluss.EARLIEST_OFFSET for p in partition_infos + } + partitioned_scanner_batch.subscribe_partition_buckets(partition_bucket_offsets) + print(f"Batch subscribed to {len(partition_bucket_offsets)} partition+bucket combinations") + partitioned_batch_arrow = await partitioned_scanner_batch.to_arrow() + print(f"to_arrow() returned {partitioned_batch_arrow.num_rows} records:") + print(partitioned_batch_arrow.to_pandas()) + + # Demo: unsubscribe_partition - unsubscribe from one partition, read remaining + print("\n--- Testing unsubscribe_partition ---") + partitioned_scanner3 = await partitioned_table.new_scan().create_record_batch_log_scanner() + for p in partition_infos: + partitioned_scanner3.subscribe_partition(p.partition_id, 0, fluss.EARLIEST_OFFSET) + # Unsubscribe from the first partition + first_partition = partition_infos[0] + partitioned_scanner3.unsubscribe_partition(first_partition.partition_id, 0) + print(f"Unsubscribed from partition {first_partition.partition_name} (id={first_partition.partition_id})") + remaining_arrow = await partitioned_scanner3.to_arrow() + print(f"After unsubscribe, to_arrow() returned {remaining_arrow.num_rows} records (from remaining partitions):") + print(remaining_arrow.to_pandas()) + + # Demo: to_pandas() also works for partitioned tables + print("\n--- Testing to_pandas() on partitioned table ---") + partitioned_scanner2 = await partitioned_table.new_scan().create_record_batch_log_scanner() + for p in partition_infos: + partitioned_scanner2.subscribe_partition(p.partition_id, 0, fluss.EARLIEST_OFFSET) + partitioned_df = await partitioned_scanner2.to_pandas() + print(f"to_pandas() returned {len(partitioned_df)} records:") + print(partitioned_df) + + # Cleanup + await admin.drop_table(partitioned_table_path, ignore_if_not_exists=True) + print(f"\nDropped partitioned table: {partitioned_table_path}") + + except Exception as e: + print(f"Error with partitioned table: {e}") + traceback.print_exc() + + # ===================================================== + # Demo: Partitioned KV Table (Upsert, Lookup, Delete) + # ===================================================== + print("\n" + "=" * 60) + print("--- Testing Partitioned KV Table ---") + print("=" * 60) + + partitioned_kv_fields = [ + pa.field("region", pa.string()), # partition key + part of PK + pa.field("user_id", pa.int32()), # part of PK + pa.field("name", pa.string()), + pa.field("score", pa.int64()), + ] + partitioned_kv_schema = pa.schema(partitioned_kv_fields) + fluss_partitioned_kv_schema = fluss.Schema( + partitioned_kv_schema, primary_keys=["region", "user_id"] + ) + + partitioned_kv_descriptor = fluss.TableDescriptor( + fluss_partitioned_kv_schema, + partition_keys=["region"], + ) + + partitioned_kv_path = fluss.TablePath("fluss", "partitioned_kv_table_py") + + try: + await admin.drop_table(partitioned_kv_path, ignore_if_not_exists=True) + await admin.create_table(partitioned_kv_path, partitioned_kv_descriptor, False) + print(f"Created partitioned KV table: {partitioned_kv_path}") + + # Create partitions + await admin.create_partition(partitioned_kv_path, {"region": "US"}) + await admin.create_partition(partitioned_kv_path, {"region": "EU"}) + await admin.create_partition(partitioned_kv_path, {"region": "APAC"}) + print("Created partitions: US, EU, APAC") + + partitioned_kv_table = await conn.get_table(partitioned_kv_path) + upsert_writer = partitioned_kv_table.new_upsert().create_writer() + + # Upsert rows across partitions + test_data = [ + ("US", 1, "Gustave", 100), + ("US", 2, "Lune", 200), + ("EU", 1, "Sciel", 150), + ("EU", 2, "Maelle", 250), + ("APAC", 1, "Noco", 300), + ] + for region, user_id, name, score in test_data: + upsert_writer.upsert({ + "region": region, "user_id": user_id, + "name": name, "score": score, + }) + await upsert_writer.flush() + print(f"Upserted {len(test_data)} rows across 3 partitions") + + # Lookup all rows across partitions + print("\n--- Lookup across partitions ---") + lookuper = partitioned_kv_table.new_lookup().create_lookuper() + for region, user_id, name, score in test_data: + result = await lookuper.lookup({"region": region, "user_id": user_id}) + assert result is not None, f"Expected to find region={region} user_id={user_id}" + assert result["name"] == name, f"Name mismatch: {result['name']} != {name}" + assert result["score"] == score, f"Score mismatch: {result['score']} != {score}" + print(f"All {len(test_data)} rows verified across partitions") + + # Update within a partition + print("\n--- Update within partition ---") + handle = upsert_writer.upsert({ + "region": "US", "user_id": 1, + "name": "Gustave Updated", "score": 999, + }) + await handle.wait() + result = await lookuper.lookup({"region": "US", "user_id": 1}) + assert result is not None, "Expected to find region=US user_id=1 after update" + assert result["name"] == "Gustave Updated" + assert result["score"] == 999 + print(f"Update verified: US/1 name={result['name']} score={result['score']}") + + # Lookup in non-existent partition + print("\n--- Lookup in non-existent partition ---") + result = await lookuper.lookup({"region": "UNKNOWN", "user_id": 1}) + assert result is None, "Expected UNKNOWN partition lookup to return None" + print("UNKNOWN partition lookup: not found (expected)") + + # Delete within a partition + print("\n--- Delete within partition ---") + handle = upsert_writer.delete({"region": "EU", "user_id": 1}) + await handle.wait() + result = await lookuper.lookup({"region": "EU", "user_id": 1}) + assert result is None, "Expected EU/1 to be deleted" + print("Delete verified: EU/1 not found") + + # Verify sibling record still exists + result = await lookuper.lookup({"region": "EU", "user_id": 2}) + assert result is not None, "Expected EU/2 to still exist" + assert result["name"] == "Maelle" + print(f"EU/2 still exists: name={result['name']}") + + # Cleanup + await admin.drop_table(partitioned_kv_path, ignore_if_not_exists=True) + print(f"\nDropped partitioned KV table: {partitioned_kv_path}") + + except Exception as e: + print(f"Error with partitioned KV table: {e}") + traceback.print_exc() + + + + # Close connection + await conn.close() + print("\nConnection closed") + + +if __name__ == "__main__": + # Run the async main function + asyncio.run(main()) diff --git a/fluss-rust/bindings/python/fluss/__init__.py b/fluss-rust/bindings/python/fluss/__init__.py new file mode 100644 index 0000000000..098014adc6 --- /dev/null +++ b/fluss-rust/bindings/python/fluss/__init__.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from ._fluss import * + +__version__ = "0.1.0" diff --git a/fluss-rust/bindings/python/fluss/__init__.pyi b/fluss-rust/bindings/python/fluss/__init__.pyi new file mode 100644 index 0000000000..b5bfdfab28 --- /dev/null +++ b/fluss-rust/bindings/python/fluss/__init__.pyi @@ -0,0 +1,1156 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Type stubs for Fluss Python bindings.""" + +from enum import IntEnum +from types import TracebackType +from typing import ( + Any, + AsyncIterator, + Dict, + Iterator, + List, + Optional, + Tuple, + Union, + overload, +) + +import pandas as pd +import pyarrow as pa + +class ChangeType(IntEnum): + """Represents the type of change for a record in a log.""" + + AppendOnly = 0 + """Append-only operation""" + Insert = 1 + """Insert operation""" + UpdateBefore = 2 + """Update operation containing the previous content of the updated row""" + UpdateAfter = 3 + """Update operation containing the new content of the updated row""" + Delete = 4 + """Delete operation""" + + def short_string(self) -> str: + """Returns a short string representation (+A, +I, -U, +U, -D).""" + ... + +class ScanRecord: + """Represents a single scan record with metadata. + + The bucket is the key in ScanRecords, not on the individual record + (matches Rust/Java). + """ + + @property + def offset(self) -> int: + """The position of this record in the log.""" + ... + @property + def timestamp(self) -> int: + """The timestamp of this record.""" + ... + @property + def change_type(self) -> ChangeType: + """The type of change (insert, update, delete, etc.).""" + ... + @property + def row(self) -> Dict[str, object]: + """The row data as a dictionary mapping column names to values.""" + ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + +class RecordBatch: + """Represents a batch of records with metadata.""" + + @property + def batch(self) -> pa.RecordBatch: + """The Arrow RecordBatch containing the data.""" + ... + @property + def bucket(self) -> TableBucket: + """The bucket this batch belongs to.""" + ... + @property + def base_offset(self) -> int: + """The offset of the first record in this batch.""" + ... + @property + def last_offset(self) -> int: + """The offset of the last record in this batch.""" + ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + +class ScanRecords: + """A collection of scan records grouped by bucket. + + Returned by ``LogScanner.poll()``. Supports flat iteration + (``for rec in records``) and per-bucket access (``records.records(bucket)``). + """ + + def buckets(self) -> List[TableBucket]: + """List of distinct buckets that have records.""" + ... + def records(self, bucket: TableBucket) -> List[ScanRecord]: + """Get records for a specific bucket. Returns empty list if bucket not present.""" + ... + def count(self) -> int: + """Total number of records across all buckets.""" + ... + def is_empty(self) -> bool: + """Whether the result set is empty.""" + ... + def keys(self) -> List[TableBucket]: + """Mapping protocol: alias for ``buckets()``.""" + ... + def values(self) -> Iterator[List[ScanRecord]]: + """Mapping protocol: lazy iterator over record lists, one per bucket.""" + ... + def items(self) -> Iterator[Tuple[TableBucket, List[ScanRecord]]]: + """Mapping protocol: lazy iterator over ``(bucket, records)`` pairs.""" + ... + def __len__(self) -> int: ... + @overload + def __getitem__(self, index: int) -> ScanRecord: ... + @overload + def __getitem__(self, index: slice) -> List[ScanRecord]: ... + @overload + def __getitem__(self, bucket: TableBucket) -> List[ScanRecord]: ... + def __getitem__(self, key: Union[int, slice, TableBucket]) -> Union[ScanRecord, List[ScanRecord]]: ... + def __contains__(self, bucket: TableBucket) -> bool: ... + def __iter__(self) -> Iterator[ScanRecord]: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + +class Config: + def __init__(self, properties: Optional[Dict[str, str]] = None) -> None: ... + @property + def bootstrap_servers(self) -> str: ... + @bootstrap_servers.setter + def bootstrap_servers(self, server: str) -> None: ... + @property + def writer_request_max_size(self) -> int: ... + @writer_request_max_size.setter + def writer_request_max_size(self, size: int) -> None: ... + @property + def writer_acks(self) -> str: ... + @writer_acks.setter + def writer_acks(self, acks: str) -> None: ... + @property + def writer_retries(self) -> int: ... + @writer_retries.setter + def writer_retries(self, retries: int) -> None: ... + @property + def writer_batch_size(self) -> int: ... + @writer_batch_size.setter + def writer_batch_size(self, size: int) -> None: ... + @property + def writer_dynamic_batch_size_enabled(self) -> bool: ... + @writer_dynamic_batch_size_enabled.setter + def writer_dynamic_batch_size_enabled(self, enabled: bool) -> None: ... + @property + def writer_dynamic_batch_size_min(self) -> int: ... + @writer_dynamic_batch_size_min.setter + def writer_dynamic_batch_size_min(self, size: int) -> None: ... + @property + def writer_bucket_no_key_assigner(self) -> str: ... + @writer_bucket_no_key_assigner.setter + def writer_bucket_no_key_assigner(self, value: str) -> None: ... + @property + def scanner_remote_log_prefetch_num(self) -> int: ... + @scanner_remote_log_prefetch_num.setter + def scanner_remote_log_prefetch_num(self, num: int) -> None: ... + @property + def remote_file_download_thread_num(self) -> int: ... + @remote_file_download_thread_num.setter + def remote_file_download_thread_num(self, num: int) -> None: ... + @property + def scanner_remote_log_read_concurrency(self) -> int: ... + @scanner_remote_log_read_concurrency.setter + def scanner_remote_log_read_concurrency(self, num: int) -> None: ... + @property + def scanner_log_max_poll_records(self) -> int: ... + @scanner_log_max_poll_records.setter + def scanner_log_max_poll_records(self, num: int) -> None: ... + @property + def scanner_log_fetch_max_bytes(self) -> int: ... + @scanner_log_fetch_max_bytes.setter + def scanner_log_fetch_max_bytes(self, bytes: int) -> None: ... + @property + def scanner_log_fetch_min_bytes(self) -> int: ... + @scanner_log_fetch_min_bytes.setter + def scanner_log_fetch_min_bytes(self, bytes: int) -> None: ... + @property + def scanner_log_fetch_wait_max_time_ms(self) -> int: ... + @scanner_log_fetch_wait_max_time_ms.setter + def scanner_log_fetch_wait_max_time_ms(self, ms: int) -> None: ... + @property + def scanner_log_fetch_max_bytes_for_bucket(self) -> int: ... + @scanner_log_fetch_max_bytes_for_bucket.setter + def scanner_log_fetch_max_bytes_for_bucket(self, bytes: int) -> None: ... + @property + def writer_batch_timeout_ms(self) -> int: ... + @writer_batch_timeout_ms.setter + def writer_batch_timeout_ms(self, timeout: int) -> None: ... + @property + def writer_enable_idempotence(self) -> bool: ... + @writer_enable_idempotence.setter + def writer_enable_idempotence(self, enabled: bool) -> None: ... + @property + def writer_max_inflight_requests_per_bucket(self) -> int: ... + @writer_max_inflight_requests_per_bucket.setter + def writer_max_inflight_requests_per_bucket(self, num: int) -> None: ... + @property + def writer_buffer_memory_size(self) -> int: ... + @writer_buffer_memory_size.setter + def writer_buffer_memory_size(self, size: int) -> None: ... + @property + def writer_buffer_wait_timeout_ms(self) -> int: ... + @writer_buffer_wait_timeout_ms.setter + def writer_buffer_wait_timeout_ms(self, timeout: int) -> None: ... + @property + def connect_timeout_ms(self) -> int: ... + @connect_timeout_ms.setter + def connect_timeout_ms(self, timeout: int) -> None: ... + @property + def security_protocol(self) -> str: ... + @security_protocol.setter + def security_protocol(self, protocol: str) -> None: ... + @property + def security_sasl_mechanism(self) -> str: ... + @security_sasl_mechanism.setter + def security_sasl_mechanism(self, mechanism: str) -> None: ... + @property + def security_sasl_username(self) -> str: ... + @security_sasl_username.setter + def security_sasl_username(self, username: str) -> None: ... + @property + def security_sasl_password(self) -> str: ... + @security_sasl_password.setter + def security_sasl_password(self, password: str) -> None: ... + +class FlussConnection: + @staticmethod + async def create(config: Config) -> FlussConnection: ... + def get_admin(self) -> FlussAdmin: ... + async def get_table(self, table_path: TablePath) -> FlussTable: ... + async def close(self) -> None: ... + def __enter__(self) -> FlussConnection: ... + def __exit__( + self, + exc_type: Optional[type], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ) -> bool: ... + async def __aenter__(self) -> FlussConnection: ... + async def __aexit__( + self, + exc_type: Optional[type], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ) -> bool: ... + def __repr__(self) -> str: ... + +class ServerNode: + """Information about a server node in the Fluss cluster.""" + + @property + def id(self) -> int: + """The server node ID.""" + ... + @property + def host(self) -> str: + """The hostname of the server.""" + ... + @property + def port(self) -> int: + """The port number of the server.""" + ... + @property + def server_type(self) -> str: + """The type of server ('CoordinatorServer' or 'TabletServer').""" + ... + @property + def uid(self) -> str: + """The unique identifier of the server (e.g. 'cs-0', 'ts-1').""" + ... + def __repr__(self) -> str: ... + +class FlussAdmin: + async def create_database( + self, + database_name: str, + database_descriptor: Optional["DatabaseDescriptor"] = None, + ignore_if_exists: bool = False, + ) -> None: + """Create a database.""" + ... + async def drop_database( + self, + database_name: str, + ignore_if_not_exists: bool = False, + cascade: bool = True, + ) -> None: + """Drop a database.""" + ... + async def list_databases(self) -> List[str]: + """List all databases.""" + ... + async def database_exists(self, database_name: str) -> bool: + """Check if a database exists.""" + ... + async def get_database_info(self, database_name: str) -> "DatabaseInfo": + """Get database information.""" + ... + async def list_tables(self, database_name: str) -> List[str]: + """List all tables in a database.""" + ... + async def table_exists(self, table_path: TablePath) -> bool: + """Check if a table exists.""" + ... + async def drop_partition( + self, + table_path: TablePath, + partition_spec: Dict[str, str], + ignore_if_not_exists: bool = False, + ) -> None: + """Drop a partition from a partitioned table.""" + ... + async def create_table( + self, + table_path: TablePath, + table_descriptor: TableDescriptor, + ignore_if_exists: Optional[bool] = False, + ) -> None: ... + async def get_table_info(self, table_path: TablePath) -> TableInfo: ... + async def get_latest_lake_snapshot(self, table_path: TablePath) -> LakeSnapshot: ... + async def drop_table( + self, + table_path: TablePath, + ignore_if_not_exists: bool = False, + ) -> None: ... + async def list_offsets( + self, + table_path: TablePath, + bucket_ids: List[int], + offset_spec: "OffsetSpec", + ) -> Dict[int, int]: + """List offsets for the specified buckets. + + Args: + table_path: Path to the table + bucket_ids: List of bucket IDs to query + offset_spec: Offset specification (OffsetSpec.earliest(), OffsetSpec.latest(), + or OffsetSpec.timestamp(ts)) + + Returns: + Dict mapping bucket_id -> offset + """ + ... + async def list_partition_offsets( + self, + table_path: TablePath, + partition_name: str, + bucket_ids: List[int], + offset_spec: "OffsetSpec", + ) -> Dict[int, int]: + """List offsets for buckets in a specific partition. + + Args: + table_path: Path to the table + partition_name: Partition value (e.g., "US" not "region=US") + bucket_ids: List of bucket IDs to query + offset_spec: Offset specification (OffsetSpec.earliest(), OffsetSpec.latest(), + or OffsetSpec.timestamp(ts)) + + Returns: + Dict mapping bucket_id -> offset + """ + ... + async def create_partition( + self, + table_path: TablePath, + partition_spec: Dict[str, str], + ignore_if_exists: bool = False, + ) -> None: + """Create a partition for a partitioned table. + + Args: + table_path: Path to the table + partition_spec: Dict mapping partition column name to value (e.g., {"region": "US"}) + ignore_if_exists: If True, don't raise error if partition already exists + """ + ... + async def list_partition_infos( + self, + table_path: TablePath, + partition_spec: Optional[Dict[str, str]] = None, + ) -> List["PartitionInfo"]: + """List partitions for a partitioned table. + + Args: + table_path: Path to the table + partition_spec: Optional partial partition spec to filter results. + Dict mapping partition column name to value (e.g., {"region": "US"}). + If None, returns all partitions. + + Returns: + List of PartitionInfo objects + """ + ... + async def get_server_nodes(self) -> List[ServerNode]: + """Get all alive server nodes in the cluster. + + Returns: + List of ServerNode objects (coordinator and tablet servers) + """ + ... + def __repr__(self) -> str: ... + + +class DatabaseDescriptor: + """Descriptor for a Fluss database (comment and custom properties).""" + + def __init__( + self, + comment: Optional[str] = None, + custom_properties: Optional[Dict[str, str]] = None, + ) -> None: ... + @property + def comment(self) -> Optional[str]: ... + def get_custom_properties(self) -> Dict[str, str]: ... + def __repr__(self) -> str: ... + + +class DatabaseInfo: + """Information about a Fluss database.""" + + @property + def database_name(self) -> str: ... + def get_database_descriptor(self) -> DatabaseDescriptor: ... + @property + def created_time(self) -> int: ... + @property + def modified_time(self) -> int: ... + def __repr__(self) -> str: ... + +class TableScan: + """Builder for creating log scanners with flexible configuration. + + Use this builder to configure projection before creating a log scanner. + Obtain a TableScan instance via `FlussTable.new_scan()`. + + Example: + ```python + # Record-based scanning with projection + scanner = await table.new_scan() \\ + .project([0, 1, 2]) \\ + .create_log_scanner() + + # Batch-based scanning with column names + scanner = await table.new_scan() \\ + .project_by_name(["id", "name"]) \\ + .create_record_batch_log_scanner() + ``` + """ + + def project(self, indices: List[int]) -> "TableScan": + """Project to specific columns by their indices. + + Args: + indices: List of column indices (0-based) to include in the scan. + + Returns: + Self for method chaining. + """ + ... + def project_by_name(self, names: List[str]) -> "TableScan": + """Project to specific columns by their names. + + Args: + names: List of column names to include in the scan. + + Returns: + Self for method chaining. + """ + ... + async def create_log_scanner(self) -> LogScanner: + """Create a record-based log scanner. + + Use this scanner with `poll()` to get individual records with metadata + (offset, timestamp, change_type). + + Returns: + LogScanner for record-by-record scanning with `poll()` + """ + ... + async def create_record_batch_log_scanner(self) -> LogScanner: + """Create a batch-based log scanner. + + Use this scanner with `poll_arrow()` to get Arrow Tables, or with + `poll_record_batch()` to get individual batches with metadata. + + Returns: + LogScanner for batch-based scanning with `poll_arrow()` or `poll_record_batch()` + """ + ... + def __repr__(self) -> str: ... + +class FlussTable: + def new_scan(self) -> TableScan: + """Create a new table scan builder for configuring and creating log scanners. + + Use this method to create scanners with the builder pattern: + + Example: + ```python + # Record-based scanning + scanner = await table.new_scan() \\ + .project([0, 1]) \\ + .create_log_scanner() + + # Batch-based scanning + scanner = await table.new_scan() \\ + .project_by_name(["id", "name"]) \\ + .create_record_batch_log_scanner() + ``` + + Returns: + TableScan builder for configuring the scanner. + """ + ... + def new_append(self) -> TableAppend: ... + def new_upsert(self) -> TableUpsert: ... + def new_lookup(self) -> TableLookup: ... + def get_table_info(self) -> TableInfo: ... + def get_table_path(self) -> TablePath: ... + def has_primary_key(self) -> bool: ... + def __repr__(self) -> str: ... + +class TableAppend: + """Builder for creating an AppendWriter. + + Obtain via `FlussTable.new_append()`, then call `create_writer()`. + + Example: + writer = table.new_append().create_writer() + """ + + def create_writer(self) -> AppendWriter: ... + def __repr__(self) -> str: ... + +class TableUpsert: + """Builder for creating an UpsertWriter, with optional partial update. + + Obtain via `FlussTable.new_upsert()`, then optionally call + `partial_update_by_name()` or `partial_update_by_index()`, + then call `create_writer()`. + + Example: + # Full row upsert + writer = table.new_upsert().create_writer() + + # Partial update by column names + writer = table.new_upsert().partial_update_by_name(["col1", "col2"]).create_writer() + + # Partial update by column indices + writer = table.new_upsert().partial_update_by_index([0, 1]).create_writer() + """ + + def partial_update_by_name(self, columns: List[str]) -> "TableUpsert": ... + def partial_update_by_index(self, column_indices: List[int]) -> "TableUpsert": ... + def create_writer(self) -> UpsertWriter: ... + def __repr__(self) -> str: ... + +class TableLookup: + """Builder for creating a Lookuper or PrefixLookuper. + + Obtain via `FlussTable.new_lookup()`, then call `create_lookuper()` + for primary key lookup, or `lookup_by(columns).create_lookuper()` + for prefix key lookup. + + Example: + lookuper = table.new_lookup().create_lookuper() + prefix_lookuper = table.new_lookup().lookup_by(["a", "b"]).create_lookuper() + """ + + def create_lookuper(self) -> Lookuper: ... + def lookup_by(self, column_names: List[str]) -> "TablePrefixLookup": + """Switch to prefix-scan mode for the given lookup columns. + + The columns must be the table's partition keys (if any) plus the + bucket keys, in that order. + + Args: + column_names: List of column names forming the prefix key. + + Returns: + TablePrefixLookup builder. Call `create_lookuper()` to get a PrefixLookuper. + """ + ... + def __repr__(self) -> str: ... + +class TablePrefixLookup: + """Builder for creating a PrefixLookuper. + + Obtain via `TableLookup.lookup_by(columns)`, then call `create_lookuper()`. + + Example: + prefix_lookuper = table.new_lookup().lookup_by(["a", "b"]).create_lookuper() + """ + + def create_lookuper(self) -> "PrefixLookuper": ... + def __repr__(self) -> str: ... + +class AppendWriter: + def append(self, row: dict | list | tuple) -> WriteResultHandle: + """Append a single row to the table. + + Args: + row: Dictionary mapping field names to values, or + list/tuple of values in schema order + + Returns: + WriteResultHandle: Ignore for fire-and-forget, or await handle.wait() for acknowledgement. + + Supported Types: + - Boolean, TinyInt, SmallInt, Int, BigInt (integers) + - Float, Double (floating point) + - String, Char (text) + - Bytes, Binary (binary data) + - Date, Time, Timestamp, TimestampLTZ (temporal) + - Decimal (arbitrary precision) + - Null values + + Example: + writer.append({'id': 1, 'name': 'Alice', 'score': 95.5}) + writer.append([1, 'Alice', 95.5]) + + Note: + For high-throughput bulk loading, prefer write_arrow_batch(). + Use flush() to ensure all queued records are sent and acknowledged. + """ + ... + def write_arrow(self, table: pa.Table) -> None: ... + def write_arrow_batch(self, batch: pa.RecordBatch) -> WriteResultHandle: ... + def write_pandas(self, df: pd.DataFrame) -> None: ... + async def flush(self) -> None: ... + async def __aenter__(self) -> AppendWriter: + """ + Enter the async context manager. + + Returns: + The AppendWriter instance. + """ + ... + async def __aexit__( + self, + exc_type: Optional[type], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ) -> bool: + """ + Exit the async context manager. + + On exit, the writer is automatically flushed to ensure + all pending records are sent and acknowledged. + """ + ... + def __repr__(self) -> str: ... + +class UpsertWriter: + """Writer for upserting and deleting data in a Fluss primary key table.""" + + def upsert(self, row: dict | list | tuple) -> WriteResultHandle: + """Upsert a row into the table. + + If a row with the same primary key exists, it will be updated. + Otherwise, a new row will be inserted. + + Args: + row: Dictionary mapping field names to values, or + list/tuple of values in schema order + + Returns: + WriteResultHandle: Ignore for fire-and-forget, or await handle.wait() for ack. + """ + ... + def delete(self, pk: dict | list | tuple) -> WriteResultHandle: + """Delete a row from the table by primary key. + + Args: + pk: Dictionary with PK column names as keys, or + list/tuple of PK values in PK column order + + Returns: + WriteResultHandle: Ignore for fire-and-forget, or await handle.wait() for ack. + """ + ... + async def flush(self) -> None: + """Flush all pending upsert/delete operations to the server.""" + ... + async def __aenter__(self) -> UpsertWriter: + """ + Enter the async context manager. + + Returns: + The UpsertWriter instance. + """ + ... + async def __aexit__( + self, + exc_type: Optional[type], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ) -> bool: + """ + Exit the async context manager. + + On exit, the writer is automatically flushed to ensure + all pending records are sent and acknowledged. + """ + ... + def __repr__(self) -> str: ... + + +class WriteResultHandle: + """Handle for a pending write (append/upsert/delete). Ignore for fire-and-forget, or await handle.wait() for ack.""" + + async def wait(self) -> None: + """Wait for server acknowledgment of this write.""" + ... + def __repr__(self) -> str: ... + + +class Lookuper: + """Lookuper for performing primary key lookups on a Fluss table.""" + + async def lookup(self, pk: dict | list | tuple) -> Optional[Dict[str, object]]: + """Lookup a row by its primary key. + + Args: + pk: Dictionary with PK column names as keys, or + list/tuple of PK values in PK column order + + Returns: + A dict containing the row data if found, None otherwise. + """ + ... + def __repr__(self) -> str: ... + +class PrefixLookuper: + """Lookuper for performing prefix key lookups on a Fluss table. + + Returns all rows whose primary key starts with the given prefix. + Create via `table.new_lookup().lookup_by(columns).create_lookuper()`. + """ + + async def lookup(self, prefix: dict | list | tuple) -> List[Dict[str, object]]: + """Lookup all rows matching a prefix key. + + Args: + prefix: A dict, list, or tuple containing only the prefix key values + (the columns specified in lookup_by()). + For dict: keys are prefix column names. + For list/tuple: values in prefix column order. + + Returns: + A list of dicts, each containing the full row data. + Empty list if no matches. + """ + ... + def __repr__(self) -> str: ... + +class LogScanner: + """Scanner for reading log data from a Fluss table. + + This scanner supports two modes: + - Record-based scanning via `poll()` - returns individual records with metadata + - Batch-based scanning via `poll_arrow()` / `poll_record_batch()` - returns Arrow batches + + Create scanners using the builder pattern: + # Record-based scanning + scanner = await table.new_scan().create_log_scanner() + + # Batch-based scanning + scanner = await table.new_scan().create_record_batch_log_scanner() + + # With projection + scanner = await table.new_scan().project([0, 1]).create_log_scanner() + """ + + def subscribe(self, bucket_id: int, start_offset: int) -> None: + """Subscribe to a single bucket at a specific offset (non-partitioned tables). + + Args: + bucket_id: The bucket ID to subscribe to + start_offset: The offset to start reading from (use EARLIEST_OFFSET for beginning) + """ + ... + def subscribe_buckets(self, bucket_offsets: Dict[int, int]) -> None: + """Subscribe to multiple buckets at specified offsets (non-partitioned tables). + + Args: + bucket_offsets: Dict mapping bucket_id -> start_offset + """ + ... + def subscribe_partition( + self, partition_id: int, bucket_id: int, start_offset: int + ) -> None: + """Subscribe to a bucket within a specific partition (partitioned tables only). + + Args: + partition_id: The partition ID (from PartitionInfo.partition_id) + bucket_id: The bucket ID within the partition + start_offset: The offset to start reading from (use EARLIEST_OFFSET for beginning) + """ + ... + def subscribe_partition_buckets( + self, partition_bucket_offsets: Dict[Tuple[int, int], int] + ) -> None: + """Subscribe to multiple partition+bucket combinations at once (partitioned tables only). + + Args: + partition_bucket_offsets: Dict mapping (partition_id, bucket_id) tuples to start_offsets. + Example: {(partition_id_1, 0): EARLIEST_OFFSET, (partition_id_2, 1): 100} + """ + ... + def unsubscribe(self, bucket_id: int) -> None: + """Unsubscribe from a specific bucket (non-partitioned tables only). + + Args: + bucket_id: The bucket ID to unsubscribe from + """ + ... + def unsubscribe_partition(self, partition_id: int, bucket_id: int) -> None: + """Unsubscribe from a specific partition bucket (partitioned tables only). + + Args: + partition_id: The partition ID to unsubscribe from + bucket_id: The bucket ID within the partition + """ + ... + async def poll(self, timeout_ms: int) -> ScanRecords: + """Poll for individual records with metadata. + + Requires a record-based scanner (created with new_scan().create_log_scanner()). + + Args: + timeout_ms: Timeout in milliseconds to wait for records. + + Returns: + ScanRecords grouped by bucket. Supports flat iteration + (``for rec in records``) and per-bucket access + (``records.buckets()``, ``records.records(bucket)``). + + Note: + Returns an empty ScanRecords if no records are available or timeout expires. + """ + ... + async def poll_record_batch(self, timeout_ms: int) -> List[RecordBatch]: + """Poll for batches with metadata. + + Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()). + + Args: + timeout_ms: Timeout in milliseconds to wait for batches. + + Returns: + List of RecordBatch objects, each containing the Arrow batch along with + bucket, base_offset, and last_offset metadata. + + Note: + Returns an empty list if no batches are available or timeout expires. + """ + ... + async def poll_arrow(self, timeout_ms: int) -> pa.Table: + """Poll for records as an Arrow Table. + + Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()). + + Args: + timeout_ms: Timeout in milliseconds to wait for records. + + Returns: + PyArrow Table containing the polled records (batches merged). + + Note: + Returns an empty table (with correct schema) if no records are available + or timeout expires. + """ + ... + def to_arrow_batch_reader(self) -> pa.RecordBatchReader: + """Create a lazy Arrow RecordBatchReader that reads until latest offsets. + + Returns a ``pyarrow.RecordBatchReader`` that lazily polls batches one at + a time (streaming). Prefer this when you want to process batches without + holding the full result in memory at once. + + Do not call ``poll_arrow`` / ``poll_record_batch`` on this scanner while + iterating the reader; they share the same underlying scanner state. + Overlapping calls are not supported. Use one active + polling/consumption path at a time. + + Requires a batch-based scanner (created with ``new_scan().create_record_batch_log_scanner()``). + You must call ``subscribe()``, ``subscribe_buckets()``, ``subscribe_partition()``, + or ``subscribe_partition_buckets()`` first. + + Returns: + ``pyarrow.RecordBatchReader`` yielding ``RecordBatch`` objects. + """ + ... + async def to_pandas(self) -> pd.DataFrame: + """Convert all data to Pandas DataFrame. + + Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()). + Reads from currently subscribed buckets until reaching their latest offsets. + + You must call subscribe(), subscribe_buckets(), or subscribe_partition() first. + """ + ... + async def to_arrow(self) -> pa.Table: + """Convert all data to Arrow Table. + + Batches are collected in Rust then combined into one table (no per-batch + Python iteration). Do not interleave with ``poll_arrow`` / ``poll_record_batch`` + for the same subscription session; overlapping use is not supported. + + Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()). + Reads from currently subscribed buckets until reaching their latest offsets. + + You must call subscribe(), subscribe_buckets(), or subscribe_partition() first. + """ + ... + + def __repr__(self) -> str: ... + def __aiter__(self) -> AsyncIterator[Union[ScanRecord, RecordBatch]]: ... + +class Schema: + def __init__( + self, schema: pa.Schema, primary_keys: Optional[List[str]] = None + ) -> None: ... + def get_column_names(self) -> List[str]: ... + def get_column_types(self) -> List[str]: ... + def get_columns(self) -> List[Tuple[str, str]]: ... + def get_primary_keys(self) -> List[str]: ... + def __str__(self) -> str: ... + +class TableDescriptor: + def __init__( + self, + schema: Schema, + *, + partition_keys: Optional[List[str]] = None, + bucket_count: Optional[int] = None, + bucket_keys: Optional[List[str]] = None, + comment: Optional[str] = None, + log_format: Optional[str] = None, + kv_format: Optional[str] = None, + properties: Optional[Dict[str, str]] = None, + custom_properties: Optional[Dict[str, str]] = None, + ) -> None: ... + def get_schema(self) -> Schema: ... + +class TablePath: + def __init__(self, database: str, table: str) -> None: ... + @property + def database_name(self) -> str: ... + @property + def table_name(self) -> str: ... + def table_path_str(self) -> str: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + def __hash__(self) -> int: ... + def __eq__(self, other: object) -> bool: ... + +class TableInfo: + @property + def table_id(self) -> int: ... + @property + def schema_id(self) -> int: ... + @property + def created_time(self) -> int: ... + @property + def modified_time(self) -> int: ... + @property + def table_path(self) -> TablePath: ... + @property + def num_buckets(self) -> int: ... + @property + def comment(self) -> Optional[str]: ... + def get_primary_keys(self) -> List[str]: ... + def get_bucket_keys(self) -> List[str]: ... + def get_partition_keys(self) -> List[str]: ... + def has_primary_key(self) -> bool: ... + def is_partitioned(self) -> bool: ... + def get_properties(self) -> Dict[str, str]: ... + def get_custom_properties(self) -> Dict[str, str]: ... + def get_schema(self) -> Schema: ... + def get_column_names(self) -> List[str]: ... + def get_column_count(self) -> int: ... + +class FlussError(Exception): + message: str + error_code: int + def __init__(self, message: str, error_code: int = -2) -> None: ... + def __str__(self) -> str: ... + @property + def is_retriable(self) -> bool: ... + +class LakeSnapshot: + def __init__(self, snapshot_id: int) -> None: ... + @property + def snapshot_id(self) -> int: ... + @property + def table_buckets_offset(self) -> Dict[TableBucket, int]: ... + def get_bucket_offset(self, bucket: TableBucket) -> Optional[int]: ... + def get_table_buckets(self) -> List[TableBucket]: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + +class TableBucket: + def __init__(self, table_id: int, bucket: int) -> None: ... + @staticmethod + def with_partition( + table_id: int, partition_id: int, bucket: int + ) -> TableBucket: ... + @property + def table_id(self) -> int: ... + @property + def bucket_id(self) -> int: ... + @property + def partition_id(self) -> Optional[int]: ... + def __hash__(self) -> int: ... + def __eq__(self, other: object) -> bool: ... + def __str__(self) -> str: ... + def __repr__(self) -> str: ... + +class PartitionInfo: + """Information about a partition.""" + + @property + def partition_id(self) -> int: + """Get the partition ID (globally unique in the cluster).""" + ... + @property + def partition_name(self) -> str: + """Get the partition name.""" + ... + def __repr__(self) -> str: ... + +class ErrorCode: + """Named constants for Fluss API error codes. + + Server API errors have error_code > 0 or == -1. + Client-side errors have error_code == CLIENT_ERROR (-2). + These constants are convenience names — new server error codes work + automatically since error_code is a raw int, not a closed enum. + """ + + CLIENT_ERROR: int + NONE: int + UNKNOWN_SERVER_ERROR: int + NETWORK_EXCEPTION: int + UNSUPPORTED_VERSION: int + CORRUPT_MESSAGE: int + DATABASE_NOT_EXIST: int + DATABASE_NOT_EMPTY: int + DATABASE_ALREADY_EXIST: int + TABLE_NOT_EXIST: int + TABLE_ALREADY_EXIST: int + SCHEMA_NOT_EXIST: int + LOG_STORAGE_EXCEPTION: int + KV_STORAGE_EXCEPTION: int + NOT_LEADER_OR_FOLLOWER: int + RECORD_TOO_LARGE_EXCEPTION: int + CORRUPT_RECORD_EXCEPTION: int + INVALID_TABLE_EXCEPTION: int + INVALID_DATABASE_EXCEPTION: int + INVALID_REPLICATION_FACTOR: int + INVALID_REQUIRED_ACKS: int + LOG_OFFSET_OUT_OF_RANGE_EXCEPTION: int + NON_PRIMARY_KEY_TABLE_EXCEPTION: int + UNKNOWN_TABLE_OR_BUCKET_EXCEPTION: int + INVALID_UPDATE_VERSION_EXCEPTION: int + INVALID_COORDINATOR_EXCEPTION: int + FENCED_LEADER_EPOCH_EXCEPTION: int + REQUEST_TIME_OUT: int + STORAGE_EXCEPTION: int + OPERATION_NOT_ATTEMPTED_EXCEPTION: int + NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION: int + NOT_ENOUGH_REPLICAS_EXCEPTION: int + SECURITY_TOKEN_EXCEPTION: int + OUT_OF_ORDER_SEQUENCE_EXCEPTION: int + DUPLICATE_SEQUENCE_EXCEPTION: int + UNKNOWN_WRITER_ID_EXCEPTION: int + INVALID_COLUMN_PROJECTION: int + INVALID_TARGET_COLUMN: int + PARTITION_NOT_EXISTS: int + TABLE_NOT_PARTITIONED_EXCEPTION: int + INVALID_TIMESTAMP_EXCEPTION: int + INVALID_CONFIG_EXCEPTION: int + LAKE_STORAGE_NOT_CONFIGURED_EXCEPTION: int + KV_SNAPSHOT_NOT_EXIST: int + PARTITION_ALREADY_EXISTS: int + PARTITION_SPEC_INVALID_EXCEPTION: int + LEADER_NOT_AVAILABLE_EXCEPTION: int + PARTITION_MAX_NUM_EXCEPTION: int + AUTHENTICATE_EXCEPTION: int + SECURITY_DISABLED_EXCEPTION: int + AUTHORIZATION_EXCEPTION: int + BUCKET_MAX_NUM_EXCEPTION: int + FENCED_TIERING_EPOCH_EXCEPTION: int + RETRIABLE_AUTHENTICATE_EXCEPTION: int + INVALID_SERVER_RACK_INFO_EXCEPTION: int + LAKE_SNAPSHOT_NOT_EXIST: int + LAKE_TABLE_ALREADY_EXIST: int + INELIGIBLE_REPLICA_EXCEPTION: int + INVALID_ALTER_TABLE_EXCEPTION: int + DELETION_DISABLED_EXCEPTION: int + +class OffsetSpec: + """Offset specification for list_offsets(), matching Java's OffsetSpec. + + Use factory methods to create instances: + OffsetSpec.earliest() + OffsetSpec.latest() + OffsetSpec.timestamp(ts) + """ + + @staticmethod + def earliest() -> "OffsetSpec": + """Create an OffsetSpec for the earliest available offset.""" + ... + @staticmethod + def latest() -> "OffsetSpec": + """Create an OffsetSpec for the latest available offset.""" + ... + @staticmethod + def timestamp(ts: int) -> "OffsetSpec": + """Create an OffsetSpec for the offset at or after the given timestamp.""" + ... + def __repr__(self) -> str: ... + +# Constant for earliest offset (-2) +EARLIEST_OFFSET: int + +__version__: str diff --git a/fluss-rust/bindings/python/fluss/py.typed b/fluss-rust/bindings/python/fluss/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fluss-rust/bindings/python/pyproject.toml b/fluss-rust/bindings/python/pyproject.toml new file mode 100644 index 0000000000..56a059c9d4 --- /dev/null +++ b/fluss-rust/bindings/python/pyproject.toml @@ -0,0 +1,105 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[build-system] +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" + +[project] +name = "pyfluss" +description = "Apache Fluss (incubating) Python client" +authors = [{name = "Apache Fluss", email = "dev@fluss.apache.org"}] +license = {text = "Apache-2.0"} +readme = "PYPI_README.md" +requires-python = ">=3.9" +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +dynamic = ["version"] + +dependencies = [ + "pandas>=2.3.1", + "pyarrow>=10.0.0", +] + +[project.urls] +Homepage = "https://clients.fluss.apache.org/user-guide/python/installation/" +Repository = "https://github.com/apache/fluss-rust" + +[project.optional-dependencies] +dev = [ + "mypy>=1.17.1", + "pytest>=8.3.5", + "pytest-asyncio>=0.25.3", + "pytest-xdist>=3.5.0", + "pytest-timeout>=2.3.1", + "filelock>=3.0", + "ruff>=0.9.10", + "maturin>=1.8.2", +] +docs = [ + "pdoc>=15.0.4", +] + +[tool.maturin] +python-source = "." +module-name = "fluss._fluss" +features = ["pyo3/extension-module"] + +[tool.uv] +cache-keys = [ + { file = "pyproject.toml" }, + { file = "Cargo.toml" }, + { file = "src/**/*.rs" }, + { file = "../../crates/**/*.rs" }, +] + +[tool.ruff] +line-length = 88 +fix = true + +[tool.ruff.lint] +ignore = ["E402", "F403", "F405"] +select = ["E", "F", "I"] + +[tool.ruff.lint.pycodestyle] +max-doc-length = 88 + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[tool.ruff.format] +docstring-code-format = true + +[tool.ruff.lint.isort] +known-first-party = ["fluss"] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "session" +timeout = 120 + +[tool.mypy] +python_version = "3.9" +warn_return_any = true +warn_unused_configs = true +ignore_missing_imports = true diff --git a/fluss-rust/bindings/python/src/admin.rs b/fluss-rust/bindings/python/src/admin.rs new file mode 100644 index 0000000000..5f4e45d5b9 --- /dev/null +++ b/fluss-rust/bindings/python/src/admin.rs @@ -0,0 +1,633 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::*; +use pyo3::conversion::IntoPyObject; +use pyo3_async_runtimes::tokio::future_into_py; +use std::sync::Arc; + +/// Administrative client for managing Fluss tables +#[pyclass] +pub struct FlussAdmin { + __admin: Arc, +} + +/// Validate bucket IDs are non-negative +fn validate_bucket_ids(bucket_ids: &[i32]) -> PyResult<()> { + for &bucket_id in bucket_ids { + if bucket_id < 0 { + return Err(FlussError::new_err(format!( + "Invalid bucket_id: {bucket_id}. Bucket IDs must be non-negative" + ))); + } + } + Ok(()) +} + +#[pymethods] +impl FlussAdmin { + /// Create a database. + /// + /// Args: + /// database_name: Name of the database + /// ignore_if_exists: If True, don't raise error if database already exists + /// database_descriptor: Optional descriptor (comment, custom_properties) + /// + /// Returns: + /// None + #[pyo3(signature = (database_name, database_descriptor=None, ignore_if_exists=false))] + pub fn create_database<'py>( + &self, + py: Python<'py>, + database_name: &str, + database_descriptor: Option<&DatabaseDescriptor>, + ignore_if_exists: bool, + ) -> PyResult> { + let admin = self.__admin.clone(); + let name = database_name.to_string(); + let descriptor = database_descriptor.map(|d| d.to_core().clone()); + + future_into_py(py, async move { + admin + .create_database(&name, descriptor.as_ref(), ignore_if_exists) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| Ok(py.None())) + }) + } + + /// Drop a database. + /// + /// Args: + /// database_name: Name of the database + /// ignore_if_not_exists: If True, don't raise error if database does not exist + /// cascade: If True, drop tables in the database first + /// + /// Returns: + /// None + #[pyo3(signature = (database_name, ignore_if_not_exists=false, cascade=true))] + pub fn drop_database<'py>( + &self, + py: Python<'py>, + database_name: &str, + ignore_if_not_exists: bool, + cascade: bool, + ) -> PyResult> { + let admin = self.__admin.clone(); + let name = database_name.to_string(); + + future_into_py(py, async move { + admin + .drop_database(&name, ignore_if_not_exists, cascade) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| Ok(py.None())) + }) + } + + /// List all databases. + /// + /// Returns: + /// List[str]: Names of all databases + pub fn list_databases<'py>(&self, py: Python<'py>) -> PyResult> { + let admin = self.__admin.clone(); + + future_into_py(py, async move { + let names = admin + .list_databases() + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let py_list = pyo3::types::PyList::empty(py); + for name in names { + py_list.append(name)?; + } + Ok(py_list.unbind()) + }) + }) + } + + /// Check if a database exists. + /// + /// Args: + /// database_name: Name of the database + /// + /// Returns: + /// bool: True if the database exists + pub fn database_exists<'py>( + &self, + py: Python<'py>, + database_name: &str, + ) -> PyResult> { + let admin = self.__admin.clone(); + let name = database_name.to_string(); + + future_into_py(py, async move { + let exists = admin + .database_exists(&name) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| Ok(exists.into_pyobject(py)?.to_owned().into_any().unbind())) + }) + } + + /// Get database information. + /// + /// Args: + /// database_name: Name of the database + /// + /// Returns: + /// DatabaseInfo: Database metadata + pub fn get_database_info<'py>( + &self, + py: Python<'py>, + database_name: &str, + ) -> PyResult> { + let admin = self.__admin.clone(); + let name = database_name.to_string(); + + future_into_py(py, async move { + let info = admin + .get_database_info(&name) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| Py::new(py, DatabaseInfo::from_core(info))) + }) + } + + /// List all tables in a database. + /// + /// Args: + /// database_name: Name of the database + /// + /// Returns: + /// List[str]: Names of all tables in the database + pub fn list_tables<'py>( + &self, + py: Python<'py>, + database_name: &str, + ) -> PyResult> { + let admin = self.__admin.clone(); + let name = database_name.to_string(); + + future_into_py(py, async move { + let names = admin + .list_tables(&name) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let py_list = pyo3::types::PyList::empty(py); + for name in names { + py_list.append(name)?; + } + Ok(py_list.unbind()) + }) + }) + } + + /// Check if a table exists. + /// + /// Args: + /// table_path: Path to the table (database, table) + /// + /// Returns: + /// bool: True if the table exists + pub fn table_exists<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + ) -> PyResult> { + let core_table_path = table_path.to_core(); + let admin = self.__admin.clone(); + + future_into_py(py, async move { + let exists = admin + .table_exists(&core_table_path) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| Ok(exists.into_pyobject(py)?.to_owned().into_any().unbind())) + }) + } + + /// Drop a partition from a partitioned table. + /// + /// Args: + /// table_path: Path to the table + /// partition_spec: Dict mapping partition column name to value (e.g., {"region": "US"}) + /// ignore_if_not_exists: If True, don't raise error if partition does not exist + /// + /// Returns: + /// None + #[pyo3(signature = (table_path, partition_spec, ignore_if_not_exists=false))] + pub fn drop_partition<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + partition_spec: std::collections::HashMap, + ignore_if_not_exists: bool, + ) -> PyResult> { + let core_table_path = table_path.to_core(); + let admin = self.__admin.clone(); + let core_partition_spec = fcore::metadata::PartitionSpec::new(partition_spec); + + future_into_py(py, async move { + admin + .drop_partition(&core_table_path, &core_partition_spec, ignore_if_not_exists) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| Ok(py.None())) + }) + } + + /// Create a table with the given schema + #[pyo3(signature = (table_path, table_descriptor, ignore_if_exists=None))] + pub fn create_table<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + table_descriptor: &TableDescriptor, + ignore_if_exists: Option, + ) -> PyResult> { + let ignore = ignore_if_exists.unwrap_or(false); + + let core_table_path = table_path.to_core(); + let core_descriptor = table_descriptor.to_core().clone(); + let admin = self.__admin.clone(); + + future_into_py(py, async move { + admin + .create_table(&core_table_path, &core_descriptor, ignore) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| Ok(py.None())) + }) + } + + /// Get table information + pub fn get_table_info<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + ) -> PyResult> { + let core_table_path = table_path.to_core(); + let admin = self.__admin.clone(); + + future_into_py(py, async move { + let core_table_info = admin + .get_table_info(&core_table_path) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let table_info = TableInfo::from_core(core_table_info); + Py::new(py, table_info) + }) + }) + } + + /// Get the latest lake snapshot for a table + pub fn get_latest_lake_snapshot<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + ) -> PyResult> { + let core_table_path = table_path.to_core(); + let admin = self.__admin.clone(); + + future_into_py(py, async move { + let core_lake_snapshot = admin + .get_latest_lake_snapshot(&core_table_path) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let lake_snapshot = LakeSnapshot::from_core(core_lake_snapshot); + Py::new(py, lake_snapshot) + }) + }) + } + + /// Drop a table + #[pyo3(signature = (table_path, ignore_if_not_exists=false))] + pub fn drop_table<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + ignore_if_not_exists: bool, + ) -> PyResult> { + let core_table_path = table_path.to_core(); + let admin = self.__admin.clone(); + + future_into_py(py, async move { + admin + .drop_table(&core_table_path, ignore_if_not_exists) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| Ok(py.None())) + }) + } + + /// List offsets for buckets (non-partitioned tables only). + /// + /// Args: + /// table_path: Path to the table + /// bucket_ids: List of bucket IDs to query + /// offset_spec: Offset specification (OffsetSpec.earliest(), OffsetSpec.latest(), + /// or OffsetSpec.timestamp(ts)) + /// + /// Returns: + /// dict[int, int]: Mapping of bucket_id -> offset + pub fn list_offsets<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + bucket_ids: Vec, + offset_spec: &OffsetSpec, + ) -> PyResult> { + validate_bucket_ids(&bucket_ids)?; + let offset_spec = offset_spec.inner.clone(); + + let core_table_path = table_path.to_core(); + let admin = self.__admin.clone(); + + future_into_py(py, async move { + let offsets = admin + .list_offsets(&core_table_path, &bucket_ids, offset_spec) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let dict = pyo3::types::PyDict::new(py); + for (bucket_id, offset) in offsets { + dict.set_item(bucket_id, offset)?; + } + Ok(dict.unbind()) + }) + }) + } + + /// List offsets for buckets in a specific partition of a partitioned table. + /// + /// Args: + /// table_path: Path to the table + /// partition_name: Partition value (e.g., "US" not "region=US") + /// bucket_ids: List of bucket IDs to query + /// offset_spec: Offset specification (OffsetSpec.earliest(), OffsetSpec.latest(), + /// or OffsetSpec.timestamp(ts)) + /// + /// Returns: + /// dict[int, int]: Mapping of bucket_id -> offset + pub fn list_partition_offsets<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + partition_name: &str, + bucket_ids: Vec, + offset_spec: &OffsetSpec, + ) -> PyResult> { + validate_bucket_ids(&bucket_ids)?; + let offset_spec = offset_spec.inner.clone(); + + let core_table_path = table_path.to_core(); + let admin = self.__admin.clone(); + let partition_name = partition_name.to_string(); + + future_into_py(py, async move { + let offsets = admin + .list_partition_offsets(&core_table_path, &partition_name, &bucket_ids, offset_spec) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let dict = pyo3::types::PyDict::new(py); + for (bucket_id, offset) in offsets { + dict.set_item(bucket_id, offset)?; + } + Ok(dict.unbind()) + }) + }) + } + + /// Create a partition for a partitioned table. + /// + /// Args: + /// table_path: Path to the table + /// partition_spec: Dict mapping partition column name to value (e.g., {"region": "US"}) + /// ignore_if_exists: If True, don't raise error if partition already exists + /// + /// Returns: + /// None + #[pyo3(signature = (table_path, partition_spec, ignore_if_exists=false))] + pub fn create_partition<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + partition_spec: std::collections::HashMap, + ignore_if_exists: bool, + ) -> PyResult> { + let core_table_path = table_path.to_core(); + let admin = self.__admin.clone(); + let core_partition_spec = fcore::metadata::PartitionSpec::new(partition_spec); + + future_into_py(py, async move { + admin + .create_partition(&core_table_path, &core_partition_spec, ignore_if_exists) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| Ok(py.None())) + }) + } + + /// List partitions for a partitioned table. + /// + /// Args: + /// table_path: Path to the table + /// partition_spec: Optional partial partition spec to filter results. + /// Dict mapping partition column name to value (e.g., {"region": "US"}). + /// If None, returns all partitions. + /// + /// Returns: + /// List[PartitionInfo]: List of partition info objects + #[pyo3(signature = (table_path, partition_spec=None))] + pub fn list_partition_infos<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + partition_spec: Option>, + ) -> PyResult> { + let core_table_path = table_path.to_core(); + let admin = self.__admin.clone(); + let core_partition_spec = partition_spec.map(fcore::metadata::PartitionSpec::new); + + future_into_py(py, async move { + let partition_infos = admin + .list_partition_infos_with_spec(&core_table_path, core_partition_spec.as_ref()) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let py_list = pyo3::types::PyList::empty(py); + for info in partition_infos { + let py_info = PartitionInfo::from_core(info); + py_list.append(Py::new(py, py_info)?)?; + } + Ok(py_list.unbind()) + }) + }) + } + + /// Get all alive server nodes in the cluster. + /// + /// Returns: + /// List[ServerNode]: List of server nodes (coordinator and tablet servers) + pub fn get_server_nodes<'py>(&self, py: Python<'py>) -> PyResult> { + let admin = self.__admin.clone(); + + future_into_py(py, async move { + let nodes = admin + .get_server_nodes() + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let py_list = pyo3::types::PyList::empty(py); + for node in nodes { + let py_node = ServerNode::from_core(node); + py_list.append(Py::new(py, py_node)?)?; + } + Ok(py_list.unbind()) + }) + }) + } + + fn __repr__(&self) -> String { + "FlussAdmin()".to_string() + } +} + +impl FlussAdmin { + // Internal method to create FlussAdmin from core admin + pub fn from_core(admin: Arc) -> Self { + Self { __admin: admin } + } +} + +/// Information about a partition +#[pyclass] +pub struct PartitionInfo { + partition_id: i64, + partition_name: String, +} + +#[pymethods] +impl PartitionInfo { + /// Get the partition ID (globally unique in the cluster) + #[getter] + fn partition_id(&self) -> i64 { + self.partition_id + } + + /// Get the partition name (e.g., "US" for a table partitioned by region) + #[getter] + fn partition_name(&self) -> &str { + &self.partition_name + } + + fn __repr__(&self) -> String { + format!( + "PartitionInfo(partition_id={}, partition_name='{}')", + self.partition_id, self.partition_name + ) + } +} + +impl PartitionInfo { + pub fn from_core(info: fcore::metadata::PartitionInfo) -> Self { + Self { + partition_id: info.get_partition_id(), + partition_name: info.get_partition_name(), + } + } +} + +/// Information about a server node in the Fluss cluster +#[pyclass] +pub struct ServerNode { + id: i32, + host: String, + port: u32, + server_type: String, + uid: String, +} + +#[pymethods] +impl ServerNode { + #[getter] + fn id(&self) -> i32 { + self.id + } + + #[getter] + fn host(&self) -> &str { + &self.host + } + + #[getter] + fn port(&self) -> u32 { + self.port + } + + #[getter] + fn server_type(&self) -> &str { + &self.server_type + } + + #[getter] + fn uid(&self) -> &str { + &self.uid + } + + fn __repr__(&self) -> String { + format!( + "ServerNode(id={}, host='{}', port={}, server_type='{}')", + self.id, self.host, self.port, self.server_type + ) + } +} + +impl ServerNode { + pub fn from_core(node: fcore::ServerNode) -> Self { + Self { + id: node.id(), + host: node.host().to_string(), + port: node.port(), + server_type: node.server_type().to_string(), + uid: node.uid().to_string(), + } + } +} diff --git a/fluss-rust/bindings/python/src/config.rs b/fluss-rust/bindings/python/src/config.rs new file mode 100644 index 0000000000..11188bf3c6 --- /dev/null +++ b/fluss-rust/bindings/python/src/config.rs @@ -0,0 +1,535 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::*; +use pyo3::types::PyDict; + +/// Configuration for Fluss client +#[pyclass] +#[derive(Clone)] +pub struct Config { + inner: fcore::config::Config, +} + +#[pymethods] +impl Config { + /// Create a new Config with optional properties from a dictionary + #[new] + #[pyo3(signature = (properties = None))] + fn new(properties: Option<&Bound<'_, PyDict>>) -> PyResult { + let mut config = fcore::config::Config::default(); + + if let Some(props) = properties { + for item in props.iter() { + let key: String = item.0.extract()?; + let value: String = item.1.extract()?; + + match key.as_str() { + "bootstrap.servers" => { + config.bootstrap_servers = value; + } + "writer.request-max-size" => { + config.writer_request_max_size = value.parse::().map_err(|e| { + FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}")) + })?; + } + "writer.acks" => { + config.writer_acks = value; + } + "writer.retries" => { + config.writer_retries = value.parse::().map_err(|e| { + FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}")) + })?; + } + "writer.batch-size" => { + config.writer_batch_size = value.parse::().map_err(|e| { + FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}")) + })?; + } + "writer.dynamic-batch-size.enabled" => { + config.writer_dynamic_batch_size_enabled = match value.as_str() { + "true" => true, + "false" => false, + other => { + return Err(FlussError::new_err(format!( + "Invalid value '{other}' for '{key}', expected 'true' or 'false'" + ))); + } + }; + } + "writer.dynamic-batch-size-min" => { + config.writer_dynamic_batch_size_min = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "writer.batch-timeout-ms" => { + config.writer_batch_timeout_ms = value.parse::().map_err(|e| { + FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}")) + })?; + } + "scanner.remote-log.prefetch-num" => { + config.scanner_remote_log_prefetch_num = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "remote-file.download-thread-num" => { + config.remote_file_download_thread_num = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "scanner.remote-log.read-concurrency" => { + config.scanner_remote_log_read_concurrency = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "scanner.log.max-poll-records" => { + config.scanner_log_max_poll_records = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "scanner.log.fetch.max-bytes" => { + config.scanner_log_fetch_max_bytes = value.parse::().map_err(|e| { + FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}")) + })?; + } + "scanner.log.fetch.min-bytes" => { + config.scanner_log_fetch_min_bytes = value.parse::().map_err(|e| { + FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}")) + })?; + } + "scanner.log.fetch.wait-max-time-ms" => { + config.scanner_log_fetch_wait_max_time_ms = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "scanner.log.fetch.max-bytes-for-bucket" => { + config.scanner_log_fetch_max_bytes_for_bucket = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "writer.enable-idempotence" => { + config.writer_enable_idempotence = match value.as_str() { + "true" => true, + "false" => false, + other => { + return Err(FlussError::new_err(format!( + "Invalid value '{other}' for '{key}', expected 'true' or 'false'" + ))); + } + }; + } + "writer.max-inflight-requests-per-bucket" => { + config.writer_max_inflight_requests_per_bucket = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "writer.buffer.memory-size" => { + config.writer_buffer_memory_size = value.parse::().map_err(|e| { + FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}")) + })?; + } + "writer.buffer.wait-timeout-ms" => { + config.writer_buffer_wait_timeout_ms = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "writer.bucket.no-key-assigner" => { + config.writer_bucket_no_key_assigner = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for '{key}': {e}" + )) + })?; + } + "connect-timeout" => { + config.connect_timeout_ms = value.parse::().map_err(|e| { + FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}")) + })?; + } + "security.protocol" => { + config.security_protocol = value; + } + "security.sasl.mechanism" => { + config.security_sasl_mechanism = value; + } + "security.sasl.username" => { + config.security_sasl_username = value; + } + "security.sasl.password" => { + config.security_sasl_password = value; + } + _ => { + return Err(FlussError::new_err(format!("Unknown property: {key}"))); + } + } + } + } + + Ok(Self { inner: config }) + } + + /// Get the bootstrap servers + #[getter] + fn bootstrap_servers(&self) -> String { + self.inner.bootstrap_servers.clone() + } + + /// Set the bootstrap servers + #[setter] + fn set_bootstrap_servers(&mut self, server: String) { + self.inner.bootstrap_servers = server; + } + + /// Get the writer request max size + #[getter] + fn writer_request_max_size(&self) -> i32 { + self.inner.writer_request_max_size + } + + /// Set the writer request max size + #[setter] + fn set_writer_request_max_size(&mut self, size: i32) { + self.inner.writer_request_max_size = size; + } + + /// Get the writer acks + #[getter] + fn writer_acks(&self) -> String { + self.inner.writer_acks.clone() + } + + /// Set the writer acks + #[setter] + fn set_writer_acks(&mut self, acks: String) { + self.inner.writer_acks = acks; + } + + /// Get the writer retries + #[getter] + fn writer_retries(&self) -> i32 { + self.inner.writer_retries + } + + /// Set the writer retries + #[setter] + fn set_writer_retries(&mut self, retries: i32) { + self.inner.writer_retries = retries; + } + + /// Get the writer batch size + #[getter] + fn writer_batch_size(&self) -> i32 { + self.inner.writer_batch_size + } + + /// Set the writer batch size + #[setter] + fn set_writer_batch_size(&mut self, size: i32) { + self.inner.writer_batch_size = size; + } + + /// Get whether the per-table dynamic batch size estimator is enabled + #[getter] + fn writer_dynamic_batch_size_enabled(&self) -> bool { + self.inner.writer_dynamic_batch_size_enabled + } + + /// Set whether the per-table dynamic batch size estimator is enabled + #[setter] + fn set_writer_dynamic_batch_size_enabled(&mut self, enabled: bool) { + self.inner.writer_dynamic_batch_size_enabled = enabled; + } + + /// Get the lower bound used by the dynamic batch size estimator + #[getter] + fn writer_dynamic_batch_size_min(&self) -> i32 { + self.inner.writer_dynamic_batch_size_min + } + + /// Set the lower bound used by the dynamic batch size estimator + #[setter] + fn set_writer_dynamic_batch_size_min(&mut self, size: i32) { + self.inner.writer_dynamic_batch_size_min = size; + } + + /// Get the scanner remote log prefetch num + #[getter] + fn scanner_remote_log_prefetch_num(&self) -> usize { + self.inner.scanner_remote_log_prefetch_num + } + + /// Set the scanner remote log prefetch num + #[setter] + fn set_scanner_remote_log_prefetch_num(&mut self, num: usize) { + self.inner.scanner_remote_log_prefetch_num = num; + } + + /// Get the remote file download thread num + #[getter] + fn remote_file_download_thread_num(&self) -> usize { + self.inner.remote_file_download_thread_num + } + + /// Set the remote file download thread num + #[setter] + fn set_remote_file_download_thread_num(&mut self, num: usize) { + self.inner.remote_file_download_thread_num = num; + } + + /// Get the scanner remote log read concurrency + #[getter] + fn scanner_remote_log_read_concurrency(&self) -> usize { + self.inner.scanner_remote_log_read_concurrency + } + + /// Set the scanner remote log read concurrency + #[setter] + fn set_scanner_remote_log_read_concurrency(&mut self, num: usize) { + self.inner.scanner_remote_log_read_concurrency = num; + } + + /// Get the scanner log max poll records + #[getter] + fn scanner_log_max_poll_records(&self) -> usize { + self.inner.scanner_log_max_poll_records + } + + /// Set the scanner log max poll records + #[setter] + fn set_scanner_log_max_poll_records(&mut self, num: usize) { + self.inner.scanner_log_max_poll_records = num; + } + + /// Get the writer batch timeout in milliseconds + #[getter] + fn writer_batch_timeout_ms(&self) -> i64 { + self.inner.writer_batch_timeout_ms + } + + /// Set the writer batch timeout in milliseconds + #[setter] + fn set_writer_batch_timeout_ms(&mut self, timeout: i64) { + self.inner.writer_batch_timeout_ms = timeout; + } + + /// Get the bucket assignment strategy for tables without bucket keys + #[getter] + fn writer_bucket_no_key_assigner(&self) -> String { + self.inner.writer_bucket_no_key_assigner.to_string() + } + + /// Set the bucket assignment strategy for tables without bucket keys + #[setter] + fn set_writer_bucket_no_key_assigner(&mut self, value: String) -> PyResult<()> { + self.inner.writer_bucket_no_key_assigner = + value.parse::().map_err(|e| { + FlussError::new_err(format!( + "Invalid value '{value}' for 'writer.bucket.no-key-assigner': {e}" + )) + })?; + Ok(()) + } + + /// Get whether idempotent writes are enabled + #[getter] + fn writer_enable_idempotence(&self) -> bool { + self.inner.writer_enable_idempotence + } + + /// Set whether idempotent writes are enabled + #[setter] + fn set_writer_enable_idempotence(&mut self, enabled: bool) { + self.inner.writer_enable_idempotence = enabled; + } + + /// Get the max in-flight requests per bucket + #[getter] + fn writer_max_inflight_requests_per_bucket(&self) -> usize { + self.inner.writer_max_inflight_requests_per_bucket + } + + /// Set the max in-flight requests per bucket + #[setter] + fn set_writer_max_inflight_requests_per_bucket(&mut self, num: usize) { + self.inner.writer_max_inflight_requests_per_bucket = num; + } + + /// Get the writer buffer memory size + #[getter] + fn writer_buffer_memory_size(&self) -> usize { + self.inner.writer_buffer_memory_size + } + + /// Set the writer buffer memory size + #[setter] + fn set_writer_buffer_memory_size(&mut self, size: usize) { + self.inner.writer_buffer_memory_size = size; + } + + /// Get the writer buffer wait timeout in milliseconds + #[getter] + fn writer_buffer_wait_timeout_ms(&self) -> u64 { + self.inner.writer_buffer_wait_timeout_ms + } + + /// Set the writer buffer wait timeout in milliseconds + #[setter] + fn set_writer_buffer_wait_timeout_ms(&mut self, timeout: u64) { + self.inner.writer_buffer_wait_timeout_ms = timeout; + } + + /// Get the connect timeout in milliseconds + #[getter] + fn connect_timeout_ms(&self) -> u64 { + self.inner.connect_timeout_ms + } + + /// Set the connect timeout in milliseconds + #[setter] + fn set_connect_timeout_ms(&mut self, timeout: u64) { + self.inner.connect_timeout_ms = timeout; + } + + /// Get the security protocol + #[getter] + fn security_protocol(&self) -> String { + self.inner.security_protocol.clone() + } + + /// Set the security protocol + #[setter] + fn set_security_protocol(&mut self, protocol: String) { + self.inner.security_protocol = protocol; + } + + /// Get the SASL mechanism + #[getter] + fn security_sasl_mechanism(&self) -> String { + self.inner.security_sasl_mechanism.clone() + } + + /// Set the SASL mechanism + #[setter] + fn set_security_sasl_mechanism(&mut self, mechanism: String) { + self.inner.security_sasl_mechanism = mechanism; + } + + /// Get the SASL username + #[getter] + fn security_sasl_username(&self) -> String { + self.inner.security_sasl_username.clone() + } + + /// Set the SASL username + #[setter] + fn set_security_sasl_username(&mut self, username: String) { + self.inner.security_sasl_username = username; + } + + /// Get the SASL password + #[getter] + fn security_sasl_password(&self) -> String { + self.inner.security_sasl_password.clone() + } + + /// Set the SASL password + #[setter] + fn set_security_sasl_password(&mut self, password: String) { + self.inner.security_sasl_password = password; + } + + /// Get the maximum bytes per fetch response for LogScanner + #[getter] + fn scanner_log_fetch_max_bytes(&self) -> i32 { + self.inner.scanner_log_fetch_max_bytes + } + + /// Set the maximum bytes per fetch response for LogScanner + #[setter] + fn set_scanner_log_fetch_max_bytes(&mut self, bytes: i32) { + self.inner.scanner_log_fetch_max_bytes = bytes; + } + + /// Get the minimum bytes to accumulate before returning a fetch response + #[getter] + fn scanner_log_fetch_min_bytes(&self) -> i32 { + self.inner.scanner_log_fetch_min_bytes + } + + /// Set the minimum bytes to accumulate before returning a fetch response + #[setter] + fn set_scanner_log_fetch_min_bytes(&mut self, bytes: i32) { + self.inner.scanner_log_fetch_min_bytes = bytes; + } + + /// Get the maximum time (ms) the server may wait to satisfy min-bytes + #[getter] + fn scanner_log_fetch_wait_max_time_ms(&self) -> i32 { + self.inner.scanner_log_fetch_wait_max_time_ms + } + + /// Set the maximum time (ms) the server may wait to satisfy min-bytes + #[setter] + fn set_scanner_log_fetch_wait_max_time_ms(&mut self, ms: i32) { + self.inner.scanner_log_fetch_wait_max_time_ms = ms; + } + + /// Get the maximum bytes per fetch response per bucket for LogScanner + #[getter] + fn scanner_log_fetch_max_bytes_for_bucket(&self) -> i32 { + self.inner.scanner_log_fetch_max_bytes_for_bucket + } + + /// Set the maximum bytes per fetch response per bucket for LogScanner + #[setter] + fn set_scanner_log_fetch_max_bytes_for_bucket(&mut self, bytes: i32) { + self.inner.scanner_log_fetch_max_bytes_for_bucket = bytes; + } +} + +impl Config { + pub fn get_core_config(&self) -> fcore::config::Config { + self.inner.clone() + } +} diff --git a/fluss-rust/bindings/python/src/connection.rs b/fluss-rust/bindings/python/src/connection.rs new file mode 100644 index 0000000000..3853896ce4 --- /dev/null +++ b/fluss-rust/bindings/python/src/connection.rs @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::*; +use pyo3_async_runtimes::tokio::future_into_py; +use std::sync::Arc; +use std::time::Duration; + +/// Connection to a Fluss cluster +#[pyclass] +pub struct FlussConnection { + inner: Arc, +} + +#[pymethods] +impl FlussConnection { + /// Create a new FlussConnection (async) + #[staticmethod] + fn create<'py>(py: Python<'py>, config: &Config) -> PyResult> { + let rust_config = config.get_core_config(); + + future_into_py(py, async move { + let connection = fcore::client::FlussConnection::new(rust_config) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + let py_connection = FlussConnection { + inner: Arc::new(connection), + }; + + Python::attach(|py| Py::new(py, py_connection)) + }) + } + + /// Get admin interface + fn get_admin(&self, py: Python<'_>) -> PyResult> { + let admin = self + .inner + .get_admin() + .map_err(|e| FlussError::from_core_error(&e))?; + + Py::new(py, FlussAdmin::from_core(admin)) + } + + /// Get a table + fn get_table<'py>( + &self, + py: Python<'py>, + table_path: &TablePath, + ) -> PyResult> { + let client = self.inner.clone(); + let core_path = table_path.to_core().clone(); + + future_into_py(py, async move { + let core_table = client + .get_table(&core_path) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + let py_table = FlussTable::new_table( + client.clone(), + core_table.metadata().clone(), + core_table.get_table_info().clone(), + core_table.table_path().clone(), + core_table.has_primary_key(), + ); + + Python::attach(|py| Py::new(py, py_table)) + }) + } + + /// Close the connection (async). + /// + /// Gracefully shuts down the connection by draining any pending write batches. + /// This method is awaitable. + fn close<'py>(&self, py: Python<'py>) -> PyResult> { + let inner = self.inner.clone(); + + future_into_py(py, async move { + inner + .close(Duration::MAX) + .await + .map_err(|e| FlussError::from_core_error(&e)) + }) + } + + // Enter the runtime context (for 'with' statement) + fn __enter__(slf: PyRef) -> PyRef { + slf + } + + // Exit the runtime context (for 'with' statement) + #[pyo3(signature = (_exc_type=None, _exc_value=None, _traceback=None))] + fn __exit__( + &mut self, + _exc_type: Option>, + _exc_value: Option>, + _traceback: Option>, + ) -> PyResult { + // Sync exit cannot await the graceful drain, so it's a no-op here. + // Users should use 'async with' for graceful shutdown. + Ok(false) + } + + // Enter the async runtime context (for 'async with' statement) + fn __aenter__<'py>(slf: PyRef<'py, Self>, py: Python<'py>) -> PyResult> { + let py_slf = slf.into_pyobject(py)?.unbind(); + future_into_py(py, async move { Ok(py_slf) }) + } + + // Exit the async runtime context (for 'async with' statement) + #[pyo3(signature = (exc_type=None, _exc_value=None, _traceback=None))] + fn __aexit__<'py>( + &self, + py: Python<'py>, + exc_type: Option>, + _exc_value: Option>, + _traceback: Option>, + ) -> PyResult> { + let inner = self.inner.clone(); + let is_exc_none = exc_type.as_ref().is_none_or(|e| e.is_none()); + future_into_py(py, async move { + let res = inner.close(Duration::MAX).await; + if let Err(e) = res { + if is_exc_none { + return Err(FlussError::from_core_error(&e)); + } + } + Ok(false) + }) + } + + fn __repr__(&self) -> String { + "FlussConnection()".to_string() + } +} diff --git a/fluss-rust/bindings/python/src/error.rs b/fluss-rust/bindings/python/src/error.rs new file mode 100644 index 0000000000..9d718aa66e --- /dev/null +++ b/fluss-rust/bindings/python/src/error.rs @@ -0,0 +1,276 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use fluss::error::Error; +use fluss::rpc::FlussError as CoreFlussError; +use pyo3::exceptions::PyException; +use pyo3::prelude::*; + +/// Error code for client-side errors that did not originate from the server API protocol. +/// The value -2 is outside the server API error code range (-1 .. 57+), so it will never +/// collide with current or future API codes. Consistent with the CPP binding. +const CLIENT_ERROR_CODE: i32 = -2; + +/// Fluss errors +#[pyclass(extends=PyException)] +#[derive(Debug, Clone)] +pub struct FlussError { + #[pyo3(get)] + pub message: String, + #[pyo3(get)] + pub error_code: i32, +} + +#[pymethods] +impl FlussError { + #[new] + #[pyo3(signature = (message, error_code=-2))] + fn new(message: String, error_code: i32) -> Self { + Self { + message, + error_code, + } + } + + fn __str__(&self) -> String { + if self.error_code != CLIENT_ERROR_CODE { + format!("FlussError(code={}): {}", self.error_code, self.message) + } else { + format!("FlussError: {}", self.message) + } + } + + /// Returns ``True`` if retrying the request may succeed. Client-side errors always return ``False``. + #[getter] + fn is_retriable(&self) -> bool { + if self.error_code == CLIENT_ERROR_CODE { + return false; + } + CoreFlussError::for_code(self.error_code).is_retriable() + } +} + +impl FlussError { + pub fn new_err(message: impl ToString) -> PyErr { + PyErr::new::((message.to_string(), CLIENT_ERROR_CODE)) + } + + pub fn from_core_error(error: &Error) -> PyErr { + // Transport failures map to `NetworkException` (Java parity, + // retriable). + let (msg, code) = match error { + Error::FlussAPIError { api_error } => (api_error.message.clone(), api_error.code), + Error::RpcError { .. } => (error.to_string(), CoreFlussError::NetworkException.code()), + _ => (error.to_string(), CLIENT_ERROR_CODE), + }; + PyErr::new::((msg, code)) + } +} + +/// Named constants for Fluss API error codes. +/// +/// Server API errors have error_code > 0 or == -1. +/// Client-side errors have error_code == CLIENT_ERROR (-2). +/// These constants match the Rust core FlussError enum and are stable across protocol versions. +/// New server error codes work automatically (error_code is a raw int, not a closed enum) — +/// these constants are convenience names, not an exhaustive list. +#[pyclass] +pub struct ErrorCode; + +#[pymethods] +impl ErrorCode { + /// Client-side error (not from server API protocol). Check the error message for details. + #[classattr] + const CLIENT_ERROR: i32 = -2; + /// No error. + #[classattr] + const NONE: i32 = 0; + /// The server experienced an unexpected error when processing the request. + #[classattr] + const UNKNOWN_SERVER_ERROR: i32 = -1; + /// The server disconnected before a response was received. + #[classattr] + const NETWORK_EXCEPTION: i32 = 1; + /// The version of API is not supported. + #[classattr] + const UNSUPPORTED_VERSION: i32 = 2; + /// This message has failed its CRC checksum, exceeds the valid size, or is otherwise corrupt. + #[classattr] + const CORRUPT_MESSAGE: i32 = 3; + /// The database does not exist. + #[classattr] + const DATABASE_NOT_EXIST: i32 = 4; + /// The database is not empty. + #[classattr] + const DATABASE_NOT_EMPTY: i32 = 5; + /// The database already exists. + #[classattr] + const DATABASE_ALREADY_EXIST: i32 = 6; + /// The table does not exist. + #[classattr] + const TABLE_NOT_EXIST: i32 = 7; + /// The table already exists. + #[classattr] + const TABLE_ALREADY_EXIST: i32 = 8; + /// The schema does not exist. + #[classattr] + const SCHEMA_NOT_EXIST: i32 = 9; + /// Exception occurred while storing data for log in server. + #[classattr] + const LOG_STORAGE_EXCEPTION: i32 = 10; + /// Exception occurred while storing data for kv in server. + #[classattr] + const KV_STORAGE_EXCEPTION: i32 = 11; + /// Not leader or follower. + #[classattr] + const NOT_LEADER_OR_FOLLOWER: i32 = 12; + /// The record is too large. + #[classattr] + const RECORD_TOO_LARGE_EXCEPTION: i32 = 13; + /// The record is corrupt. + #[classattr] + const CORRUPT_RECORD_EXCEPTION: i32 = 14; + /// The client has attempted to perform an operation on an invalid table. + #[classattr] + const INVALID_TABLE_EXCEPTION: i32 = 15; + /// The client has attempted to perform an operation on an invalid database. + #[classattr] + const INVALID_DATABASE_EXCEPTION: i32 = 16; + /// The replication factor is larger than the number of available tablet servers. + #[classattr] + const INVALID_REPLICATION_FACTOR: i32 = 17; + /// Produce request specified an invalid value for required acks. + #[classattr] + const INVALID_REQUIRED_ACKS: i32 = 18; + /// The log offset is out of range. + #[classattr] + const LOG_OFFSET_OUT_OF_RANGE_EXCEPTION: i32 = 19; + /// The table is not a primary key table. + #[classattr] + const NON_PRIMARY_KEY_TABLE_EXCEPTION: i32 = 20; + /// The table or bucket does not exist. + #[classattr] + const UNKNOWN_TABLE_OR_BUCKET_EXCEPTION: i32 = 21; + /// The update version is invalid. + #[classattr] + const INVALID_UPDATE_VERSION_EXCEPTION: i32 = 22; + /// The coordinator is invalid. + #[classattr] + const INVALID_COORDINATOR_EXCEPTION: i32 = 23; + /// The leader epoch is invalid. + #[classattr] + const FENCED_LEADER_EPOCH_EXCEPTION: i32 = 24; + /// The request timed out. + #[classattr] + const REQUEST_TIME_OUT: i32 = 25; + /// The general storage exception. + #[classattr] + const STORAGE_EXCEPTION: i32 = 26; + /// The server did not attempt to execute this operation. + #[classattr] + const OPERATION_NOT_ATTEMPTED_EXCEPTION: i32 = 27; + /// Records are written to the server already, but to fewer in-sync replicas than required. + #[classattr] + const NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION: i32 = 28; + /// Messages are rejected since there are fewer in-sync replicas than required. + #[classattr] + const NOT_ENOUGH_REPLICAS_EXCEPTION: i32 = 29; + /// Get file access security token exception. + #[classattr] + const SECURITY_TOKEN_EXCEPTION: i32 = 30; + /// The tablet server received an out of order sequence batch. + #[classattr] + const OUT_OF_ORDER_SEQUENCE_EXCEPTION: i32 = 31; + /// The tablet server received a duplicate sequence batch. + #[classattr] + const DUPLICATE_SEQUENCE_EXCEPTION: i32 = 32; + /// The tablet server could not locate the writer metadata. + #[classattr] + const UNKNOWN_WRITER_ID_EXCEPTION: i32 = 33; + /// The requested column projection is invalid. + #[classattr] + const INVALID_COLUMN_PROJECTION: i32 = 34; + /// The requested target column to write is invalid. + #[classattr] + const INVALID_TARGET_COLUMN: i32 = 35; + /// The partition does not exist. + #[classattr] + const PARTITION_NOT_EXISTS: i32 = 36; + /// The table is not partitioned. + #[classattr] + const TABLE_NOT_PARTITIONED_EXCEPTION: i32 = 37; + /// The timestamp is invalid. + #[classattr] + const INVALID_TIMESTAMP_EXCEPTION: i32 = 38; + /// The config is invalid. + #[classattr] + const INVALID_CONFIG_EXCEPTION: i32 = 39; + /// The lake storage is not configured. + #[classattr] + const LAKE_STORAGE_NOT_CONFIGURED_EXCEPTION: i32 = 40; + /// The kv snapshot does not exist. + #[classattr] + const KV_SNAPSHOT_NOT_EXIST: i32 = 41; + /// The partition already exists. + #[classattr] + const PARTITION_ALREADY_EXISTS: i32 = 42; + /// The partition spec is invalid. + #[classattr] + const PARTITION_SPEC_INVALID_EXCEPTION: i32 = 43; + /// There is no currently available leader for the given partition. + #[classattr] + const LEADER_NOT_AVAILABLE_EXCEPTION: i32 = 44; + /// Exceed the maximum number of partitions. + #[classattr] + const PARTITION_MAX_NUM_EXCEPTION: i32 = 45; + /// Authentication failed. + #[classattr] + const AUTHENTICATE_EXCEPTION: i32 = 46; + /// Security is disabled. + #[classattr] + const SECURITY_DISABLED_EXCEPTION: i32 = 47; + /// Authorization failed. + #[classattr] + const AUTHORIZATION_EXCEPTION: i32 = 48; + /// Exceed the maximum number of buckets. + #[classattr] + const BUCKET_MAX_NUM_EXCEPTION: i32 = 49; + /// The tiering epoch is invalid. + #[classattr] + const FENCED_TIERING_EPOCH_EXCEPTION: i32 = 50; + /// Authentication failed with retriable exception. + #[classattr] + const RETRIABLE_AUTHENTICATE_EXCEPTION: i32 = 51; + /// The server rack info is invalid. + #[classattr] + const INVALID_SERVER_RACK_INFO_EXCEPTION: i32 = 52; + /// The lake snapshot does not exist. + #[classattr] + const LAKE_SNAPSHOT_NOT_EXIST: i32 = 53; + /// The lake table already exists. + #[classattr] + const LAKE_TABLE_ALREADY_EXIST: i32 = 54; + /// The new ISR contains at least one ineligible replica. + #[classattr] + const INELIGIBLE_REPLICA_EXCEPTION: i32 = 55; + /// The alter table is invalid. + #[classattr] + const INVALID_ALTER_TABLE_EXCEPTION: i32 = 56; + /// Deletion operations are disabled on this table. + #[classattr] + const DELETION_DISABLED_EXCEPTION: i32 = 57; +} diff --git a/fluss-rust/bindings/python/src/lib.rs b/fluss-rust/bindings/python/src/lib.rs new file mode 100644 index 0000000000..2d71491a7a --- /dev/null +++ b/fluss-rust/bindings/python/src/lib.rs @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::LazyLock; + +pub use ::fluss as fcore; +use pyo3::prelude::*; +use tokio::runtime::Runtime; + +mod admin; +mod config; +mod connection; +mod error; +mod lookup; +mod metadata; +mod table; +mod upsert; +mod utils; +mod write_handle; + +pub use admin::*; +pub use config::*; +pub use connection::*; +pub use error::*; +pub use lookup::*; +pub use metadata::*; +pub use table::*; +pub use upsert::*; +pub use utils::*; +pub use write_handle::*; + +static TOKIO_RUNTIME: LazyLock = LazyLock::new(|| { + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("Failed to create Tokio runtime") +}); + +/// Offset specification for list_offsets(), matching Java's OffsetSpec. +/// +/// Use factory methods to create instances: +/// OffsetSpec.earliest() +/// OffsetSpec.latest() +/// OffsetSpec.timestamp(ts) +#[pyclass] +#[derive(Clone)] +pub struct OffsetSpec { + pub(crate) inner: fcore::rpc::message::OffsetSpec, +} + +#[pymethods] +impl OffsetSpec { + /// Create an OffsetSpec for the earliest available offset. + #[staticmethod] + fn earliest() -> Self { + Self { + inner: fcore::rpc::message::OffsetSpec::Earliest, + } + } + + /// Create an OffsetSpec for the latest available offset. + #[staticmethod] + fn latest() -> Self { + Self { + inner: fcore::rpc::message::OffsetSpec::Latest, + } + } + + /// Create an OffsetSpec for the offset at or after the given timestamp. + #[staticmethod] + fn timestamp(ts: i64) -> Self { + Self { + inner: fcore::rpc::message::OffsetSpec::Timestamp(ts), + } + } + + fn __repr__(&self) -> String { + match &self.inner { + fcore::rpc::message::OffsetSpec::Earliest => "OffsetSpec.earliest()".to_string(), + fcore::rpc::message::OffsetSpec::Latest => "OffsetSpec.latest()".to_string(), + fcore::rpc::message::OffsetSpec::Timestamp(ts) => { + format!("OffsetSpec.timestamp({ts})") + } + } + } +} + +#[pymodule] +fn _fluss(m: &Bound<'_, PyModule>) -> PyResult<()> { + // Register all classes + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + + // Register constants + m.add("EARLIEST_OFFSET", fcore::client::EARLIEST_OFFSET)?; + + // Register exception types and error codes + m.add_class::()?; + m.add_class::()?; + + Ok(()) +} diff --git a/fluss-rust/bindings/python/src/lookup.rs b/fluss-rust/bindings/python/src/lookup.rs new file mode 100644 index 0000000000..196faa1e81 --- /dev/null +++ b/fluss-rust/bindings/python/src/lookup.rs @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::table::{internal_row_to_dict, python_to_dense_generic_row}; +use crate::*; +use pyo3_async_runtimes::tokio::future_into_py; +use std::sync::Arc; +use tokio::sync::Mutex; + +/// Lookuper for performing primary key lookups on a Fluss table. +/// +/// The Lookuper caches key encoders and bucketing functions, making +/// repeated lookups efficient. Create once and reuse for multiple lookups. +/// +/// # Example: +/// lookuper = table.new_lookup().create_lookuper() +/// result = await lookuper.lookup({"user_id": 1}) +/// result2 = await lookuper.lookup({"user_id": 2}) # Reuses cached encoders +#[pyclass] +pub struct Lookuper { + inner: Arc>, + table_info: Arc, +} + +#[pymethods] +impl Lookuper { + /// Lookup a row by its primary key. + /// + /// Args: + /// pk: A dict, list, or tuple containing only the primary key values. + /// For dict: keys are PK column names. + /// For list/tuple: values in PK column order. + /// + /// Returns: + /// A dict containing the row data if found, None otherwise. + pub fn lookup<'py>( + &self, + py: Python<'py>, + pk: &Bound<'_, PyAny>, + ) -> PyResult> { + let pk_indices = self.table_info.get_schema().primary_key_indexes(); + let generic_row = python_to_dense_generic_row(pk, &self.table_info, &pk_indices)?; + let inner = self.inner.clone(); + let table_info = self.table_info.clone(); + + future_into_py(py, async move { + // Perform async lookup + let result = { + let mut lookuper = inner.lock().await; + lookuper + .lookup(&generic_row) + .await + .map_err(|e| FlussError::from_core_error(&e))? + }; + + // Extract row data + let row_opt = result + .get_single_row() + .map_err(|e| FlussError::from_core_error(&e))?; + + // Convert to Python with GIL + Python::attach(|py| match row_opt { + Some(row) => internal_row_to_dict(py, &row, &table_info), + None => Ok(py.None()), + }) + }) + } + + fn __repr__(&self) -> String { + "Lookuper()".to_string() + } +} + +impl Lookuper { + /// Create a Lookuper from connection components. + /// + /// This creates the core Lookuper which caches encoders and bucketing functions. + pub fn new( + connection: &Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + ) -> PyResult { + // Run inside tokio runtime context because new_lookup() + // spawns a background task via tokio::spawn() in LookupClient::new(). + let lookuper = TOKIO_RUNTIME.block_on(async { + let fluss_table = + fcore::client::FlussTable::new(connection, metadata, table_info.clone()); + let table_lookup = fluss_table + .new_lookup() + .map_err(|e| FlussError::from_core_error(&e))?; + table_lookup + .create_lookuper() + .map_err(|e| FlussError::from_core_error(&e)) + })?; + + Ok(Self { + inner: Arc::new(Mutex::new(lookuper)), + table_info: Arc::new(table_info), + }) + } +} + +/// Lookuper for performing prefix key lookups on a Fluss table. +/// +/// Returns all rows whose primary key starts with the given prefix. +/// Create once via `table.new_lookup().lookup_by(columns).create_lookuper()` +/// and reuse for multiple lookups. +#[pyclass] +pub struct PrefixLookuper { + inner: Arc>, + table_info: Arc, + lookup_column_indices: Vec, +} + +#[pymethods] +impl PrefixLookuper { + /// Lookup all rows matching a prefix key. + /// + /// Args: + /// prefix: A dict, list, or tuple containing only the prefix key values + /// (the columns specified in lookup_by()). + /// For dict: keys are prefix column names. + /// For list/tuple: values in prefix column order. + /// + /// Returns: + /// A list of dicts, each containing the full row data. Empty list if no matches. + pub fn lookup<'py>( + &self, + py: Python<'py>, + prefix: &Bound<'_, PyAny>, + ) -> PyResult> { + let generic_row = + python_to_dense_generic_row(prefix, &self.table_info, &self.lookup_column_indices)?; + let inner = self.inner.clone(); + let table_info = self.table_info.clone(); + + future_into_py(py, async move { + let result = { + let mut lookuper = inner.lock().await; + lookuper + .lookup(&generic_row) + .await + .map_err(|e| FlussError::from_core_error(&e))? + }; + + let rows = result + .get_rows() + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let py_rows: Vec> = rows + .iter() + .map(|row| internal_row_to_dict(py, row, &table_info)) + .collect::>()?; + Ok(py_rows) + }) + }) + } + + fn __repr__(&self) -> String { + "PrefixLookuper()".to_string() + } +} + +impl PrefixLookuper { + pub fn new( + connection: &Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + lookup_column_names: Vec, + ) -> PyResult { + let row_type = table_info.row_type(); + let lookup_column_indices: Vec = lookup_column_names + .iter() + .map(|name| { + row_type.get_field_index(name).ok_or_else(|| { + FlussError::new_err(format!("Unknown column name '{name}' for prefix lookup")) + }) + }) + .collect::>()?; + + let lookuper = TOKIO_RUNTIME.block_on(async { + let fluss_table = + fcore::client::FlussTable::new(connection, metadata, table_info.clone()); + let table_lookup = fluss_table + .new_lookup() + .map_err(|e| FlussError::from_core_error(&e))?; + table_lookup + .lookup_by(lookup_column_names) + .create_lookuper() + .map_err(|e| FlussError::from_core_error(&e)) + })?; + + Ok(Self { + inner: Arc::new(Mutex::new(lookuper)), + table_info: Arc::new(table_info), + lookup_column_indices, + }) + } +} diff --git a/fluss-rust/bindings/python/src/metadata.rs b/fluss-rust/bindings/python/src/metadata.rs new file mode 100644 index 0000000000..7b6129a489 --- /dev/null +++ b/fluss-rust/bindings/python/src/metadata.rs @@ -0,0 +1,767 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::*; +use pyo3::types::PyDict; +use std::collections::HashMap; + +/// Represents the type of change for a record in a log +#[pyclass(eq, eq_int)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ChangeType { + /// Append-only operation + AppendOnly = 0, + /// Insert operation + Insert = 1, + /// Update operation containing the previous content of the updated row + UpdateBefore = 2, + /// Update operation containing the new content of the updated row + UpdateAfter = 3, + /// Delete operation + Delete = 4, +} + +#[pymethods] +impl ChangeType { + /// Returns a short string representation of this ChangeType + pub fn short_string(&self) -> &'static str { + match self { + ChangeType::AppendOnly => "+A", + ChangeType::Insert => "+I", + ChangeType::UpdateBefore => "-U", + ChangeType::UpdateAfter => "+U", + ChangeType::Delete => "-D", + } + } + + fn __str__(&self) -> &'static str { + self.short_string() + } + + fn __repr__(&self) -> String { + format!("ChangeType.{self:?}") + } +} + +impl ChangeType { + /// Convert from core ChangeType + pub fn from_core(change_type: fcore::record::ChangeType) -> Self { + match change_type { + fcore::record::ChangeType::AppendOnly => ChangeType::AppendOnly, + fcore::record::ChangeType::Insert => ChangeType::Insert, + fcore::record::ChangeType::UpdateBefore => ChangeType::UpdateBefore, + fcore::record::ChangeType::UpdateAfter => ChangeType::UpdateAfter, + fcore::record::ChangeType::Delete => ChangeType::Delete, + } + } +} + +/// Represents a table path with database and table name +#[pyclass] +#[derive(Clone)] +pub struct TablePath { + database_name: String, + table_name: String, +} + +#[pymethods] +impl TablePath { + /// Create a new TablePath + #[new] + pub fn new(database_name: String, table_name: String) -> Self { + Self { + database_name, + table_name, + } + } + + /// Get the database name + #[getter] + pub fn database_name(&self) -> String { + self.database_name.clone() + } + + /// Get the table name + #[getter] + pub fn table_name(&self) -> String { + self.table_name.clone() + } + + /// Get table path as string + pub fn table_path_str(&self) -> String { + format!("{}.{}", self.database_name, self.table_name) + } + + pub fn __str__(&self) -> String { + self.table_path_str() + } + + fn __repr__(&self) -> String { + format!("TablePath('{}', '{}')", self.database_name, self.table_name) + } + + /// Hash implementation for Python + pub fn __hash__(&self) -> u64 { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + self.database_name.hash(&mut hasher); + self.table_name.hash(&mut hasher); + hasher.finish() + } + + /// Equality implementation for Python + pub fn __eq__(&self, other: &TablePath) -> bool { + self.database_name == other.database_name && self.table_name == other.table_name + } +} + +impl TablePath { + /// Convert to core TablePath + pub fn to_core(&self) -> fcore::metadata::TablePath { + fcore::metadata::TablePath::new(self.database_name.clone(), self.table_name.clone()) + } + + pub fn from_core(core_path: fcore::metadata::TablePath) -> Self { + Self { + database_name: core_path.database().to_string(), + table_name: core_path.table().to_string(), + } + } +} + +/// Schema wrapper for Fluss table schema +#[pyclass] +pub struct Schema { + __schema: fcore::metadata::Schema, +} + +#[pymethods] +impl Schema { + /// Create a new Schema from PyArrow schema with optional primary keys + #[new] + #[pyo3(signature = (schema, primary_keys=None))] + pub fn new( + schema: Py, // PyArrow schema + primary_keys: Option>, + ) -> PyResult { + let arrow_schema = crate::utils::Utils::pyarrow_to_arrow_schema(&schema)?; + + let mut builder = fcore::metadata::Schema::builder(); + + for field in arrow_schema.fields() { + let fluss_data_type = crate::utils::Utils::arrow_field_to_fluss_type(field)?; + builder = builder.column(field.name(), fluss_data_type); + + if let Some(comment) = field.metadata().get("comment") { + builder = builder.with_comment(comment); + } + } + + if let Some(pk_columns) = primary_keys { + if !pk_columns.is_empty() { + builder = builder.primary_key(pk_columns); + } + } + + let fluss_schema = builder + .build() + .map_err(|e| FlussError::new_err(format!("Failed to build schema: {e}")))?; + + Ok(Self { + __schema: fluss_schema, + }) + } + + /// Get column names + fn get_column_names(&self) -> Vec { + self.__schema + .columns() + .iter() + .map(|col| col.name().to_string()) + .collect() + } + + /// Get column types + fn get_column_types(&self) -> Vec { + self.__schema + .columns() + .iter() + .map(|col| Utils::datatype_to_string(col.data_type())) + .collect() + } + + /// Get columns as (name, type) pairs + fn get_columns(&self) -> Vec<(String, String)> { + self.__schema + .columns() + .iter() + .map(|col| { + ( + col.name().to_string(), + Utils::datatype_to_string(col.data_type()), + ) + }) + .collect() + } + + /// Get primary key column names, returns empty list if no primary key is defined + fn get_primary_keys(&self) -> Vec { + self.__schema + .primary_key() + .map(|pk| pk.column_names().to_vec()) + .unwrap_or_default() + } + + fn __str__(&self) -> String { + format!("Schema: columns={:?}", self.get_columns()) + } +} + +impl Schema { + /// Convert to core Schema + pub fn to_core(&self) -> &fcore::metadata::Schema { + &self.__schema + } +} + +/// Table distribution configuration +#[pyclass] +pub struct TableDistribution { + inner: fcore::metadata::TableDistribution, +} + +#[pymethods] +impl TableDistribution { + /// Get bucket keys + fn bucket_keys(&self) -> Vec { + self.inner.bucket_keys().to_vec() + } + + /// Get bucket count + fn bucket_count(&self) -> Option { + self.inner.bucket_count() + } +} + +/// Table descriptor containing schema and metadata +#[pyclass] +#[derive(Clone)] +pub struct TableDescriptor { + __tbl_desc: fcore::metadata::TableDescriptor, +} + +#[pymethods] +impl TableDescriptor { + /// Create a new TableDescriptor + #[new] + #[pyo3(signature = (schema, **kwargs))] + pub fn new( + schema: &Schema, // fluss schema + kwargs: Option<&Bound<'_, PyDict>>, + ) -> PyResult { + let mut partition_keys: Vec = Vec::new(); + let mut bucket_count = None; + let mut bucket_keys = Vec::new(); + let mut properties: HashMap = HashMap::new(); + let mut custom_properties: HashMap = HashMap::new(); + let mut comment: Option = None; + let mut log_format = None; + let mut kv_format = None; + + if let Some(kwargs) = kwargs { + if let Ok(Some(pkeys)) = kwargs.get_item("partition_keys") { + partition_keys = pkeys.extract()?; + } + if let Ok(Some(bcount)) = kwargs.get_item("bucket_count") { + bucket_count = Some(bcount.extract()?); + } + if let Ok(Some(bkeys)) = kwargs.get_item("bucket_keys") { + bucket_keys = bkeys.extract()?; + } + if let Ok(Some(props)) = kwargs.get_item("properties") { + properties = props.extract()?; + } + if let Ok(Some(cprops)) = kwargs.get_item("custom_properties") { + custom_properties = cprops.extract()?; + } + if let Ok(Some(comm)) = kwargs.get_item("comment") { + comment = Some(comm.extract()?); + } + if let Ok(Some(lformat)) = kwargs.get_item("log_format") { + let format_str: String = lformat.extract()?; + log_format = Some( + fcore::metadata::LogFormat::parse(&format_str) + .map_err(|e| FlussError::new_err(e.to_string()))?, + ); + } + if let Ok(Some(kformat)) = kwargs.get_item("kv_format") { + let format_str: String = kformat.extract()?; + kv_format = Some( + fcore::metadata::KvFormat::parse(&format_str) + .map_err(|e| FlussError::new_err(e.to_string()))?, + ); + } + } + + let fluss_schema = schema.to_core().clone(); + + let mut builder = fcore::metadata::TableDescriptor::builder() + .schema(fluss_schema) + .properties(properties) + .custom_properties(custom_properties) + .partitioned_by(partition_keys) + .distributed_by(bucket_count, bucket_keys); + + if let Some(comment) = comment { + builder = builder.comment(&comment); + } + if let Some(log_format) = log_format { + builder = builder.log_format(log_format); + } + if let Some(kv_format) = kv_format { + builder = builder.kv_format(kv_format); + } + + let core_descriptor = builder + .build() + .map_err(|e| FlussError::new_err(format!("Failed to build TableDescriptor: {e}")))?; + + Ok(Self { + __tbl_desc: core_descriptor, + }) + } + + /// Get the schema of this table descriptor + pub fn get_schema(&self) -> PyResult { + Ok(Schema { + __schema: self.__tbl_desc.schema().clone(), + }) + } +} + +impl TableDescriptor { + /// Convert to core TableDescriptor + pub fn to_core(&self) -> &fcore::metadata::TableDescriptor { + &self.__tbl_desc + } +} + +/// Information about a Fluss table +#[pyclass] +#[derive(Clone)] +pub struct TableInfo { + __table_info: fcore::metadata::TableInfo, +} + +#[pymethods] +impl TableInfo { + /// Get the table ID + #[getter] + pub fn table_id(&self) -> i64 { + self.__table_info.get_table_id() + } + + /// Get the schema ID + #[getter] + pub fn schema_id(&self) -> i32 { + self.__table_info.get_schema_id() + } + + /// Get the table path + #[getter] + pub fn table_path(&self) -> TablePath { + TablePath::from_core(self.__table_info.get_table_path().clone()) + } + + /// Get the created time + #[getter] + pub fn created_time(&self) -> i64 { + self.__table_info.get_created_time() + } + + /// Get the modified time + #[getter] + pub fn modified_time(&self) -> i64 { + self.__table_info.get_modified_time() + } + + /// Get the primary keys + pub fn get_primary_keys(&self) -> Vec { + self.__table_info.get_primary_keys().clone() + } + + /// Get the bucket keys + pub fn get_bucket_keys(&self) -> Vec { + self.__table_info.get_bucket_keys().to_vec() + } + + /// Get the partition keys + pub fn get_partition_keys(&self) -> Vec { + self.__table_info.get_partition_keys().to_vec() + } + + /// Get number of buckets + #[getter] + pub fn num_buckets(&self) -> i32 { + self.__table_info.get_num_buckets() + } + + /// Check if table has primary key + pub fn has_primary_key(&self) -> bool { + self.__table_info.has_primary_key() + } + + /// Check if table is partitioned + pub fn is_partitioned(&self) -> bool { + self.__table_info.is_partitioned() + } + + /// Get properties + pub fn get_properties(&self) -> std::collections::HashMap { + self.__table_info.get_properties().clone() + } + + /// Get custom properties + pub fn get_custom_properties(&self) -> std::collections::HashMap { + self.__table_info.get_custom_properties().clone() + } + + /// Get comment + #[getter] + pub fn comment(&self) -> Option { + self.__table_info.get_comment().map(|s| s.to_string()) + } + + /// Get the Schema + pub fn get_schema(&self) -> Schema { + Schema { + __schema: self.__table_info.get_schema().clone(), + } + } + + /// Get column names + pub fn get_column_names(&self) -> Vec { + self.__table_info + .get_schema() + .columns() + .iter() + .map(|col| col.name().to_string()) + .collect() + } + + /// Get column count + pub fn get_column_count(&self) -> usize { + self.__table_info.get_schema().columns().len() + } +} + +impl TableInfo { + /// Create from core TableInfo (internal use) + pub fn from_core(info: fcore::metadata::TableInfo) -> Self { + Self { __table_info: info } + } +} + +/// Represents a lake snapshot with snapshot ID and table bucket offsets +#[pyclass] +#[derive(Clone)] +pub struct LakeSnapshot { + snapshot_id: i64, + table_buckets_offset: HashMap, +} + +/// Represents a table bucket with table ID, partition ID, and bucket ID +#[pyclass] +#[derive(Eq, Hash, PartialEq, Clone)] +pub struct TableBucket { + table_id: i64, + partition_id: Option, + bucket: i32, +} + +#[pymethods] +impl TableBucket { + /// Create a new TableBucket + #[new] + pub fn new(table_id: i64, bucket: i32) -> Self { + Self { + table_id, + partition_id: None, + bucket, + } + } + + /// Create a new TableBucket with partition + #[staticmethod] + pub fn with_partition(table_id: i64, partition_id: i64, bucket: i32) -> Self { + Self { + table_id, + partition_id: Some(partition_id), + bucket, + } + } + + /// Get table ID + #[getter] + pub fn table_id(&self) -> i64 { + self.table_id + } + + /// Get bucket ID + #[getter] + pub fn bucket_id(&self) -> i32 { + self.bucket + } + + /// Get partition ID + #[getter] + pub fn partition_id(&self) -> Option { + self.partition_id + } + + /// String representation + pub fn __str__(&self) -> String { + if let Some(partition_id) = self.partition_id { + format!( + "TableBucket(table_id={}, partition_id={}, bucket={})", + self.table_id, partition_id, self.bucket + ) + } else { + format!( + "TableBucket(table_id={}, bucket={})", + self.table_id, self.bucket + ) + } + } + + /// String representation + pub fn __repr__(&self) -> String { + self.__str__() + } + + /// Hash implementation for Python + pub fn __hash__(&self) -> u64 { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + self.table_id.hash(&mut hasher); + self.partition_id.hash(&mut hasher); + self.bucket.hash(&mut hasher); + hasher.finish() + } + + /// Equality implementation for Python + pub fn __eq__(&self, other: &TableBucket) -> bool { + self.table_id == other.table_id + && self.partition_id == other.partition_id + && self.bucket == other.bucket + } +} + +impl TableBucket { + /// Create from core TableBucket (internal use) + pub fn from_core(bucket: fcore::metadata::TableBucket) -> Self { + Self { + table_id: bucket.table_id(), + partition_id: bucket.partition_id(), + bucket: bucket.bucket_id(), + } + } + + /// Convert to core TableBucket (internal use) + pub fn to_core(&self) -> fcore::metadata::TableBucket { + fcore::metadata::TableBucket::new_with_partition( + self.table_id, + self.partition_id, + self.bucket, + ) + } +} + +#[pymethods] +impl LakeSnapshot { + /// Create a new LakeSnapshot + #[new] + pub fn new(snapshot_id: i64) -> Self { + Self { + snapshot_id, + table_buckets_offset: HashMap::new(), + } + } + + /// Get snapshot ID + #[getter] + pub fn snapshot_id(&self) -> i64 { + self.snapshot_id + } + + /// Get table bucket offsets as a Python dictionary with TableBucket keys + #[getter] + pub fn table_buckets_offset(&self, py: Python) -> PyResult> { + let dict = PyDict::new(py); + for (bucket, offset) in &self.table_buckets_offset { + let py_bucket = TableBucket::from_core(bucket.clone()); + dict.set_item(Py::new(py, py_bucket)?, *offset)?; + } + Ok(dict.into()) + } + + /// Get offset for a specific table bucket + pub fn get_bucket_offset(&self, bucket: &TableBucket) -> Option { + let core_bucket = bucket.to_core(); + self.table_buckets_offset.get(&core_bucket).copied() + } + + /// Get all table buckets + pub fn get_table_buckets(&self, py: Python) -> PyResult>> { + let mut buckets = Vec::new(); + for bucket in self.table_buckets_offset.keys() { + let py_bucket = TableBucket::from_core(bucket.clone()); + buckets.push(Py::new(py, py_bucket)?.into()); + } + Ok(buckets) + } + + /// String representation + pub fn __str__(&self) -> String { + format!( + "LakeSnapshot(snapshot_id={}, buckets_count={})", + self.snapshot_id, + self.table_buckets_offset.len() + ) + } + + /// String representation + pub fn __repr__(&self) -> String { + self.__str__() + } +} + +impl LakeSnapshot { + /// Create from core LakeSnapshot (internal use) + pub fn from_core(snapshot: fcore::metadata::LakeSnapshot) -> Self { + Self { + snapshot_id: snapshot.snapshot_id, + table_buckets_offset: snapshot.table_buckets_offset, + } + } +} + +/// Descriptor for a Fluss database (comment and custom properties) +#[pyclass] +#[derive(Clone)] +pub struct DatabaseDescriptor { + __descriptor: fcore::metadata::DatabaseDescriptor, +} + +#[pymethods] +impl DatabaseDescriptor { + /// Create a new DatabaseDescriptor + #[new] + #[pyo3(signature = (comment=None, custom_properties=None))] + pub fn new( + comment: Option, + custom_properties: Option>, + ) -> PyResult { + let mut builder = fcore::metadata::DatabaseDescriptor::builder(); + if let Some(c) = comment { + builder = builder.comment(&c); + } + if let Some(props) = custom_properties { + builder = builder.custom_properties(props); + } + let __descriptor = builder.build(); + Ok(Self { __descriptor }) + } + + /// Get comment if set + #[getter] + pub fn comment(&self) -> Option { + self.__descriptor.comment().map(|s| s.to_string()) + } + + /// Get custom properties + pub fn get_custom_properties(&self) -> HashMap { + self.__descriptor.custom_properties().clone() + } + + fn __repr__(&self) -> String { + format!( + "DatabaseDescriptor(comment={:?}, custom_properties={:?})", + self.comment(), + self.get_custom_properties() + ) + } +} + +impl DatabaseDescriptor { + pub fn to_core(&self) -> &fcore::metadata::DatabaseDescriptor { + &self.__descriptor + } +} + +/// Information about a Fluss database +#[pyclass] +pub struct DatabaseInfo { + __info: fcore::metadata::DatabaseInfo, +} + +#[pymethods] +impl DatabaseInfo { + /// Get the database name + #[getter] + pub fn database_name(&self) -> String { + self.__info.database_name().to_string() + } + + /// Get the database descriptor + pub fn get_database_descriptor(&self) -> DatabaseDescriptor { + DatabaseDescriptor { + __descriptor: self.__info.database_descriptor().clone(), + } + } + + /// Get created time + #[getter] + pub fn created_time(&self) -> i64 { + self.__info.created_time() + } + + /// Get modified time + #[getter] + pub fn modified_time(&self) -> i64 { + self.__info.modified_time() + } + + fn __repr__(&self) -> String { + format!( + "DatabaseInfo(database_name='{}', created_time={}, modified_time={})", + self.database_name(), + self.created_time(), + self.modified_time() + ) + } +} + +impl DatabaseInfo { + pub fn from_core(info: fcore::metadata::DatabaseInfo) -> Self { + Self { __info: info } + } +} diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs new file mode 100644 index 0000000000..b30baeb5ca --- /dev/null +++ b/fluss-rust/bindings/python/src/table.rs @@ -0,0 +1,2599 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::TOKIO_RUNTIME; +use crate::*; +use arrow::array::RecordBatch as ArrowRecordBatch; +use arrow::record_batch::RecordBatchReader as _; +use arrow_pyarrow::{FromPyArrow, ToPyArrow}; +use arrow_schema::SchemaRef; +use fluss::record::to_arrow_schema; +use indexmap::IndexMap; +use pyo3::IntoPyObjectExt; +use pyo3::exceptions::{PyIndexError, PyRuntimeError, PyTypeError}; +use pyo3::sync::PyOnceLock; +use pyo3::types::{ + IntoPyDict, PyBool, PyByteArray, PyBytes, PyDate, PyDateAccess, PyDateTime, PyDelta, + PyDeltaAccess, PyDict, PyList, PySequence, PySlice, PyString, PyTime, PyTimeAccess, PyTuple, + PyType, PyTzInfo, +}; +use pyo3_async_runtimes::tokio::future_into_py; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Duration; + +// Time conversion constants +const MILLIS_PER_SECOND: i64 = 1_000; +const MILLIS_PER_MINUTE: i64 = 60_000; +const MILLIS_PER_HOUR: i64 = 3_600_000; +const MICROS_PER_MILLI: i64 = 1_000; +const MICROS_PER_SECOND: i64 = 1_000_000; +const MICROS_PER_DAY: i64 = 86_400_000_000; +const NANOS_PER_MILLI: i64 = 1_000_000; +const NANOS_PER_MICRO: i64 = 1_000; +const DEFAULT_POLL_INTERVAL_MS: i64 = 1000; + +/// Represents a single scan record with metadata. +/// +/// Matches Rust/Java: offset, timestamp, change_type, row. +/// The bucket is the key in ScanRecords, not on the individual record. +#[pyclass] +pub struct ScanRecord { + #[pyo3(get)] + offset: i64, + #[pyo3(get)] + timestamp: i64, + #[pyo3(get)] + change_type: ChangeType, + /// Store row as a Python dict directly + row_dict: Py, +} + +#[pymethods] +impl ScanRecord { + /// Get the row data as a dictionary + #[getter] + pub fn row(&self, py: Python) -> Py { + self.row_dict.clone_ref(py) + } + + fn __str__(&self) -> String { + format!( + "ScanRecord(offset={}, timestamp={}, change_type={})", + self.offset, + self.timestamp, + self.change_type.short_string() + ) + } + + fn __repr__(&self) -> String { + self.__str__() + } +} + +impl ScanRecord { + /// Create a ScanRecord from core types + pub fn from_core( + py: Python, + record: &fcore::record::ScanRecord, + row_type: &fcore::metadata::RowType, + ) -> PyResult { + let fields = row_type.fields(); + let row = record.row(); + let dict = PyDict::new(py); + + for (pos, field) in fields.iter().enumerate() { + let value = datum_to_python_value(py, row, pos, field.data_type())?; + dict.set_item(field.name(), value)?; + } + + Ok(ScanRecord { + offset: record.offset(), + timestamp: record.timestamp(), + change_type: ChangeType::from_core(*record.change_type()), + row_dict: dict.unbind(), + }) + } +} + +/// Represents a batch of records with metadata +#[pyclass] +pub struct RecordBatch { + batch: Arc, + #[pyo3(get)] + bucket: TableBucket, + #[pyo3(get)] + base_offset: i64, + #[pyo3(get)] + last_offset: i64, +} + +#[pymethods] +impl RecordBatch { + /// Get the Arrow RecordBatch as PyArrow RecordBatch + #[getter] + pub fn batch(&self, py: Python) -> PyResult> { + let pyarrow_batch = self + .batch + .as_ref() + .to_pyarrow(py) + .map_err(|e| FlussError::new_err(format!("Failed to convert batch: {e}")))?; + Ok(pyarrow_batch.unbind()) + } + + fn __str__(&self) -> String { + format!( + "RecordBatch(bucket={}, base_offset={}, last_offset={}, rows={})", + self.bucket.__str__(), + self.base_offset, + self.last_offset, + self.batch.num_rows() + ) + } + + fn __repr__(&self) -> String { + self.__str__() + } +} + +impl RecordBatch { + /// Create a RecordBatch from core ScanBatch + pub fn from_scan_batch(scan_batch: fcore::record::ScanBatch) -> Self { + RecordBatch { + bucket: TableBucket::from_core(scan_batch.bucket().clone()), + base_offset: scan_batch.base_offset(), + last_offset: scan_batch.last_offset(), + batch: Arc::new(scan_batch.into_batch()), + } + } +} + +/// A collection of scan records grouped by bucket. +/// +/// Returned by `LogScanner.poll()`. Records are grouped by `TableBucket`. +#[pyclass] +pub struct ScanRecords { + records_by_bucket: IndexMap>>, + total_count: usize, +} + +#[pymethods] +impl ScanRecords { + /// List of distinct buckets that have records in this result. + pub fn buckets(&self) -> Vec { + self.records_by_bucket.keys().cloned().collect() + } + + /// Get records for a specific bucket. + /// + /// Returns an empty list if the bucket is not present (matches Rust/Java behavior). + pub fn records(&self, py: Python, bucket: &TableBucket) -> Vec> { + self.records_by_bucket + .get(bucket) + .map(|recs| recs.iter().map(|r| r.clone_ref(py)).collect()) + .unwrap_or_default() + } + + /// Total number of records across all buckets. + pub fn count(&self) -> usize { + self.total_count + } + + /// Whether the result set is empty. + pub fn is_empty(&self) -> bool { + self.total_count == 0 + } + + fn __len__(&self) -> usize { + self.total_count + } + + /// Type-dispatched indexing: + /// records[0] → ScanRecord (flat index) + /// records[-1] → ScanRecord (negative index) + /// records[1:3] → list[ScanRecord] (slice) + /// records[bucket] → list[ScanRecord] (by bucket) + fn __getitem__(&self, py: Python, key: &Bound<'_, PyAny>) -> PyResult> { + // Try integer index first + if let Ok(mut idx) = key.extract::() { + let len = self.total_count as isize; + if idx < 0 { + idx += len; + } + if idx < 0 || idx >= len { + return Err(PyIndexError::new_err(format!( + "index {idx} out of range for ScanRecords of size {len}" + ))); + } + let idx = idx as usize; + let mut offset = 0; + for recs in self.records_by_bucket.values() { + if idx < offset + recs.len() { + return Ok(recs[idx - offset].clone_ref(py).into_any()); + } + offset += recs.len(); + } + return Err(PyRuntimeError::new_err( + "internal error: total_count out of sync with records", + )); + } + // Try slice + if let Ok(slice) = key.downcast::() { + let indices = slice.indices(self.total_count as isize)?; + let mut result: Vec> = Vec::new(); + let mut i = indices.start; + while (indices.step > 0 && i < indices.stop) || (indices.step < 0 && i > indices.stop) { + let idx = i as usize; + let mut offset = 0; + for recs in self.records_by_bucket.values() { + if idx < offset + recs.len() { + result.push(recs[idx - offset].clone_ref(py)); + break; + } + offset += recs.len(); + } + i += indices.step; + } + return Ok(result.into_pyobject(py).unwrap().into_any().unbind()); + } + // Try TableBucket + if let Ok(bucket) = key.extract::() { + let recs = self.records(py, &bucket); + return Ok(recs.into_pyobject(py).unwrap().into_any().unbind()); + } + Err(PyTypeError::new_err( + "index must be int, slice, or TableBucket", + )) + } + + /// Support `bucket in records`. + fn __contains__(&self, bucket: &TableBucket) -> bool { + self.records_by_bucket.contains_key(bucket) + } + + /// Mapping protocol: alias for `buckets()`. + pub fn keys(&self) -> Vec { + self.buckets() + } + + /// Mapping protocol: lazy iterator over record lists, one per bucket. + pub fn values(slf: Bound<'_, Self>) -> ScanRecordsBucketIter { + let this = slf.borrow(); + let bucket_keys: Vec = this.records_by_bucket.keys().cloned().collect(); + drop(this); + ScanRecordsBucketIter { + owner: slf.unbind(), + bucket_keys, + bucket_idx: 0, + with_keys: false, + } + } + + /// Mapping protocol: lazy iterator over `(TableBucket, list[ScanRecord])` pairs. + pub fn items(slf: Bound<'_, Self>) -> ScanRecordsBucketIter { + let this = slf.borrow(); + let bucket_keys: Vec = this.records_by_bucket.keys().cloned().collect(); + drop(this); + ScanRecordsBucketIter { + owner: slf.unbind(), + bucket_keys, + bucket_idx: 0, + with_keys: true, + } + } + + fn __str__(&self) -> String { + format!( + "ScanRecords(records={}, buckets={})", + self.total_count, + self.records_by_bucket.len() + ) + } + + fn __repr__(&self) -> String { + self.__str__() + } + + /// Flat iterator over all records across all buckets (matches Java/Rust). + fn __iter__(slf: Bound<'_, Self>) -> ScanRecordsIter { + let this = slf.borrow(); + let bucket_keys: Vec = this.records_by_bucket.keys().cloned().collect(); + drop(this); + ScanRecordsIter { + owner: slf.unbind(), + bucket_keys, + bucket_idx: 0, + rec_idx: 0, + } + } +} + +#[pyclass] +struct ScanRecordsIter { + owner: Py, + bucket_keys: Vec, + bucket_idx: usize, + rec_idx: usize, +} + +#[pymethods] +impl ScanRecordsIter { + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __next__(&mut self, py: Python) -> Option> { + let owner = self.owner.borrow(py); + loop { + if self.bucket_idx >= self.bucket_keys.len() { + return None; + } + let bucket = &self.bucket_keys[self.bucket_idx]; + if let Some(recs) = owner.records_by_bucket.get(bucket) { + if self.rec_idx < recs.len() { + let rec = recs[self.rec_idx].clone_ref(py); + self.rec_idx += 1; + return Some(rec); + } + } + self.bucket_idx += 1; + self.rec_idx = 0; + } + } +} + +/// Lazy iterator for `ScanRecords.items()` and `ScanRecords.values()`. +/// +/// Yields one bucket at a time: `(TableBucket, list[ScanRecord])` for items, +/// or `list[ScanRecord]` for values. Only materializes records for the +/// current bucket on each `__next__` call. +#[pyclass] +pub struct ScanRecordsBucketIter { + owner: Py, + bucket_keys: Vec, + bucket_idx: usize, + with_keys: bool, +} + +#[pymethods] +impl ScanRecordsBucketIter { + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __next__(&mut self, py: Python) -> Option> { + if self.bucket_idx >= self.bucket_keys.len() { + return None; + } + let bucket = &self.bucket_keys[self.bucket_idx]; + let owner = self.owner.borrow(py); + let recs = owner + .records_by_bucket + .get(bucket) + .map(|recs| recs.iter().map(|r| r.clone_ref(py)).collect::>()) + .unwrap_or_default(); + let bucket = bucket.clone(); + self.bucket_idx += 1; + + if self.with_keys { + Some( + (bucket, recs) + .into_pyobject(py) + .unwrap() + .into_any() + .unbind(), + ) + } else { + Some(recs.into_pyobject(py).unwrap().into_any().unbind()) + } + } +} + +/// Represents a Fluss table for data operations +#[pyclass] +pub struct FlussTable { + connection: Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + table_path: fcore::metadata::TablePath, + has_primary_key: bool, +} + +/// Builder for creating log scanners with flexible configuration. +/// +/// Use this builder to configure projection, and in the future, filters +/// before creating a log scanner. +#[pyclass] +pub struct TableScan { + connection: Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + projection: Option, +} + +/// Scanner type for internal use +enum ScannerType { + Record, + Batch, +} + +#[pymethods] +impl TableScan { + /// Project to specific columns by their indices. + /// + /// Args: + /// indices: List of column indices (0-based) to include in the scan. + /// + /// Returns: + /// Self for method chaining. + pub fn project(mut slf: PyRefMut<'_, Self>, indices: Vec) -> PyRefMut<'_, Self> { + slf.projection = Some(ProjectionType::Indices(indices)); + slf + } + + /// Project to specific columns by their names. + /// + /// Args: + /// names: List of column names to include in the scan. + /// + /// Returns: + /// Self for method chaining. + pub fn project_by_name(mut slf: PyRefMut<'_, Self>, names: Vec) -> PyRefMut<'_, Self> { + slf.projection = Some(ProjectionType::Names(names)); + slf + } + + /// Create a record-based log scanner. + /// + /// Use this scanner with `poll()` to get individual records with metadata + /// (offset, timestamp, change_type). + /// + /// Returns: + /// LogScanner for record-by-record scanning with `poll()` + pub fn create_log_scanner<'py>(&self, py: Python<'py>) -> PyResult> { + self.create_scanner_internal(py, ScannerType::Record) + } + + /// Create a batch-based log scanner. + /// + /// Use this scanner with `poll_arrow()` to get Arrow Tables, or with + /// `poll_record_batch()` to get individual batches with metadata. + /// + /// Returns: + /// LogScanner for batch-based scanning with `poll_arrow()` or `poll_record_batch()` + pub fn create_record_batch_log_scanner<'py>( + &self, + py: Python<'py>, + ) -> PyResult> { + self.create_scanner_internal(py, ScannerType::Batch) + } + + fn __repr__(&self) -> String { + format!( + "TableScan(table={}.{})", + self.table_info.table_path.database(), + self.table_info.table_path.table() + ) + } +} + +impl TableScan { + fn create_scanner_internal<'py>( + &self, + py: Python<'py>, + scanner_type: ScannerType, + ) -> PyResult> { + let conn = self.connection.clone(); + let metadata = self.metadata.clone(); + let table_info = self.table_info.clone(); + let projection = self.projection.clone(); + + future_into_py(py, async move { + let fluss_table = fcore::client::FlussTable::new(&conn, metadata, table_info.clone()); + + let projection_indices = resolve_projection_indices(&projection, &table_info)?; + let table_scan = apply_projection(fluss_table.new_scan(), projection)?; + + let admin = conn + .get_admin() + .map_err(|e| FlussError::from_core_error(&e))?; + + let (projected_schema, projected_row_type) = + calculate_projected_types(&table_info, projection_indices)?; + + let scanner_kind = match scanner_type { + ScannerType::Record => { + let s = table_scan + .create_log_scanner() + .map_err(|e| FlussError::from_core_error(&e))?; + ScannerKind::Record(s) + } + ScannerType::Batch => { + let s = table_scan + .create_record_batch_log_scanner() + .map_err(|e| FlussError::from_core_error(&e))?; + ScannerKind::Batch(s) + } + }; + + let py_scanner = LogScanner::new( + scanner_kind, + admin, + table_info, + projected_schema, + Arc::new(projected_row_type), + ); + + Python::attach(|py| Py::new(py, py_scanner)) + }) + } +} + +/// Internal enum to represent different projection types +#[derive(Clone)] +enum ProjectionType { + Indices(Vec), + Names(Vec), +} + +/// Resolve projection to column indices +fn resolve_projection_indices( + projection: &Option, + table_info: &fcore::metadata::TableInfo, +) -> PyResult>> { + match projection { + Some(ProjectionType::Indices(indices)) => Ok(Some(indices.clone())), + Some(ProjectionType::Names(names)) => { + let schema = table_info.get_schema(); + let columns = schema.columns(); + let mut indices = Vec::with_capacity(names.len()); + for name in names { + let idx = columns + .iter() + .position(|c| c.name() == name) + .ok_or_else(|| FlussError::new_err(format!("Column '{name}' not found")))?; + indices.push(idx); + } + Ok(Some(indices)) + } + None => Ok(None), + } +} + +/// Apply projection to table scan +fn apply_projection( + table_scan: fcore::client::TableScan, + projection: Option, +) -> PyResult { + match projection { + Some(ProjectionType::Indices(indices)) => table_scan + .project(&indices) + .map_err(|e| FlussError::from_core_error(&e)), + Some(ProjectionType::Names(names)) => { + let column_name_refs: Vec<&str> = names.iter().map(|s| s.as_str()).collect(); + table_scan + .project_by_name(&column_name_refs) + .map_err(|e| FlussError::from_core_error(&e)) + } + None => Ok(table_scan), + } +} + +/// Calculate projected schema and row type from projection indices +fn calculate_projected_types( + table_info: &fcore::metadata::TableInfo, + projection_indices: Option>, +) -> PyResult<(SchemaRef, fcore::metadata::RowType)> { + let full_schema = + to_arrow_schema(table_info.get_row_type()).map_err(|e| FlussError::from_core_error(&e))?; + let full_row_type = table_info.get_row_type(); + + match projection_indices { + Some(indices) => { + let arrow_fields: Vec<_> = indices + .iter() + .map(|&i| full_schema.field(i).clone()) + .collect(); + let row_fields: Vec<_> = indices + .iter() + .map(|&i| full_row_type.fields()[i].clone()) + .collect(); + Ok(( + Arc::new(arrow_schema::Schema::new(arrow_fields)), + fcore::metadata::RowType::new(row_fields), + )) + } + None => Ok((full_schema, full_row_type.clone())), + } +} + +#[pymethods] +impl FlussTable { + /// Create a new table scan builder for configuring and creating log scanners. + /// + /// Use this method to create scanners with the builder pattern: + /// Returns: + /// TableScan builder for configuring the scanner. + pub fn new_scan(&self) -> TableScan { + TableScan { + connection: self.connection.clone(), + metadata: self.metadata.clone(), + table_info: self.table_info.clone(), + projection: None, + } + } + + /// Create a new TableAppend builder for the table. + /// + /// Returns: + /// TableAppend builder. Call `create_writer()` to get an AppendWriter. + fn new_append(&self) -> PyResult { + let _guard = TOKIO_RUNTIME.enter(); + let fluss_table = fcore::client::FlussTable::new( + &self.connection, + self.metadata.clone(), + self.table_info.clone(), + ); + + let table_append = fluss_table + .new_append() + .map_err(|e| FlussError::from_core_error(&e))?; + + Ok(TableAppend { + inner: table_append, + table_info: self.table_info.clone(), + }) + } + + /// Get table information + pub fn get_table_info(&self) -> TableInfo { + TableInfo::from_core(self.table_info.clone()) + } + + /// Get table path + pub fn get_table_path(&self) -> TablePath { + TablePath::from_core(self.table_path.clone()) + } + + /// Check if table has primary key + pub fn has_primary_key(&self) -> bool { + self.has_primary_key + } + + /// Create a new TableLookup builder for primary key lookups. + /// + /// This is only available for tables with a primary key. + /// + /// Returns: + /// TableLookup builder. Call `create_lookuper()` to get a Lookuper. + pub fn new_lookup(&self) -> PyResult { + if !self.has_primary_key { + return Err(FlussError::new_err( + "Lookup is only supported for primary key tables", + )); + } + + Ok(TableLookup { + connection: self.connection.clone(), + metadata: self.metadata.clone(), + table_info: self.table_info.clone(), + }) + } + + /// Create a new TableUpsert builder for the table. + /// + /// This is only available for tables with a primary key. + /// + /// Returns: + /// TableUpsert builder. Call `create_writer()` to get an UpsertWriter, + /// or use `partial_update_by_name()` / `partial_update_by_index()` first. + pub fn new_upsert(&self) -> PyResult { + if !self.has_primary_key { + return Err(FlussError::new_err( + "Upsert is only supported for primary key tables", + )); + } + + let _guard = TOKIO_RUNTIME.enter(); + let fluss_table = fcore::client::FlussTable::new( + &self.connection, + self.metadata.clone(), + self.table_info.clone(), + ); + + let table_upsert = fluss_table + .new_upsert() + .map_err(|e| FlussError::from_core_error(&e))?; + + Ok(TableUpsert { + inner: table_upsert, + table_info: self.table_info.clone(), + target_columns: None, + }) + } + + fn __repr__(&self) -> String { + format!( + "FlussTable(path={}.{})", + self.table_path.database(), + self.table_path.table() + ) + } +} + +impl FlussTable { + /// Create a FlussTable + pub fn new_table( + connection: Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + table_path: fcore::metadata::TablePath, + has_primary_key: bool, + ) -> Self { + Self { + connection, + metadata, + table_info, + table_path, + has_primary_key, + } + } +} + +/// Builder for creating an AppendWriter. +/// +/// Obtain via `FlussTable.new_append()`, then call `create_writer()`. +#[pyclass] +pub struct TableAppend { + inner: fcore::client::TableAppend, + table_info: fcore::metadata::TableInfo, +} + +#[pymethods] +impl TableAppend { + /// Create an AppendWriter from this builder. + pub fn create_writer(&self) -> PyResult { + let rust_writer = self + .inner + .create_writer() + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(AppendWriter::from_core( + rust_writer, + self.table_info.clone(), + )) + } + + fn __repr__(&self) -> String { + "TableAppend()".to_string() + } +} + +/// Builder for creating an UpsertWriter, with optional partial update configuration. +/// +/// Obtain via `FlussTable.new_upsert()`, then optionally call +/// `partial_update_by_name()` or `partial_update_by_index()`, +/// then call `create_writer()`. +#[pyclass] +pub struct TableUpsert { + inner: fcore::client::TableUpsert, + table_info: fcore::metadata::TableInfo, + /// Column indices for partial updates, tracked for Python's dict→GenericRow conversion. + target_columns: Option>, +} + +#[pymethods] +impl TableUpsert { + /// Configure partial update by column names. + /// + /// Only the specified columns will be updated on upsert. + /// + /// Args: + /// columns: List of column names to update. + /// + /// Returns: + /// A new TableUpsert configured for partial update. + pub fn partial_update_by_name(&self, columns: Vec) -> PyResult { + let col_refs: Vec<&str> = columns.iter().map(|s| s.as_str()).collect(); + // Core validates and resolves names → indices internally + let updated = self + .inner + .partial_update_with_column_names(&col_refs) + .map_err(|e| FlussError::from_core_error(&e))?; + // Resolve indices for Python's row conversion layer (core validated names above) + let row_type = self.table_info.row_type(); + let indices: Vec = columns + .iter() + .map(|name| { + row_type.get_field_index(name).ok_or_else(|| { + FlussError::new_err(format!("Unknown column name '{name}' for partial update")) + }) + }) + .collect::>>()?; + Ok(TableUpsert { + inner: updated, + table_info: self.table_info.clone(), + target_columns: Some(indices), + }) + } + + /// Configure partial update by column indices. + /// + /// Only the specified columns will be updated on upsert. + /// + /// Args: + /// column_indices: List of column indices (0-based) to update. + /// + /// Returns: + /// A new TableUpsert configured for partial update. + pub fn partial_update_by_index(&self, column_indices: Vec) -> PyResult { + let target = column_indices.clone(); + // Core validates indices internally + let updated = self + .inner + .partial_update(Some(column_indices)) + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(TableUpsert { + inner: updated, + table_info: self.table_info.clone(), + target_columns: Some(target), + }) + } + + /// Create an UpsertWriter from this builder. + pub fn create_writer(&self) -> PyResult { + crate::UpsertWriter::new( + &self.inner, + self.table_info.clone(), + self.target_columns.clone(), + ) + } + + fn __repr__(&self) -> String { + "TableUpsert()".to_string() + } +} + +/// Builder for creating a Lookuper. +/// +/// Obtain via `FlussTable.new_lookup()`, then call `create_lookuper()`. +#[pyclass] +pub struct TableLookup { + connection: Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, +} + +#[pymethods] +impl TableLookup { + /// Create a Lookuper from this builder. + pub fn create_lookuper(&self) -> PyResult { + crate::Lookuper::new( + &self.connection, + self.metadata.clone(), + self.table_info.clone(), + ) + } + + /// Switch to prefix-scan mode for the given lookup columns. + /// + /// The columns must be the table's partition keys (if any) plus the + /// bucket keys, in that order. + /// + /// Args: + /// column_names: List of column names forming the prefix key. + /// + /// Returns: + /// TablePrefixLookup builder. Call `create_lookuper()` to get a PrefixLookuper. + pub fn lookup_by(&self, column_names: Vec) -> TablePrefixLookup { + TablePrefixLookup { + connection: self.connection.clone(), + metadata: self.metadata.clone(), + table_info: self.table_info.clone(), + lookup_column_names: column_names, + } + } + + fn __repr__(&self) -> String { + "TableLookup()".to_string() + } +} + +/// Builder for creating a PrefixLookuper. +/// +/// Obtain via `TableLookup.lookup_by(columns)`, then call `create_lookuper()`. +#[pyclass] +pub struct TablePrefixLookup { + connection: Arc, + metadata: Arc, + table_info: fcore::metadata::TableInfo, + lookup_column_names: Vec, +} + +#[pymethods] +impl TablePrefixLookup { + /// Create a PrefixLookuper from this builder. + pub fn create_lookuper(&self) -> PyResult { + crate::PrefixLookuper::new( + &self.connection, + self.metadata.clone(), + self.table_info.clone(), + self.lookup_column_names.clone(), + ) + } + + fn __repr__(&self) -> String { + "TablePrefixLookup()".to_string() + } +} + +/// Writer for appending data to a Fluss table +#[pyclass] +pub struct AppendWriter { + inner: Arc, + table_info: fcore::metadata::TableInfo, +} + +#[pymethods] +impl AppendWriter { + /// Write Arrow table data (fire-and-forget, use flush() to ensure delivery) + pub fn write_arrow(&self, py: Python, table: Py) -> PyResult<()> { + // Convert Arrow Table to batches and write each batch + let batches = table.call_method0(py, "to_batches")?; + let batch_list: Vec> = batches.extract(py)?; + + for batch in batch_list { + // Drop the handle — fire-and-forget for bulk writes + drop(self.write_arrow_batch(py, batch)?); + } + Ok(()) + } + + /// Write Arrow batch data. + /// + /// Returns: + /// WriteResultHandle that can be ignored (fire-and-forget) or + /// awaited via `handle.wait()` for server acknowledgment. + pub fn write_arrow_batch(&self, py: Python, batch: Py) -> PyResult { + // This shares the underlying Arrow buffers without copying data + let batch_bound = batch.bind(py); + let rust_batch: ArrowRecordBatch = FromPyArrow::from_pyarrow_bound(batch_bound) + .map_err(|e| FlussError::new_err(format!("Failed to convert RecordBatch: {e}")))?; + + let result_future = self + .inner + .append_arrow_batch(rust_batch) + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(WriteResultHandle::new(result_future)) + } + + /// Append a single row to the table. + /// + /// Returns: + /// WriteResultHandle that can be ignored (fire-and-forget) or + /// awaited via `handle.wait()` for server acknowledgment. + pub fn append(&self, row: &Bound<'_, PyAny>) -> PyResult { + let generic_row = python_to_generic_row(row, &self.table_info)?; + + let result_future = self + .inner + .append(&generic_row) + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(WriteResultHandle::new(result_future)) + } + + /// Write Pandas DataFrame data + pub fn write_pandas(&self, py: Python, df: Py) -> PyResult<()> { + // Get the expected Arrow schema from the Fluss table + let row_type = self.table_info.get_row_type(); + let expected_schema = fcore::record::to_arrow_schema(row_type) + .map_err(|e| FlussError::from_core_error(&e))?; + + // Convert Arrow schema to PyArrow schema + let py_schema = expected_schema + .as_ref() + .to_pyarrow(py) + .map_err(|e| FlussError::new_err(format!("Failed to convert schema: {e}")))?; + + // Import pyarrow module + let pyarrow = py.import("pyarrow")?; + + // Get the Table class from pyarrow module + let table_class = pyarrow.getattr("Table")?; + + // Call Table.from_pandas(df, schema=expected_schema) to ensure proper type casting + let pa_table = table_class.call_method( + "from_pandas", + (df,), + Some(&[("schema", py_schema)].into_py_dict(py)?), + )?; + + // Then call write_arrow with the converted table + self.write_arrow(py, pa_table.into()) + } + + /// Flush any pending data + pub fn flush<'py>(&self, py: Python<'py>) -> PyResult> { + let inner = self.inner.clone(); + future_into_py(py, async move { + inner + .flush() + .await + .map_err(|e| FlussError::from_core_error(&e)) + }) + } + + // Enter the async runtime context (for 'async with' statement) + fn __aenter__<'py>(slf: PyRef<'py, Self>, py: Python<'py>) -> PyResult> { + let py_slf = slf.into_pyobject(py)?.unbind(); + future_into_py(py, async move { Ok(py_slf) }) + } + + // Exit the async runtime context (for 'async with' statement) + /// On exit, the writer is automatically flushed. + #[pyo3(signature = (exc_type=None, _exc_value=None, _traceback=None))] + fn __aexit__<'py>( + &self, + py: Python<'py>, + exc_type: Option>, + _exc_value: Option>, + _traceback: Option>, + ) -> PyResult> { + let inner = self.inner.clone(); + let is_exc_none = exc_type.as_ref().is_none_or(|e| e.is_none()); + future_into_py(py, async move { + let res = inner.flush().await; + if let Err(e) = res { + if is_exc_none { + return Err(FlussError::from_core_error(&e)); + } + } + Ok(false) + }) + } + + fn __repr__(&self) -> String { + "AppendWriter()".to_string() + } +} + +impl AppendWriter { + /// Create a AppendWriter from a core append writer + pub fn from_core( + append: fcore::client::AppendWriter, + table_info: fcore::metadata::TableInfo, + ) -> Self { + Self { + inner: Arc::new(append), + table_info, + } + } +} + +/// Represents different input shapes for a row +#[derive(FromPyObject)] +enum RowInput<'py> { + Dict(Bound<'py, PyDict>), + Tuple(Bound<'py, PyTuple>), + List(Bound<'py, PyList>), +} + +/// Convert Python row (dict/list/tuple) to GenericRow requiring all schema columns. +pub fn python_to_generic_row( + row: &Bound, + table_info: &fcore::metadata::TableInfo, +) -> PyResult> { + let all_indices: Vec = (0..table_info.row_type().fields().len()).collect(); + python_to_sparse_generic_row(row, table_info, &all_indices) +} + +/// Process a Python sequence (list or tuple) into datums at the target column positions. +fn process_sequence( + seq: &Bound, + target_indices: &[usize], + fields: &[fcore::metadata::DataField], + datums: &mut [fcore::row::Datum<'static>], + sparse: bool, +) -> PyResult<()> { + if seq.len()? != target_indices.len() { + return Err(FlussError::new_err(format!( + "Expected {} elements, got {}", + target_indices.len(), + seq.len()? + ))); + } + for (i, &col_idx) in target_indices.iter().enumerate() { + let field = &fields[col_idx]; + let value = seq.get_item(i)?; + let dest = if sparse { col_idx } else { i }; + datums[dest] = python_value_to_datum(&value, field.data_type()) + .map_err(|e| FlussError::new_err(format!("Field '{}': {}", field.name(), e)))?; + } + Ok(()) +} + +/// Build a full-width GenericRow filling only the specified column +/// indices from user input; all other columns are set to Null. +pub fn python_to_sparse_generic_row( + row: &Bound, + table_info: &fcore::metadata::TableInfo, + target_indices: &[usize], +) -> PyResult> { + python_to_generic_row_inner(row, table_info, target_indices, true) +} + +/// Build a dense GenericRow with exactly `target_indices.len()` fields, +/// containing only the target column values in order. +pub fn python_to_dense_generic_row( + row: &Bound, + table_info: &fcore::metadata::TableInfo, + target_indices: &[usize], +) -> PyResult> { + python_to_generic_row_inner(row, table_info, target_indices, false) +} + +/// Build a GenericRow from user input. When `sparse` is true, the row is full width and padded with nulls +fn python_to_generic_row_inner( + row: &Bound, + table_info: &fcore::metadata::TableInfo, + target_indices: &[usize], + sparse: bool, +) -> PyResult> { + let row_type = table_info.row_type(); + let fields = row_type.fields(); + let target_names: Vec<&str> = target_indices.iter().map(|&i| fields[i].name()).collect(); + + let num_fields = if sparse { + fields.len() + } else { + target_indices.len() + }; + let mut datums: Vec> = vec![fcore::row::Datum::Null; num_fields]; + + let row_input: RowInput = row.extract().map_err(|_| { + let type_name = row + .get_type() + .name() + .map(|n| n.to_string()) + .unwrap_or_else(|_| "unknown".to_string()); + FlussError::new_err(format!( + "Row must be a dict, list, or tuple; got {type_name}" + )) + })?; + + match row_input { + RowInput::Dict(dict) => { + for (k, _) in dict.iter() { + let key_str = k.extract::<&str>().map_err(|_| { + let key_type = k + .get_type() + .name() + .map(|n| n.to_string()) + .unwrap_or_else(|_| "unknown".to_string()); + FlussError::new_err(format!("Dict keys must be strings; got {key_type}")) + })?; + if !target_names.contains(&key_str) { + return Err(FlussError::new_err(format!( + "Unknown field '{}'. Expected: {}", + key_str, + target_names.join(", ") + ))); + } + } + for (i, &col_idx) in target_indices.iter().enumerate() { + let name = target_names[i]; + let field = &fields[col_idx]; + let value = dict + .get_item(name)? + .ok_or_else(|| FlussError::new_err(format!("Missing field: {name}")))?; + let dest = if sparse { col_idx } else { i }; + datums[dest] = python_value_to_datum(&value, field.data_type()) + .map_err(|e| FlussError::new_err(format!("Field '{name}': {e}")))?; + } + } + + RowInput::List(list) => { + process_sequence( + list.as_sequence(), + target_indices, + fields, + &mut datums, + sparse, + )?; + } + + RowInput::Tuple(tuple) => { + process_sequence( + tuple.as_sequence(), + target_indices, + fields, + &mut datums, + sparse, + )?; + } + } + + Ok(fcore::row::GenericRow { values: datums }) +} + +/// Convert Python value to Datum based on data type +fn python_value_to_datum( + value: &Bound, + data_type: &fcore::metadata::DataType, +) -> PyResult> { + use fcore::row::{Datum, F32, F64}; + + if value.is_none() { + return Ok(Datum::Null); + } + + match data_type { + fcore::metadata::DataType::Boolean(_) => { + let v: bool = value.extract()?; + Ok(Datum::Bool(v)) + } + fcore::metadata::DataType::TinyInt(_) => { + // Strict type checking: reject bool for int columns + if value.is_instance_of::() { + return Err(FlussError::new_err( + "Expected int for TinyInt column, got bool. Use 0 or 1 explicitly.".to_string(), + )); + } + let v: i8 = value.extract()?; + Ok(Datum::Int8(v)) + } + fcore::metadata::DataType::SmallInt(_) => { + if value.is_instance_of::() { + return Err(FlussError::new_err( + "Expected int for SmallInt column, got bool. Use 0 or 1 explicitly." + .to_string(), + )); + } + let v: i16 = value.extract()?; + Ok(Datum::Int16(v)) + } + fcore::metadata::DataType::Int(_) => { + if value.is_instance_of::() { + return Err(FlussError::new_err( + "Expected int for Int column, got bool. Use 0 or 1 explicitly.".to_string(), + )); + } + let v: i32 = value.extract()?; + Ok(Datum::Int32(v)) + } + fcore::metadata::DataType::BigInt(_) => { + if value.is_instance_of::() { + return Err(FlussError::new_err( + "Expected int for BigInt column, got bool. Use 0 or 1 explicitly.".to_string(), + )); + } + let v: i64 = value.extract()?; + Ok(Datum::Int64(v)) + } + fcore::metadata::DataType::Float(_) => { + let v: f32 = value.extract()?; + Ok(Datum::Float32(F32::from(v))) + } + fcore::metadata::DataType::Double(_) => { + let v: f64 = value.extract()?; + Ok(Datum::Float64(F64::from(v))) + } + fcore::metadata::DataType::String(_) | fcore::metadata::DataType::Char(_) => { + let v: String = value.extract()?; + Ok(v.into()) + } + fcore::metadata::DataType::Bytes(_) | fcore::metadata::DataType::Binary(_) => { + // Efficient extraction: downcast to specific type and use bulk copy. + // PyBytes::as_bytes() and PyByteArray::to_vec() are O(n) bulk copies of the underlying data. + if let Ok(bytes) = value.downcast::() { + Ok(bytes.as_bytes().to_vec().into()) + } else if let Ok(bytearray) = value.downcast::() { + Ok(bytearray.to_vec().into()) + } else { + Err(FlussError::new_err(format!( + "Expected bytes or bytearray, got {}", + value.get_type().name()? + ))) + } + } + fcore::metadata::DataType::Decimal(decimal_type) => { + python_decimal_to_datum(value, decimal_type.precision(), decimal_type.scale()) + } + fcore::metadata::DataType::Date(_) => python_date_to_datum(value), + fcore::metadata::DataType::Time(_) => python_time_to_datum(value), + fcore::metadata::DataType::Timestamp(_) => python_datetime_to_timestamp_ntz(value), + fcore::metadata::DataType::TimestampLTz(_) => python_datetime_to_timestamp_ltz(value), + fcore::metadata::DataType::Array(array_type) => { + let element_type = array_type.get_element_type(); + if value.is_instance_of::() { + return Err(FlussError::new_err(format!( + "Expected sequence for Array column, got {}", + get_type_name(value) + ))); + } + let seq = value.downcast::().map_err(|_| { + FlussError::new_err(format!( + "Expected sequence for Array column, got {}", + get_type_name(value) + )) + })?; + + let len = seq.len()?; + let mut writer = fcore::row::binary_array::FlussArrayWriter::new(len, element_type); + + for i in 0..len { + let item = seq.get_item(i)?; + if item.is_none() { + writer.set_null_at(i); + } else { + let val_datum = python_value_to_datum(&item, element_type)?; + match val_datum { + Datum::Null => writer.set_null_at(i), + Datum::Bool(v) => writer.write_boolean(i, v), + Datum::Int8(v) => writer.write_byte(i, v), + Datum::Int16(v) => writer.write_short(i, v), + Datum::Int32(v) => writer.write_int(i, v), + Datum::Int64(v) => writer.write_long(i, v), + Datum::Float32(v) => writer.write_float(i, v.into_inner()), + Datum::Float64(v) => writer.write_double(i, v.into_inner()), + Datum::String(v) => writer.write_string(i, &v), + Datum::Blob(v) => writer.write_binary_bytes(i, v.as_ref()), + Datum::Decimal(v) => { + if let fcore::metadata::DataType::Decimal(dt) = element_type { + writer.write_decimal(i, &v, dt.precision()); + } + } + Datum::Date(v) => writer.write_date(i, v), + Datum::Time(v) => writer.write_time(i, v), + Datum::TimestampNtz(v) => { + if let fcore::metadata::DataType::Timestamp(dt) = element_type { + writer.write_timestamp_ntz(i, &v, dt.precision()); + } + } + Datum::TimestampLtz(v) => { + if let fcore::metadata::DataType::TimestampLTz(dt) = element_type { + writer.write_timestamp_ltz(i, &v, dt.precision()); + } + } + Datum::Array(v) => writer.write_array(i, &v), + Datum::Map(v) => writer.write_map(i, &v), + Datum::Row(_) => { + return Err(FlussError::new_err( + "Row datum is not supported as an array element", + )); + } + } + } + } + + let array = writer + .complete() + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(Datum::Array(array)) + } + _ => Err(FlussError::new_err(format!( + "Unsupported data type for row-level operations: {data_type}" + ))), + } +} + +/// Convert Rust Datum to Python value based on data type. +/// This is the reverse of python_value_to_datum. +pub fn datum_to_python_value( + py: Python, + row: &dyn fcore::row::InternalRow, + pos: usize, + data_type: &fcore::metadata::DataType, +) -> PyResult> { + use fcore::metadata::DataType; + + // Check for null first + if row + .is_null_at(pos) + .map_err(|e| FlussError::from_core_error(&e))? + { + return Ok(py.None()); + } + + match data_type { + DataType::Boolean(_) => Ok(row + .get_boolean(pos) + .map_err(|e| FlussError::from_core_error(&e))? + .into_pyobject(py)? + .to_owned() + .into_any() + .unbind()), + DataType::TinyInt(_) => Ok(row + .get_byte(pos) + .map_err(|e| FlussError::from_core_error(&e))? + .into_pyobject(py)? + .to_owned() + .into_any() + .unbind()), + DataType::SmallInt(_) => Ok(row + .get_short(pos) + .map_err(|e| FlussError::from_core_error(&e))? + .into_pyobject(py)? + .to_owned() + .into_any() + .unbind()), + DataType::Int(_) => Ok(row + .get_int(pos) + .map_err(|e| FlussError::from_core_error(&e))? + .into_pyobject(py)? + .to_owned() + .into_any() + .unbind()), + DataType::BigInt(_) => Ok(row + .get_long(pos) + .map_err(|e| FlussError::from_core_error(&e))? + .into_pyobject(py)? + .to_owned() + .into_any() + .unbind()), + DataType::Float(_) => Ok(row + .get_float(pos) + .map_err(|e| FlussError::from_core_error(&e))? + .into_pyobject(py)? + .to_owned() + .into_any() + .unbind()), + DataType::Double(_) => Ok(row + .get_double(pos) + .map_err(|e| FlussError::from_core_error(&e))? + .into_pyobject(py)? + .to_owned() + .into_any() + .unbind()), + DataType::String(_) => { + let s = row + .get_string(pos) + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(s.into_pyobject(py)?.into_any().unbind()) + } + DataType::Char(char_type) => { + let s = row + .get_char(pos, char_type.length() as usize) + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(s.into_pyobject(py)?.into_any().unbind()) + } + DataType::Bytes(_) => { + let b = row + .get_bytes(pos) + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(PyBytes::new(py, b).into_any().unbind()) + } + DataType::Binary(binary_type) => { + let b = row + .get_binary(pos, binary_type.length()) + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(PyBytes::new(py, b).into_any().unbind()) + } + DataType::Decimal(decimal_type) => { + let decimal = row + .get_decimal( + pos, + decimal_type.precision() as usize, + decimal_type.scale() as usize, + ) + .map_err(|e| FlussError::from_core_error(&e))?; + rust_decimal_to_python(py, &decimal) + } + DataType::Date(_) => { + let date = row + .get_date(pos) + .map_err(|e| FlussError::from_core_error(&e))?; + rust_date_to_python(py, date) + } + DataType::Time(_) => { + let time = row + .get_time(pos) + .map_err(|e| FlussError::from_core_error(&e))?; + rust_time_to_python(py, time) + } + DataType::Timestamp(ts_type) => { + let ts = row + .get_timestamp_ntz(pos, ts_type.precision()) + .map_err(|e| FlussError::from_core_error(&e))?; + rust_timestamp_ntz_to_python(py, ts) + } + DataType::TimestampLTz(ts_type) => { + let ts = row + .get_timestamp_ltz(pos, ts_type.precision()) + .map_err(|e| FlussError::from_core_error(&e))?; + rust_timestamp_ltz_to_python(py, ts) + } + DataType::Array(array_type) => { + let array_data = row + .get_array(pos) + .map_err(|e| FlussError::from_core_error(&e))?; + + let element_type = array_type.get_element_type(); + let py_list = pyo3::types::PyList::empty(py); + + for i in 0..array_data.size() { + let py_val = datum_to_python_value(py, &array_data, i, element_type)?; + py_list.append(py_val)?; + } + Ok(py_list.into_any().unbind()) + } + _ => Err(FlussError::new_err(format!( + "Unsupported data type for conversion to Python: {data_type}" + ))), + } +} + +/// Convert Rust Decimal to Python decimal.Decimal +fn rust_decimal_to_python(py: Python, decimal: &fcore::row::Decimal) -> PyResult> { + let decimal_ty = get_decimal_type(py)?; + let decimal_str = decimal.to_string(); + let py_decimal = decimal_ty.call1((decimal_str,))?; + Ok(py_decimal.into_any().unbind()) +} + +/// Convert Rust Date (days since epoch) to Python datetime.date +fn rust_date_to_python(py: Python, date: fcore::row::Date) -> PyResult> { + let days_since_epoch = date.get_inner(); + let epoch = jiff::civil::date(1970, 1, 1); + let civil_date = epoch + jiff::Span::new().days(days_since_epoch as i64); + + let py_date = PyDate::new( + py, + civil_date.year() as i32, + civil_date.month() as u8, + civil_date.day() as u8, + )?; + Ok(py_date.into_any().unbind()) +} + +/// Convert Rust Time (millis since midnight) to Python datetime.time +fn rust_time_to_python(py: Python, time: fcore::row::Time) -> PyResult> { + let millis = time.get_inner() as i64; + let hours = millis / MILLIS_PER_HOUR; + let minutes = (millis % MILLIS_PER_HOUR) / MILLIS_PER_MINUTE; + let seconds = (millis % MILLIS_PER_MINUTE) / MILLIS_PER_SECOND; + let microseconds = (millis % MILLIS_PER_SECOND) * MICROS_PER_MILLI; + + let py_time = PyTime::new( + py, + hours as u8, + minutes as u8, + seconds as u8, + microseconds as u32, + None, + )?; + Ok(py_time.into_any().unbind()) +} + +/// Convert Rust TimestampNtz to Python naive datetime +fn rust_timestamp_ntz_to_python(py: Python, ts: fcore::row::TimestampNtz) -> PyResult> { + let millis = ts.get_millisecond(); + let nanos = ts.get_nano_of_millisecond(); + let total_micros = millis * MICROS_PER_MILLI + (nanos as i64 / NANOS_PER_MICRO); + + // Convert to civil datetime via jiff + let timestamp = jiff::Timestamp::from_microsecond(total_micros) + .map_err(|e| FlussError::new_err(format!("Invalid timestamp: {e}")))?; + let civil_dt = timestamp.to_zoned(jiff::tz::TimeZone::UTC).datetime(); + + let py_dt = PyDateTime::new( + py, + civil_dt.year() as i32, + civil_dt.month() as u8, + civil_dt.day() as u8, + civil_dt.hour() as u8, + civil_dt.minute() as u8, + civil_dt.second() as u8, + (civil_dt.subsec_nanosecond() / 1000) as u32, // microseconds + None, + )?; + Ok(py_dt.into_any().unbind()) +} + +/// Convert Rust TimestampLtz to Python timezone-aware datetime (UTC) +fn rust_timestamp_ltz_to_python(py: Python, ts: fcore::row::TimestampLtz) -> PyResult> { + let millis = ts.get_epoch_millisecond(); + let nanos = ts.get_nano_of_millisecond(); + let total_micros = millis * MICROS_PER_MILLI + (nanos as i64 / NANOS_PER_MICRO); + + // Convert to civil datetime via jiff + let timestamp = jiff::Timestamp::from_microsecond(total_micros) + .map_err(|e| FlussError::new_err(format!("Invalid timestamp: {e}")))?; + let civil_dt = timestamp.to_zoned(jiff::tz::TimeZone::UTC).datetime(); + + let utc = get_utc_timezone(py)?; + let py_dt = PyDateTime::new( + py, + civil_dt.year() as i32, + civil_dt.month() as u8, + civil_dt.day() as u8, + civil_dt.hour() as u8, + civil_dt.minute() as u8, + civil_dt.second() as u8, + (civil_dt.subsec_nanosecond() / 1000) as u32, // microseconds + Some(&utc), + )?; + Ok(py_dt.into_any().unbind()) +} + +/// Convert an InternalRow to a Python dictionary +pub fn internal_row_to_dict( + py: Python, + row: &dyn fcore::row::InternalRow, + table_info: &fcore::metadata::TableInfo, +) -> PyResult> { + let row_type = table_info.row_type(); + let fields = row_type.fields(); + let dict = PyDict::new(py); + + for (pos, field) in fields.iter().enumerate() { + let value = datum_to_python_value(py, row, pos, field.data_type())?; + dict.set_item(field.name(), value)?; + } + + Ok(dict.into_any().unbind()) +} + +/// Cached decimal.Decimal type +/// Uses PyOnceLock for thread-safety and subinterpreter compatibility. +static DECIMAL_TYPE: PyOnceLock> = PyOnceLock::new(); + +/// Cached UTC timezone +static UTC_TIMEZONE: PyOnceLock> = PyOnceLock::new(); + +/// Cached UTC epoch type +static UTC_EPOCH: PyOnceLock> = PyOnceLock::new(); + +/// Get the cached decimal.Decimal type, importing it once per interpreter. +fn get_decimal_type(py: Python) -> PyResult> { + let ty = DECIMAL_TYPE.get_or_try_init(py, || -> PyResult<_> { + let decimal_mod = py.import("decimal")?; + let decimal_ty = decimal_mod.getattr("Decimal")?.downcast_into::()?; + Ok(decimal_ty.unbind()) + })?; + Ok(ty.bind(py).clone()) +} + +/// Get the cached UTC timezone (datetime.timezone.utc), creating it once per interpreter. +fn get_utc_timezone(py: Python) -> PyResult> { + let tz = UTC_TIMEZONE.get_or_try_init(py, || -> PyResult<_> { + let datetime_mod = py.import("datetime")?; + let timezone = datetime_mod.getattr("timezone")?; + let utc = timezone.getattr("utc")?; + Ok(utc.unbind()) + })?; + // Downcast to PyTzInfo for use with PyDateTime::new() + Ok(tz.bind(py).clone().downcast_into::()?) +} + +/// Get the cached UTC epoch datetime, creating it once per interpreter. +fn get_utc_epoch(py: Python) -> PyResult> { + let epoch = UTC_EPOCH.get_or_try_init(py, || -> PyResult<_> { + let datetime_mod = py.import("datetime")?; + let timezone = datetime_mod.getattr("timezone")?; + let utc = timezone.getattr("utc")?; + let epoch = datetime_mod + .getattr("datetime")? + .call1((1970, 1, 1, 0, 0, 0, 0, &utc))?; + Ok(epoch.unbind()) + })?; + Ok(epoch.bind(py).clone()) +} + +/// Validate that value is a decimal.Decimal instance. +fn ensure_is_decimal(value: &Bound) -> PyResult<()> { + let decimal_ty = get_decimal_type(value.py())?; + if !value.is_instance(&decimal_ty.into_any())? { + return Err(FlussError::new_err(format!( + "Expected decimal.Decimal, got {}", + get_type_name(value) + ))); + } + Ok(()) +} + +/// Convert Python decimal.Decimal to Datum::Decimal. +/// Only accepts decimal.Decimal +fn python_decimal_to_datum( + value: &Bound, + precision: u32, + scale: u32, +) -> PyResult> { + use std::str::FromStr; + + ensure_is_decimal(value)?; + + let decimal_str: String = value.str()?.extract()?; + let bd = bigdecimal::BigDecimal::from_str(&decimal_str).map_err(|e| { + FlussError::new_err(format!("Failed to parse decimal '{decimal_str}': {e}")) + })?; + + let decimal = fcore::row::Decimal::from_big_decimal(bd, precision, scale).map_err(|e| { + FlussError::new_err(format!( + "Failed to convert decimal '{decimal_str}' to DECIMAL({precision}, {scale}): {e}" + )) + })?; + + Ok(fcore::row::Datum::Decimal(decimal)) +} + +/// Convert Python datetime.date to Datum::Date. +fn python_date_to_datum(value: &Bound) -> PyResult> { + // Reject datetime.datetime (subclass of date) - use timestamp columns for those + if value.downcast::().is_ok() { + return Err(FlussError::new_err( + "Expected datetime.date, got datetime.datetime. Use a TIMESTAMP column for datetime values.", + )); + } + + let date = value.downcast::().map_err(|_| { + FlussError::new_err(format!( + "Expected datetime.date, got {}", + get_type_name(value) + )) + })?; + + let year = date.get_year(); + let month = date.get_month(); + let day = date.get_day(); + + // Calculate days since Unix epoch (1970-01-01) + let civil_date = jiff::civil::date(year as i16, month as i8, day as i8); + let epoch = jiff::civil::date(1970, 1, 1); + let days_since_epoch = (civil_date - epoch).get_days(); + + Ok(fcore::row::Datum::Date(fcore::row::Date::new( + days_since_epoch, + ))) +} + +/// Convert Python datetime.time to Datum::Time. +/// Uses PyO3's native PyTime type for efficient access. +/// +/// Note: Fluss TIME is always stored as milliseconds since midnight (i32) regardless +/// of the schema's precision setting. This matches the Java Fluss wire protocol. +/// Sub-millisecond precision (microseconds not divisible by 1000) will raise an error +/// to prevent silent data loss and ensure fail-fast behavior. +fn python_time_to_datum(value: &Bound) -> PyResult> { + let time = value.downcast::().map_err(|_| { + FlussError::new_err(format!( + "Expected datetime.time, got {}", + get_type_name(value) + )) + })?; + + let hour = time.get_hour() as i32; + let minute = time.get_minute() as i32; + let second = time.get_second() as i32; + let microsecond = time.get_microsecond() as i32; + + // Strict validation: reject sub-millisecond precision + if microsecond % MICROS_PER_MILLI as i32 != 0 { + return Err(FlussError::new_err(format!( + "TIME values with sub-millisecond precision are not supported. \ + Got time with {microsecond} microseconds (not divisible by 1000). \ + Fluss stores TIME as milliseconds since midnight. \ + Please round to milliseconds before insertion." + ))); + } + + // Convert to milliseconds since midnight + let millis = hour * MILLIS_PER_HOUR as i32 + + minute * MILLIS_PER_MINUTE as i32 + + second * MILLIS_PER_SECOND as i32 + + microsecond / MICROS_PER_MILLI as i32; + + Ok(fcore::row::Datum::Time(fcore::row::Time::new(millis))) +} + +/// Convert Python datetime-like object to Datum::TimestampNtz. +/// Supports: datetime.datetime (naive preferred), pd.Timestamp, np.datetime64 +fn python_datetime_to_timestamp_ntz(value: &Bound) -> PyResult> { + let (epoch_millis, nano_of_milli) = extract_datetime_components_ntz(value)?; + + let ts = fcore::row::TimestampNtz::from_millis_nanos(epoch_millis, nano_of_milli) + .map_err(|e| FlussError::new_err(format!("Failed to create TimestampNtz: {e}")))?; + + Ok(fcore::row::Datum::TimestampNtz(ts)) +} + +/// Convert Python datetime-like object to Datum::TimestampLtz. +/// For naive datetimes, assumes UTC. For aware datetimes, converts to UTC. +/// Supports: datetime.datetime, pd.Timestamp, np.datetime64 +fn python_datetime_to_timestamp_ltz(value: &Bound) -> PyResult> { + let (epoch_millis, nano_of_milli) = extract_datetime_components_ltz(value)?; + + let ts = fcore::row::TimestampLtz::from_millis_nanos(epoch_millis, nano_of_milli) + .map_err(|e| FlussError::new_err(format!("Failed to create TimestampLtz: {e}")))?; + + Ok(fcore::row::Datum::TimestampLtz(ts)) +} + +/// Extract epoch milliseconds for TimestampNtz (wall-clock time, no timezone conversion). +/// Uses integer arithmetic to avoid float precision issues. +/// For clarity, tz-aware datetimes are rejected - use TimestampLtz for those. +fn extract_datetime_components_ntz(value: &Bound) -> PyResult<(i64, i32)> { + // Try PyDateTime first + if let Ok(dt) = value.downcast::() { + // Reject tz-aware datetime for NTZ - it's ambiguous what the user wants + let tzinfo = dt.getattr("tzinfo")?; + if !tzinfo.is_none() { + return Err(FlussError::new_err( + "TIMESTAMP (without timezone) requires a naive datetime. \ + Got timezone-aware datetime. Either remove tzinfo or use TIMESTAMP_LTZ column.", + )); + } + return datetime_to_epoch_millis_as_utc(dt); + } + + // Check for pandas Timestamp by verifying module name + if is_pandas_timestamp(value) { + // For NTZ, reject tz-aware pandas Timestamps for consistency with datetime behavior + if let Ok(tz) = value.getattr("tz") { + if !tz.is_none() { + return Err(FlussError::new_err( + "TIMESTAMP (without timezone) requires a naive pd.Timestamp. \ + Got timezone-aware Timestamp. Either use tz_localize(None) or use TIMESTAMP_LTZ column.", + )); + } + } + // Naive pandas Timestamp: .value is nanoseconds since epoch (wall-clock as UTC) + let nanos: i64 = value.getattr("value")?.extract()?; + return Ok(nanos_to_millis_and_submillis(nanos)); + } + + // Try to_pydatetime() for objects that support it + if let Ok(py_dt) = value.call_method0("to_pydatetime") { + if let Ok(dt) = py_dt.downcast::() { + let tzinfo = dt.getattr("tzinfo")?; + if !tzinfo.is_none() { + return Err(FlussError::new_err( + "TIMESTAMP (without timezone) requires a naive datetime. \ + Got timezone-aware value. Use TIMESTAMP_LTZ column instead.", + )); + } + return datetime_to_epoch_millis_as_utc(dt); + } + } + + Err(FlussError::new_err(format!( + "Expected naive datetime.datetime or pd.Timestamp, got {}", + get_type_name(value) + ))) +} + +/// Extract epoch milliseconds for TimestampLtz (instant in time, UTC-based). +/// For naive datetimes, assumes UTC. For aware datetimes, converts to UTC. +fn extract_datetime_components_ltz(value: &Bound) -> PyResult<(i64, i32)> { + // Try PyDateTime first + if let Ok(dt) = value.downcast::() { + // Check if timezone-aware + let tzinfo = dt.getattr("tzinfo")?; + if tzinfo.is_none() { + // Naive datetime: assume UTC (treat components as UTC time) + return datetime_to_epoch_millis_as_utc(dt); + } else { + // Aware datetime: use timedelta from epoch to get correct UTC instant + return datetime_to_epoch_millis_utc_aware(dt); + } + } + + // Check for pandas Timestamp + if is_pandas_timestamp(value) { + // pandas Timestamp.value is always nanoseconds since UTC epoch + let nanos: i64 = value.getattr("value")?.extract()?; + return Ok(nanos_to_millis_and_submillis(nanos)); + } + + // Try to_pydatetime() + if let Ok(py_dt) = value.call_method0("to_pydatetime") { + if let Ok(dt) = py_dt.downcast::() { + let tzinfo = dt.getattr("tzinfo")?; + if tzinfo.is_none() { + return datetime_to_epoch_millis_as_utc(dt); + } else { + return datetime_to_epoch_millis_utc_aware(dt); + } + } + } + + Err(FlussError::new_err(format!( + "Expected datetime.datetime or pd.Timestamp, got {}", + get_type_name(value) + ))) +} + +/// Convert datetime components to epoch milliseconds treating them as UTC +fn datetime_to_epoch_millis_as_utc(dt: &Bound<'_, PyDateTime>) -> PyResult<(i64, i32)> { + let year = dt.get_year(); + let month = dt.get_month(); + let day = dt.get_day(); + let hour = dt.get_hour(); + let minute = dt.get_minute(); + let second = dt.get_second(); + let microsecond = dt.get_microsecond(); + + // Create jiff civil datetime and convert to UTC timestamp + // Safe casts: hour (0-23), minute (0-59), second (0-59) all fit in i8 + let civil_dt = jiff::civil::date(year as i16, month as i8, day as i8).at( + hour as i8, + minute as i8, + second as i8, + microsecond as i32 * 1000, + ); + + let timestamp = jiff::tz::Offset::UTC + .to_timestamp(civil_dt) + .map_err(|e| FlussError::new_err(format!("Invalid datetime: {e}")))?; + + let millis = timestamp.as_millisecond(); + let nano_of_milli = (timestamp.subsec_nanosecond() % NANOS_PER_MILLI as i32) as i32; + + Ok((millis, nano_of_milli)) +} + +/// Convert timezone-aware datetime to epoch milliseconds using Python's timedelta. +/// This correctly handles timezone conversions by computing (dt - UTC_EPOCH). +/// The UTC epoch is cached for performance. +fn datetime_to_epoch_millis_utc_aware(dt: &Bound<'_, PyDateTime>) -> PyResult<(i64, i32)> { + let py = dt.py(); + let epoch = get_utc_epoch(py)?; + + // Compute delta = dt - epoch (this handles timezone conversion correctly) + let delta = dt.call_method1("__sub__", (epoch,))?; + let delta = delta.downcast::()?; + + // Extract components using integer arithmetic + let days = delta.get_days() as i64; + let seconds = delta.get_seconds() as i64; + let microseconds = delta.get_microseconds() as i64; + + // Total milliseconds (note: days can be negative for dates before epoch) + let total_micros = days * MICROS_PER_DAY + seconds * MICROS_PER_SECOND + microseconds; + let millis = total_micros / MICROS_PER_MILLI; + let nano_of_milli = ((total_micros % MICROS_PER_MILLI) * MICROS_PER_MILLI) as i32; + + // Handle negative microseconds remainder + let (millis, nano_of_milli) = if nano_of_milli < 0 { + (millis - 1, nano_of_milli + NANOS_PER_MILLI as i32) + } else { + (millis, nano_of_milli) + }; + + Ok((millis, nano_of_milli)) +} + +/// Convert nanoseconds to (milliseconds, nano_of_millisecond) +fn nanos_to_millis_and_submillis(nanos: i64) -> (i64, i32) { + let millis = nanos / NANOS_PER_MILLI; + let nano_of_milli = (nanos % NANOS_PER_MILLI) as i32; + + // Handle negative nanoseconds correctly (Euclidean remainder) + if nano_of_milli < 0 { + (millis - 1, nano_of_milli + NANOS_PER_MILLI as i32) + } else { + (millis, nano_of_milli) + } +} + +/// Check if value is a pandas Timestamp by examining its type. +fn is_pandas_timestamp(value: &Bound) -> bool { + // Check module and class name to avoid importing pandas + if let Ok(cls) = value.get_type().getattr("__module__") { + if let Ok(module) = cls.extract::<&str>() { + if module.starts_with("pandas") { + if let Ok(name) = value.get_type().getattr("__name__") { + if let Ok(name_str) = name.extract::<&str>() { + return name_str == "Timestamp"; + } + } + } + } + } + false +} + +/// Get type name +fn get_type_name(value: &Bound) -> String { + value + .get_type() + .name() + .map(|s| s.to_string()) + .unwrap_or_else(|_| "unknown".to_string()) +} + +/// Thin Python iterator over [`fcore::client::SyncRecordBatchLogReader`]. +/// Used internally as the backing iterator for +/// ``pa.RecordBatchReader.from_batches()``; not registered on the module. +#[pyclass] +struct PyRecordBatchLogReader { + sync_reader: fcore::client::SyncRecordBatchLogReader, +} + +#[pymethods] +impl PyRecordBatchLogReader { + fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + fn __next__(&mut self, py: Python) -> PyResult>> { + let result = py.detach(|| self.sync_reader.next().transpose()); + + match result { + Ok(Some(batch)) => { + let py_batch = batch + .to_pyarrow(py) + .map_err(|e| FlussError::new_err(format!("Failed to convert batch: {e}")))?; + Ok(Some(py_batch.unbind())) + } + Ok(None) => Ok(None), + Err(arrow_err) => Err(FlussError::new_err(format!( + "Error reading batch: {arrow_err}" + ))), + } + } +} + +/// Wraps the two scanner variants so we never have an impossible state +/// (both None or both Some). +enum ScannerKind { + Record(fcore::client::LogScanner), + Batch(fcore::client::RecordBatchLogScanner), +} + +impl ScannerKind { + fn as_record(&self) -> PyResult<&fcore::client::LogScanner> { + match self { + Self::Record(s) => Ok(s), + Self::Batch(_) => Err(FlussError::new_err( + "poll() requires a record-based scanner. Use new_scan().create_log_scanner().", + )), + } + } + + fn as_batch(&self) -> PyResult<&fcore::client::RecordBatchLogScanner> { + match self { + Self::Batch(s) => Ok(s), + Self::Record(_) => Err(FlussError::new_err( + "This method requires a batch-based scanner. Use new_scan().create_record_batch_log_scanner().", + )), + } + } +} + +/// Dispatch a method call to whichever scanner variant is active. +/// Both `LogScanner` and `RecordBatchLogScanner` share the same subscribe interface. +macro_rules! with_scanner { + ($scanner:expr, $method:ident($($arg:expr),*)) => { + match $scanner.as_ref() { + ScannerKind::Record(s) => s.$method($($arg),*).await, + ScannerKind::Batch(s) => s.$method($($arg),*).await, + } + }; +} + +/// Scanner for reading log data from a Fluss table. +/// +/// This scanner supports two modes: +/// - Record-based scanning via `poll()` - returns individual records with metadata +/// - Batch-based scanning via `poll_arrow()` / `poll_record_batch()` - returns Arrow batches +#[pyclass] +pub struct LogScanner { + kind: Arc, + admin: Arc, + table_info: fcore::metadata::TableInfo, + /// The projected Arrow schema to use for empty table creation + projected_schema: SchemaRef, + /// The projected row type to use for record-based scanning + projected_row_type: Arc, +} + +#[pymethods] +impl LogScanner { + /// Subscribe to a single bucket at a specific offset (non-partitioned tables). + /// + /// Args: + /// bucket_id: The bucket ID to subscribe to + /// start_offset: The offset to start reading from (use EARLIEST_OFFSET for beginning) + fn subscribe(&self, py: Python, bucket_id: i32, start_offset: i64) -> PyResult<()> { + py.detach(|| { + TOKIO_RUNTIME.block_on(async { + with_scanner!(&self.kind, subscribe(bucket_id, start_offset)) + .map_err(|e| FlussError::from_core_error(&e)) + }) + }) + } + + /// Subscribe to multiple buckets at specified offsets (non-partitioned tables). + /// + /// Args: + /// bucket_offsets: A dict mapping bucket_id -> start_offset + fn subscribe_buckets(&self, py: Python, bucket_offsets: HashMap) -> PyResult<()> { + py.detach(|| { + TOKIO_RUNTIME.block_on(async { + with_scanner!(&self.kind, subscribe_buckets(&bucket_offsets)) + .map_err(|e| FlussError::from_core_error(&e)) + }) + }) + } + + /// Subscribe to a bucket within a specific partition (partitioned tables only). + /// + /// Args: + /// partition_id: The partition ID (from PartitionInfo.partition_id) + /// bucket_id: The bucket ID within the partition + /// start_offset: The offset to start reading from (use EARLIEST_OFFSET for beginning) + fn subscribe_partition( + &self, + py: Python, + partition_id: i64, + bucket_id: i32, + start_offset: i64, + ) -> PyResult<()> { + py.detach(|| { + TOKIO_RUNTIME.block_on(async { + with_scanner!( + &self.kind, + subscribe_partition(partition_id, bucket_id, start_offset) + ) + .map_err(|e| FlussError::from_core_error(&e)) + }) + }) + } + + /// Subscribe to multiple partition+bucket combinations at once (partitioned tables only). + /// + /// Args: + /// partition_bucket_offsets: A dict mapping (partition_id, bucket_id) tuples to start_offsets + fn subscribe_partition_buckets( + &self, + py: Python, + partition_bucket_offsets: HashMap<(i64, i32), i64>, + ) -> PyResult<()> { + py.detach(|| { + TOKIO_RUNTIME.block_on(async { + with_scanner!( + &self.kind, + subscribe_partition_buckets(&partition_bucket_offsets) + ) + .map_err(|e| FlussError::from_core_error(&e)) + }) + }) + } + + /// Unsubscribe from a specific bucket (non-partitioned tables only). + /// + /// Args: + /// bucket_id: The bucket ID to unsubscribe from + fn unsubscribe(&self, py: Python, bucket_id: i32) -> PyResult<()> { + py.detach(|| { + TOKIO_RUNTIME.block_on(async { + with_scanner!(&self.kind, unsubscribe(bucket_id)) + .map_err(|e| FlussError::from_core_error(&e)) + }) + }) + } + + /// Unsubscribe from a specific partition bucket (partitioned tables only). + /// + /// Args: + /// partition_id: The partition ID to unsubscribe from + /// bucket_id: The bucket ID within the partition + fn unsubscribe_partition(&self, py: Python, partition_id: i64, bucket_id: i32) -> PyResult<()> { + py.detach(|| { + TOKIO_RUNTIME.block_on(async { + with_scanner!(&self.kind, unsubscribe_partition(partition_id, bucket_id)) + .map_err(|e| FlussError::from_core_error(&e)) + }) + }) + } + + /// Poll for individual records with metadata. + /// + /// Args: + /// timeout_ms: Timeout in milliseconds to wait for records + /// + /// Returns: + /// ScanRecords grouped by bucket. Supports flat iteration + /// (`for rec in records`) and per-bucket access (`records.buckets()`, + /// `records.records(bucket)`, `records[bucket]`). + /// + /// Note: + /// - Requires a record-based scanner (created with new_scan().create_log_scanner()) + /// - Returns an empty ScanRecords if no records are available + /// - When timeout expires, returns an empty ScanRecords (NOT an error) + fn poll<'py>(&self, py: Python<'py>, timeout_ms: i64) -> PyResult> { + if timeout_ms < 0 { + return Err(FlussError::new_err(format!( + "timeout_ms must be non-negative, got: {timeout_ms}" + ))); + } + + let timeout = Duration::from_millis(timeout_ms as u64); + let scanner = Arc::clone(&self.kind); + let projected_row_type = self.projected_row_type.clone(); + + future_into_py(py, async move { + let scan_records = scanner + .as_record()? + .poll(timeout) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + let mut records_by_bucket = IndexMap::new(); + let mut total_count = 0usize; + + for (bucket, records) in scan_records.into_records_by_buckets() { + let py_bucket = TableBucket::from_core(bucket); + let mut py_records = Vec::with_capacity(records.len()); + for record in &records { + let scan_record = ScanRecord::from_core(py, record, &projected_row_type)?; + py_records.push(Py::new(py, scan_record)?); + total_count += 1; + } + records_by_bucket.insert(py_bucket, py_records); + } + + Ok(ScanRecords { + records_by_bucket, + total_count, + }) + }) + }) + } + + /// Poll for batches with metadata. + /// + /// Args: + /// timeout_ms: Timeout in milliseconds to wait for batches + /// + /// Returns: + /// List of RecordBatch objects, each containing the Arrow batch along with + /// bucket, base_offset, and last_offset metadata. + /// + /// Note: + /// - Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()) + /// - Returns an empty list if no batches are available + /// - When timeout expires, returns an empty list (NOT an error) + fn poll_record_batch<'py>( + &self, + py: Python<'py>, + timeout_ms: i64, + ) -> PyResult> { + if timeout_ms < 0 { + return Err(FlussError::new_err(format!( + "timeout_ms must be non-negative, got: {timeout_ms}" + ))); + } + + let timeout = Duration::from_millis(timeout_ms as u64); + let scanner = Arc::clone(&self.kind); + + future_into_py(py, async move { + let scan_batches = scanner + .as_batch()? + .poll(timeout) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + Python::attach(|py| { + scan_batches + .into_iter() + .map(|sb| Py::new(py, RecordBatch::from_scan_batch(sb))) + .collect::>>() + }) + }) + } + + /// Poll for new records as an Arrow Table. + /// + /// Args: + /// timeout_ms: Timeout in milliseconds to wait for records + /// + /// Returns: + /// PyArrow Table containing the polled records (batches merged) + /// + /// Note: + /// - Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()) + /// - Returns an empty table (with correct schema) if no records are available + /// - When timeout expires, returns an empty table (NOT an error) + fn poll_arrow<'py>(&self, py: Python<'py>, timeout_ms: i64) -> PyResult> { + if timeout_ms < 0 { + return Err(FlussError::new_err(format!( + "timeout_ms must be non-negative, got: {timeout_ms}" + ))); + } + + let timeout = Duration::from_millis(timeout_ms as u64); + let scanner = Arc::clone(&self.kind); + let projected_schema = self.projected_schema.clone(); + + future_into_py(py, async move { + let scan_batches = scanner + .as_batch()? + .poll(timeout) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + let arrow_batches = scan_batches + .into_iter() + .map(|sb| Arc::new(sb.into_batch())) + .collect(); + + Python::attach(|py| Self::batches_to_arrow_table(py, arrow_batches, &projected_schema)) + }) + } + + /// Create a lazy Arrow RecordBatchReader that reads until latest offsets. + /// + /// This is a **blocking / synchronous** API: construction queries the + /// server for latest offsets (via ``block_on``), and each + /// ``RecordBatchReader.__next__()`` call blocks the calling thread until + /// the next batch is available. It is suitable for Arrow interop + /// (feeding into DuckDB, Polars, etc.) but should not be used + /// from ``asyncio`` coroutines -- see issue #545 for a planned + /// asyncio-native streaming alternative. + /// TODO(#545): Add asyncio-native streaming counterpart. + /// + /// Returns a PyArrow RecordBatchReader that lazily polls batches one at a + /// time. This is more memory-efficient than ``to_arrow()`` which loads all + /// data into a single table. + /// + /// **Concurrency:** While this reader is alive, ``subscribe*`` and + /// ``unsubscribe*`` calls on the scanner are rejected with an error. + /// You should also avoid calling ``poll_arrow`` / ``poll_record_batch`` + /// on the same scanner — these are not blocked by the guard, but they + /// share the underlying fetch buffer with the reader and would + /// interleave batches between both consumers. Drop the reader before + /// resuming any of these operations. + /// + /// You must call subscribe(), subscribe_buckets(), subscribe_partition(), + /// or subscribe_partition_buckets() first. + /// + /// Returns: + /// ``pyarrow.RecordBatchReader`` yielding ``RecordBatch`` objects + fn to_arrow_batch_reader(&self, py: Python) -> PyResult> { + let scanner = self.kind.as_batch()?; + + let sync_reader = py + .detach(|| { + TOKIO_RUNTIME.block_on(async { + let reader = fcore::client::RecordBatchLogReader::new_until_latest( + scanner.new_shared_handle(), + &self.admin, + ) + .await?; + Ok::<_, fcore::error::Error>( + reader.to_record_batch_reader(TOKIO_RUNTIME.handle().clone()), + ) + }) + }) + .map_err(|e| FlussError::from_core_error(&e))?; + + let py_schema = sync_reader + .schema() + .to_pyarrow(py) + .map_err(|e| FlussError::new_err(format!("Failed to convert schema: {e}")))?; + + let py_iter = Py::new(py, PyRecordBatchLogReader { sync_reader })?; + + let pyarrow = py.import("pyarrow")?; + let batch_reader = pyarrow + .getattr("RecordBatchReader")? + .call_method1("from_batches", (py_schema, py_iter))?; + + Ok(batch_reader.into()) + } + + /// Convert all data to Arrow Table. + /// + /// Reads from currently subscribed buckets until reaching their latest offsets. + /// Works for both partitioned and non-partitioned tables. + /// + /// Materializes batches in Rust (``RecordBatchLogReader::collect_all_batches``) + /// then builds one PyArrow table, avoiding per-batch Python iteration. + /// + /// You must call subscribe(), subscribe_buckets(), subscribe_partition(), or subscribe_partition_buckets() first. + /// + /// Returns: + /// PyArrow Table containing all data from subscribed buckets + fn to_arrow<'py>(&self, py: Python<'py>) -> PyResult> { + let kind = Arc::clone(&self.kind); + let admin = Arc::clone(&self.admin); + let projected_schema = self.projected_schema.clone(); + + future_into_py(py, async move { + let scanner = kind.as_batch()?; + + let mut reader = fcore::client::RecordBatchLogReader::new_until_latest( + scanner.new_shared_handle(), + &admin, + ) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + let scan_batches = reader + .collect_all_batches() + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + let batches: Vec> = scan_batches + .into_iter() + .map(|sb| Arc::new(sb.into_batch())) + .collect(); + + Python::attach(|py| Self::batches_to_arrow_table(py, batches, &projected_schema)) + }) + } + + /// Convert all data to Pandas DataFrame. + /// + /// Reads from currently subscribed buckets until reaching their latest offsets. + /// Works for both partitioned and non-partitioned tables. + /// + /// You must call subscribe(), subscribe_buckets(), subscribe_partition(), or subscribe_partition_buckets() first. + /// + /// Returns: + /// Pandas DataFrame containing all data from subscribed buckets + fn to_pandas<'py>(&self, py: Python<'py>) -> PyResult> { + let kind = Arc::clone(&self.kind); + let admin = Arc::clone(&self.admin); + let projected_schema = self.projected_schema.clone(); + + future_into_py(py, async move { + let scanner = kind.as_batch()?; + + let mut reader = fcore::client::RecordBatchLogReader::new_until_latest( + scanner.new_shared_handle(), + &admin, + ) + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + let scan_batches = reader + .collect_all_batches() + .await + .map_err(|e| FlussError::from_core_error(&e))?; + + let batches: Vec> = scan_batches + .into_iter() + .map(|sb| Arc::new(sb.into_batch())) + .collect(); + + Python::attach(|py| { + let arrow_table = Self::batches_to_arrow_table(py, batches, &projected_schema)?; + arrow_table.call_method0(py, "to_pandas") + }) + }) + } + + fn __aiter__<'py>(slf: PyRef<'py, Self>) -> PyResult> { + let py = slf.py(); + + // Single lock for the generic async generator + static ASYNC_GEN_FN: PyOnceLock> = PyOnceLock::new(); + + let gen_fn = ASYNC_GEN_FN.get_or_init(py, || { + let code = pyo3::ffi::c_str!( + r#" +async def _async_scan_generic(scanner, method_name, timeout_ms): + poll_method = getattr(scanner, method_name) + while True: + for item in await poll_method(timeout_ms): + yield item +"# + ); + let globals = pyo3::types::PyDict::new(py); + py.run(code, Some(&globals), None).unwrap(); + globals + .get_item("_async_scan_generic") + .unwrap() + .unwrap() + .unbind() + }); + + let method_name = match slf.kind.as_ref() { + ScannerKind::Record(_) => "poll", + ScannerKind::Batch(_) => "poll_record_batch", + }; + + gen_fn.bind(py).call1(( + slf.into_bound_py_any(py)?, + method_name, + DEFAULT_POLL_INTERVAL_MS, + )) + } + + fn __repr__(&self) -> String { + format!("LogScanner(table={})", self.table_info.table_path) + } +} + +impl LogScanner { + fn new( + scanner: ScannerKind, + admin: Arc, + table_info: fcore::metadata::TableInfo, + projected_schema: SchemaRef, + projected_row_type: Arc, + ) -> Self { + Self { + kind: Arc::new(scanner), + admin, + table_info, + projected_schema, + projected_row_type, + } + } + + /// Convert Arrow record batches to a PyArrow Table (or empty table if no batches). + fn batches_to_arrow_table( + py: Python<'_>, + batches: Vec>, + projected_schema: &SchemaRef, + ) -> PyResult> { + if batches.is_empty() { + let py_schema = projected_schema + .as_ref() + .to_pyarrow(py) + .map_err(|e| FlussError::new_err(format!("Failed to convert schema: {e}")))?; + let pyarrow = py.import("pyarrow")?; + let empty_table = pyarrow + .getattr("Table")? + .call_method1("from_batches", (vec![] as Vec>, py_schema))?; + Ok(empty_table.into()) + } else { + Utils::combine_batches_to_table(py, batches) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_nanos_to_millis_and_submillis() { + // Simple positive case + assert_eq!(nanos_to_millis_and_submillis(1_500_000), (1, 500_000)); + + // Exact millisecond boundary + assert_eq!(nanos_to_millis_and_submillis(2_000_000), (2, 0)); + + // Zero + assert_eq!(nanos_to_millis_and_submillis(0), (0, 0)); + + // Large value + assert_eq!( + nanos_to_millis_and_submillis(86_400_000_000_000), // 1 day in nanos + (86_400_000, 0) + ); + + // Negative: -1.5 milliseconds should be (-2 millis, +500_000 nanos) + // Because -1_500_000 nanos = -2ms + 500_000ns + assert_eq!(nanos_to_millis_and_submillis(-1_500_000), (-2, 500_000)); + + // Negative exact boundary + assert_eq!(nanos_to_millis_and_submillis(-2_000_000), (-2, 0)); + + // Small negative + assert_eq!(nanos_to_millis_and_submillis(-1), (-1, 999_999)); + + // Negative with sub-millisecond part + assert_eq!(nanos_to_millis_and_submillis(-500_000), (-1, 500_000)); + } +} diff --git a/fluss-rust/bindings/python/src/upsert.rs b/fluss-rust/bindings/python/src/upsert.rs new file mode 100644 index 0000000000..45244225f6 --- /dev/null +++ b/fluss-rust/bindings/python/src/upsert.rs @@ -0,0 +1,161 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::table::{python_to_generic_row, python_to_sparse_generic_row}; +use crate::*; +use pyo3_async_runtimes::tokio::future_into_py; +use std::sync::Arc; + +/// Writer for upserting and deleting data in a Fluss primary key table. +/// +/// Each upsert/delete operation synchronously queues the write. Call `flush()` +/// to ensure all queued writes are delivered to the server. +/// +/// # Example: +/// writer = table.new_upsert().create_writer() +/// +/// # Fire-and-forget — ignore the returned handle +/// writer.upsert(row1) +/// writer.upsert(row2) +/// await writer.flush() +/// +/// # Per-record ack — call wait() on the handle +/// handle = writer.upsert(critical_row) +/// await handle.wait() +#[pyclass] +pub struct UpsertWriter { + writer: Arc, + table_info: fcore::metadata::TableInfo, + /// Column indices for partial updates (None = full row) + target_columns: Option>, +} + +#[pymethods] +impl UpsertWriter { + /// Upsert a row into the table. + /// + /// If a row with the same primary key exists, it will be updated. + /// Otherwise, a new row will be inserted. + /// + /// The write is queued synchronously. Call `flush()` to ensure delivery. + /// + /// Args: + /// row: A dict, list, or tuple containing the row data. + /// For dict: keys are column names, values are column values. + /// For list/tuple: values must be in schema order. + pub fn upsert(&self, row: &Bound<'_, PyAny>) -> PyResult { + let generic_row = if let Some(target_cols) = &self.target_columns { + python_to_sparse_generic_row(row, &self.table_info, target_cols)? + } else { + python_to_generic_row(row, &self.table_info)? + }; + + let result_future = self + .writer + .upsert(&generic_row) + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(WriteResultHandle::new(result_future)) + } + + /// Delete a row from the table by primary key. + /// + /// The delete is queued synchronously. Call `flush()` to ensure delivery. + /// + /// Args: + /// pk: A dict, list, or tuple containing only the primary key values. + /// For dict: keys are PK column names. + /// For list/tuple: values in PK column order. + pub fn delete(&self, pk: &Bound<'_, PyAny>) -> PyResult { + let pk_indices = self.table_info.get_schema().primary_key_indexes(); + let generic_row = python_to_sparse_generic_row(pk, &self.table_info, &pk_indices)?; + + let result_future = self + .writer + .delete(&generic_row) + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(WriteResultHandle::new(result_future)) + } + + /// Flush all pending upsert/delete operations to the server. + /// + /// This method sends all buffered operations and waits until they are + /// acknowledged according to the writer's ack configuration. + /// + /// Returns: + /// None on success + pub fn flush<'py>(&self, py: Python<'py>) -> PyResult> { + let writer = self.writer.clone(); + + future_into_py(py, async move { + writer + .flush() + .await + .map_err(|e| FlussError::from_core_error(&e)) + }) + } + + // Enter the async runtime context (for 'async with' statement) + fn __aenter__<'py>(slf: PyRef<'py, Self>, py: Python<'py>) -> PyResult> { + let py_slf = slf.into_pyobject(py)?.unbind(); + future_into_py(py, async move { Ok(py_slf) }) + } + + // Exit the async runtime context (for 'async with' statement) + /// On exit, the writer is automatically flushed. + #[pyo3(signature = (exc_type=None, _exc_value=None, _traceback=None))] + fn __aexit__<'py>( + &self, + py: Python<'py>, + exc_type: Option>, + _exc_value: Option>, + _traceback: Option>, + ) -> PyResult> { + let writer = self.writer.clone(); + let is_exc_none = exc_type.as_ref().is_none_or(|e| e.is_none()); + future_into_py(py, async move { + let res = writer.flush().await; + if let Err(e) = res { + if is_exc_none { + return Err(FlussError::from_core_error(&e)); + } + } + Ok(false) + }) + } + + fn __repr__(&self) -> String { + "UpsertWriter()".to_string() + } +} + +impl UpsertWriter { + /// Create an UpsertWriter by eagerly creating the core writer from a TableUpsert. + pub fn new( + table_upsert: &fcore::client::TableUpsert, + table_info: fcore::metadata::TableInfo, + target_columns: Option>, + ) -> PyResult { + let writer = table_upsert + .create_writer() + .map_err(|e| FlussError::from_core_error(&e))?; + Ok(Self { + writer: Arc::new(writer), + table_info, + target_columns, + }) + } +} diff --git a/fluss-rust/bindings/python/src/utils.rs b/fluss-rust/bindings/python/src/utils.rs new file mode 100644 index 0000000000..e07713976e --- /dev/null +++ b/fluss-rust/bindings/python/src/utils.rs @@ -0,0 +1,246 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::*; +use arrow_pyarrow::{FromPyArrow, ToPyArrow}; +use arrow_schema::SchemaRef; +use std::sync::Arc; + +/// Utilities for schema conversion between PyArrow, Arrow, and Fluss +pub struct Utils; + +impl Utils { + /// Convert PyArrow schema to Rust Arrow schema + pub fn pyarrow_to_arrow_schema(py_schema: &Py) -> PyResult { + Python::attach(|py| { + let schema_bound = py_schema.bind(py); + let schema: arrow_schema::Schema = FromPyArrow::from_pyarrow_bound(schema_bound) + .map_err(|e| { + FlussError::new_err(format!("Failed to convert PyArrow schema: {e}")) + })?; + Ok(Arc::new(schema)) + }) + } + + /// Convert an Arrow Field to a Fluss DataType, preserving nullability. + pub fn arrow_field_to_fluss_type( + field: &arrow::datatypes::Field, + ) -> PyResult { + use arrow::datatypes::DataType as ArrowDataType; + use fcore::metadata::DataTypes; + + let fluss_type = match field.data_type() { + ArrowDataType::Boolean => DataTypes::boolean(), + ArrowDataType::Int8 => DataTypes::tinyint(), + ArrowDataType::Int16 => DataTypes::smallint(), + ArrowDataType::Int32 => DataTypes::int(), + ArrowDataType::Int64 => DataTypes::bigint(), + ArrowDataType::UInt8 => DataTypes::tinyint(), + ArrowDataType::UInt16 => DataTypes::smallint(), + ArrowDataType::UInt32 => DataTypes::int(), + ArrowDataType::UInt64 => DataTypes::bigint(), + ArrowDataType::Float32 => DataTypes::float(), + ArrowDataType::Float64 => DataTypes::double(), + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => DataTypes::string(), + ArrowDataType::Binary | ArrowDataType::LargeBinary => DataTypes::bytes(), + ArrowDataType::FixedSizeBinary(n) => DataTypes::binary(*n as usize), + ArrowDataType::Date32 => DataTypes::date(), + ArrowDataType::Date64 => DataTypes::date(), + ArrowDataType::Time32(unit) => match unit { + arrow_schema::TimeUnit::Second => DataTypes::time_with_precision(0), + arrow_schema::TimeUnit::Millisecond => DataTypes::time_with_precision(3), + _ => { + return Err(FlussError::new_err(format!( + "Unsupported Time32 unit: {unit:?}" + ))); + } + }, + ArrowDataType::Time64(unit) => match unit { + arrow_schema::TimeUnit::Microsecond => DataTypes::time_with_precision(6), + arrow_schema::TimeUnit::Nanosecond => DataTypes::time_with_precision(9), + _ => { + return Err(FlussError::new_err(format!( + "Unsupported Time64 unit: {unit:?}" + ))); + } + }, + ArrowDataType::Timestamp(unit, tz) => { + let precision = match unit { + arrow_schema::TimeUnit::Second => 0, + arrow_schema::TimeUnit::Millisecond => 3, + arrow_schema::TimeUnit::Microsecond => 6, + arrow_schema::TimeUnit::Nanosecond => 9, + }; + // Arrow Timestamp with timezone -> Fluss TimestampLtz + // Arrow Timestamp without timezone -> Fluss Timestamp (NTZ) + if tz.is_some() { + DataTypes::timestamp_ltz_with_precision(precision) + } else { + DataTypes::timestamp_with_precision(precision) + } + } + ArrowDataType::Decimal128(precision, scale) => { + DataTypes::decimal(*precision as u32, *scale as u32) + } + ArrowDataType::List(element_field) => { + let element_type = Utils::arrow_field_to_fluss_type(element_field)?; + DataTypes::array(element_type) + } + other => { + return Err(FlussError::new_err(format!( + "Unsupported Arrow data type: {other:?}" + ))); + } + }; + + if field.is_nullable() { + Ok(fluss_type) + } else { + Ok(fluss_type.as_non_nullable()) + } + } + + /// Convert Fluss DataType to string representation, appending " NOT NULL" + /// for non-nullable types (matches Java's `withNullability` and Rust core's + /// `Display` impl). + pub fn datatype_to_string(data_type: &fcore::metadata::DataType) -> String { + let type_str = match data_type { + fcore::metadata::DataType::Boolean(_) => "boolean".to_string(), + fcore::metadata::DataType::TinyInt(_) => "tinyint".to_string(), + fcore::metadata::DataType::SmallInt(_) => "smallint".to_string(), + fcore::metadata::DataType::Int(_) => "int".to_string(), + fcore::metadata::DataType::BigInt(_) => "bigint".to_string(), + fcore::metadata::DataType::Float(_) => "float".to_string(), + fcore::metadata::DataType::Double(_) => "double".to_string(), + fcore::metadata::DataType::String(_) => "string".to_string(), + fcore::metadata::DataType::Bytes(_) => "bytes".to_string(), + fcore::metadata::DataType::Date(_) => "date".to_string(), + fcore::metadata::DataType::Time(t) => { + if t.precision() == 0 { + "time".to_string() + } else { + format!("time({})", t.precision()) + } + } + fcore::metadata::DataType::Timestamp(t) => { + if t.precision() == 6 { + "timestamp".to_string() + } else { + format!("timestamp({})", t.precision()) + } + } + fcore::metadata::DataType::TimestampLTz(t) => { + if t.precision() == 6 { + "timestamp_ltz".to_string() + } else { + format!("timestamp_ltz({})", t.precision()) + } + } + fcore::metadata::DataType::Char(c) => format!("char({})", c.length()), + fcore::metadata::DataType::Decimal(d) => { + format!("decimal({},{})", d.precision(), d.scale()) + } + fcore::metadata::DataType::Binary(b) => format!("binary({})", b.length()), + fcore::metadata::DataType::Array(arr) => format!( + "array<{}>", + Utils::datatype_to_string(arr.get_element_type()) + ), + fcore::metadata::DataType::Map(map) => format!( + "map<{},{}>", + Utils::datatype_to_string(map.key_type()), + Utils::datatype_to_string(map.value_type()) + ), + fcore::metadata::DataType::Row(row) => { + let fields: Vec = row + .fields() + .iter() + .map(|field| { + format!( + "{}: {}", + field.name(), + Utils::datatype_to_string(field.data_type()) + ) + }) + .collect(); + format!("row<{}>", fields.join(", ")) + } + }; + + if data_type.is_nullable() { + type_str + } else { + format!("{type_str} NOT NULL") + } + } + + /// Parse log format string to LogFormat enum + pub fn parse_log_format(format_str: &str) -> PyResult { + fcore::metadata::LogFormat::parse(format_str) + .map_err(|e| FlussError::new_err(format!("Invalid log format '{format_str}': {e}"))) + } + + /// Parse kv format string to KvFormat enum + pub fn parse_kv_format(format_str: &str) -> PyResult { + fcore::metadata::KvFormat::parse(format_str) + .map_err(|e| FlussError::new_err(format!("Invalid kv format '{format_str}': {e}"))) + } + + /// Convert Vec to Arrow RecordBatch + pub fn convert_scan_records_to_arrow( + _scan_records: Vec, + ) -> Vec> { + let mut result = Vec::new(); + for record in _scan_records { + let columnar_row = record.row(); + let row_id = columnar_row.get_row_id(); + if row_id == 0 { + let record_batch = columnar_row.get_record_batch(); + result.push(Arc::new(record_batch.clone())); + } + } + result + } + + /// Combine multiple Arrow batches into a single Table + pub fn combine_batches_to_table( + py: Python, + batches: Vec>, + ) -> PyResult> { + let py_batches: Result>, _> = batches + .iter() + .map(|batch| { + // Just dereference the Arc - no need to recreate the batch + batch + .as_ref() + .to_pyarrow(py) + .map(|x| x.into()) + .map_err(|e| FlussError::new_err(format!("Failed to convert to PyObject: {e}"))) + }) + .collect(); + + let py_batches = py_batches?; + + let pyarrow = py.import("pyarrow")?; + + // Use pyarrow.Table.from_batches to combine batches + let table = pyarrow + .getattr("Table")? + .call_method1("from_batches", (py_batches,))?; + + Ok(table.into()) + } +} diff --git a/fluss-rust/bindings/python/src/write_handle.rs b/fluss-rust/bindings/python/src/write_handle.rs new file mode 100644 index 0000000000..83cbeccadc --- /dev/null +++ b/fluss-rust/bindings/python/src/write_handle.rs @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::*; +use pyo3_async_runtimes::tokio::future_into_py; +use std::sync::Mutex; + +/// Handle for a pending write operation. +/// +/// Returned by `upsert()`, `delete()`, `append()`, etc. +/// Can be safely ignored for fire-and-forget semantics, +/// or awaited via `wait()` for per-record acknowledgment. +/// +/// # Example: +/// # Fire-and-forget — just ignore the handle +/// writer.upsert(row1) +/// writer.upsert(row2) +/// await writer.flush() +/// +/// # Per-record ack — call wait() +/// handle = writer.upsert(critical_row) +/// await handle.wait() +#[pyclass] +pub struct WriteResultHandle { + inner: Mutex>, +} + +impl WriteResultHandle { + pub fn new(future: fcore::client::WriteResultFuture) -> Self { + Self { + inner: Mutex::new(Some(future)), + } + } +} + +#[pymethods] +impl WriteResultHandle { + /// Wait for server acknowledgment of this specific write. + /// + /// Returns: + /// None on success, raises FlussError on failure. + pub fn wait<'py>(&self, py: Python<'py>) -> PyResult> { + let future = self + .inner + .lock() + .map_err(|e| FlussError::new_err(format!("Lock poisoned: {e}")))? + .take() + .ok_or_else(|| FlussError::new_err("WriteResultHandle already consumed"))?; + + future_into_py(py, async move { + future.await.map_err(|e| FlussError::from_core_error(&e))?; + Ok(()) + }) + } + + fn __repr__(&self) -> String { + let consumed = self.inner.lock().map(|g| g.is_none()).unwrap_or(false); + if consumed { + "WriteResultHandle(consumed)".to_string() + } else { + "WriteResultHandle(pending)".to_string() + } + } +} diff --git a/fluss-rust/bindings/python/test/conftest.py b/fluss-rust/bindings/python/test/conftest.py new file mode 100644 index 0000000000..8b2bc732b9 --- /dev/null +++ b/fluss-rust/bindings/python/test/conftest.py @@ -0,0 +1,182 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import asyncio +import json +import os +import subprocess +import tempfile +import time +from pathlib import Path + +import pytest +import pytest_asyncio +from filelock import FileLock + +import fluss + +CLUSTER_NAME = "shared-test" + + +def _find_cli_binary(): + env_bin = os.environ.get("FLUSS_TEST_CLUSTER_BIN") + if env_bin: + if os.path.isfile(env_bin): + return env_bin + raise FileNotFoundError(f"FLUSS_TEST_CLUSTER_BIN={env_bin!r} does not exist") + result = subprocess.run( + ["cargo", "locate-project", "--workspace", "--message-format", "plain"], + capture_output=True, + text=True, + ) + if result.returncode == 0: + root = Path(result.stdout.strip()).parent + for profile in ("debug", "release"): + bin_path = root / "target" / profile / "fluss-test-cluster" + if bin_path.is_file(): + return str(bin_path) + raise FileNotFoundError( + "fluss-test-cluster not found. Run: cargo build -p fluss-test-cluster" + ) + + +def _start_cluster(): + lock = Path(tempfile.gettempdir()) / f"fluss-{CLUSTER_NAME}.lock" + with FileLock(lock): + cli = _find_cli_binary() + result = subprocess.run( + [cli, "start", "--sasl", "--name", CLUSTER_NAME], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError( + f"fluss-test-cluster start failed:\n{result.stderr}\n{result.stdout}" + ) + prefix = "CLUSTER_JSON: " + for line in result.stdout.strip().split("\n"): + if line.startswith(prefix): + info = json.loads(line[len(prefix) :]) + return info["bootstrap_servers"], info.get("sasl_bootstrap_servers") + raise RuntimeError( + f"No CLUSTER_JSON token in output:\n{result.stdout}\n{result.stderr}" + ) + + +def _stop_cluster(): + try: + cli = _find_cli_binary() + except FileNotFoundError: + return + subprocess.run([cli, "stop", "--name", CLUSTER_NAME], capture_output=True) + + +async def _connect(bootstrap_servers): + config = fluss.Config({"bootstrap.servers": bootstrap_servers}) + start = time.time() + last_err = None + while time.time() - start < 60: + try: + conn = await fluss.FlussConnection.create(config) + admin = conn.get_admin() + nodes = await admin.get_server_nodes() + if any(n.server_type == "TabletServer" for n in nodes): + return conn + await conn.close() + last_err = RuntimeError("No TabletServer available yet") + except Exception as e: + last_err = e + await asyncio.sleep(1) + raise RuntimeError(f"Could not connect after 60s: {last_err}") + + +def pytest_unconfigure(config): + if os.environ.get("FLUSS_BOOTSTRAP_SERVERS"): + return + if hasattr(config, "workerinput"): + return + if os.environ.get("FLUSS_SKIP_CLUSTER_TEARDOWN"): + return + _stop_cluster() + + +@pytest.fixture(scope="session") +def fluss_cluster(): + env = os.environ.get("FLUSS_BOOTSTRAP_SERVERS") + if env: + sasl_env = os.environ.get("FLUSS_SASL_BOOTSTRAP_SERVERS", env) + yield (env, sasl_env) + return + + plaintext_addr, sasl_addr = _start_cluster() + yield (plaintext_addr, sasl_addr or plaintext_addr) + + +@pytest_asyncio.fixture(scope="session") +async def connection(fluss_cluster): + plaintext_addr, _sasl_addr = fluss_cluster + conn = await _connect(plaintext_addr) + yield conn + conn.close() + + +@pytest.fixture(scope="session") +def sasl_bootstrap_servers(fluss_cluster): + _plaintext_addr, sasl_addr = fluss_cluster + return sasl_addr + + +@pytest.fixture(scope="session") +def plaintext_bootstrap_servers(fluss_cluster): + plaintext_addr, _sasl_addr = fluss_cluster + return plaintext_addr + + +@pytest_asyncio.fixture(scope="session") +async def admin(connection): + return connection.get_admin() + + +@pytest_asyncio.fixture +async def wait_for_table_ready(admin): + """ + Fixture that returns a helper function to wait for a table or partition to be ready. + """ + async def _wait(table_path, timeout=15, partition_name=None): + start_time = time.monotonic() + while time.monotonic() - start_time < timeout: + try: + if partition_name: + await admin.list_partition_offsets( + table_path, partition_name, [0], fluss.OffsetSpec.earliest() + ) + else: + await admin.list_offsets(table_path, [0], fluss.OffsetSpec.earliest()) + return + except (fluss.FlussError, Exception) as e: + # Catch "No leader found" or other errors that indicate the table/partition is still initializing + err_msg = str(e) + if any(msg in err_msg for msg in ["No leader found", "Table not ready", "Metadata not ready", "not leader or follower"]): + await asyncio.sleep(1) + continue + raise + raise TimeoutError( + f"Table/Partition {table_path} ({partition_name or 'standard'}) " + f"did not become ready within {timeout}s" + ) + + return _wait diff --git a/fluss-rust/bindings/python/test/test_admin.py b/fluss-rust/bindings/python/test/test_admin.py new file mode 100644 index 0000000000..646248d8d4 --- /dev/null +++ b/fluss-rust/bindings/python/test/test_admin.py @@ -0,0 +1,319 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Integration tests for FlussAdmin operations. + +Mirrors the Rust integration tests in crates/fluss/tests/integration/admin.rs. +""" + +import pyarrow as pa +import pytest + +import fluss + + +async def test_create_database(admin): + """Test database create, exists, get_info, and drop lifecycle.""" + db_name = "py_test_create_database" + + # Cleanup in case of prior failed run + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + + assert not await admin.database_exists(db_name) + + db_descriptor = fluss.DatabaseDescriptor( + comment="test_db", + custom_properties={"k1": "v1", "k2": "v2"}, + ) + await admin.create_database(db_name, db_descriptor, ignore_if_exists=False) + + assert await admin.database_exists(db_name) + + db_info = await admin.get_database_info(db_name) + assert db_info.database_name == db_name + + descriptor = db_info.get_database_descriptor() + assert descriptor.comment == "test_db" + assert descriptor.get_custom_properties() == {"k1": "v1", "k2": "v2"} + + await admin.drop_database(db_name, ignore_if_not_exists=False, cascade=True) + + assert not await admin.database_exists(db_name) + + +async def test_create_table(admin): + """Test table create, exists, get_info, list, and drop lifecycle.""" + db_name = "py_test_create_table_db" + + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + + assert not await admin.database_exists(db_name) + await admin.create_database( + db_name, + fluss.DatabaseDescriptor(comment="Database for test_create_table"), + ignore_if_exists=False, + ) + + table_name = "test_user_table" + table_path = fluss.TablePath(db_name, table_name) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("age", pa.int32()), + pa.field("email", pa.string()), + ] + ), + primary_keys=["id"], + ) + assert schema.get_primary_keys() == ["id"] + + table_descriptor = fluss.TableDescriptor( + schema, + bucket_count=3, + bucket_keys=["id"], + comment="Test table for user data (id, name, age, email)", + log_format="arrow", + kv_format="indexed", + properties={"table.replication.factor": "1"}, + ) + + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + assert await admin.table_exists(table_path) + + tables = await admin.list_tables(db_name) + assert len(tables) == 1 + assert table_name in tables + + table_info = await admin.get_table_info(table_path) + + assert table_info.comment == "Test table for user data (id, name, age, email)" + assert table_info.get_primary_keys() == ["id"] + assert table_info.num_buckets == 3 + assert table_info.get_bucket_keys() == ["id"] + assert table_info.get_column_names() == ["id", "name", "age", "email"] + + await admin.drop_table(table_path, ignore_if_not_exists=False) + assert not await admin.table_exists(table_path) + + await admin.drop_database(db_name, ignore_if_not_exists=False, cascade=True) + assert not await admin.database_exists(db_name) + + +async def test_partition_apis(admin): + """Test partition create, list, and drop lifecycle.""" + db_name = "py_test_partition_apis_db" + + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + await admin.create_database( + db_name, + fluss.DatabaseDescriptor(comment="Database for test_partition_apis"), + ignore_if_exists=True, + ) + + table_path = fluss.TablePath(db_name, "partitioned_table") + + schema = fluss.Schema( + pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("dt", pa.string()), + pa.field("region", pa.string()), + ] + ), + primary_keys=["id", "dt", "region"], + ) + + table_descriptor = fluss.TableDescriptor( + schema, + partition_keys=["dt", "region"], + bucket_count=3, + bucket_keys=["id"], + log_format="arrow", + kv_format="compacted", + properties={"table.replication.factor": "1"}, + ) + + await admin.create_table(table_path, table_descriptor, ignore_if_exists=True) + + # Initially no partitions + partitions = await admin.list_partition_infos(table_path) + assert len(partitions) == 0 + + # Create a partition + await admin.create_partition( + table_path, + {"dt": "2024-01-15", "region": "EMEA"}, + ignore_if_exists=False, + ) + + partitions = await admin.list_partition_infos(table_path) + assert len(partitions) == 1 + assert partitions[0].partition_name == "2024-01-15$EMEA" + + # Drop the partition + await admin.drop_partition( + table_path, + {"dt": "2024-01-15", "region": "EMEA"}, + ignore_if_not_exists=False, + ) + + partitions = await admin.list_partition_infos(table_path) + assert len(partitions) == 0 + + await admin.drop_table(table_path, ignore_if_not_exists=True) + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + + +async def test_fluss_error_response(admin): + """Test that API errors are raised as FlussError with correct error codes.""" + table_path = fluss.TablePath("fluss", "py_not_exist") + + with pytest.raises(fluss.FlussError) as exc_info: + await admin.get_table_info(table_path) + + assert exc_info.value.error_code == fluss.ErrorCode.TABLE_NOT_EXIST + + +async def test_error_database_not_exist(admin): + """Test error handling for non-existent database operations.""" + # get_database_info + with pytest.raises(fluss.FlussError) as exc_info: + await admin.get_database_info("py_no_such_db") + assert exc_info.value.error_code == fluss.ErrorCode.DATABASE_NOT_EXIST + + # drop_database without ignore flag + with pytest.raises(fluss.FlussError) as exc_info: + await admin.drop_database("py_no_such_db", ignore_if_not_exists=False) + assert exc_info.value.error_code == fluss.ErrorCode.DATABASE_NOT_EXIST + + # list_tables for non-existent database + with pytest.raises(fluss.FlussError) as exc_info: + await admin.list_tables("py_no_such_db") + assert exc_info.value.error_code == fluss.ErrorCode.DATABASE_NOT_EXIST + + +async def test_error_database_already_exist(admin): + """Test error when creating a database that already exists.""" + db_name = "py_test_error_db_already_exist" + + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + await admin.create_database(db_name, ignore_if_exists=False) + + # Create same database again without ignore flag + with pytest.raises(fluss.FlussError) as exc_info: + await admin.create_database(db_name, ignore_if_exists=False) + assert exc_info.value.error_code == fluss.ErrorCode.DATABASE_ALREADY_EXIST + + # With ignore flag should succeed + await admin.create_database(db_name, ignore_if_exists=True) + + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + + +async def test_error_table_already_exist(admin): + """Test error when creating a table that already exists.""" + db_name = "py_test_error_tbl_already_exist_db" + + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + await admin.create_database(db_name, ignore_if_exists=True) + + table_path = fluss.TablePath(db_name, "my_table") + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor( + schema, + bucket_count=1, + properties={"table.replication.factor": "1"}, + ) + + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + # Create same table again without ignore flag + with pytest.raises(fluss.FlussError) as exc_info: + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + assert exc_info.value.error_code == fluss.ErrorCode.TABLE_ALREADY_EXIST + + # With ignore flag should succeed + await admin.create_table(table_path, table_descriptor, ignore_if_exists=True) + + await admin.drop_table(table_path, ignore_if_not_exists=True) + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + + +async def test_error_table_not_exist(admin): + """Test error handling for non-existent table operations.""" + table_path = fluss.TablePath("fluss", "py_no_such_table") + + # drop without ignore flag + with pytest.raises(fluss.FlussError) as exc_info: + await admin.drop_table(table_path, ignore_if_not_exists=False) + assert exc_info.value.error_code == fluss.ErrorCode.TABLE_NOT_EXIST + + # drop with ignore flag should succeed + await admin.drop_table(table_path, ignore_if_not_exists=True) + + +async def test_get_server_nodes(admin): + """Test get_server_nodes returns coordinator and tablet servers.""" + nodes = await admin.get_server_nodes() + + assert len(nodes) > 0, "Expected at least one server node" + + server_types = [n.server_type for n in nodes] + assert "CoordinatorServer" in server_types, "Expected a coordinator server" + assert "TabletServer" in server_types, "Expected at least one tablet server" + + for node in nodes: + assert node.host, "Server node host should not be empty" + assert node.port > 0, "Server node port should be > 0" + assert node.uid, "Server node uid should not be empty" + assert repr(node).startswith("ServerNode(") + + +async def test_error_table_not_partitioned(admin): + """Test error when calling partition operations on non-partitioned table.""" + db_name = "py_test_error_not_partitioned_db" + + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + await admin.create_database(db_name, ignore_if_exists=True) + + table_path = fluss.TablePath(db_name, "non_partitioned_table") + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor( + schema, + bucket_count=1, + properties={"table.replication.factor": "1"}, + ) + + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + with pytest.raises(fluss.FlussError) as exc_info: + await admin.list_partition_infos(table_path) + assert ( + exc_info.value.error_code == fluss.ErrorCode.TABLE_NOT_PARTITIONED_EXCEPTION + ) + + await admin.drop_table(table_path, ignore_if_not_exists=True) + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) diff --git a/fluss-rust/bindings/python/test/test_context_manager.py b/fluss-rust/bindings/python/test/test_context_manager.py new file mode 100644 index 0000000000..5dcb5a4c31 --- /dev/null +++ b/fluss-rust/bindings/python/test/test_context_manager.py @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest +import pyarrow as pa +import time +import fluss + +async def _poll_records(scanner, expected_count, timeout_s=10): + """Poll a record-based scanner until expected_count records are collected.""" + collected = [] + deadline = time.monotonic() + timeout_s + while len(collected) < expected_count and time.monotonic() < deadline: + records = await scanner.poll(5000) + collected.extend(records) + return collected + +@pytest.mark.asyncio +async def test_connection_context_manager(plaintext_bootstrap_servers): + config = fluss.Config({"bootstrap.servers": plaintext_bootstrap_servers}) + async with await fluss.FlussConnection.create(config) as conn: + admin = conn.get_admin() + nodes = await admin.get_server_nodes() + assert len(nodes) > 0 + + +@pytest.mark.asyncio +async def test_append_writer_success_flush(connection, admin): + table_path = fluss.TablePath("fluss", "test_append_ctx_success") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema(pa.schema([pa.field("a", pa.int32())])) + await admin.create_table(table_path, fluss.TableDescriptor(schema)) + + table = await connection.get_table(table_path) + + async with table.new_append().create_writer() as writer: + writer.append({"a": 1}) + writer.append({"a": 2}) + # No explicit flush here + + # After context exit, data should be flushed + scanner = await table.new_scan().create_log_scanner() + scanner.subscribe(0, fluss.EARLIEST_OFFSET) + records = await _poll_records(scanner, expected_count=2) + assert len(records) == 2 + assert sorted([r.row["a"] for r in records]) == [1, 2] + +@pytest.mark.asyncio +async def test_connection_drain_on_close(plaintext_bootstrap_servers, admin): + table_path = fluss.TablePath("fluss", "test_conn_drain") + await admin.drop_table(table_path, ignore_if_not_exists=True) + schema = fluss.Schema(pa.schema([pa.field("a", pa.int32())])) + await admin.create_table(table_path, fluss.TableDescriptor(schema)) + + config = fluss.Config({"bootstrap.servers": plaintext_bootstrap_servers}) + async with await fluss.FlussConnection.create(config) as conn: + table = await conn.get_table(table_path) + writer = table.new_append().create_writer() + writer.append({"a": 123}) + # No explicit flush, no writer context exit. + # Rely on connection.__aexit__ -> close() to drain. + + # Re-connect with a new connection to verify data arrived + async with await fluss.FlussConnection.create(config) as conn2: + table2 = await conn2.get_table(table_path) + scanner = await table2.new_scan().create_log_scanner() + scanner.subscribe(0, fluss.EARLIEST_OFFSET) + records = await _poll_records(scanner, expected_count=1) + assert len(records) == 1 + assert records[0].row["a"] == 123 + +@pytest.mark.asyncio +async def test_upsert_writer_context_manager(connection, admin): + table_path = fluss.TablePath("fluss", "test_upsert_ctx") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema(pa.schema([pa.field("id", pa.int32()), pa.field("v", pa.string())]), primary_keys=["id"]) + await admin.create_table(table_path, fluss.TableDescriptor(schema)) + + table = await connection.get_table(table_path) + + # Success path: verify it flushes + async with table.new_upsert().create_writer() as writer: + writer.upsert({"id": 1, "v": "a"}) + + lookuper = table.new_lookup().create_lookuper() + res = await lookuper.lookup({"id": 1}) + assert res is not None + assert res["v"] == "a" + +@pytest.mark.asyncio +async def test_connection_context_manager_exception(plaintext_bootstrap_servers): + config = fluss.Config({"bootstrap.servers": plaintext_bootstrap_servers}) + class TestException(Exception): pass + + try: + async with await fluss.FlussConnection.create(config) as conn: + raise TestException("connection error") + except TestException: + pass + # If we reach here without hanging, the connection __aexit__ gracefully handled the error \ No newline at end of file diff --git a/fluss-rust/bindings/python/test/test_kv_table.py b/fluss-rust/bindings/python/test/test_kv_table.py new file mode 100644 index 0000000000..f3cddf8c3d --- /dev/null +++ b/fluss-rust/bindings/python/test/test_kv_table.py @@ -0,0 +1,720 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Integration tests for KV (primary key) table operations. + +Mirrors the Rust integration tests in crates/fluss/tests/integration/kv_table.rs. +""" + +import math +from datetime import date, datetime, timezone +from datetime import time as dt_time +from decimal import Decimal + +import pyarrow as pa +import pytest + +import fluss + + +async def _upsert_and_wait(writer, row): + handle = writer.upsert(row) + await handle.wait() + + +def _assert_float_specials(values): + assert math.isnan(values[0]) + assert math.isinf(values[1]) and values[1] > 0 + assert math.isinf(values[2]) and values[2] < 0 + + +async def test_upsert_delete_and_lookup(connection, admin): + """Test upsert, lookup, update, delete, and non-existent key lookup.""" + table_path = fluss.TablePath("fluss", "py_test_upsert_and_lookup") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + ] + ), + primary_keys=["id"], + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + upsert_writer = table.new_upsert().create_writer() + + test_data = [(1, "Verso", 32), (2, "Noco", 25), (3, "Esquie", 35)] + + # Upsert rows (fire-and-forget, then flush) + for id_, name, age in test_data: + upsert_writer.upsert({"id": id_, "name": name, "age": age}) + await upsert_writer.flush() + + # Lookup and verify + lookuper = table.new_lookup().create_lookuper() + + for id_, expected_name, expected_age in test_data: + result = await lookuper.lookup({"id": id_}) + assert result is not None, f"Row with id={id_} should exist" + assert result["id"] == id_ + assert result["name"] == expected_name + assert result["age"] == expected_age + + # Update record with id=1 (await acknowledgment) + handle = upsert_writer.upsert({"id": 1, "name": "Verso", "age": 33}) + await handle.wait() + + result = await lookuper.lookup({"id": 1}) + assert result is not None + assert result["age"] == 33 + assert result["name"] == "Verso" + + # Delete record with id=1 (await acknowledgment) + handle = upsert_writer.delete({"id": 1}) + await handle.wait() + + result = await lookuper.lookup({"id": 1}) + assert result is None, "Record 1 should not exist after delete" + + # Verify other records still exist + for id_ in [2, 3]: + result = await lookuper.lookup({"id": id_}) + assert result is not None, f"Record {id_} should still exist" + + # Lookup non-existent key + result = await lookuper.lookup({"id": 999}) + assert result is None, "Non-existent key should return None" + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_composite_primary_keys(connection, admin): + """Test upsert/lookup with composite PKs, including prefix lookup.""" + table_path = fluss.TablePath("fluss", "py_test_composite_pk") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + # PK columns intentionally interleaved with non-PK column to verify + # that lookup correctly handles non-contiguous primary key indices. + schema = fluss.Schema( + pa.schema( + [ + pa.field("region", pa.string()), + pa.field("score", pa.int64()), + pa.field("user_id", pa.int32()), + pa.field("event_id", pa.int64()), + ] + ), + primary_keys=["region", "user_id", "event_id"], + ) + table_descriptor = fluss.TableDescriptor( + schema, bucket_count=3, bucket_keys=["region", "user_id"] + ) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + upsert_writer = table.new_upsert().create_writer() + + test_data = [ + ("US", 1, 1, 100), + ("US", 1, 2, 200), + ("US", 2, 1, 300), + ("EU", 1, 1, 150), + ("EU", 2, 1, 250), + ] + + for region, user_id, event_id, score in test_data: + upsert_writer.upsert( + { + "region": region, + "user_id": user_id, + "event_id": event_id, + "score": score, + } + ) + await upsert_writer.flush() + + lookuper = table.new_lookup().create_lookuper() + + # Lookup (US, 1, 1) -> score 100 + result = await lookuper.lookup({"region": "US", "user_id": 1, "event_id": 1}) + assert result is not None + assert result["score"] == 100 + + # Lookup (EU, 2, 1) -> score 250 + result = await lookuper.lookup({"region": "EU", "user_id": 2, "event_id": 1}) + assert result is not None + assert result["score"] == 250 + + # Update (US, 1, 1) score (await acknowledgment) + handle = upsert_writer.upsert( + {"region": "US", "user_id": 1, "event_id": 1, "score": 500} + ) + await handle.wait() + + result = await lookuper.lookup({"region": "US", "user_id": 1, "event_id": 1}) + assert result is not None + assert result["score"] == 500 + + prefix_lookuper = table.new_lookup().lookup_by(["region", "user_id"]).create_lookuper() + + # Prefix (US, 1) should match 2 rows (event_id 1 and 2) + rows = await prefix_lookuper.lookup({"region": "US", "user_id": 1}) + assert len(rows) == 2 + event_ids = sorted(row["event_id"] for row in rows) + assert event_ids == [1, 2] + + # Also validate list/tuple prefix input + rows = await prefix_lookuper.lookup(["US", 1]) + assert len(rows) == 2 + rows = await prefix_lookuper.lookup(("EU", 2)) + assert len(rows) == 1 + assert rows[0]["event_id"] == 1 + + # Validate empty-result case: valid prefix shape but no matching rows. + rows = await prefix_lookuper.lookup({"region": "APAC", "user_id": 999}) + assert rows == [] + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_partial_update(connection, admin): + """Test partial column update via partial_update_by_name.""" + table_path = fluss.TablePath("fluss", "py_test_partial_update") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("score", pa.int64()), + ] + ), + primary_keys=["id"], + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + + # Insert initial record + upsert_writer = table.new_upsert().create_writer() + handle = upsert_writer.upsert( + {"id": 1, "name": "Verso", "age": 32, "score": 6942} + ) + await handle.wait() + + lookuper = table.new_lookup().create_lookuper() + result = await lookuper.lookup({"id": 1}) + assert result is not None + assert result["id"] == 1 + assert result["name"] == "Verso" + assert result["age"] == 32 + assert result["score"] == 6942 + + # Partial update: only update score column + partial_writer = ( + table.new_upsert().partial_update_by_name(["id", "score"]).create_writer() + ) + handle = partial_writer.upsert({"id": 1, "score": 420}) + await handle.wait() + + result = await lookuper.lookup({"id": 1}) + assert result is not None + assert result["id"] == 1 + assert result["name"] == "Verso", "name should remain unchanged" + assert result["age"] == 32, "age should remain unchanged" + assert result["score"] == 420, "score should be updated to 420" + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_partial_update_by_index(connection, admin): + """Test partial column update via partial_update_by_index.""" + table_path = fluss.TablePath("fluss", "py_test_partial_update_by_index") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + pa.field("age", pa.int64()), + pa.field("score", pa.int64()), + ] + ), + primary_keys=["id"], + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + + upsert_writer = table.new_upsert().create_writer() + handle = upsert_writer.upsert( + {"id": 1, "name": "Verso", "age": 32, "score": 6942} + ) + await handle.wait() + + # Partial update by indices: columns 0=id (PK), 1=name + partial_writer = ( + table.new_upsert().partial_update_by_index([0, 1]).create_writer() + ) + handle = partial_writer.upsert([1, "Verso Renamed"]) + await handle.wait() + + lookuper = table.new_lookup().create_lookuper() + result = await lookuper.lookup({"id": 1}) + assert result is not None + assert result["name"] == "Verso Renamed", "name should be updated" + assert result["score"] == 6942, "score should remain unchanged" + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_partitioned_table_upsert_and_lookup(connection, admin): + """Test upsert/lookup/delete on a partitioned KV table.""" + table_path = fluss.TablePath("fluss", "py_test_partitioned_kv_table") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("region", pa.string()), + pa.field("user_id", pa.int32()), + pa.field("name", pa.string()), + pa.field("score", pa.int64()), + ] + ), + primary_keys=["region", "user_id"], + ) + table_descriptor = fluss.TableDescriptor( + schema, + partition_keys=["region"], + ) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + # Create partitions + for region in ["US", "EU", "APAC"]: + await admin.create_partition( + table_path, {"region": region}, ignore_if_exists=True + ) + + table = await connection.get_table(table_path) + upsert_writer = table.new_upsert().create_writer() + + test_data = [ + ("US", 1, "Gustave", 100), + ("US", 2, "Lune", 200), + ("EU", 1, "Sciel", 150), + ("EU", 2, "Maelle", 250), + ("APAC", 1, "Noco", 300), + ] + + for region, user_id, name, score in test_data: + upsert_writer.upsert( + {"region": region, "user_id": user_id, "name": name, "score": score} + ) + await upsert_writer.flush() + + lookuper = table.new_lookup().create_lookuper() + + # Verify all rows across partitions + for region, user_id, expected_name, expected_score in test_data: + result = await lookuper.lookup({"region": region, "user_id": user_id}) + assert result is not None, f"Row ({region}, {user_id}) should exist" + assert result["region"] == region + assert result["user_id"] == user_id + assert result["name"] == expected_name + assert result["score"] == expected_score + + # Update within a partition (await acknowledgment) + handle = upsert_writer.upsert( + {"region": "US", "user_id": 1, "name": "Gustave Updated", "score": 999} + ) + await handle.wait() + + result = await lookuper.lookup({"region": "US", "user_id": 1}) + assert result is not None + assert result["name"] == "Gustave Updated" + assert result["score"] == 999 + + # Lookup in non-existent partition should return None + result = await lookuper.lookup({"region": "UNKNOWN_REGION", "user_id": 1}) + assert result is None, "Lookup in non-existent partition should return None" + + # Delete within a partition (await acknowledgment) + handle = upsert_writer.delete({"region": "EU", "user_id": 1}) + await handle.wait() + + result = await lookuper.lookup({"region": "EU", "user_id": 1}) + assert result is None, "Deleted record should not exist" + + # Verify sibling record still exists + result = await lookuper.lookup({"region": "EU", "user_id": 2}) + assert result is not None + assert result["name"] == "Maelle" + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_upsert_and_lookup_with_array(connection, admin): + """Test upsert and lookup with flat, nested, and null-pattern arrays in KV tables.""" + table_path = fluss.TablePath("fluss", "py_test_kv_arrays") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("tags", pa.list_(pa.string())), + pa.field("scores", pa.list_(pa.int32())), + pa.field("matrix", pa.list_(pa.list_(pa.int32()))), + ] + ), + primary_keys=["id"], + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + upsert_writer = table.new_upsert().create_writer() + + await _upsert_and_wait( + upsert_writer, + { + "id": 1, + "tags": ["hello", "world"], + "scores": [10, 20, 30], + "matrix": [[1, 2], [3, 4]], + }, + ) + await _upsert_and_wait( + upsert_writer, + {"id": 2, "tags": [None], "scores": [], "matrix": None}, + ) + await _upsert_and_wait( + upsert_writer, + {"id": 3, "tags": None, "scores": [42], "matrix": [[], [5], [6, 7, 8]]}, + ) + await _upsert_and_wait( + upsert_writer, + {"id": 4, "tags": None, "scores": None, "matrix": [[1, None], None, []]}, + ) + + lookuper = table.new_lookup().create_lookuper() + + result1 = await lookuper.lookup({"id": 1}) + assert result1 is not None + assert result1["tags"] == ["hello", "world"] + assert result1["scores"] == [10, 20, 30] + assert result1["matrix"] == [[1, 2], [3, 4]] + + result2 = await lookuper.lookup({"id": 2}) + assert result2 is not None + assert result2["tags"] == [None] + assert result2["scores"] == [] + assert result2["matrix"] is None + + result3 = await lookuper.lookup({"id": 3}) + assert result3 is not None + assert result3["tags"] is None + assert result3["scores"] == [42] + assert result3["matrix"] == [[], [5], [6, 7, 8]] + + result4 = await lookuper.lookup({"id": 4}) + assert result4 is not None + assert result4["tags"] is None + assert result4["scores"] is None + assert result4["matrix"] == [[1, None], None, []] + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_upsert_and_lookup_with_array_rich_types(connection, admin): + """Test upsert/lookup for arrays with rich element types and encoding edge cases.""" + table_path = fluss.TablePath("fluss", "py_test_kv_arrays_rich_types") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("arr_bytes", pa.list_(pa.binary())), + pa.field("arr_date", pa.list_(pa.date32())), + pa.field("arr_time", pa.list_(pa.time32("ms"))), + pa.field("arr_ts_ntz", pa.list_(pa.timestamp("us"))), + pa.field("arr_ts_ltz", pa.list_(pa.timestamp("us", tz="UTC"))), + pa.field("arr_decimal", pa.list_(pa.decimal128(10, 2))), + pa.field("arr_long_str", pa.list_(pa.string())), + pa.field("arr_big_decimal", pa.list_(pa.decimal128(22, 5))), + pa.field("arr_ts_nano", pa.list_(pa.timestamp("ns"))), + pa.field("arr_float", pa.list_(pa.float32())), + pa.field("arr_double", pa.list_(pa.float64())), + # TODO(fluss-python#524): support PyArrow FixedSizeBinary in schema + # conversion. Then switch to pa.binary(4). + pa.field("arr_binary", pa.list_(pa.binary())), + ] + ), + primary_keys=["id"], + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + upsert_writer = table.new_upsert().create_writer() + + await _upsert_and_wait( + upsert_writer, + { + "id": 1, + "arr_bytes": [b"\x10\x20\x30", None], + "arr_date": [date(2026, 1, 23), None], + "arr_time": [dt_time(10, 13, 47, 123000), None], + "arr_ts_ntz": [datetime(2026, 1, 23, 10, 13, 47, 123000)], + "arr_ts_ltz": [ + datetime(2026, 1, 23, 10, 13, 47, 123000, tzinfo=timezone.utc) + ], + "arr_decimal": [Decimal("123.45"), None], + "arr_long_str": [ + "abcdefgh", + "this is a much longer string that definitely exceeds inline", + ], + "arr_big_decimal": [ + Decimal("12345678901234567.12345"), + Decimal("-99999999999999999.99999"), + ], + "arr_ts_nano": [datetime(2026, 1, 23, 10, 13, 47, 123456)], + "arr_float": [float("nan"), float("inf"), float("-inf")], + "arr_double": [float("nan"), float("inf"), float("-inf")], + "arr_binary": [b"\xde\xad\xbe\xef", b"\x00\x01\x02\x03"], + }, + ) + + lookuper = table.new_lookup().create_lookuper() + result = await lookuper.lookup({"id": 1}) + assert result is not None + + assert result["arr_bytes"] == [b"\x10\x20\x30", None] + assert result["arr_date"] == [date(2026, 1, 23), None] + assert result["arr_time"] == [dt_time(10, 13, 47, 123000), None] + assert result["arr_ts_ntz"] == [datetime(2026, 1, 23, 10, 13, 47, 123000)] + assert result["arr_ts_ltz"] == [ + datetime(2026, 1, 23, 10, 13, 47, 123000, tzinfo=timezone.utc) + ] + assert result["arr_decimal"] == [Decimal("123.45"), None] + assert result["arr_long_str"] == [ + "abcdefgh", + "this is a much longer string that definitely exceeds inline", + ] + assert result["arr_big_decimal"] == [ + Decimal("12345678901234567.12345"), + Decimal("-99999999999999999.99999"), + ] + assert result["arr_ts_nano"] == [datetime(2026, 1, 23, 10, 13, 47, 123456)] + _assert_float_specials(result["arr_float"]) + _assert_float_specials(result["arr_double"]) + assert result["arr_binary"] == [b"\xde\xad\xbe\xef", b"\x00\x01\x02\x03"] + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_all_supported_datatypes(connection, admin): + """Test upsert/lookup for all supported data types, including nulls.""" + table_path = fluss.TablePath("fluss", "py_test_kv_all_datatypes") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("pk_int", pa.int32()), + pa.field("col_boolean", pa.bool_()), + pa.field("col_tinyint", pa.int8()), + pa.field("col_smallint", pa.int16()), + pa.field("col_int", pa.int32()), + pa.field("col_bigint", pa.int64()), + pa.field("col_float", pa.float32()), + pa.field("col_double", pa.float64()), + pa.field("col_string", pa.string()), + pa.field("col_decimal", pa.decimal128(10, 2)), + pa.field("col_date", pa.date32()), + pa.field("col_time", pa.time32("ms")), + pa.field("col_timestamp_ntz", pa.timestamp("us")), + pa.field("col_timestamp_ltz", pa.timestamp("us", tz="UTC")), + pa.field("col_bytes", pa.binary()), + pa.field("col_array", pa.list_(pa.string())), + pa.field("col_binary", pa.binary(16)), + ] + ), + primary_keys=["pk_int"], + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + upsert_writer = table.new_upsert().create_writer() + + # Test data for all types + row_data = { + "pk_int": 1, + "col_boolean": True, + "col_tinyint": 127, + "col_smallint": 32767, + "col_int": 2147483647, + "col_bigint": 9223372036854775807, + "col_float": 3.14, + "col_double": 2.718281828459045, + "col_string": "world of fluss python client", + "col_decimal": Decimal("123.45"), + "col_date": date(2026, 1, 23), + "col_time": dt_time(10, 13, 47, 123000), # millisecond precision + "col_timestamp_ntz": datetime(2026, 1, 23, 10, 13, 47, 123000), + "col_timestamp_ltz": datetime(2026, 1, 23, 10, 13, 47, 123000), + "col_bytes": b"binary data", + "col_array": ["fluss", "python"], + "col_binary": b"binary_data_0123", + } + + await _upsert_and_wait(upsert_writer, row_data) + + lookuper = table.new_lookup().create_lookuper() + result = await lookuper.lookup({"pk_int": 1}) + assert result is not None, "Row should exist" + + assert result["pk_int"] == 1 + assert result["col_boolean"] is True + assert result["col_tinyint"] == 127 + assert result["col_smallint"] == 32767 + assert result["col_int"] == 2147483647 + assert result["col_bigint"] == 9223372036854775807 + assert math.isclose(result["col_float"], 3.14, rel_tol=1e-6) + assert math.isclose(result["col_double"], 2.718281828459045, rel_tol=1e-15) + assert result["col_string"] == "world of fluss python client" + assert result["col_decimal"] == Decimal("123.45") + assert result["col_date"] == date(2026, 1, 23) + assert result["col_time"] == dt_time(10, 13, 47, 123000) + assert result["col_timestamp_ntz"] == datetime(2026, 1, 23, 10, 13, 47, 123000) + assert result["col_timestamp_ltz"] == datetime( + 2026, 1, 23, 10, 13, 47, 123000, tzinfo=timezone.utc + ) + assert result["col_bytes"] == b"binary data" + assert result["col_array"] == ["fluss", "python"] + assert result["col_binary"] == b"binary_data_0123" + + # Test with null values for all nullable columns + null_row = {"pk_int": 2} + for col in row_data: + if col != "pk_int": + null_row[col] = None + await _upsert_and_wait(upsert_writer, null_row) + + result = await lookuper.lookup({"pk_int": 2}) + assert result is not None, "Row with nulls should exist" + assert result["pk_int"] == 2 + for col in row_data: + if col != "pk_int": + assert result[col] is None, f"{col} should be null" + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_prefix_lookup_validation_errors(connection, admin): + """Test that prefix lookup raises errors for invalid column configurations.""" + table_path = fluss.TablePath("fluss", "py_test_prefix_lookup_validation") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("a", pa.int32()), + pa.field("b", pa.string()), + pa.field("c", pa.int64()), + ] + ), + primary_keys=["a", "b", "c"], + ) + table_descriptor = fluss.TableDescriptor( + schema, bucket_count=3, bucket_keys=["a", "b"] + ) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + + # lookup_by with columns equal to full PK should error + with pytest.raises(fluss.FlussError, match="prefix lookup"): + table.new_lookup().lookup_by(["a", "b", "c"]).create_lookuper() + + # lookup_by with wrong column names should error + with pytest.raises(fluss.FlussError, match="bucket keys"): + table.new_lookup().lookup_by(["a", "c"]).create_lookuper() + + # lookup_by with unknown column should error + with pytest.raises(fluss.FlussError, match="Unknown column name"): + table.new_lookup().lookup_by(["a", "missing_col"]).create_lookuper() + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + # Partitioned table: lookup columns must include partition keys first, + # followed by bucket keys. + partitioned_table_path = fluss.TablePath("fluss", "py_test_prefix_lookup_validation_pt") + await admin.drop_table(partitioned_table_path, ignore_if_not_exists=True) + + partitioned_schema = fluss.Schema( + pa.schema( + [ + pa.field("region", pa.string()), + pa.field("user_id", pa.int32()), + pa.field("event_id", pa.int64()), + ] + ), + primary_keys=["region", "user_id", "event_id"], + ) + partitioned_table_descriptor = fluss.TableDescriptor( + partitioned_schema, + partition_keys=["region"], + bucket_count=3, + bucket_keys=["user_id"], + ) + await admin.create_table( + partitioned_table_path, partitioned_table_descriptor, ignore_if_exists=False + ) + + partitioned_table = await connection.get_table(partitioned_table_path) + + # Missing partition key in lookup columns. + with pytest.raises(fluss.FlussError, match="partition fields"): + partitioned_table.new_lookup().lookup_by(["user_id"]).create_lookuper() + + # A non-existent partition returns empty list. + partitioned_prefix_lookuper = ( + partitioned_table.new_lookup().lookup_by(["region", "user_id"]).create_lookuper() + ) + rows = await partitioned_prefix_lookuper.lookup({"region": "UNKNOWN_REGION", "user_id": 1}) + assert rows == [] + + # After partition keys, remaining columns must equal bucket keys. + with pytest.raises(fluss.FlussError, match="bucket keys"): + partitioned_table.new_lookup().lookup_by(["region", "event_id"]).create_lookuper() + + await admin.drop_table(partitioned_table_path, ignore_if_not_exists=False) diff --git a/fluss-rust/bindings/python/test/test_log_table.py b/fluss-rust/bindings/python/test/test_log_table.py new file mode 100644 index 0000000000..50b9078bcb --- /dev/null +++ b/fluss-rust/bindings/python/test/test_log_table.py @@ -0,0 +1,1452 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Integration tests for log (append-only) table operations. + +Mirrors the Rust integration tests in crates/fluss/tests/integration/log_table.rs. +""" + +import asyncio +import time + +import pyarrow as pa +import pytest + +import fluss + + +async def test_append_and_scan(connection, admin): + """Test appending record batches and scanning with a record-based scanner.""" + table_path = fluss.TablePath("fluss", "py_test_append_and_scan") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("c1", pa.int32()), pa.field("c2", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor( + schema, bucket_count=3, bucket_keys=["c1"] + ) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + append_writer = table.new_append().create_writer() + + batch1 = pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3], type=pa.int32()), pa.array(["a1", "a2", "a3"])], + schema=pa.schema([pa.field("c1", pa.int32()), pa.field("c2", pa.string())]), + ) + append_writer.write_arrow_batch(batch1) + + batch2 = pa.RecordBatch.from_arrays( + [pa.array([4, 5, 6], type=pa.int32()), pa.array(["a4", "a5", "a6"])], + schema=pa.schema([pa.field("c1", pa.int32()), pa.field("c2", pa.string())]), + ) + append_writer.write_arrow_batch(batch2) + + await append_writer.flush() + + # Scan with record-based scanner + scanner = await table.new_scan().create_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + records = await _poll_records(scanner, expected_count=6) + + assert len(records) == 6, f"Expected 6 records, got {len(records)}" + + records.sort(key=lambda r: r.row["c1"]) + + expected_c1 = [1, 2, 3, 4, 5, 6] + expected_c2 = ["a1", "a2", "a3", "a4", "a5", "a6"] + for i, record in enumerate(records): + assert record.row["c1"] == expected_c1[i], f"c1 mismatch at row {i}" + assert record.row["c2"] == expected_c2[i], f"c2 mismatch at row {i}" + + # Test unsubscribe + scanner.unsubscribe(bucket_id=0) + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_append_dict_rows(connection, admin): + """Test appending rows as dicts and scanning.""" + table_path = fluss.TablePath("fluss", "py_test_append_dict_rows") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + append_writer = table.new_append().create_writer() + + # Append using dicts + append_writer.append({"id": 1, "name": "Alice"}) + append_writer.append({"id": 2, "name": "Bob"}) + # Append using lists + append_writer.append([3, "Charlie"]) + await append_writer.flush() + + scanner = await table.new_scan().create_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + records = await _poll_records(scanner, expected_count=3) + assert len(records) == 3 + + rows = sorted([r.row for r in records], key=lambda r: r["id"]) + assert rows[0] == {"id": 1, "name": "Alice"} + assert rows[1] == {"id": 2, "name": "Bob"} + assert rows[2] == {"id": 3, "name": "Charlie"} + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_list_offsets(connection, admin, wait_for_table_ready): + """Test listing earliest, latest, and timestamp-based offsets.""" + table_path = fluss.TablePath("fluss", "py_test_list_offsets") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + await wait_for_table_ready(table_path) + + # Earliest offset should be 0 for empty table + earliest = await admin.list_offsets( + table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.earliest() + ) + assert earliest[0] == 0 + + # Latest offset should be 0 for empty table + latest = await admin.list_offsets( + table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.latest() + ) + assert latest[0] == 0 + + before_append_ms = int(time.time() * 1000) + + # Append some records + table = await connection.get_table(table_path) + append_writer = table.new_append().create_writer() + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2, 3], type=pa.int32()), + pa.array(["alice", "bob", "charlie"]), + ], + schema=pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]), + ) + append_writer.write_arrow_batch(batch) + await append_writer.flush() + + await asyncio.sleep(1) + + after_append_ms = int(time.time() * 1000) + + # Latest offset should be 3 after appending 3 records + latest_after = await admin.list_offsets( + table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.latest() + ) + assert latest_after[0] == 3 + + # Earliest offset should still be 0 + earliest_after = await admin.list_offsets( + table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.earliest() + ) + assert earliest_after[0] == 0 + + # Timestamp before append should resolve to offset 0 + ts_before = await admin.list_offsets( + table_path, + bucket_ids=[0], + offset_spec=fluss.OffsetSpec.timestamp(before_append_ms), + ) + assert ts_before[0] == 0 + + # Intentional sleep to avoid race condition FlussError(code=38) The timestamp is invalid + await asyncio.sleep(1) + + # Timestamp after append should resolve to offset 3 + ts_after = await admin.list_offsets( + table_path, + bucket_ids=[0], + offset_spec=fluss.OffsetSpec.timestamp(after_append_ms), + ) + assert ts_after[0] == 3 + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_project(connection, admin): + """Test column projection by name and by index.""" + table_path = fluss.TablePath("fluss", "py_test_project") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("col_a", pa.int32()), + pa.field("col_b", pa.string()), + pa.field("col_c", pa.int32()), + ] + ) + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + append_writer = table.new_append().create_writer() + + batch = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2, 3], type=pa.int32()), + pa.array(["x", "y", "z"]), + pa.array([10, 20, 30], type=pa.int32()), + ], + schema=pa.schema( + [ + pa.field("col_a", pa.int32()), + pa.field("col_b", pa.string()), + pa.field("col_c", pa.int32()), + ] + ), + ) + append_writer.write_arrow_batch(batch) + await append_writer.flush() + + # Test project_by_name: select col_b and col_c only + scan = table.new_scan().project_by_name(["col_b", "col_c"]) + scanner = await scan.create_log_scanner() + scanner.subscribe_buckets({0: 0}) + + records = await _poll_records(scanner, expected_count=3) + assert len(records) == 3 + + records.sort(key=lambda r: r.row["col_c"]) + expected_col_b = ["x", "y", "z"] + expected_col_c = [10, 20, 30] + for i, record in enumerate(records): + assert record.row["col_b"] == expected_col_b[i] + assert record.row["col_c"] == expected_col_c[i] + # col_a should not be present in projected results + assert "col_a" not in record.row + + # Test project by indices [1, 0] -> (col_b, col_a) + scanner2 = await table.new_scan().project([1, 0]).create_log_scanner() + scanner2.subscribe_buckets({0: 0}) + + records2 = await _poll_records(scanner2, expected_count=3) + assert len(records2) == 3 + + records2.sort(key=lambda r: r.row["col_a"]) + for i, record in enumerate(records2): + assert record.row["col_b"] == expected_col_b[i] + assert record.row["col_a"] == [1, 2, 3][i] + assert "col_c" not in record.row + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_poll_batches(connection, admin, wait_for_table_ready): + """Test batch-based scanning with poll_arrow and poll_record_batch.""" + table_path = fluss.TablePath("fluss", "py_test_poll_batches") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + await wait_for_table_ready(table_path) + + table = await connection.get_table(table_path) + scanner = await table.new_scan().create_record_batch_log_scanner() + scanner.subscribe(bucket_id=0, start_offset=0) + + # Empty table should return empty result + result = await scanner.poll_arrow(500) + assert result.num_rows == 0 + + writer = table.new_append().create_writer() + pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [pa.array([1, 2], type=pa.int32()), pa.array(["a", "b"])], + schema=pa_schema, + ) + ) + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [pa.array([3, 4], type=pa.int32()), pa.array(["c", "d"])], + schema=pa_schema, + ) + ) + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [pa.array([5, 6], type=pa.int32()), pa.array(["e", "f"])], + schema=pa_schema, + ) + ) + await writer.flush() + + # Poll until we get all 6 records + all_ids = await _poll_arrow_ids(scanner, expected_count=6) + assert all_ids == [1, 2, 3, 4, 5, 6] + + # Append more and verify offset continuation (no duplicates) + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [pa.array([7, 8], type=pa.int32()), pa.array(["g", "h"])], + schema=pa_schema, + ) + ) + await writer.flush() + + new_ids = await _poll_arrow_ids(scanner, expected_count=2) + assert new_ids == [7, 8] + + # Subscribe from mid-offset should truncate (skip earlier records) + trunc_scanner = await table.new_scan().create_record_batch_log_scanner() + trunc_scanner.subscribe(bucket_id=0, start_offset=3) + + trunc_ids = await _poll_arrow_ids(trunc_scanner, expected_count=5) + assert trunc_ids == [4, 5, 6, 7, 8] + + # Projection with batch scanner + proj_scanner = ( + await table.new_scan() + .project_by_name(["id"]) + .create_record_batch_log_scanner() + ) + proj_scanner.subscribe(bucket_id=0, start_offset=0) + batches = await proj_scanner.poll_record_batch(10000) + assert len(batches) > 0 + assert batches[0].batch.num_columns == 1 + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_to_arrow_and_to_pandas(connection, admin): + """Test to_arrow() and to_pandas() convenience methods.""" + table_path = fluss.TablePath("fluss", "py_test_to_arrow_pandas") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + + pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [pa.array([1, 2, 3], type=pa.int32()), pa.array(["a", "b", "c"])], + schema=pa_schema, + ) + ) + await writer.flush() + + num_buckets = (await admin.get_table_info(table_path)).num_buckets + + # to_arrow() + scanner = await table.new_scan().create_record_batch_log_scanner() + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + arrow_table = await scanner.to_arrow() + assert arrow_table.num_rows == 3 + assert arrow_table.schema.names == ["id", "name"] + + # to_pandas() + scanner2 = await table.new_scan().create_record_batch_log_scanner() + scanner2.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + df = await scanner2.to_pandas() + assert len(df) == 3 + assert list(df.columns) == ["id", "name"] + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_to_arrow_batch_reader(connection, admin): + """Test to_arrow_batch_reader() returns a lazy PyArrow RecordBatchReader.""" + table_path = fluss.TablePath("fluss", "py_test_to_arrow_batch_reader") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + + pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [pa.array([10, 20, 30], type=pa.int32()), pa.array(["x", "y", "z"])], + schema=pa_schema, + ) + ) + await writer.flush() + + num_buckets = (await admin.get_table_info(table_path)).num_buckets + + scanner = await table.new_scan().create_record_batch_log_scanner() + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + # to_arrow_batch_reader() is a blocking/sync API; run in a thread to + # avoid starving the asyncio event loop (see docstring warning). + def _read_all(): + reader = scanner.to_arrow_batch_reader() + assert isinstance(reader, pa.RecordBatchReader) + assert reader.schema == pa_schema + + batches = list(reader) + total_rows = sum(b.num_rows for b in batches) + assert total_rows == 3 + + result_table = pa.Table.from_batches(batches, schema=pa_schema) + assert result_table.column("id").to_pylist() == [10, 20, 30] + assert result_table.column("name").to_pylist() == ["x", "y", "z"] + + await asyncio.to_thread(_read_all) + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_to_arrow_batch_reader_drop_and_guard(connection, admin): + """Test reader-active guard and Drop cleanup on mid-iteration drop.""" + table_path = fluss.TablePath("fluss", "py_test_batch_reader_drop_guard") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + + pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + # Write multiple separate flushes so the server stores multiple log + # batches per bucket. This makes it likely that the reader's first poll + # only drains a subset, leaving real work for the Drop cleanup loop. + num_flushes = 10 + rows_per_flush = 200 + total_rows = num_flushes * rows_per_flush + for f in range(num_flushes): + start = f * rows_per_flush + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [ + pa.array( + list(range(start, start + rows_per_flush)), type=pa.int32() + ), + pa.array( + [f"row_{i}" for i in range(start, start + rows_per_flush)] + ), + ], + schema=pa_schema, + ) + ) + await writer.flush() + + num_buckets = (await admin.get_table_info(table_path)).num_buckets + + scanner = await table.new_scan().create_record_batch_log_scanner() + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + # to_arrow_batch_reader() is a blocking/sync API; run all blocking + # interactions in a thread to avoid starving the asyncio event loop. + def _test_guard_and_drop(): + # --- Guard blocks subscribe / unsubscribe while reader is active --- + reader = scanner.to_arrow_batch_reader() + with pytest.raises(fluss.FlussError, match="RecordBatchLogReader is active"): + scanner.subscribe_buckets({0: fluss.EARLIEST_OFFSET}) + with pytest.raises(fluss.FlussError, match="RecordBatchLogReader is active"): + scanner.unsubscribe(0) + + # --- Drop mid-iteration: read one batch, then discard --- + first_batch = next(reader) + assert first_batch.num_rows > 0 + del reader + + # --- Drop unsubscribed leftover buckets: creating a reader without + # re-subscribing must fail with "No buckets subscribed" --- + with pytest.raises(fluss.FlussError, match="No buckets subscribed"): + scanner.to_arrow_batch_reader() + + # --- Guard cleared after drop: scanner is reusable from a fresh subscribe --- + scanner.subscribe_buckets( + {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)} + ) + reader2 = scanner.to_arrow_batch_reader() + batches = list(reader2) + assert sum(b.num_rows for b in batches) == total_rows + + await asyncio.to_thread(_test_guard_and_drop) + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_partitioned_table_append_scan(connection, admin, wait_for_table_ready): + """Test append and scan on a partitioned log table.""" + table_path = fluss.TablePath("fluss", "py_test_partitioned_log_append") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("region", pa.string()), + pa.field("value", pa.int64()), + ] + ) + ) + table_descriptor = fluss.TableDescriptor( + schema, + partition_keys=["region"], + ) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + # Create partitions + for region in ["US", "EU"]: + await admin.create_partition( + table_path, {"region": region}, ignore_if_exists=True + ) + await wait_for_table_ready(table_path, partition_name=region) + table = await connection.get_table(table_path) + append_writer = table.new_append().create_writer() + + # Append rows + test_data = [ + (1, "US", 100), + (2, "US", 200), + (3, "EU", 300), + (4, "EU", 400), + ] + for id_, region, value in test_data: + append_writer.append({"id": id_, "region": region, "value": value}) + await append_writer.flush() + + # Append arrow batches per partition + pa_schema = pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("region", pa.string()), + pa.field("value", pa.int64()), + ] + ) + us_batch = pa.RecordBatch.from_arrays( + [ + pa.array([5, 6], type=pa.int32()), + pa.array(["US", "US"]), + pa.array([500, 600], type=pa.int64()), + ], + schema=pa_schema, + ) + append_writer.write_arrow_batch(us_batch) + + eu_batch = pa.RecordBatch.from_arrays( + [ + pa.array([7, 8], type=pa.int32()), + pa.array(["EU", "EU"]), + pa.array([700, 800], type=pa.int64()), + ], + schema=pa_schema, + ) + append_writer.write_arrow_batch(eu_batch) + await append_writer.flush() + + # Verify partition offsets + us_offsets = await admin.list_partition_offsets( + table_path, + partition_name="US", + bucket_ids=[0], + offset_spec=fluss.OffsetSpec.latest(), + ) + assert us_offsets[0] == 4, "US partition should have 4 records" + + eu_offsets = await admin.list_partition_offsets( + table_path, + partition_name="EU", + bucket_ids=[0], + offset_spec=fluss.OffsetSpec.latest(), + ) + assert eu_offsets[0] == 4, "EU partition should have 4 records" + + # Scan all partitions + scanner = await table.new_scan().create_log_scanner() + partition_infos = await admin.list_partition_infos(table_path) + for p in partition_infos: + scanner.subscribe_partition( + partition_id=p.partition_id, bucket_id=0, start_offset=0 + ) + + expected = [ + (1, "US", 100), + (2, "US", 200), + (3, "EU", 300), + (4, "EU", 400), + (5, "US", 500), + (6, "US", 600), + (7, "EU", 700), + (8, "EU", 800), + ] + + # Poll and verify per-bucket grouping + all_records = [] + deadline = time.monotonic() + 10 + while len(all_records) < 8 and time.monotonic() < deadline: + scan_records = await scanner.poll(5000) + for bucket, bucket_records in scan_records.items(): + assert bucket.partition_id is not None, "Partitioned table should have partition_id" + # All records in a bucket should belong to the same partition + regions = {r.row["region"] for r in bucket_records} + assert len(regions) == 1, f"Bucket has mixed regions: {regions}" + all_records.extend(bucket_records) + + assert len(all_records) == 8 + + collected = sorted( + [(r.row["id"], r.row["region"], r.row["value"]) for r in all_records], + key=lambda x: x[0], + ) + assert collected == expected + + # Test unsubscribe_partition: unsubscribe from EU, only US data should remain + unsub_scanner = await table.new_scan().create_log_scanner() + eu_partition_id = next( + p.partition_id for p in partition_infos if p.partition_name == "EU" + ) + for p in partition_infos: + unsub_scanner.subscribe_partition(p.partition_id, 0, 0) + unsub_scanner.unsubscribe_partition(eu_partition_id, 0) + + remaining = await _poll_records(unsub_scanner, expected_count=4, timeout_s=5) + assert len(remaining) == 4 + assert all(r.row["region"] == "US" for r in remaining) + + # Test subscribe_partition_buckets (batch subscribe) + batch_scanner = await table.new_scan().create_log_scanner() + partition_bucket_offsets = { + (p.partition_id, 0): fluss.EARLIEST_OFFSET for p in partition_infos + } + batch_scanner.subscribe_partition_buckets(partition_bucket_offsets) + + batch_records = await _poll_records(batch_scanner, expected_count=8) + assert len(batch_records) == 8 + batch_collected = sorted( + [(r.row["id"], r.row["region"], r.row["value"]) for r in batch_records], + key=lambda x: x[0], + ) + assert batch_collected == expected + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_write_arrow(connection, admin): + """Test writing a full PyArrow Table via write_arrow().""" + table_path = fluss.TablePath("fluss", "py_test_write_arrow") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + + pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + arrow_table = pa.table( + { + "id": pa.array([1, 2, 3, 4, 5], type=pa.int32()), + "name": pa.array(["alice", "bob", "charlie", "dave", "eve"]), + }, + schema=pa_schema, + ) + writer.write_arrow(arrow_table) + await writer.flush() + + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner = await table.new_scan().create_record_batch_log_scanner() + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + result = await scanner.to_arrow() + assert result.num_rows == 5 + + ids = sorted(result.column("id").to_pylist()) + names = [ + n + for _, n in sorted( + zip(result.column("id").to_pylist(), result.column("name").to_pylist()) + ) + ] + assert ids == [1, 2, 3, 4, 5] + assert names == ["alice", "bob", "charlie", "dave", "eve"] + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_write_pandas(connection, admin): + """Test writing a Pandas DataFrame via write_pandas().""" + import pandas as pd + + table_path = fluss.TablePath("fluss", "py_test_write_pandas") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + + df = pd.DataFrame({"id": [10, 20, 30], "name": ["x", "y", "z"]}) + writer.write_pandas(df) + await writer.flush() + + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner = await table.new_scan().create_record_batch_log_scanner() + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + result = await scanner.to_pandas() + assert len(result) == 3 + + result_sorted = result.sort_values("id").reset_index(drop=True) + assert result_sorted["id"].tolist() == [10, 20, 30] + assert result_sorted["name"].tolist() == ["x", "y", "z"] + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_partitioned_table_to_arrow(connection, admin, wait_for_table_ready): + """Test to_arrow() on partitioned tables.""" + table_path = fluss.TablePath("fluss", "py_test_partitioned_to_arrow") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("region", pa.string()), + pa.field("value", pa.int64()), + ] + ) + ) + table_descriptor = fluss.TableDescriptor(schema, partition_keys=["region"]) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + for region in ["US", "EU"]: + await admin.create_partition( + table_path, {"region": region}, ignore_if_exists=True + ) + await wait_for_table_ready(table_path, partition_name=region) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + writer.append({"id": 1, "region": "US", "value": 100}) + writer.append({"id": 2, "region": "EU", "value": 200}) + await writer.flush() + + scanner = await table.new_scan().create_record_batch_log_scanner() + partition_infos = await admin.list_partition_infos(table_path) + for p in partition_infos: + scanner.subscribe_partition(p.partition_id, 0, fluss.EARLIEST_OFFSET) + + arrow_table = await scanner.to_arrow() + assert arrow_table.num_rows == 2 + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_scan_records_indexing_and_slicing(connection, admin): + """Test ScanRecords indexing, slicing (incl. negative steps), and iteration consistency.""" + table_path = fluss.TablePath("fluss", "py_test_scan_records_indexing") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]) + ) + await admin.create_table(table_path, fluss.TableDescriptor(schema)) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [pa.array(list(range(1, 9)), type=pa.int32()), + pa.array([f"v{i}" for i in range(1, 9)])], + schema=pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]), + ) + ) + await writer.flush() + + scanner = await table.new_scan().create_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + # Poll until we get a non-empty ScanRecords (need ≥2 records for slice tests) + sr = None + deadline = time.monotonic() + 10 + while time.monotonic() < deadline: + sr = await scanner.poll(5000) + if len(sr) >= 2: + break + assert sr is not None and len(sr) >= 2, "Expected at least 2 records" + n = len(sr) + offsets = [sr[i].offset for i in range(n)] + + # Iteration and indexing must produce the same order + assert [r.offset for r in sr] == offsets + + # Negative indexing + assert sr[-1].offset == offsets[-1] + assert sr[-n].offset == offsets[0] + + # Verify slices match the same operation on the offsets reference list + test_slices = [ + slice(1, n - 1), # forward subrange + slice(None, None, -1), # [::-1] full reverse + slice(n - 2, 0, -1), # reverse with bounds + slice(n - 1, 0, -2), # reverse with step + slice(None, None, 2), # [::2] + slice(1, None, 3), # [1::3] + slice(2, 2), # empty + ] + for s in test_slices: + result = [r.offset for r in sr[s]] + assert result == offsets[s], f"slice {s}: got {result}, expected {offsets[s]}" + + # Bucket-based indexing + for bucket in sr.buckets(): + assert len(sr[bucket]) > 0 + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_async_iterator(connection, admin): + """Test the Python asynchronous iterator loop (`async for`) on LogScanner.""" + table_path = fluss.TablePath("fluss", "py_test_async_iterator") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]) + ) + await admin.create_table(table_path, fluss.TableDescriptor(schema)) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + + # Write 5 records + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [pa.array(list(range(1, 6)), type=pa.int32()), + pa.array([f"async{i}" for i in range(1, 6)])], + schema=pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]), + ) + ) + await writer.flush() + + scanner = await table.new_scan().create_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + collected = [] + + # Here is the magical Issue #424 async iterator logic at work: + async def consume_scanner(): + async for record in scanner: + collected.append(record) + if len(collected) == 5: + break + + await consume_scanner() + + assert len(collected) == 5, f"Expected 5 records, got {len(collected)}" + + collected.sort(key=lambda r: r.row["id"]) + for i, record in enumerate(collected): + assert record.row["id"] == i + 1 + assert record.row["val"] == f"async{i + 1}" + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_async_iterator_break_no_leak(connection, admin): + """Verify that breaking out of `async for` does not leak resources. + + After breaking, the scanner must still be usable for synchronous + `poll()` calls. If the old implementation's tokio::spawn'd task + were still alive, it would hold the Mutex and cause `poll()` to + deadlock or error. + """ + table_path = fluss.TablePath("fluss", "py_test_async_break_leak") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]) + ) + await admin.create_table(table_path, fluss.TableDescriptor(schema)) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [ + pa.array(list(range(1, 11)), type=pa.int32()), + pa.array([f"v{i}" for i in range(1, 11)]), + ], + schema=pa.schema( + [pa.field("id", pa.int32()), pa.field("val", pa.string())] + ), + ) + ) + await writer.flush() + + scanner = await table.new_scan().create_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner.subscribe_buckets( + {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)} + ) + + # Phase 1: async for with early break (collect only 3 of 10) + collected_async = [] + + async def consume_and_break(): + async for record in scanner: + collected_async.append(record) + if len(collected_async) >= 3: + break + + await consume_and_break() + assert len(collected_async) == 3, ( + f"Expected 3 records from async for, got {len(collected_async)}" + ) + + # Phase 2: sync poll() must still work — proves no leaked task / lock. + # With small data and few buckets, _async_poll may have fetched all + # records in one batch. After break, the un-yielded records from that + # batch are lost. So sync poll may return 0 records — the key assertion + # is that poll() completes without deadlock (returns within timeout). + remaining = await scanner.poll(2000) + assert remaining is not None, "poll() should return (not deadlock)" + + # If we got records, verify no duplicates + async_ids = {r.row["id"] for r in collected_async} + sync_ids = {r.row["id"] for r in remaining} + assert async_ids.isdisjoint(sync_ids), ( + f"Duplicate IDs between async and sync: {async_ids & sync_ids}" + ) + + # All IDs must be from the original 1-10 range + all_ids = async_ids | sync_ids + assert all_ids.issubset(set(range(1, 11))), ( + f"Unexpected IDs: {all_ids - set(range(1, 11))}" + ) + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_async_iterator_multiple_batches(connection, admin): + """Verify async iteration works across multiple network poll cycles. + + _async_poll does a single bounded poll per call. Writing 20 records + to multiple buckets ensures the Python generator must loop through + several _async_poll calls to collect them all. + """ + table_path = fluss.TablePath("fluss", "py_test_async_multi_batch") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor( + schema, bucket_count=3, bucket_keys=["id"] + ) + await admin.create_table( + table_path, table_descriptor, ignore_if_exists=False + ) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + + num_records = 20 + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [ + pa.array(list(range(1, num_records + 1)), type=pa.int32()), + pa.array([f"multi{i}" for i in range(1, num_records + 1)]), + ], + schema=pa.schema( + [pa.field("id", pa.int32()), pa.field("val", pa.string())] + ), + ) + ) + await writer.flush() + + scanner = await table.new_scan().create_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner.subscribe_buckets( + {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)} + ) + + collected = [] + + async def consume_all(): + async for record in scanner: + collected.append(record) + if len(collected) >= num_records: + break + + await consume_all() + assert len(collected) == num_records, ( + f"Expected {num_records} records, got {len(collected)}" + ) + + # Verify all IDs are present (order may vary due to bucketing) + ids = sorted(r.row["id"] for r in collected) + assert ids == list(range(1, num_records + 1)) + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_batch_async_iterator(connection, admin): + """Test the Python asynchronous iterator loop (`async for`) on a batch LogScanner. + + With our __aiter__ dispatch, a batch-based scanner should yield RecordBatch + objects (not ScanRecord). Each yielded item has .batch (PyArrow RecordBatch), + .bucket, .base_offset, .last_offset. + """ + table_path = fluss.TablePath("fluss", "py_test_batch_async_iter") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]) + ) + await admin.create_table(table_path, fluss.TableDescriptor(schema)) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [ + pa.array(list(range(1, 7)), type=pa.int32()), + pa.array([f"bv{i}" for i in range(1, 7)]), + ], + schema=pa.schema( + [pa.field("id", pa.int32()), pa.field("val", pa.string())] + ), + ) + ) + await writer.flush() + + batch_scanner = await table.new_scan().create_record_batch_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + batch_scanner.subscribe_buckets( + {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)} + ) + + collected_batches = [] + total_rows = 0 + + async def consume_batches(): + nonlocal total_rows + async for rb in batch_scanner: + collected_batches.append(rb) + total_rows += rb.batch.num_rows + if total_rows >= 6: + break + + await consume_batches() + + assert total_rows >= 6, f"Expected >=6 total rows, got {total_rows}" + assert len(collected_batches) > 0 + + # Verify each yielded item is a RecordBatch with expected attributes + for rb in collected_batches: + assert hasattr(rb, "batch"), "RecordBatch should have .batch" + assert hasattr(rb, "bucket"), "RecordBatch should have .bucket" + assert hasattr(rb, "base_offset"), "RecordBatch should have .base_offset" + assert hasattr(rb, "last_offset"), "RecordBatch should have .last_offset" + # .batch should be a PyArrow RecordBatch + arrow_batch = rb.batch + assert isinstance(arrow_batch, pa.RecordBatch), ( + f"Expected PyArrow RecordBatch, got {type(arrow_batch).__name__}" + ) + assert arrow_batch.num_columns == 2 + assert set(arrow_batch.schema.names) == {"id", "val"} + + # Verify all 6 IDs are present + all_ids = [] + for rb in collected_batches: + all_ids.extend(rb.batch.column("id").to_pylist()) + assert sorted(all_ids[:6]) == [1, 2, 3, 4, 5, 6] + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_batch_async_iterator_break_no_leak(connection, admin): + """Verify that breaking out of batch `async for` does not leak resources. + + After breaking, the scanner must still be usable for synchronous + poll_record_batch() calls, proving no leaked task or lock. + """ + table_path = fluss.TablePath("fluss", "py_test_batch_async_break") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]) + ) + await admin.create_table(table_path, fluss.TableDescriptor(schema)) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [ + pa.array(list(range(1, 11)), type=pa.int32()), + pa.array([f"bl{i}" for i in range(1, 11)]), + ], + schema=pa.schema( + [pa.field("id", pa.int32()), pa.field("val", pa.string())] + ), + ) + ) + await writer.flush() + + batch_scanner = await table.new_scan().create_record_batch_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + batch_scanner.subscribe_buckets( + {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)} + ) + + # Phase 1: async for with early break (collect just 1 batch) + first_batch = None + + async def consume_and_break(): + nonlocal first_batch + async for rb in batch_scanner: + first_batch = rb + break + + await consume_and_break() + assert first_batch is not None, "Should have received at least 1 batch" + assert first_batch.batch.num_rows > 0 + + # Phase 2: sync poll_record_batch() must still work — proves no leak + remaining = await batch_scanner.poll_record_batch(2000) + assert remaining is not None, "poll_record_batch() should return (not deadlock)" + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_batch_async_iterator_multiple_batches(connection, admin): + """Verify batch async iteration works across multiple network poll cycles. + + Writing 20 records to 3 buckets ensures the generator must loop through + several _async_poll_batches calls to collect them all. + """ + table_path = fluss.TablePath("fluss", "py_test_batch_async_multi") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + schema = fluss.Schema( + pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]) + ) + table_descriptor = fluss.TableDescriptor( + schema, bucket_count=3, bucket_keys=["id"] + ) + await admin.create_table( + table_path, table_descriptor, ignore_if_exists=False + ) + + table = await connection.get_table(table_path) + writer = table.new_append().create_writer() + + num_records = 20 + writer.write_arrow_batch( + pa.RecordBatch.from_arrays( + [ + pa.array(list(range(1, num_records + 1)), type=pa.int32()), + pa.array([f"bm{i}" for i in range(1, num_records + 1)]), + ], + schema=pa.schema( + [pa.field("id", pa.int32()), pa.field("val", pa.string())] + ), + ) + ) + await writer.flush() + + batch_scanner = await table.new_scan().create_record_batch_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + batch_scanner.subscribe_buckets( + {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)} + ) + + all_ids = [] + + async def consume_all(): + async for rb in batch_scanner: + all_ids.extend(rb.batch.column("id").to_pylist()) + if len(all_ids) >= num_records: + break + + await consume_all() + assert len(all_ids) >= num_records, ( + f"Expected >={num_records} IDs, got {len(all_ids)}" + ) + assert sorted(all_ids[:num_records]) == list(range(1, num_records + 1)) + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _poll_records(scanner, expected_count, timeout_s=10): + """Poll a record-based scanner until expected_count records are collected.""" + collected = [] + deadline = time.monotonic() + timeout_s + while len(collected) < expected_count and time.monotonic() < deadline: + records = await scanner.poll(5000) + collected.extend(records) + return collected + + +async def _poll_arrow_ids(scanner, expected_count, timeout_s=10): + """Poll a batch scanner and extract 'id' column values.""" + all_ids = [] + deadline = time.monotonic() + timeout_s + while len(all_ids) < expected_count and time.monotonic() < deadline: + arrow_table = await scanner.poll_arrow(5000) + if arrow_table.num_rows > 0: + all_ids.extend(arrow_table.column("id").to_pylist()) + return all_ids + + +async def test_append_and_scan_with_array(connection, admin): + """Test appending and scanning with array columns.""" + table_path = fluss.TablePath("fluss", "py_test_append_and_scan_with_array") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + pa_schema = pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("tags", pa.list_(pa.string())), + pa.field("scores", pa.list_(pa.int32())), + ] + ) + schema = fluss.Schema(pa_schema) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + append_writer = table.new_append().create_writer() + + # Batch 1: Testing standard lists + batch1 = pa.RecordBatch.from_arrays( + [ + pa.array([1, 2], type=pa.int32()), + pa.array([["a", "b"], ["c"]], type=pa.list_(pa.string())), + pa.array([[10, 20], [30]], type=pa.list_(pa.int32())), + ], + schema=pa_schema, + ) + append_writer.write_arrow_batch(batch1) + + # Batch 2: Testing null values inside arrays and null arrays + batch2 = pa.RecordBatch.from_arrays( + [ + pa.array([3, 4, 5, 6], type=pa.int32()), + pa.array([["d", None], None, [], [None]], type=pa.list_(pa.string())), + pa.array([[40, 50], [60], None, []], type=pa.list_(pa.int32())), + ], + schema=pa_schema, + ) + append_writer.write_arrow_batch(batch2) + await append_writer.flush() + + # Verify via LogScanner (record-by-record) + scanner = await table.new_scan().create_log_scanner() + scanner.subscribe_buckets({0: fluss.EARLIEST_OFFSET}) + records = await _poll_records(scanner, expected_count=6) + + assert len(records) == 6 + records.sort(key=lambda r: r.row["id"]) + + # Verify Batch 1 + assert records[0].row["tags"] == ["a", "b"] + assert records[0].row["scores"] == [10, 20] + assert records[1].row["tags"] == ["c"] + assert records[1].row["scores"] == [30] + + # Verify Batch 2 + assert records[2].row["tags"] == ["d", None] + assert records[2].row["scores"] == [40, 50] + assert records[3].row["tags"] is None + assert records[3].row["scores"] == [60] + assert records[4].row["tags"] == [] + assert records[4].row["scores"] is None + assert records[5].row["tags"] == [None] + assert records[5].row["scores"] == [] + + # Verify via to_arrow (batch-based) + scanner2 = await table.new_scan().create_record_batch_log_scanner() + scanner2.subscribe_buckets({0: fluss.EARLIEST_OFFSET}) + result_table = await scanner2.to_arrow() + + assert result_table.num_rows == 6 + assert result_table.column("tags").to_pylist() == [ + ["a", "b"], + ["c"], + ["d", None], + None, + [], + [None], + ] + assert result_table.column("scores").to_pylist() == [ + [10, 20], + [30], + [40, 50], + [60], + None, + [], + ] + + + + +async def test_append_rows_with_array(connection, admin): + """Test appending rows with array data as Python lists and scanning.""" + table_path = fluss.TablePath("fluss", "py_test_append_rows_with_array") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + pa_schema = pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("tags", pa.list_(pa.string())), + pa.field("scores", pa.list_(pa.int32())), + ] + ) + schema = fluss.Schema(pa_schema) + table_descriptor = fluss.TableDescriptor(schema) + await admin.create_table(table_path, table_descriptor, ignore_if_exists=False) + + table = await connection.get_table(table_path) + append_writer = table.new_append().create_writer() + + # Append rows using dicts with lists + append_writer.append({"id": 1, "tags": ["a", "b"], "scores": [10, 20]}) + append_writer.append({"id": 2, "tags": ["c"], "scores": [30]}) + # Append row using list with nested list (null handling) + append_writer.append([3, None, [40, None, 60]]) + + await append_writer.flush() + + scanner = await table.new_scan().create_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + records = await _poll_records(scanner, expected_count=3) + assert len(records) == 3 + + rows = sorted([r.row for r in records], key=lambda r: r["id"]) + assert rows[0] == {"id": 1, "tags": ["a", "b"], "scores": [10, 20]} + assert rows[1] == {"id": 2, "tags": ["c"], "scores": [30]} + # Note: records[2].row["tags"] will be None, records[2].row["scores"] will be [40, None, 60] + assert rows[2]["id"] == 3 + assert rows[2]["tags"] is None + assert rows[2]["scores"] == [40, None, 60] + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_append_rows_with_nested_array(connection, admin): + """Test appending rows with nested array data (ARRAY>) and scanning.""" + table_path = fluss.TablePath("fluss", "py_test_append_rows_with_nested_array") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + pa_schema = pa.schema([ + pa.field("id", pa.int32()), + pa.field("matrix", pa.list_(pa.list_(pa.int32()))), + ]) + schema = fluss.Schema(pa_schema) + await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=False) + + table = await connection.get_table(table_path) + append_writer = table.new_append().create_writer() + + # Append nested lists + append_writer.append({"id": 1, "matrix": [[1, 2], [3, 4]]}) + append_writer.append({"id": 2, "matrix": [[], [5], [6, 7, 8]]}) + append_writer.append({"id": 3, "matrix": None}) + append_writer.append({"id": 4, "matrix": [[1, None], None, []]}) + append_writer.append({"id": 5, "matrix": [[None, None]]}) + + await append_writer.flush() + + scanner = await table.new_scan().create_log_scanner() + num_buckets = (await admin.get_table_info(table_path)).num_buckets + scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}) + + records = await _poll_records(scanner, expected_count=5) + assert len(records) == 5 + + rows = sorted([r.row for r in records], key=lambda r: r["id"]) + assert rows[0] == {"id": 1, "matrix": [[1, 2], [3, 4]]} + assert rows[1] == {"id": 2, "matrix": [[], [5], [6, 7, 8]]} + assert rows[2] == {"id": 3, "matrix": None} + assert rows[3] == {"id": 4, "matrix": [[1, None], None, []]} + assert rows[4] == {"id": 5, "matrix": [[None, None]]} + + await admin.drop_table(table_path, ignore_if_not_exists=False) + + +async def test_append_rows_with_invalid_array(connection, admin): + """Test that appending invalid data to an array column raises an error.""" + table_path = fluss.TablePath("fluss", "py_test_append_rows_with_invalid_array") + await admin.drop_table(table_path, ignore_if_not_exists=True) + + pa_schema = pa.schema([ + pa.field("id", pa.int32()), + pa.field("tags", pa.list_(pa.string())), + ]) + schema = fluss.Schema(pa_schema) + await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=False) + + table = await connection.get_table(table_path) + append_writer = table.new_append().create_writer() + + # Appending a string instead of a list should raise an error + with pytest.raises(Exception, match="Expected sequence for Array column"): + append_writer.append({"id": 4, "tags": "not_a_list"}) + + await admin.drop_table(table_path, ignore_if_not_exists=False) diff --git a/fluss-rust/bindings/python/test/test_sasl_auth.py b/fluss-rust/bindings/python/test/test_sasl_auth.py new file mode 100644 index 0000000000..6889f1ab67 --- /dev/null +++ b/fluss-rust/bindings/python/test/test_sasl_auth.py @@ -0,0 +1,108 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Integration tests for SASL/PLAIN authentication. + +Mirrors the Rust integration tests in crates/fluss/tests/integration/sasl_auth.rs. +""" + +import pytest + +import fluss + + +async def test_sasl_connect_with_valid_credentials(sasl_bootstrap_servers): + """Verify that a client with correct SASL credentials can connect and perform operations.""" + config = fluss.Config({ + "bootstrap.servers": sasl_bootstrap_servers, + "security.protocol": "sasl", + "security.sasl.mechanism": "PLAIN", + "security.sasl.username": "admin", + "security.sasl.password": "admin-secret", + }) + conn = await fluss.FlussConnection.create(config) + admin = conn.get_admin() + + db_name = "py_sasl_test_valid_db" + db_descriptor = fluss.DatabaseDescriptor(comment="created via SASL auth") + await admin.create_database(db_name, db_descriptor, ignore_if_exists=True) + + assert await admin.database_exists(db_name) + + # Cleanup + await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True) + await conn.close() + + +async def test_sasl_connect_with_second_user(sasl_bootstrap_servers): + """Verify that a second user can also authenticate successfully.""" + config = fluss.Config({ + "bootstrap.servers": sasl_bootstrap_servers, + "security.protocol": "sasl", + "security.sasl.mechanism": "PLAIN", + "security.sasl.username": "alice", + "security.sasl.password": "alice-secret", + }) + conn = await fluss.FlussConnection.create(config) + admin = conn.get_admin() + + # Basic operation to confirm functional connection + assert not await admin.database_exists("some_nonexistent_db_alice") + await conn.close() + + +async def test_sasl_connect_with_wrong_password(sasl_bootstrap_servers): + """Verify that wrong credentials are rejected with AUTHENTICATE_EXCEPTION.""" + config = fluss.Config({ + "bootstrap.servers": sasl_bootstrap_servers, + "security.protocol": "sasl", + "security.sasl.mechanism": "PLAIN", + "security.sasl.username": "admin", + "security.sasl.password": "wrong-password", + }) + with pytest.raises(fluss.FlussError) as exc_info: + await fluss.FlussConnection.create(config) + + assert exc_info.value.error_code == fluss.ErrorCode.AUTHENTICATE_EXCEPTION + + +async def test_sasl_connect_with_unknown_user(sasl_bootstrap_servers): + """Verify that a nonexistent user is rejected with AUTHENTICATE_EXCEPTION.""" + config = fluss.Config({ + "bootstrap.servers": sasl_bootstrap_servers, + "security.protocol": "sasl", + "security.sasl.mechanism": "PLAIN", + "security.sasl.username": "nonexistent_user", + "security.sasl.password": "some-password", + }) + with pytest.raises(fluss.FlussError) as exc_info: + await fluss.FlussConnection.create(config) + + assert exc_info.value.error_code == fluss.ErrorCode.AUTHENTICATE_EXCEPTION + + +async def test_sasl_client_to_plaintext_server(plaintext_bootstrap_servers): + """Verify that a SASL-configured client fails when connecting to a plaintext server.""" + config = fluss.Config({ + "bootstrap.servers": plaintext_bootstrap_servers, + "security.protocol": "sasl", + "security.sasl.mechanism": "PLAIN", + "security.sasl.username": "admin", + "security.sasl.password": "admin-secret", + }) + with pytest.raises(fluss.FlussError): + await fluss.FlussConnection.create(config) diff --git a/fluss-rust/bindings/python/test/test_schema.py b/fluss-rust/bindings/python/test/test_schema.py new file mode 100644 index 0000000000..dfd9cf5619 --- /dev/null +++ b/fluss-rust/bindings/python/test/test_schema.py @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Unit tests for Schema (no cluster required).""" + +import pyarrow as pa + +import fluss + + +def test_get_primary_keys(): + fields = pa.schema([ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + ]) + + schema_with_pk = fluss.Schema(fields, primary_keys=["id"]) + assert schema_with_pk.get_primary_keys() == ["id"] + + schema_without_pk = fluss.Schema(fields) + assert schema_without_pk.get_primary_keys() == [] + + +def test_schema_with_array(): + # Test that a schema can be constructed from a pyarrow schema containing a list + fields = pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("tags", pa.list_(pa.string())), + ] + ) + schema = fluss.Schema(fields) + assert schema.get_column_names() == ["id", "tags"] + assert schema.get_column_types() == ["int", "array"] + + +def test_nullable_fields(): + fields = pa.schema( + [ + pa.field("id", pa.int32(), nullable=False), + pa.field("name", pa.string()), + ] + ) + schema = fluss.Schema(fields) + assert schema.get_column_types() == ["int NOT NULL", "string"] + assert schema.get_columns() == [("id", "int NOT NULL"), ("name", "string")] + + +def test_pk_forces_non_nullable(): + fields = pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + ] + ) + schema = fluss.Schema(fields, primary_keys=["id"]) + types = schema.get_column_types() + assert types[0] == "int NOT NULL" + assert types[1] == "string" + + +def test_nested_list_nullability(): + fields = pa.schema( + [ + pa.field( + "tags", + pa.list_(pa.field("item", pa.string(), nullable=False)), + ), + pa.field("ids", pa.list_(pa.int32()), nullable=False), + pa.field( + "strict_ids", + pa.list_(pa.field("item", pa.int32(), nullable=False)), + nullable=False, + ), + ] + ) + schema = fluss.Schema(fields) + types = schema.get_column_types() + assert types[0] == "array" + assert types[1] == "array NOT NULL" + assert types[2] == "array NOT NULL" + + diff --git a/fluss-rust/copyright.txt b/fluss-rust/copyright.txt new file mode 100644 index 0000000000..d5519133ed --- /dev/null +++ b/fluss-rust/copyright.txt @@ -0,0 +1,17 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ \ No newline at end of file diff --git a/fluss-rust/crates/examples/Cargo.toml b/fluss-rust/crates/examples/Cargo.toml new file mode 100644 index 0000000000..45f029ee8c --- /dev/null +++ b/fluss-rust/crates/examples/Cargo.toml @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +edition = { workspace = true } +license = { workspace = true } +name = "fluss-examples" +rust-version = { workspace = true } +version = { workspace = true } + + +[dependencies] +fluss = { workspace = true, features = ["storage-all"] } +tokio = { workspace = true } +clap = { workspace = true } + +[target.'cfg(not(target_env = "msvc"))'.dependencies] +tikv-jemallocator = "0.6" + +[[example]] +name = "example-table" +path = "src/example_table.rs" + +[[example]] +name = "example-upsert-lookup" +path = "src/example_kv_table.rs" + +[[example]] +name = "example-partitioned-upsert-lookup" +path = "src/example_partitioned_kv_table.rs" + +[[example]] +name = "example-prefix-lookup" +path = "src/example_prefix_lookup.rs" + +[[example]] +name = "example-partitioned-prefix-lookup" +path = "src/example_partitioned_prefix_lookup.rs" diff --git a/fluss-rust/crates/examples/DEPENDENCIES.rust.tsv b/fluss-rust/crates/examples/DEPENDENCIES.rust.tsv new file mode 100644 index 0000000000..5af4754d0c --- /dev/null +++ b/fluss-rust/crates/examples/DEPENDENCIES.rust.tsv @@ -0,0 +1,300 @@ +crate Apache-2.0 Apache-2.0 WITH LLVM-exception BSD-2-Clause BSD-3-Clause BSL-1.0 CC0-1.0 CDLA-Permissive-2.0 ISC LGPL-2.1-or-later MIT Unicode-3.0 Unlicense Zlib +ahash@0.8.12 X X +aho-corasick@1.1.4 X X +android_system_properties@0.1.5 X X +anstream@1.0.0 X X +anstyle@1.0.14 X X +anstyle-parse@1.0.0 X X +anstyle-query@1.1.5 X X +anstyle-wincon@3.0.11 X X +anyhow@1.0.102 X X +arrow@57.3.0 X +arrow-arith@57.3.0 X +arrow-array@57.3.0 X +arrow-buffer@57.3.0 X +arrow-cast@57.3.0 X +arrow-csv@57.3.0 X +arrow-data@57.3.0 X +arrow-ipc@57.3.0 X +arrow-json@57.3.0 X +arrow-ord@57.3.0 X +arrow-row@57.3.0 X +arrow-schema@57.3.0 X +arrow-select@57.3.0 X +arrow-string@57.3.0 X +async-trait@0.1.89 X X +atoi@2.0.0 X +atomic-waker@1.1.2 X X +autocfg@1.5.0 X X +backon@1.6.0 X +base64@0.22.1 X X +bigdecimal@0.4.10 X X +bitflags@2.11.0 X X +bitvec@1.0.1 X +block-buffer@0.10.4 X X +bumpalo@3.20.2 X X +byteorder@1.5.0 X X +bytes@1.11.1 X +cc@1.2.57 X X +cfg-if@1.0.4 X X +chrono@0.4.44 X X +clap@4.6.0 X X +clap_builder@4.6.0 X X +clap_derive@4.6.0 X X +clap_lex@1.1.0 X X +colorchoice@1.0.5 X X +const-oid@0.9.6 X X +const-random@0.1.18 X X +const-random-macro@0.1.16 X X +core-foundation-sys@0.8.7 X X +cpufeatures@0.2.17 X X +crc32c@0.6.8 X X +crossbeam-utils@0.8.21 X X +crunchy@0.2.4 X +crypto-common@0.1.7 X X +csv@1.4.0 X X +csv-core@0.1.13 X X +dashmap@6.1.0 X +delegate@0.13.5 X X +digest@0.10.7 X X +displaydoc@0.2.5 X X +either@1.15.0 X X +equivalent@1.0.2 X X +errno@0.3.14 X X +fastrand@2.3.0 X X +find-msvc-tools@0.1.9 X X +fixedbitset@0.5.7 X X +flatbuffers@25.12.19 X +fluss-examples@0.1.0 X +fluss-rs@0.1.0 X +fnv@1.0.7 X X +foldhash@0.1.5 X +form_urlencoded@1.2.2 X X +funty@2.0.0 X +futures@0.3.32 X X +futures-channel@0.3.32 X X +futures-core@0.3.32 X X +futures-executor@0.3.32 X X +futures-io@0.3.32 X X +futures-macro@0.3.32 X X +futures-sink@0.3.32 X X +futures-task@0.3.32 X X +futures-util@0.3.32 X X +generic-array@0.14.7 X +getrandom@0.2.17 X X +getrandom@0.3.4 X X +getrandom@0.4.2 X X +gloo-timers@0.3.0 X X +h2@0.4.13 X +half@2.7.1 X X +hashbrown@0.14.5 X X +hashbrown@0.15.5 X X +hashbrown@0.16.1 X X +heck@0.5.0 X X +hex@0.4.3 X X +hmac@0.12.1 X X +home@0.5.12 X X +http@1.4.0 X X +http-body@1.0.1 X +http-body-util@0.1.3 X +httparse@1.10.1 X X +httpdate@1.0.3 X X +hyper@1.8.1 X +hyper-rustls@0.27.7 X X X +hyper-util@0.1.20 X +iana-time-zone@0.1.65 X X +iana-time-zone-haiku@0.1.2 X X +icu_collections@2.1.1 X +icu_locale_core@2.1.1 X +icu_normalizer@2.1.1 X +icu_normalizer_data@2.1.1 X +icu_properties@2.1.2 X +icu_properties_data@2.1.2 X +icu_provider@2.1.1 X +idna@1.1.0 X X +idna_adapter@1.2.1 X X +indexmap@2.13.0 X X +ipnet@2.12.0 X X +iri-string@0.7.11 X X +is_terminal_polyfill@1.70.2 X X +itertools@0.14.0 X X +itoa@1.0.18 X X +jiff@0.2.23 X X +jiff-tzdb@0.1.6 X X +jiff-tzdb-platform@0.1.3 X X +jobserver@0.1.34 X X +js-sys@0.3.91 X X +lexical-core@1.0.6 X X +lexical-parse-float@1.0.6 X X +lexical-parse-integer@1.0.6 X X +lexical-util@1.0.7 X X +lexical-write-float@1.0.6 X X +lexical-write-integer@1.0.6 X X +libc@0.2.183 X X +libm@0.2.16 X +linked-hash-map@0.5.6 X X +linux-raw-sys@0.12.1 X X X +litemap@0.8.1 X +lock_api@0.4.14 X X +log@0.4.29 X X +lz4_flex@0.12.1 X +md-5@0.10.6 X X +memchr@2.8.0 X X +mio@1.1.1 X +multimap@0.10.1 X X +num-bigint@0.4.6 X X +num-complex@0.4.6 X X +num-integer@0.1.46 X X +num-traits@0.2.19 X X +once_cell@1.21.4 X X +once_cell_polyfill@1.70.2 X X +opendal@0.55.0 X +ordered-float@5.1.0 X +parking_lot@0.12.5 X X +parking_lot_core@0.9.12 X X +parse-display@0.10.0 X X +parse-display-derive@0.10.0 X X +percent-encoding@2.3.2 X X +petgraph@0.8.3 X X +pin-project-lite@0.2.17 X X +pin-utils@0.1.0 X X +pkg-config@0.3.32 X X +portable-atomic@1.13.1 X X +portable-atomic-util@0.2.6 X X +potential_utf@0.1.4 X +ppv-lite86@0.2.21 X X +prettyplease@0.2.37 X X +proc-macro2@1.0.106 X X +prost@0.14.3 X +prost-build@0.14.3 X +prost-derive@0.14.3 X +prost-types@0.14.3 X +quick-xml@0.37.5 X +quick-xml@0.38.4 X +quote@1.0.45 X X +r-efi@5.3.0 X X X +r-efi@6.0.0 X X X +radium@0.7.0 X +rand@0.8.5 X X +rand@0.9.2 X X +rand_chacha@0.3.1 X X +rand_chacha@0.9.0 X X +rand_core@0.6.4 X X +rand_core@0.9.5 X X +redox_syscall@0.5.18 X +regex@1.12.3 X X +regex-automata@0.4.14 X X +regex-syntax@0.8.10 X X +reqsign@0.16.5 X +reqwest@0.12.28 X X +ring@0.17.14 X X +rustc_version@0.4.1 X X +rustix@1.1.4 X X X +rustls@0.23.37 X X X +rustls-pki-types@1.14.0 X X +rustls-webpki@0.103.10 X +rustversion@1.0.22 X X +ryu@1.0.23 X X +scopeguard@1.2.0 X X +semver@1.0.27 X X +serde@1.0.228 X X +serde_core@1.0.228 X X +serde_derive@1.0.228 X X +serde_json@1.0.149 X X +serde_urlencoded@0.7.1 X X +sha1@0.10.6 X X +sha2@0.10.9 X X +shlex@1.3.0 X X +signal-hook-registry@1.4.8 X X +simdutf8@0.1.5 X X +slab@0.4.12 X +smallvec@1.15.1 X X +snafu@0.8.9 X X +snafu-derive@0.8.9 X X +socket2@0.6.3 X X +stable_deref_trait@1.2.1 X X +strsim@0.11.1 X +structmeta@0.3.0 X X +structmeta-derive@0.3.0 X X +strum@0.26.3 X +strum_macros@0.26.4 X +subtle@2.6.1 X +syn@2.0.117 X X +sync_wrapper@1.0.2 X +synstructure@0.13.2 X +tap@1.0.1 X +tempfile@3.27.0 X X +thiserror@1.0.69 X X +thiserror-impl@1.0.69 X X +tikv-jemalloc-sys@0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7 X X +tikv-jemallocator@0.6.1 X X +tiny-keccak@2.0.2 X +tinystr@0.8.2 X +tokio@1.50.0 X +tokio-macros@2.6.1 X +tokio-rustls@0.26.4 X X +tokio-util@0.7.18 X +tower@0.5.3 X +tower-http@0.6.8 X +tower-layer@0.3.3 X +tower-service@0.3.3 X +tracing@0.1.44 X +tracing-attributes@0.1.31 X +tracing-core@0.1.36 X +try-lock@0.2.5 X +twox-hash@2.1.2 X +typenum@1.19.0 X X +unicode-ident@1.0.24 X X X +untrusted@0.9.0 X +url@2.5.8 X X +utf8_iter@1.0.4 X X +utf8parse@0.2.2 X X +uuid@1.22.0 X X +value-bag@1.12.0 X X +version_check@0.9.5 X X +want@0.3.1 X +wasi@0.11.1+wasi-snapshot-preview1 X X X +wasip2@1.0.2+wasi-0.2.9 X X X +wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06 X X X +wasm-bindgen@0.2.114 X X +wasm-bindgen-futures@0.4.64 X X +wasm-bindgen-macro@0.2.114 X X +wasm-bindgen-macro-support@0.2.114 X X +wasm-bindgen-shared@0.2.114 X X +wasm-streams@0.4.2 X X +web-sys@0.3.91 X X +webpki-roots@1.0.6 X +windows-core@0.62.2 X X +windows-implement@0.60.2 X X +windows-interface@0.59.3 X X +windows-link@0.2.1 X X +windows-result@0.4.1 X X +windows-strings@0.5.1 X X +windows-sys@0.52.0 X X +windows-sys@0.61.2 X X +windows-targets@0.52.6 X X +windows_aarch64_gnullvm@0.52.6 X X +windows_aarch64_msvc@0.52.6 X X +windows_i686_gnu@0.52.6 X X +windows_i686_gnullvm@0.52.6 X X +windows_i686_msvc@0.52.6 X X +windows_x86_64_gnu@0.52.6 X X +windows_x86_64_gnullvm@0.52.6 X X +windows_x86_64_msvc@0.52.6 X X +wit-bindgen@0.51.0 X X X +writeable@0.6.2 X +wyz@0.5.1 X +yoke@0.8.1 X +yoke-derive@0.8.1 X +zerocopy@0.8.47 X X X +zerocopy-derive@0.8.47 X X X +zerofrom@0.1.6 X +zerofrom-derive@0.1.6 X +zeroize@1.8.2 X X +zerotrie@0.2.3 X +zerovec@0.11.5 X +zerovec-derive@0.11.2 X +zmij@1.0.21 X +zstd@0.13.3 X +zstd-safe@7.2.4 X X +zstd-sys@2.0.16+zstd.1.5.7 X X diff --git a/fluss-rust/crates/examples/src/example_kv_table.rs b/fluss-rust/crates/examples/src/example_kv_table.rs new file mode 100644 index 0000000000..ad12ed79cf --- /dev/null +++ b/fluss-rust/crates/examples/src/example_kv_table.rs @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::Parser; +use fluss::client::FlussConnection; +use fluss::config::Config; +use fluss::error::Result; +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; +use fluss::row::{GenericRow, InternalRow}; + +#[tokio::main] +#[allow(dead_code)] +pub async fn main() -> Result<()> { + let mut config = Config::parse(); + config.bootstrap_servers = "127.0.0.1:9123".to_string(); + + let conn = FlussConnection::new(config).await?; + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .column("age", DataTypes::bigint()) + .primary_key(vec!["id"]) + .build()?, + ) + .build()?; + + let table_path = TablePath::new("fluss", "rust_upsert_lookup_example"); + + let admin = conn.get_admin()?; + admin + .create_table(&table_path, &table_descriptor, true) + .await?; + println!( + "Created KV Table:\n {}\n", + admin.get_table_info(&table_path).await? + ); + + let table = conn.get_table(&table_path).await?; + let table_upsert = table.new_upsert()?; + let upsert_writer = table_upsert.create_writer()?; + + println!("\n=== Upserting ==="); + for (id, name, age) in [(1, "Verso", 32i64), (2, "Noco", 25), (3, "Esquie", 35)] { + let mut row = GenericRow::new(3); + row.set_field(0, id); + row.set_field(1, name); + row.set_field(2, age); + upsert_writer.upsert(&row)?; + println!("Upserted: {row:?}"); + } + upsert_writer.flush().await?; + + println!("\n=== Looking up ==="); + let mut lookuper = table.new_lookup()?.create_lookuper()?; + + for id in 1..=3 { + let result = lookuper.lookup(&make_key(id)).await?; + let row = result.get_single_row()?.unwrap(); + println!( + "Found id={id}: name={}, age={}", + row.get_string(1)?, + row.get_long(2)? + ); + } + + println!("\n=== Updating ==="); + let mut row = GenericRow::new(3); + row.set_field(0, 1); + row.set_field(1, "Verso"); + row.set_field(2, 33i64); + upsert_writer.upsert(&row)?.await?; + println!("Updated: {row:?}"); + + let result = lookuper.lookup(&make_key(1)).await?; + let row = result.get_single_row()?.unwrap(); + println!( + "Verified update: name={}, age={}", + row.get_string(1)?, + row.get_long(2)? + ); + + println!("\n=== Deleting ==="); + // For delete, only primary key field needs to be set; other fields can remain null + let mut row = GenericRow::new(3); + row.set_field(0, 2); + upsert_writer.delete(&row)?.await?; + println!("Deleted row with id=2"); + + let result = lookuper.lookup(&make_key(2)).await?; + if result.get_single_row()?.is_none() { + println!("Verified deletion"); + } + + Ok(()) +} + +fn make_key(id: i32) -> GenericRow<'static> { + let mut row = GenericRow::new(1); + row.set_field(0, id); + row +} diff --git a/fluss-rust/crates/examples/src/example_partitioned_kv_table.rs b/fluss-rust/crates/examples/src/example_partitioned_kv_table.rs new file mode 100644 index 0000000000..944d8d4962 --- /dev/null +++ b/fluss-rust/crates/examples/src/example_partitioned_kv_table.rs @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::Parser; +use fluss::client::{FlussAdmin, FlussConnection}; +use fluss::config::Config; +use fluss::error::Result; +use fluss::metadata::{DataTypes, PartitionSpec, Schema, TableDescriptor, TablePath}; +use fluss::row::{GenericRow, InternalRow}; +use std::collections::HashMap; + +#[tokio::main] +#[allow(dead_code)] +pub async fn main() -> Result<()> { + let mut config = Config::parse(); + config.bootstrap_servers = "127.0.0.1:9123".to_string(); + + let conn = FlussConnection::new(config).await?; + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("id", DataTypes::int()) + .column("region", DataTypes::string()) + .column("zone", DataTypes::bigint()) + .column("score", DataTypes::bigint()) + .primary_key(vec!["id", "region", "zone"]) + .build()?, + ) + .partitioned_by(vec!["region", "zone"]) + .build()?; + + let table_path = TablePath::new("fluss", "partitioned_kv_example"); + + let admin = conn.get_admin()?; + admin + .create_table(&table_path, &table_descriptor, true) + .await?; + println!( + "Created KV Table:\n {}\n", + admin.get_table_info(&table_path).await? + ); + + create_partition(&table_path, &admin, "APAC", 1).await; + create_partition(&table_path, &admin, "EMEA", 2).await; + create_partition(&table_path, &admin, "US", 3).await; + + let table = conn.get_table(&table_path).await?; + let table_upsert = table.new_upsert()?; + let upsert_writer = table_upsert.create_writer()?; + + println!("\n=== Upserting ==="); + for (id, region, zone, score) in [ + (1001, "APAC", 1i64, 1234i64), + (1002, "EMEA", 2, 2234), + (1003, "US", 3, 3234), + ] { + let mut row = GenericRow::new(4); + row.set_field(0, id); + row.set_field(1, region); + row.set_field(2, zone); + row.set_field(3, score); + upsert_writer.upsert(&row)?; + println!("Upserted: {row:?}"); + } + upsert_writer.flush().await?; + + println!("\n=== Looking up ==="); + let mut lookuper = table.new_lookup()?.create_lookuper()?; + + for (id, region, zone) in [(1001, "APAC", 1i64), (1002, "EMEA", 2), (1003, "US", 3)] { + let result = lookuper + .lookup(&make_key(id, region, zone)) + .await + .expect("lookup"); + let row = result.get_single_row()?.unwrap(); + println!( + "Found id={id}: region={}, zone={}, score={}", + row.get_string(1)?, + row.get_long(2)?, + row.get_long(3)? + ); + } + + println!("\n=== Updating ==="); + let mut row = GenericRow::new(4); + row.set_field(0, 1001); + row.set_field(1, "APAC"); + row.set_field(2, 1i64); + row.set_field(3, 4321i64); + upsert_writer.upsert(&row)?.await?; + println!("Updated: {row:?}"); + + let result = lookuper.lookup(&make_key(1001, "APAC", 1)).await?; + let row = result.get_single_row()?.unwrap(); + println!( + "Verified update: region={}, zone={}", + row.get_string(1)?, + row.get_long(2)? + ); + + println!("\n=== Deleting ==="); + let mut row = GenericRow::new(4); + row.set_field(0, 1002); + row.set_field(1, "EMEA"); + row.set_field(2, 2i64); + upsert_writer.delete(&row)?.await?; + println!("Deleted: {row:?}"); + + let result = lookuper.lookup(&make_key(1002, "EMEA", 2)).await?; + if result.get_single_row()?.is_none() { + println!("Verified deletion"); + } + + Ok(()) +} + +async fn create_partition(table_path: &TablePath, admin: &FlussAdmin, region: &str, zone: i64) { + let mut partition_values = HashMap::new(); + partition_values.insert("region".to_string(), region.to_string()); + partition_values.insert("zone".to_string(), zone.to_string()); + let partition_spec = PartitionSpec::new(partition_values); + + admin + .create_partition(table_path, &partition_spec, true) + .await + .unwrap(); +} + +fn make_key(id: i32, region: &str, zone: i64) -> GenericRow<'static> { + let mut row = GenericRow::new(4); + row.set_field(0, id); + row.set_field(1, region.to_string()); + row.set_field(2, zone); + row +} diff --git a/fluss-rust/crates/examples/src/example_partitioned_prefix_lookup.rs b/fluss-rust/crates/examples/src/example_partitioned_prefix_lookup.rs new file mode 100644 index 0000000000..b212b0fd45 --- /dev/null +++ b/fluss-rust/crates/examples/src/example_partitioned_prefix_lookup.rs @@ -0,0 +1,144 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::Parser; +use fluss::client::{FlussAdmin, FlussConnection}; +use fluss::config::Config; +use fluss::error::Result; +use fluss::metadata::{DataTypes, PartitionSpec, Schema, TableDescriptor, TablePath}; +use fluss::row::{GenericRow, InternalRow}; +use std::collections::HashMap; + +#[tokio::main] +#[allow(dead_code)] +pub async fn main() -> Result<()> { + let mut config = Config::parse(); + config.bootstrap_servers = "127.0.0.1:9123".to_string(); + + let conn = FlussConnection::new(config).await?; + + // Partitioned schema: pk is (region, user_id, session_id, event_seq), + // `region` is the partition key, and the bucket key (user_id, session_id) + // is a prefix of the *non-partition* portion of the primary key — which is + // the condition for prefix lookup on a partitioned table. The lookup + // key must include the partition column(s) in addition to the bucket + // prefix, so we look up by (region, user_id, session_id). + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("region", DataTypes::string()) + .column("user_id", DataTypes::int()) + .column("session_id", DataTypes::string()) + .column("event_seq", DataTypes::bigint()) + .column("event_data", DataTypes::string()) + .primary_key(vec!["region", "user_id", "session_id", "event_seq"]) + .build()?, + ) + .partitioned_by(vec!["region"]) + .distributed_by( + Some(3), + vec!["user_id".to_string(), "session_id".to_string()], + ) + .build()?; + + let table_path = TablePath::new("fluss", "rust_partitioned_prefix_lookup_example"); + + let admin = conn.get_admin()?; + admin + .create_table(&table_path, &table_descriptor, true) + .await?; + println!( + "Created partitioned KV Table:\n {}\n", + admin.get_table_info(&table_path).await? + ); + + create_partition(&table_path, &admin, "US").await; + create_partition(&table_path, &admin, "EU").await; + + let table = conn.get_table(&table_path).await?; + let table_upsert = table.new_upsert()?; + let upsert_writer = table_upsert.create_writer()?; + + println!("\n=== Upserting session events ==="); + for (region, user_id, session_id, event_seq, event_data) in [ + ("US", 1, "sess-a", 1i64, "open"), + ("US", 1, "sess-a", 2, "click"), + ("US", 1, "sess-a", 3, "close"), + ("US", 2, "sess-b", 1, "open"), + ("EU", 1, "sess-a", 1, "open"), + ] { + let mut row = GenericRow::new(5); + row.set_field(0, region); + row.set_field(1, user_id); + row.set_field(2, session_id); + row.set_field(3, event_seq); + row.set_field(4, event_data); + upsert_writer.upsert(&row)?; + println!("Upserted: {row:?}"); + } + upsert_writer.flush().await?; + + println!("\n=== Prefix lookup by (region, user_id, session_id) ==="); + let mut prefix_lookuper = table + .new_lookup()? + .lookup_by(vec![ + "region".to_string(), + "user_id".to_string(), + "session_id".to_string(), + ]) + .create_lookuper()?; + + for (region, user_id, session_id) in [ + ("US", 1, "sess-a"), + ("US", 2, "sess-b"), + ("EU", 1, "sess-a"), + ("EU", 1, "sess-missing"), + ] { + let result = prefix_lookuper + .lookup(&make_prefix(region, user_id, session_id)) + .await?; + let rows = result.get_rows()?; + println!( + "region={region}, user_id={user_id}, session_id={session_id}: {} event(s)", + rows.len() + ); + for row in &rows { + println!(" seq={}, data={}", row.get_long(3)?, row.get_string(4)?); + } + } + + Ok(()) +} + +async fn create_partition(table_path: &TablePath, admin: &FlussAdmin, region: &str) { + let mut partition_values = HashMap::new(); + partition_values.insert("region".to_string(), region.to_string()); + let partition_spec = PartitionSpec::new(partition_values); + + admin + .create_partition(table_path, &partition_spec, true) + .await + .unwrap(); +} + +fn make_prefix(region: &str, user_id: i32, session_id: &str) -> GenericRow<'static> { + let mut row = GenericRow::new(3); + row.set_field(0, region.to_string()); + row.set_field(1, user_id); + row.set_field(2, session_id.to_string()); + row +} diff --git a/fluss-rust/crates/examples/src/example_prefix_lookup.rs b/fluss-rust/crates/examples/src/example_prefix_lookup.rs new file mode 100644 index 0000000000..12fc76dc13 --- /dev/null +++ b/fluss-rust/crates/examples/src/example_prefix_lookup.rs @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::Parser; +use fluss::client::FlussConnection; +use fluss::config::Config; +use fluss::error::Result; +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; +use fluss::row::{GenericRow, InternalRow}; + +#[tokio::main] +#[allow(dead_code)] +pub async fn main() -> Result<()> { + let mut config = Config::parse(); + config.bootstrap_servers = "127.0.0.1:9123".to_string(); + + let conn = FlussConnection::new(config).await?; + + // Schema: primary key is (user_id, session_id, event_seq); the bucket key + // (user_id, session_id) is a strict prefix of the primary key, which is + // what enables prefix lookup. + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("user_id", DataTypes::int()) + .column("session_id", DataTypes::string()) + .column("event_seq", DataTypes::bigint()) + .column("event_data", DataTypes::string()) + .primary_key(vec!["user_id", "session_id", "event_seq"]) + .build()?, + ) + .distributed_by( + Some(3), + vec!["user_id".to_string(), "session_id".to_string()], + ) + .build()?; + + let table_path = TablePath::new("fluss", "rust_prefix_lookup_example"); + + let admin = conn.get_admin()?; + admin + .create_table(&table_path, &table_descriptor, true) + .await?; + println!( + "Created KV Table:\n {}\n", + admin.get_table_info(&table_path).await? + ); + + let table = conn.get_table(&table_path).await?; + let table_upsert = table.new_upsert()?; + let upsert_writer = table_upsert.create_writer()?; + + println!("\n=== Upserting session events ==="); + for (user_id, session_id, event_seq, event_data) in [ + (1, "sess-a", 1i64, "open"), + (1, "sess-a", 2, "click"), + (1, "sess-a", 3, "close"), + (1, "sess-b", 1, "open"), + (2, "sess-c", 1, "open"), + ] { + let mut row = GenericRow::new(4); + row.set_field(0, user_id); + row.set_field(1, session_id); + row.set_field(2, event_seq); + row.set_field(3, event_data); + upsert_writer.upsert(&row)?; + println!("Upserted: {row:?}"); + } + upsert_writer.flush().await?; + + println!("\n=== Prefix lookup by (user_id, session_id) ==="); + // `lookup_by` names the prefix columns. The resulting lookuper returns all + // rows whose primary key starts with the given prefix. + let mut prefix_lookuper = table + .new_lookup()? + .lookup_by(vec!["user_id".to_string(), "session_id".to_string()]) + .create_lookuper()?; + + for (user_id, session_id) in [ + (1, "sess-a"), + (1, "sess-b"), + (2, "sess-c"), + (2, "sess-missing"), + ] { + let result = prefix_lookuper + .lookup(&make_prefix(user_id, session_id)) + .await?; + let rows = result.get_rows()?; + println!( + "user_id={user_id}, session_id={session_id}: {} event(s)", + rows.len() + ); + for row in &rows { + println!(" seq={}, data={}", row.get_long(2)?, row.get_string(3)?); + } + } + + Ok(()) +} + +fn make_prefix(user_id: i32, session_id: &str) -> GenericRow<'static> { + let mut row = GenericRow::new(2); + row.set_field(0, user_id); + row.set_field(1, session_id.to_string()); + row +} diff --git a/fluss-rust/crates/examples/src/example_table.rs b/fluss-rust/crates/examples/src/example_table.rs new file mode 100644 index 0000000000..1f751f3c98 --- /dev/null +++ b/fluss-rust/crates/examples/src/example_table.rs @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[cfg(not(target_env = "msvc"))] +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + +mod example_kv_table; +mod example_partitioned_kv_table; + +use clap::Parser; +use fluss::client::FlussConnection; +use fluss::config::Config; +use fluss::error::Result; +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; +use fluss::row::{GenericRow, InternalRow}; +use std::time::Duration; + +#[tokio::main] +pub async fn main() -> Result<()> { + let mut config = Config::parse(); + config.bootstrap_servers = "127.0.0.1:9123".to_string(); + + let conn = FlussConnection::new(config).await?; + + let table_descriptor = TableDescriptor::builder() + .schema( + Schema::builder() + .column("c1", DataTypes::int()) + .column("c2", DataTypes::string()) + .column("c3", DataTypes::bigint()) + .build()?, + ) + .build()?; + + let table_path = TablePath::new("fluss", "rust_test_long"); + + let admin = conn.get_admin()?; + + admin + .create_table(&table_path, &table_descriptor, true) + .await?; + + // 2: get the table + let table_info = admin.get_table_info(&table_path).await?; + print!("Get created table:\n {table_info}\n"); + + // write row + let mut row = GenericRow::new(3); + row.set_field(0, 22222); + row.set_field(1, "t2t"); + row.set_field(2, 123_456_789_123i64); + + let table = conn.get_table(&table_path).await?; + let append_writer = table.new_append()?.create_writer()?; + // Fire-and-forget: queue writes then flush + append_writer.append(&row)?; + let mut row = GenericRow::new(3); + row.set_field(0, 233333); + row.set_field(1, "tt44"); + row.set_field(2, 987_654_321_987i64); + append_writer.append(&row)?; + append_writer.flush().await?; + + // scan rows + let log_scanner = table.new_scan().create_log_scanner()?; + log_scanner.subscribe(0, 0).await?; + + loop { + let scan_records = log_scanner.poll(Duration::from_secs(10)).await?; + println!("Start to poll records......"); + for record in scan_records { + let row = record.row(); + println!( + "{{{}, {}, {}}}@{}", + row.get_int(0)?, + row.get_string(1)?, + row.get_long(2)?, + record.offset() + ); + } + } +} diff --git a/fluss-rust/crates/fluss-test-cluster/Cargo.toml b/fluss-rust/crates/fluss-test-cluster/Cargo.toml new file mode 100644 index 0000000000..977df307b8 --- /dev/null +++ b/fluss-rust/crates/fluss-test-cluster/Cargo.toml @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "fluss-test-cluster" +edition.workspace = true +version.workspace = true +license.workspace = true +rust-version.workspace = true +publish = false + +[[bin]] +name = "fluss-test-cluster" +path = "src/main.rs" + +[dependencies] +fluss = { workspace = true } +testcontainers = "0.27.2" +tokio = { workspace = true } +clap = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } diff --git a/fluss-rust/crates/fluss-test-cluster/build.rs b/fluss-rust/crates/fluss-test-cluster/build.rs new file mode 100644 index 0000000000..0145196bc3 --- /dev/null +++ b/fluss-rust/crates/fluss-test-cluster/build.rs @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +fn main() { + println!("cargo:rerun-if-changed=test-images.env"); + for line in std::fs::read_to_string("test-images.env") + .expect("test-images.env not found") + .lines() + { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + if let Some((key, value)) = line.split_once('=') { + println!("cargo:rustc-env={}={}", key.trim(), value.trim()); + } + } +} diff --git a/fluss-rust/crates/fluss-test-cluster/src/lib.rs b/fluss-rust/crates/fluss-test-cluster/src/lib.rs new file mode 100644 index 0000000000..76199f7ed7 --- /dev/null +++ b/fluss-rust/crates/fluss-test-cluster/src/lib.rs @@ -0,0 +1,515 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use fluss::client::FlussConnection; +use fluss::config::Config; +use std::collections::HashMap; +use std::mem::ManuallyDrop; +use std::sync::Arc; +use std::time::Duration; +use testcontainers::core::ContainerPort; +use testcontainers::runners::AsyncRunner; +use testcontainers::{ContainerAsync, GenericImage, ImageExt}; + +pub const FLUSS_IMAGE: &str = env!("FLUSS_IMAGE"); +pub const FLUSS_VERSION: &str = env!("FLUSS_VERSION"); +pub const ZOOKEEPER_IMAGE: &str = env!("ZOOKEEPER_IMAGE"); +pub const ZOOKEEPER_VERSION: &str = env!("ZOOKEEPER_VERSION"); + +#[derive(serde::Serialize, serde::Deserialize, Debug)] +pub struct ClusterInfo { + pub bootstrap_servers: String, + pub sasl_bootstrap_servers: Option, +} + +pub struct FlussTestingClusterBuilder { + number_of_tablet_servers: u16, + network: &'static str, + cluster_conf: HashMap, + testing_name: String, + remote_data_dir: Option, + sasl_enabled: bool, + sasl_users: Vec<(String, String)>, + coordinator_host_port: u16, + plain_client_port: Option, + image: String, + image_tag: String, +} + +impl FlussTestingClusterBuilder { + pub fn new(testing_name: impl Into) -> Self { + Self::new_with_cluster_conf(testing_name.into(), &HashMap::default()) + } + + pub fn with_remote_data_dir(mut self, dir: std::path::PathBuf) -> Self { + std::fs::create_dir_all(&dir).expect("Failed to create remote data directory"); + self.remote_data_dir = Some(dir); + self + } + + pub fn with_sasl(mut self, users: Vec<(String, String)>) -> Self { + self.sasl_enabled = true; + self.sasl_users = users; + self.plain_client_port = Some(self.coordinator_host_port + 100); + self + } + + pub fn with_port(mut self, port: u16) -> Self { + self.coordinator_host_port = port; + // Re-derive SASL port if SASL was already enabled. + if self.sasl_enabled { + self.plain_client_port = Some(port + 100); + } + self + } + + pub fn new_with_cluster_conf( + testing_name: impl Into, + conf: &HashMap, + ) -> Self { + let mut cluster_conf = conf.clone(); + cluster_conf.insert( + "netty.server.num-network-threads".to_string(), + "1".to_string(), + ); + cluster_conf.insert( + "netty.server.num-worker-threads".to_string(), + "3".to_string(), + ); + + FlussTestingClusterBuilder { + number_of_tablet_servers: 1, + cluster_conf, + network: "fluss-cluster-network", + testing_name: testing_name.into(), + remote_data_dir: None, + sasl_enabled: false, + sasl_users: Vec::new(), + coordinator_host_port: 9123, + plain_client_port: None, + // runtime env overrides the compile-time default (server-compat CI lane) + image: std::env::var("FLUSS_IMAGE").unwrap_or_else(|_| FLUSS_IMAGE.to_string()), + image_tag: std::env::var("FLUSS_VERSION").unwrap_or_else(|_| FLUSS_VERSION.to_string()), + } + } + + fn tablet_server_container_name(&self, server_id: u16) -> String { + format!("tablet-server-{}-{}", self.testing_name, server_id) + } + + fn coordinator_server_container_name(&self) -> String { + format!("coordinator-server-{}", self.testing_name) + } + + fn zookeeper_container_name(&self) -> String { + format!("zookeeper-{}", self.testing_name) + } + + fn container_names(&self) -> Vec { + std::iter::once(self.zookeeper_container_name()) + .chain(std::iter::once(self.coordinator_server_container_name())) + .chain( + (0..self.number_of_tablet_servers).map(|id| self.tablet_server_container_name(id)), + ) + .collect() + } + + fn inject_sasl_conf(&mut self) { + if self.sasl_enabled + && !self.sasl_users.is_empty() + && !self.cluster_conf.contains_key("security.protocol.map") + { + self.cluster_conf.insert( + "security.protocol.map".to_string(), + "CLIENT:sasl".to_string(), + ); + self.cluster_conf.insert( + "security.sasl.enabled.mechanisms".to_string(), + "plain".to_string(), + ); + let user_entries: Vec = self + .sasl_users + .iter() + .map(|(u, p)| format!("user_{}=\"{}\"", u, p)) + .collect(); + let jaas_config = format!( + "org.apache.fluss.security.auth.sasl.plain.PlainLoginModule required {};", + user_entries.join(" ") + ); + self.cluster_conf + .insert("security.sasl.plain.jaas.config".to_string(), jaas_config); + } + } + + fn bootstrap_addresses(&self) -> (String, Option) { + if let Some(plain_port) = self.plain_client_port { + ( + format!("127.0.0.1:{}", plain_port), + Some(format!("127.0.0.1:{}", self.coordinator_host_port)), + ) + } else { + (format!("127.0.0.1:{}", self.coordinator_host_port), None) + } + } + + fn all_containers_exist(&self) -> bool { + self.container_names().iter().all(|name| { + std::process::Command::new("docker") + .args(["ps", "-q", "--filter", &format!("name=^{}$", name)]) + .output() + .map(|o| !String::from_utf8_lossy(&o.stdout).trim().is_empty()) + .unwrap_or(false) + }) + } + + async fn start_all_containers(&mut self) -> Vec> { + for name in &self.container_names() { + let _ = std::process::Command::new("docker") + .args(["rm", "-f", name]) + .output(); + } + self.inject_sasl_conf(); + + let mut containers = Vec::new(); + containers.push(self.start_zookeeper().await); + containers.push(self.start_coordinator_server().await); + for server_id in 0..self.number_of_tablet_servers { + containers.push(self.start_tablet_server(server_id).await); + } + containers + } + + /// Containers stop when the returned struct is dropped. + pub async fn build(&mut self) -> FlussTestingCluster { + let container_names = self.container_names(); + let containers = self.start_all_containers().await; + + let mut iter = containers.into_iter(); + let zookeeper = Arc::new(iter.next().unwrap()); + let coordinator_server = Arc::new(iter.next().unwrap()); + let mut tablet_servers = HashMap::new(); + for server_id in 0..self.number_of_tablet_servers { + tablet_servers.insert(server_id, Arc::new(iter.next().unwrap())); + } + + let (bootstrap_servers, sasl_bootstrap_servers) = self.bootstrap_addresses(); + + FlussTestingCluster { + zookeeper, + coordinator_server, + tablet_servers, + bootstrap_servers, + sasl_bootstrap_servers, + remote_data_dir: self.remote_data_dir.clone(), + sasl_users: self.sasl_users.clone(), + container_names, + } + } + + /// Containers outlive the process. Clean up via `stop_cluster()`. + /// Idempotent: if the cluster is already running, returns its info. + pub async fn build_detached(&mut self) -> ClusterInfo { + if !self.all_containers_exist() { + let containers = self.start_all_containers().await; + let _ = ManuallyDrop::new(containers); + } + + let (bootstrap_servers, sasl_bootstrap_servers) = self.bootstrap_addresses(); + ClusterInfo { + bootstrap_servers, + sasl_bootstrap_servers, + } + } + + async fn start_zookeeper(&self) -> ContainerAsync { + GenericImage::new(ZOOKEEPER_IMAGE, ZOOKEEPER_VERSION) + .with_network(self.network) + .with_container_name(self.zookeeper_container_name()) + .start() + .await + .unwrap() + } + + async fn start_coordinator_server(&mut self) -> ContainerAsync { + let port = self.coordinator_host_port; + let container_name = self.coordinator_server_container_name(); + let mut coordinator_confs = HashMap::new(); + coordinator_confs.insert( + "zookeeper.address", + format!("{}:2181", self.zookeeper_container_name()), + ); + + if let Some(plain_port) = self.plain_client_port { + coordinator_confs.insert( + "bind.listeners", + format!( + "INTERNAL://{}:0, CLIENT://{}:{}, PLAIN_CLIENT://{}:{}", + container_name, container_name, port, container_name, plain_port + ), + ); + coordinator_confs.insert( + "advertised.listeners", + format!( + "CLIENT://localhost:{}, PLAIN_CLIENT://localhost:{}", + port, plain_port + ), + ); + } else { + coordinator_confs.insert( + "bind.listeners", + format!( + "INTERNAL://{}:0, CLIENT://{}:{}", + container_name, container_name, port + ), + ); + coordinator_confs.insert( + "advertised.listeners", + format!("CLIENT://localhost:{}", port), + ); + } + + coordinator_confs.insert("internal.listener.name", "INTERNAL".to_string()); + + let mut image = GenericImage::new(&self.image, &self.image_tag) + .with_container_name(self.coordinator_server_container_name()) + .with_mapped_port(port, ContainerPort::Tcp(port)) + .with_network(self.network) + .with_cmd(vec!["coordinatorServer"]) + .with_env_var( + "FLUSS_PROPERTIES", + self.to_fluss_properties_with(coordinator_confs), + ); + + if let Some(plain_port) = self.plain_client_port { + image = image.with_mapped_port(plain_port, ContainerPort::Tcp(plain_port)); + } + + image.start().await.unwrap() + } + + async fn start_tablet_server(&self, server_id: u16) -> ContainerAsync { + let port = self.coordinator_host_port; + let container_name = self.tablet_server_container_name(server_id); + let mut tablet_server_confs = HashMap::new(); + let expose_host_port = port + 1 + server_id; + let tablet_server_id = format!("{}", server_id); + + if let Some(plain_port) = self.plain_client_port { + let bind_listeners = format!( + "INTERNAL://{}:0, CLIENT://{}:{}, PLAIN_CLIENT://{}:{}", + container_name, container_name, port, container_name, plain_port, + ); + let plain_expose_host_port = plain_port + 1 + server_id; + let advertised_listeners = format!( + "CLIENT://localhost:{}, PLAIN_CLIENT://localhost:{}", + expose_host_port, plain_expose_host_port + ); + tablet_server_confs.insert("bind.listeners", bind_listeners); + tablet_server_confs.insert("advertised.listeners", advertised_listeners); + } else { + let bind_listeners = format!( + "INTERNAL://{}:0, CLIENT://{}:{}", + container_name, container_name, port, + ); + let advertised_listeners = format!("CLIENT://localhost:{}", expose_host_port); + tablet_server_confs.insert("bind.listeners", bind_listeners); + tablet_server_confs.insert("advertised.listeners", advertised_listeners); + } + + tablet_server_confs.insert( + "zookeeper.address", + format!("{}:2181", self.zookeeper_container_name()), + ); + tablet_server_confs.insert("internal.listener.name", "INTERNAL".to_string()); + tablet_server_confs.insert("tablet-server.id", tablet_server_id); + + if let Some(remote_data_dir) = &self.remote_data_dir { + tablet_server_confs.insert( + "remote.data.dir", + remote_data_dir.to_string_lossy().to_string(), + ); + } + let mut image = GenericImage::new(&self.image, &self.image_tag) + .with_cmd(vec!["tabletServer"]) + .with_mapped_port(expose_host_port, ContainerPort::Tcp(port)) + .with_network(self.network) + .with_container_name(self.tablet_server_container_name(server_id)) + .with_env_var( + "FLUSS_PROPERTIES", + self.to_fluss_properties_with(tablet_server_confs), + ); + + if let Some(plain_port) = self.plain_client_port { + let plain_expose_host_port = plain_port + 1 + server_id; + image = image.with_mapped_port(plain_expose_host_port, ContainerPort::Tcp(plain_port)); + } + + if let Some(ref remote_data_dir) = self.remote_data_dir { + use testcontainers::core::Mount; + std::fs::create_dir_all(remote_data_dir) + .expect("Failed to create remote data directory for mount"); + let host_path = remote_data_dir.to_string_lossy().to_string(); + let container_path = remote_data_dir.to_string_lossy().to_string(); + image = image.with_mount(Mount::bind_mount(host_path, container_path)); + } + + image.start().await.unwrap() + } + + fn to_fluss_properties_with(&self, extra_properties: HashMap<&str, String>) -> String { + let mut fluss_properties = Vec::new(); + for (k, v) in self.cluster_conf.iter() { + fluss_properties.push(format!("{}: {}", k, v)); + } + for (k, v) in extra_properties.iter() { + fluss_properties.push(format!("{}: {}", k, v)); + } + fluss_properties.join("\n") + } +} + +#[derive(Clone)] +#[allow(dead_code)] // Fields held for RAII. +pub struct FlussTestingCluster { + zookeeper: Arc>, + coordinator_server: Arc>, + tablet_servers: HashMap>>, + bootstrap_servers: String, + sasl_bootstrap_servers: Option, + remote_data_dir: Option, + sasl_users: Vec<(String, String)>, + container_names: Vec, +} + +impl FlussTestingCluster { + pub fn stop(&self) { + for name in &self.container_names { + let _ = std::process::Command::new("docker") + .args(["rm", "-f", name]) + .output(); + } + if let Some(ref dir) = self.remote_data_dir { + let _ = std::fs::remove_dir_all(dir); + } + } + + pub fn sasl_users(&self) -> &[(String, String)] { + &self.sasl_users + } + + pub fn plaintext_bootstrap_servers(&self) -> &str { + &self.bootstrap_servers + } + + pub async fn get_fluss_connection(&self) -> FlussConnection { + let config = Config { + writer_acks: "all".to_string(), + bootstrap_servers: self.bootstrap_servers.clone(), + ..Default::default() + }; + + self.connect_with_retry(config).await + } + + pub async fn get_fluss_connection_with_sasl( + &self, + username: &str, + password: &str, + ) -> FlussConnection { + let bootstrap = self + .sasl_bootstrap_servers + .clone() + .unwrap_or_else(|| self.bootstrap_servers.clone()); + let config = Config { + writer_acks: "all".to_string(), + bootstrap_servers: bootstrap, + security_protocol: "sasl".to_string(), + security_sasl_mechanism: "PLAIN".to_string(), + security_sasl_username: username.to_string(), + security_sasl_password: password.to_string(), + ..Default::default() + }; + + self.connect_with_retry(config).await + } + + pub async fn try_fluss_connection_with_sasl( + &self, + username: &str, + password: &str, + ) -> fluss::error::Result { + let bootstrap = self + .sasl_bootstrap_servers + .clone() + .unwrap_or_else(|| self.bootstrap_servers.clone()); + let config = Config { + writer_acks: "all".to_string(), + bootstrap_servers: bootstrap, + security_protocol: "sasl".to_string(), + security_sasl_mechanism: "PLAIN".to_string(), + security_sasl_username: username.to_string(), + security_sasl_password: password.to_string(), + ..Default::default() + }; + + FlussConnection::new(config).await + } + + async fn connect_with_retry(&self, config: Config) -> FlussConnection { + let max_retries = 60; + let retry_interval = Duration::from_secs(1); + + for attempt in 1..=max_retries { + match FlussConnection::new(config.clone()).await { + Ok(connection) => { + return connection; + } + Err(e) => { + if attempt == max_retries { + panic!( + "Failed to connect to Fluss cluster after {} attempts: {}", + max_retries, e + ); + } + tokio::time::sleep(retry_interval).await; + } + } + } + unreachable!() + } +} + +pub fn stop_cluster(name: &str) { + let prefixes = [ + format!("zookeeper-{}", name), + format!("coordinator-server-{}", name), + format!("tablet-server-{}-", name), + ]; + for prefix in &prefixes { + if let Ok(output) = std::process::Command::new("docker") + .args(["ps", "-aq", "--filter", &format!("name={}", prefix)]) + .output() + { + let ids = String::from_utf8_lossy(&output.stdout); + for id in ids.split_whitespace() { + let _ = std::process::Command::new("docker") + .args(["rm", "-f", id]) + .output(); + } + } + } +} diff --git a/fluss-rust/crates/fluss-test-cluster/src/main.rs b/fluss-rust/crates/fluss-test-cluster/src/main.rs new file mode 100644 index 0000000000..fc3a19f60e --- /dev/null +++ b/fluss-rust/crates/fluss-test-cluster/src/main.rs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::{Parser, Subcommand}; +use fluss::ServerType; +use fluss::config::Config; +use fluss_test_cluster::FlussTestingClusterBuilder; +use std::time::Duration; + +#[derive(Parser)] +#[command(about = "Manage a Fluss test cluster via testcontainers")] +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Subcommand)] +enum Command { + /// Start a Fluss test cluster (idempotent). Prints cluster info as JSON to stdout. + Start { + #[arg(long, default_value = "shared-test")] + name: String, + #[arg(long)] + sasl: bool, + #[arg(long, default_value_t = 9123)] + port: u16, + }, + /// Stop and remove all containers for a cluster. + Stop { + #[arg(long, default_value = "shared-test")] + name: String, + }, +} + +#[tokio::main] +async fn main() { + let cli = Cli::parse(); + + match cli.command { + Command::Start { name, sasl, port } => { + eprintln!("Starting Fluss test cluster '{}'...", name); + + let mut builder = FlussTestingClusterBuilder::new(&name).with_port(port); + + if sasl { + builder = builder.with_sasl(vec![ + ("admin".to_string(), "admin-secret".to_string()), + ("alice".to_string(), "alice-secret".to_string()), + ]); + } + + let info = builder.build_detached().await; + let start = std::time::Instant::now(); + + // Check plaintext endpoint only — can't verify SASL without credentials. + eprintln!("Waiting for cluster to be ready..."); + loop { + let config = Config { + bootstrap_servers: info.bootstrap_servers.clone(), + ..Default::default() + }; + if let Ok(conn) = fluss::client::FlussConnection::new(config).await { + if let Ok(admin) = conn.get_admin() { + if let Ok(nodes) = admin.get_server_nodes().await { + if nodes + .iter() + .any(|n| *n.server_type() == ServerType::TabletServer) + { + break; + } + } + } + } + if start.elapsed() >= Duration::from_secs(60) { + eprintln!("TIMEOUT: cluster did not become ready within 60s"); + std::process::exit(1); + } + tokio::time::sleep(Duration::from_secs(1)).await; + } + eprintln!("Cluster ready."); + println!("CLUSTER_JSON: {}", serde_json::to_string(&info).unwrap()); + } + Command::Stop { name } => { + eprintln!("Stopping Fluss test cluster '{}'...", name); + fluss_test_cluster::stop_cluster(&name); + eprintln!("Cluster stopped."); + } + } +} diff --git a/fluss-rust/crates/fluss-test-cluster/test-images.env b/fluss-rust/crates/fluss-test-cluster/test-images.env new file mode 100644 index 0000000000..5cd914172c --- /dev/null +++ b/fluss-rust/crates/fluss-test-cluster/test-images.env @@ -0,0 +1,4 @@ +FLUSS_IMAGE=apache/fluss +FLUSS_VERSION=0.9.1-incubating +ZOOKEEPER_IMAGE=zookeeper +ZOOKEEPER_VERSION=3.9.2 diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml new file mode 100644 index 0000000000..feac8309f1 --- /dev/null +++ b/fluss-rust/crates/fluss/Cargo.toml @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +edition = { workspace = true } +license.workspace = true +rust-version = { workspace = true } +version = { workspace = true } +name = "fluss-rs" +authors = { workspace = true } +description = "The official rust client of Apache Fluss (Incubating)" +homepage = "https://clients.fluss.apache.org/user-guide/rust/installation/" +repository = { workspace = true } +keywords = { workspace = true } +categories = { workspace = true } +documentation = "https://docs.rs/fluss-rs" + +[lib] +name = "fluss" + +[features] +default = ["storage-memory", "storage-fs"] +storage-all = ["storage-memory", "storage-fs", "storage-s3", "storage-oss"] + +storage-memory = ["opendal/services-memory"] +storage-fs = ["opendal/services-fs"] +storage-s3 = ["opendal/services-s3"] +storage-oss = ["opendal/services-oss"] +integration_tests = [] + +[dependencies] +arrow = { workspace = true } +arrow-schema = "57.0.0" +bitvec = "1" +byteorder = "1.5" +futures = "0.3" +clap = { workspace = true } +crc32c = "0.6.8" +linked-hash-map = "0.5.6" +prost = "0.14" +rand = "0.9.3" +serde = { workspace = true, features = ["rc"] } +serde_json = { workspace = true } +thiserror = "1.0" +log = { version = "0.4", features = ["kv_std"] } +metrics = { workspace = true } +tokio = { workspace = true } +parking_lot = "0.12" +bytes = "1.10.1" +dashmap = "6.1.0" +bigdecimal = { workspace = true, features = ["serde"] } +ordered-float = { version = "5", features = ["serde"] } +parse-display = "0.10" +jiff = { workspace = true } +opendal = "0.55.0" +url = "2.5.7" +uuid = { version = "1.10", features = ["v4"] } +tempfile = "3.23.0" +snafu = "0.8.3" +scopeguard = "1.2.0" +delegate = "0.13.5" +strum = "0.26" +strum_macros = "0.26" + +[target.'cfg(target_arch = "wasm32")'.dependencies] +jiff = { workspace = true, features = ["js"] } + +[dev-dependencies] +metrics-util = "0.20" +fluss-test-cluster = { path = "../fluss-test-cluster" } + +[build-dependencies] +prost-build = "0.14" diff --git a/fluss-rust/crates/fluss/DEPENDENCIES.rust.tsv b/fluss-rust/crates/fluss/DEPENDENCIES.rust.tsv new file mode 100644 index 0000000000..85a865852a --- /dev/null +++ b/fluss-rust/crates/fluss/DEPENDENCIES.rust.tsv @@ -0,0 +1,297 @@ +crate Apache-2.0 Apache-2.0 WITH LLVM-exception BSD-2-Clause BSD-3-Clause BSL-1.0 CC0-1.0 CDLA-Permissive-2.0 ISC LGPL-2.1-or-later MIT Unicode-3.0 Unlicense Zlib +ahash@0.8.12 X X +aho-corasick@1.1.4 X X +android_system_properties@0.1.5 X X +anstream@1.0.0 X X +anstyle@1.0.14 X X +anstyle-parse@1.0.0 X X +anstyle-query@1.1.5 X X +anstyle-wincon@3.0.11 X X +anyhow@1.0.102 X X +arrow@57.3.0 X +arrow-arith@57.3.0 X +arrow-array@57.3.0 X +arrow-buffer@57.3.0 X +arrow-cast@57.3.0 X +arrow-csv@57.3.0 X +arrow-data@57.3.0 X +arrow-ipc@57.3.0 X +arrow-json@57.3.0 X +arrow-ord@57.3.0 X +arrow-row@57.3.0 X +arrow-schema@57.3.0 X +arrow-select@57.3.0 X +arrow-string@57.3.0 X +async-trait@0.1.89 X X +atoi@2.0.0 X +atomic-waker@1.1.2 X X +autocfg@1.5.0 X X +backon@1.6.0 X +base64@0.22.1 X X +bigdecimal@0.4.10 X X +bitflags@2.11.0 X X +bitvec@1.0.1 X +block-buffer@0.10.4 X X +bumpalo@3.20.2 X X +byteorder@1.5.0 X X +bytes@1.11.1 X +cc@1.2.57 X X +cfg-if@1.0.4 X X +chrono@0.4.44 X X +clap@4.6.0 X X +clap_builder@4.6.0 X X +clap_derive@4.6.0 X X +clap_lex@1.1.0 X X +colorchoice@1.0.5 X X +const-oid@0.9.6 X X +const-random@0.1.18 X X +const-random-macro@0.1.16 X X +core-foundation-sys@0.8.7 X X +cpufeatures@0.2.17 X X +crc32c@0.6.8 X X +crossbeam-utils@0.8.21 X X +crunchy@0.2.4 X +crypto-common@0.1.7 X X +csv@1.4.0 X X +csv-core@0.1.13 X X +dashmap@6.1.0 X +delegate@0.13.5 X X +digest@0.10.7 X X +displaydoc@0.2.5 X X +either@1.15.0 X X +equivalent@1.0.2 X X +errno@0.3.14 X X +fastrand@2.3.0 X X +find-msvc-tools@0.1.9 X X +fixedbitset@0.5.7 X X +flatbuffers@25.12.19 X +fluss-rs@0.1.0 X +fnv@1.0.7 X X +foldhash@0.1.5 X +form_urlencoded@1.2.2 X X +funty@2.0.0 X +futures@0.3.32 X X +futures-channel@0.3.32 X X +futures-core@0.3.32 X X +futures-executor@0.3.32 X X +futures-io@0.3.32 X X +futures-macro@0.3.32 X X +futures-sink@0.3.32 X X +futures-task@0.3.32 X X +futures-util@0.3.32 X X +generic-array@0.14.7 X +getrandom@0.2.17 X X +getrandom@0.3.4 X X +getrandom@0.4.2 X X +gloo-timers@0.3.0 X X +h2@0.4.13 X +half@2.7.1 X X +hashbrown@0.14.5 X X +hashbrown@0.15.5 X X +hashbrown@0.16.1 X X +heck@0.5.0 X X +hex@0.4.3 X X +hmac@0.12.1 X X +home@0.5.12 X X +http@1.4.0 X X +http-body@1.0.1 X +http-body-util@0.1.3 X +httparse@1.10.1 X X +httpdate@1.0.3 X X +hyper@1.8.1 X +hyper-rustls@0.27.7 X X X +hyper-util@0.1.20 X +iana-time-zone@0.1.65 X X +iana-time-zone-haiku@0.1.2 X X +icu_collections@2.1.1 X +icu_locale_core@2.1.1 X +icu_normalizer@2.1.1 X +icu_normalizer_data@2.1.1 X +icu_properties@2.1.2 X +icu_properties_data@2.1.2 X +icu_provider@2.1.1 X +idna@1.1.0 X X +idna_adapter@1.2.1 X X +indexmap@2.13.0 X X +ipnet@2.12.0 X X +iri-string@0.7.11 X X +is_terminal_polyfill@1.70.2 X X +itertools@0.14.0 X X +itoa@1.0.18 X X +jiff@0.2.23 X X +jiff-tzdb@0.1.6 X X +jiff-tzdb-platform@0.1.3 X X +jobserver@0.1.34 X X +js-sys@0.3.91 X X +lexical-core@1.0.6 X X +lexical-parse-float@1.0.6 X X +lexical-parse-integer@1.0.6 X X +lexical-util@1.0.7 X X +lexical-write-float@1.0.6 X X +lexical-write-integer@1.0.6 X X +libc@0.2.183 X X +libm@0.2.16 X +linked-hash-map@0.5.6 X X +linux-raw-sys@0.12.1 X X X +litemap@0.8.1 X +lock_api@0.4.14 X X +log@0.4.29 X X +lz4_flex@0.12.1 X +md-5@0.10.6 X X +memchr@2.8.0 X X +mio@1.1.1 X +multimap@0.10.1 X X +num-bigint@0.4.6 X X +num-complex@0.4.6 X X +num-integer@0.1.46 X X +num-traits@0.2.19 X X +once_cell@1.21.4 X X +once_cell_polyfill@1.70.2 X X +opendal@0.55.0 X +ordered-float@5.1.0 X +parking_lot@0.12.5 X X +parking_lot_core@0.9.12 X X +parse-display@0.10.0 X X +parse-display-derive@0.10.0 X X +percent-encoding@2.3.2 X X +petgraph@0.8.3 X X +pin-project-lite@0.2.17 X X +pin-utils@0.1.0 X X +pkg-config@0.3.32 X X +portable-atomic@1.13.1 X X +portable-atomic-util@0.2.6 X X +potential_utf@0.1.4 X +ppv-lite86@0.2.21 X X +prettyplease@0.2.37 X X +proc-macro2@1.0.106 X X +prost@0.14.3 X +prost-build@0.14.3 X +prost-derive@0.14.3 X +prost-types@0.14.3 X +quick-xml@0.37.5 X +quick-xml@0.38.4 X +quote@1.0.45 X X +r-efi@5.3.0 X X X +r-efi@6.0.0 X X X +radium@0.7.0 X +rand@0.8.5 X X +rand@0.9.2 X X +rand_chacha@0.3.1 X X +rand_chacha@0.9.0 X X +rand_core@0.6.4 X X +rand_core@0.9.5 X X +redox_syscall@0.5.18 X +regex@1.12.3 X X +regex-automata@0.4.14 X X +regex-syntax@0.8.10 X X +reqsign@0.16.5 X +reqwest@0.12.28 X X +ring@0.17.14 X X +rustc_version@0.4.1 X X +rustix@1.1.4 X X X +rustls@0.23.37 X X X +rustls-pki-types@1.14.0 X X +rustls-webpki@0.103.10 X +rustversion@1.0.22 X X +ryu@1.0.23 X X +scopeguard@1.2.0 X X +semver@1.0.27 X X +serde@1.0.228 X X +serde_core@1.0.228 X X +serde_derive@1.0.228 X X +serde_json@1.0.149 X X +serde_urlencoded@0.7.1 X X +sha1@0.10.6 X X +sha2@0.10.9 X X +shlex@1.3.0 X X +signal-hook-registry@1.4.8 X X +simdutf8@0.1.5 X X +slab@0.4.12 X +smallvec@1.15.1 X X +snafu@0.8.9 X X +snafu-derive@0.8.9 X X +socket2@0.6.3 X X +stable_deref_trait@1.2.1 X X +strsim@0.11.1 X +structmeta@0.3.0 X X +structmeta-derive@0.3.0 X X +strum@0.26.3 X +strum_macros@0.26.4 X +subtle@2.6.1 X +syn@2.0.117 X X +sync_wrapper@1.0.2 X +synstructure@0.13.2 X +tap@1.0.1 X +tempfile@3.27.0 X X +thiserror@1.0.69 X X +thiserror-impl@1.0.69 X X +tiny-keccak@2.0.2 X +tinystr@0.8.2 X +tokio@1.50.0 X +tokio-macros@2.6.1 X +tokio-rustls@0.26.4 X X +tokio-util@0.7.18 X +tower@0.5.3 X +tower-http@0.6.8 X +tower-layer@0.3.3 X +tower-service@0.3.3 X +tracing@0.1.44 X +tracing-attributes@0.1.31 X +tracing-core@0.1.36 X +try-lock@0.2.5 X +twox-hash@2.1.2 X +typenum@1.19.0 X X +unicode-ident@1.0.24 X X X +untrusted@0.9.0 X +url@2.5.8 X X +utf8_iter@1.0.4 X X +utf8parse@0.2.2 X X +uuid@1.22.0 X X +value-bag@1.12.0 X X +version_check@0.9.5 X X +want@0.3.1 X +wasi@0.11.1+wasi-snapshot-preview1 X X X +wasip2@1.0.2+wasi-0.2.9 X X X +wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06 X X X +wasm-bindgen@0.2.114 X X +wasm-bindgen-futures@0.4.64 X X +wasm-bindgen-macro@0.2.114 X X +wasm-bindgen-macro-support@0.2.114 X X +wasm-bindgen-shared@0.2.114 X X +wasm-streams@0.4.2 X X +web-sys@0.3.91 X X +webpki-roots@1.0.6 X +windows-core@0.62.2 X X +windows-implement@0.60.2 X X +windows-interface@0.59.3 X X +windows-link@0.2.1 X X +windows-result@0.4.1 X X +windows-strings@0.5.1 X X +windows-sys@0.52.0 X X +windows-sys@0.61.2 X X +windows-targets@0.52.6 X X +windows_aarch64_gnullvm@0.52.6 X X +windows_aarch64_msvc@0.52.6 X X +windows_i686_gnu@0.52.6 X X +windows_i686_gnullvm@0.52.6 X X +windows_i686_msvc@0.52.6 X X +windows_x86_64_gnu@0.52.6 X X +windows_x86_64_gnullvm@0.52.6 X X +windows_x86_64_msvc@0.52.6 X X +wit-bindgen@0.51.0 X X X +writeable@0.6.2 X +wyz@0.5.1 X +yoke@0.8.1 X +yoke-derive@0.8.1 X +zerocopy@0.8.47 X X X +zerocopy-derive@0.8.47 X X X +zerofrom@0.1.6 X +zerofrom-derive@0.1.6 X +zeroize@1.8.2 X X +zerotrie@0.2.3 X +zerovec@0.11.5 X +zerovec-derive@0.11.2 X +zmij@1.0.21 X +zstd@0.13.3 X +zstd-safe@7.2.4 X X +zstd-sys@2.0.16+zstd.1.5.7 X X diff --git a/fluss-rust/crates/fluss/README.md b/fluss-rust/crates/fluss/README.md new file mode 100644 index 0000000000..76dc0ec293 --- /dev/null +++ b/fluss-rust/crates/fluss/README.md @@ -0,0 +1,105 @@ +# Apache Fluss (Incubating) Official Rust Client + +Official Rust client library for [Apache Fluss (Incubating)](https://fluss.apache.org/). + +[![crates.io](https://img.shields.io/crates/v/fluss-rs.svg)](https://crates.io/crates/fluss-rs) +[![docs.rs](https://img.shields.io/docsrs/fluss-rs)](https://docs.rs/fluss-rs/) + +## Usage + +The following example shows both **primary key (KV) tables** and **log tables** in one flow: connect, create a KV table (upsert + lookup), then create a log table (append + scan). + +```rust +use fluss::client::EARLIEST_OFFSET; +use fluss::client::FlussConnection; +use fluss::config::Config; +use fluss::error::Result; +use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; +use fluss::row::{GenericRow, InternalRow}; +use std::time::Duration; + +#[tokio::main] +async fn main() -> Result<()> { + let mut config = Config::default(); + config.bootstrap_servers = "127.0.0.1:9123".to_string(); + let connection = FlussConnection::new(config).await?; + let admin = connection.get_admin()?; + + // ---- Primary key (KV) table: upsert and lookup ---- + let kv_path = TablePath::new("fluss", "users"); + let mut kv_schema = Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .column("age", DataTypes::bigint()) + .primary_key(vec!["id"]); + let kv_descriptor = TableDescriptor::builder() + .schema(kv_schema.build()?) + .build()?; + admin.create_table(&kv_path, &kv_descriptor, false).await?; + + let kv_table = connection.get_table(&kv_path).await?; + let upsert_writer = kv_table.new_upsert()?.create_writer()?; + let mut row = GenericRow::new(3); + row.set_field(0, 1i32); + row.set_field(1, "Alice"); + row.set_field(2, 30i64); + upsert_writer.upsert(&row)?; + upsert_writer.flush().await?; + + let mut lookuper = kv_table.new_lookup()?.create_lookuper()?; + let mut key = GenericRow::new(1); + key.set_field(0, 1i32); + let result = lookuper.lookup(&key).await?; + if let Some(r) = result.get_single_row()? { + println!("KV lookup: id={}, name={}, age={}", + r.get_int(0)?, r.get_string(1)?, r.get_long(2)?); + } + + // ---- Log table: append and scan ---- + let log_path = TablePath::new("fluss", "events"); + let log_schema = Schema::builder() + .column("ts", DataTypes::bigint()) + .column("message", DataTypes::string()) + .build()?; + let log_descriptor = TableDescriptor::builder() + .schema(log_schema) + .build()?; + admin.create_table(&log_path, &log_descriptor, false).await?; + + let log_table = connection.get_table(&log_path).await?; + let append_writer = log_table.new_append()?.create_writer()?; + let mut event = GenericRow::new(2); + event.set_field(0, 1700000000i64); + event.set_field(1, "hello"); + append_writer.append(&event)?; + append_writer.flush().await?; + + let scanner = log_table.new_scan().create_log_scanner()?; + scanner.subscribe(0, EARLIEST_OFFSET).await?; + let scan_records = scanner.poll(Duration::from_secs(1)).await?; + for record in scan_records { + let r = record.row(); + println!("Log scan: ts={}, message={}", r.get_long(0)?, r.get_string(1)?); + } + + Ok(()) +} +``` + +## Storage Support + +The Fluss client reads remote data by accessing Fluss’s **remote files** (e.g. log segments and snapshots) directly. The following **remote file systems** are supported; enable the matching feature(s) for your deployment: + +| Storage Backend | Feature Flag | Status | Description | +|----------------|--------------|--------|-------------| +| Local Filesystem | `storage-fs` | ✅ Stable | Local filesystem storage | +| Amazon S3 | `storage-s3` | ✅ Stable | Amazon S3 storage | +| Alibaba Cloud OSS | `storage-oss` | ✅ Stable | Alibaba Cloud Object Storage Service | + +You can enable all storage backends at once using the `storage-all` feature flag. + +Example usage in Cargo.toml: +```toml +[dependencies] +fluss-rs = { version = "0.x.x", features = ["storage-s3", "storage-fs"] } +``` diff --git a/fluss-rust/crates/fluss/build.rs b/fluss-rust/crates/fluss/build.rs new file mode 100644 index 0000000000..65d58e3592 --- /dev/null +++ b/fluss-rust/crates/fluss/build.rs @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::io::Result; +use std::path::Path; + +fn main() -> Result<()> { + let mut config = prost_build::Config::new(); + config.bytes([ + ".fluss.PbProduceLogReqForBucket.records", + ".fluss.PbPutKvReqForBucket.records", + ".fluss.PbLookupReqForBucket.keys", + ".fluss.PbPrefixLookupReqForBucket.keys", + ]); + // Published crates vendor the proto under proto/ (scripts/vendor-proto.sh); + // monorepo builds read the canonical proto directly from fluss-rpc. + let (proto, include_dir) = if Path::new("proto/FlussApi.proto").exists() { + ("proto/FlussApi.proto", "proto") + } else { + ( + "../../../fluss-rpc/src/main/proto/FlussApi.proto", + "../../../fluss-rpc/src/main/proto", + ) + }; + config.compile_protos(&[proto], &[include_dir])?; + Ok(()) +} diff --git a/fluss-rust/crates/fluss/src/bucketing/mod.rs b/fluss-rust/crates/fluss/src/bucketing/mod.rs new file mode 100644 index 0000000000..1b43d12a23 --- /dev/null +++ b/fluss-rust/crates/fluss/src/bucketing/mod.rs @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::DataLakeFormat; +use crate::util::murmur_hash; + +pub trait BucketingFunction: Sync + Send { + fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result; +} + +impl dyn BucketingFunction { + /// Provides the bucketing function for a given [DataLakeFormat] + /// + /// # Arguments + /// * `lake_format` - Data lake format or none + /// + /// # Returns + /// * BucketingFunction + pub fn of(lake_format: Option<&DataLakeFormat>) -> Box { + match lake_format { + None => Box::new(FlussBucketingFunction), + Some(DataLakeFormat::Paimon) => Box::new(PaimonBucketingFunction), + Some(DataLakeFormat::Lance) => Box::new(FlussBucketingFunction), + Some(DataLakeFormat::Iceberg) => Box::new(IcebergBucketingFunction), + } + } +} + +struct FlussBucketingFunction; +impl BucketingFunction for FlussBucketingFunction { + fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result { + if bucket_key.is_empty() { + return Err(IllegalArgument { + message: "bucket_key must not be empty!".to_string(), + }); + } + + if num_buckets <= 0 { + return Err(IllegalArgument { + message: "num_buckets must be positive!".to_string(), + }); + } + + let key_hash = murmur_hash::fluss_hash_bytes(bucket_key)?; + + Ok(murmur_hash::fluss_hash_i32(key_hash) % num_buckets) + } +} + +struct PaimonBucketingFunction; +impl BucketingFunction for PaimonBucketingFunction { + fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result { + if bucket_key.is_empty() { + return Err(IllegalArgument { + message: "bucket_key must not be empty!".to_string(), + }); + } + + if num_buckets <= 0 { + return Err(IllegalArgument { + message: "num_buckets must be positive!".to_string(), + }); + } + + let key_hash = murmur_hash::fluss_hash_bytes(bucket_key)?; + + Ok((key_hash % num_buckets).abs()) + } +} + +struct IcebergBucketingFunction; +impl BucketingFunction for IcebergBucketingFunction { + fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result { + if bucket_key.is_empty() { + return Err(IllegalArgument { + message: "bucket_key must not be empty!".to_string(), + }); + } + + if num_buckets <= 0 { + return Err(IllegalArgument { + message: "num_buckets must be positive!".to_string(), + }); + }; + + Ok((murmur_hash::hash_bytes(bucket_key) as i32 & i32::MAX) % num_buckets) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_bucketing() { + let default_bucketing = ::of(None); + + let expected = 1; + let actual = default_bucketing.bucketing(&[00u8, 10u8], 7).unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 0; + let actual = default_bucketing + .bucketing(&[00u8, 10u8, 10u8, 10u8], 12) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 6; + let actual = default_bucketing + .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 6; + let actual = default_bucketing + .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + } + + #[test] + fn test_paimon_bucketing() { + let paimon_bucketing = ::of(Some(&DataLakeFormat::Paimon)); + + let expected = 1; + let actual = paimon_bucketing.bucketing(&[00u8, 10u8], 7).unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 11; + let actual = paimon_bucketing + .bucketing(&[00u8, 10u8, 10u8, 10u8], 12) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 12; + let actual = paimon_bucketing + .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 0; + let actual = paimon_bucketing + .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + } + + #[test] + fn test_lance_bucketing() { + let lance_bucketing = ::of(Some(&DataLakeFormat::Lance)); + + let expected = 1; + let actual = lance_bucketing.bucketing(&[00u8, 10u8], 7).unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 0; + let actual = lance_bucketing + .bucketing(&[00u8, 10u8, 10u8, 10u8], 12) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 6; + let actual = lance_bucketing + .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 6; + let actual = lance_bucketing + .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + } + + #[test] + fn test_iceberg_bucketing() { + let iceberg_bucketing = ::of(Some(&DataLakeFormat::Iceberg)); + + let expected = 3; + let actual = iceberg_bucketing.bucketing(&[00u8, 10u8], 7).unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 4; + let actual = iceberg_bucketing + .bucketing(&[00u8, 10u8, 10u8, 10u8], 12) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 12; + let actual = iceberg_bucketing + .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + + let expected = 3; + let actual = iceberg_bucketing + .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8) + .unwrap(); + assert_eq!( + expected, actual, + "Expecting bucket to be {expected} but got {actual}" + ); + } +} diff --git a/fluss-rust/crates/fluss/src/client/admin.rs b/fluss-rust/crates/fluss/src/client/admin.rs new file mode 100644 index 0000000000..1eb2f80bb0 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/admin.rs @@ -0,0 +1,486 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::metadata::Metadata; +use crate::cluster::ServerNode; +use crate::metadata::{ + DatabaseDescriptor, DatabaseInfo, JsonSerde, LakeSnapshot, PartitionInfo, PartitionSpec, + PhysicalTablePath, Schema, SchemaInfo, TableBucket, TableDescriptor, TableInfo, TablePath, +}; +use crate::rpc::message::{ + CreateDatabaseRequest, CreatePartitionRequest, CreateTableRequest, DatabaseExistsRequest, + DropDatabaseRequest, DropPartitionRequest, DropTableRequest, GetDatabaseInfoRequest, + GetLatestLakeSnapshotRequest, GetTableRequest, GetTableSchemaRequestMsg, ListDatabasesRequest, + ListPartitionInfosRequest, ListTablesRequest, TableExistsRequest, +}; +use crate::rpc::message::{ListOffsetsRequest, OffsetSpec}; +use crate::rpc::{RpcClient, ServerConnection}; + +use crate::error::{Error, Result}; +use crate::proto::GetTableInfoResponse; +use crate::{BucketId, PartitionId, TableId}; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; +use tokio::task::JoinHandle; + +pub struct FlussAdmin { + metadata: Arc, + rpc_client: Arc, +} + +impl FlussAdmin { + pub fn new(connections: Arc, metadata: Arc) -> Self { + FlussAdmin { + metadata, + rpc_client: connections, + } + } + + async fn admin_gateway(&self) -> Result { + let cluster = self.metadata.get_cluster(); + let coordinator = + cluster + .get_coordinator_server() + .ok_or_else(|| Error::UnexpectedError { + message: "Coordinator server not found in cluster metadata".to_string(), + source: None, + })?; + self.rpc_client.get_connection(coordinator).await + } + + pub async fn create_database( + &self, + database_name: &str, + database_descriptor: Option<&DatabaseDescriptor>, + ignore_if_exists: bool, + ) -> Result<()> { + let _response = self + .admin_gateway() + .await? + .request(CreateDatabaseRequest::new( + database_name, + database_descriptor, + ignore_if_exists, + )?) + .await?; + Ok(()) + } + + pub async fn create_table( + &self, + table_path: &TablePath, + table_descriptor: &TableDescriptor, + ignore_if_exists: bool, + ) -> Result<()> { + let _response = self + .admin_gateway() + .await? + .request(CreateTableRequest::new( + table_path, + table_descriptor, + ignore_if_exists, + )?) + .await?; + Ok(()) + } + + pub async fn drop_table( + &self, + table_path: &TablePath, + ignore_if_not_exists: bool, + ) -> Result<()> { + let _response = self + .admin_gateway() + .await? + .request(DropTableRequest::new(table_path, ignore_if_not_exists)) + .await?; + Ok(()) + } + + /// Fetch the schema for `table_path` at the given `schema_id`. Pass + /// `None` to request the latest. + pub async fn get_table_schema( + &self, + table_path: &TablePath, + schema_id: Option, + ) -> Result { + let response = self + .admin_gateway() + .await? + .request(GetTableSchemaRequestMsg::new(table_path, schema_id)) + .await?; + + let schema_node: serde_json::Value = serde_json::from_slice(&response.schema_json) + .map_err(|e| Error::JsonSerdeError { + message: format!("Failed to parse schema_json: {e}"), + })?; + let schema = Schema::deserialize_json(&schema_node)?; + Ok(SchemaInfo::new(schema, response.schema_id)) + } + + pub async fn get_table_info(&self, table_path: &TablePath) -> Result { + let response = self + .admin_gateway() + .await? + .request(GetTableRequest::new(table_path)) + .await?; + + // force update to avoid stale data in cache + self.metadata + .update_tables_metadata(&HashSet::from([table_path]), &HashSet::new(), vec![]) + .await?; + + let GetTableInfoResponse { + table_id, + schema_id, + table_json, + created_time, + modified_time, + remote_data_dir: _, + } = response; + let v: &[u8] = &table_json[..]; + let table_descriptor = + TableDescriptor::deserialize_json(&serde_json::from_slice(v).unwrap())?; + Ok(TableInfo::of( + table_path.clone(), + table_id, + schema_id, + table_descriptor, + created_time, + modified_time, + )) + } + + /// List all tables in the given database + pub async fn list_tables(&self, database_name: &str) -> Result> { + let response = self + .admin_gateway() + .await? + .request(ListTablesRequest::new(database_name)) + .await?; + Ok(response.table_name) + } + + /// List all partitions in the given table. + pub async fn list_partition_infos(&self, table_path: &TablePath) -> Result> { + self.list_partition_infos_with_spec(table_path, None).await + } + + /// List partitions in the given table that match the partial partition spec. + pub async fn list_partition_infos_with_spec( + &self, + table_path: &TablePath, + partial_partition_spec: Option<&PartitionSpec>, + ) -> Result> { + let response = self + .admin_gateway() + .await? + .request(ListPartitionInfosRequest::new( + table_path, + partial_partition_spec, + )) + .await?; + Ok(response.get_partitions_info()) + } + + /// Create a new partition for a partitioned table. + pub async fn create_partition( + &self, + table_path: &TablePath, + partition_spec: &PartitionSpec, + ignore_if_exists: bool, + ) -> Result<()> { + let _response = self + .admin_gateway() + .await? + .request(CreatePartitionRequest::new( + table_path, + partition_spec, + ignore_if_exists, + )) + .await?; + Ok(()) + } + + /// Drop a partition from a partitioned table. + pub async fn drop_partition( + &self, + table_path: &TablePath, + partition_spec: &PartitionSpec, + ignore_if_not_exists: bool, + ) -> Result<()> { + let _response = self + .admin_gateway() + .await? + .request(DropPartitionRequest::new( + table_path, + partition_spec, + ignore_if_not_exists, + )) + .await?; + Ok(()) + } + + /// Check if a table exists + pub async fn table_exists(&self, table_path: &TablePath) -> Result { + let response = self + .admin_gateway() + .await? + .request(TableExistsRequest::new(table_path)) + .await?; + Ok(response.exists) + } + + /// Drop a database + pub async fn drop_database( + &self, + database_name: &str, + ignore_if_not_exists: bool, + cascade: bool, + ) -> Result<()> { + let _response = self + .admin_gateway() + .await? + .request(DropDatabaseRequest::new( + database_name, + ignore_if_not_exists, + cascade, + )) + .await?; + Ok(()) + } + + /// List all databases + pub async fn list_databases(&self) -> Result> { + let response = self + .admin_gateway() + .await? + .request(ListDatabasesRequest::new()) + .await?; + Ok(response.database_name) + } + + /// Check if a database exists + pub async fn database_exists(&self, database_name: &str) -> Result { + let response = self + .admin_gateway() + .await? + .request(DatabaseExistsRequest::new(database_name)) + .await?; + Ok(response.exists) + } + + /// Get database information + pub async fn get_database_info(&self, database_name: &str) -> Result { + let request = GetDatabaseInfoRequest::new(database_name); + let response = self.admin_gateway().await?.request(request).await?; + + // Convert proto response to DatabaseInfo + let database_descriptor = DatabaseDescriptor::from_json_bytes(&response.database_json)?; + + Ok(DatabaseInfo::new( + database_name.to_string(), + database_descriptor, + response.created_time, + response.modified_time, + )) + } + + /// Get all alive server nodes in the cluster, including the coordinator + /// and all tablet servers. Refreshes cluster metadata before returning. + pub async fn get_server_nodes(&self) -> Result> { + self.metadata.reinit_cluster().await?; + Ok(self.metadata.get_cluster().get_server_nodes()) + } + + /// Get the latest lake snapshot for a table + pub async fn get_latest_lake_snapshot(&self, table_path: &TablePath) -> Result { + let response = self + .admin_gateway() + .await? + .request(GetLatestLakeSnapshotRequest::new(table_path)) + .await?; + + // Convert proto response to LakeSnapshot + let mut table_buckets_offset = HashMap::new(); + for bucket_snapshot in response.bucket_snapshots { + let table_bucket = TableBucket::new_with_partition( + response.table_id, + bucket_snapshot.partition_id, + bucket_snapshot.bucket_id, + ); + if let Some(log_offset) = bucket_snapshot.log_offset { + table_buckets_offset.insert(table_bucket, log_offset); + } + } + + Ok(LakeSnapshot::new( + response.snapshot_id, + table_buckets_offset, + )) + } + + /// List offset for the specified buckets. This operation enables to find the beginning offset, + /// end offset as well as the offset matching a timestamp in buckets. + pub async fn list_offsets( + &self, + table_path: &TablePath, + buckets_id: &[BucketId], + offset_spec: OffsetSpec, + ) -> Result> { + self.do_list_offsets(table_path, None, buckets_id, offset_spec) + .await + } + + /// List offset for the specified buckets in a partition. This operation enables to find + /// the beginning offset, end offset as well as the offset matching a timestamp in buckets. + pub async fn list_partition_offsets( + &self, + table_path: &TablePath, + partition_name: &str, + buckets_id: &[BucketId], + offset_spec: OffsetSpec, + ) -> Result> { + self.do_list_offsets(table_path, Some(partition_name), buckets_id, offset_spec) + .await + } + + async fn do_list_offsets( + &self, + table_path: &TablePath, + partition_name: Option<&str>, + buckets_id: &[BucketId], + offset_spec: OffsetSpec, + ) -> Result> { + if buckets_id.is_empty() { + return Err(Error::IllegalArgument { + message: "Buckets are empty.".to_string(), + }); + } + + // force to update table metadata like java side + self.metadata.update_table_metadata(table_path).await?; + + let cluster = self.metadata.get_cluster(); + let table_id = cluster.get_table(table_path)?.table_id; + + // Resolve partition_id from partition_name if provided + let partition_id = if let Some(name) = partition_name { + let physical_table_path = Arc::new(PhysicalTablePath::of_partitioned( + Arc::new(table_path.clone()), + Some(name.to_string()), + )); + + // Update partition metadata like java side + self.metadata + .update_physical_table_metadata(std::slice::from_ref(&physical_table_path)) + .await?; + + let cluster = self.metadata.get_cluster(); + Some( + cluster + .get_partition_id(&physical_table_path) + .ok_or_else(|| { + Error::partition_not_exist(format!( + "Partition '{name}' not found for table '{table_path}'" + )) + })?, + ) + } else { + None + }; + + // Prepare requests + let requests_by_server = + self.prepare_list_offsets_requests(table_id, partition_id, buckets_id, offset_spec)?; + + // Send Requests + let response_futures = self.send_list_offsets_request(requests_by_server).await?; + + let mut results = HashMap::new(); + + for response_future in response_futures { + let offsets = response_future.await.map_err(|e| Error::UnexpectedError { + message: "Fail to get result for list offsets.".to_string(), + source: Some(Box::new(e)), + })?; + results.extend(offsets?); + } + Ok(results) + } + + fn prepare_list_offsets_requests( + &self, + table_id: TableId, + partition_id: Option, + buckets: &[BucketId], + offset_spec: OffsetSpec, + ) -> Result> { + let cluster = self.metadata.get_cluster(); + let mut node_for_bucket_list: HashMap> = HashMap::new(); + + for bucket_id in buckets { + let table_bucket = TableBucket::new_with_partition(table_id, partition_id, *bucket_id); + let leader = cluster.leader_for(&table_bucket).ok_or_else(|| { + // todo: consider retry? + Error::UnexpectedError { + message: format!("No leader found for table bucket: {table_bucket}."), + source: None, + } + })?; + + node_for_bucket_list + .entry(leader.id()) + .or_default() + .push(*bucket_id); + } + + let mut list_offsets_requests = HashMap::new(); + for (leader_id, bucket_ids) in node_for_bucket_list { + let request = + ListOffsetsRequest::new(table_id, partition_id, bucket_ids, offset_spec.clone()); + list_offsets_requests.insert(leader_id, request); + } + Ok(list_offsets_requests) + } + + async fn send_list_offsets_request( + &self, + request_map: HashMap, + ) -> Result>>>> { + let mut tasks = Vec::new(); + + for (leader_id, request) in request_map { + let rpc_client = self.rpc_client.clone(); + let metadata = self.metadata.clone(); + + let task = tokio::spawn(async move { + let cluster = metadata.get_cluster(); + let tablet_server = cluster.get_tablet_server(leader_id).ok_or_else(|| { + Error::leader_not_available(format!( + "Tablet server {leader_id} is not found in metadata cache." + )) + })?; + let connection = rpc_client.get_connection(tablet_server).await?; + let list_offsets_response = connection.request(request).await?; + list_offsets_response.offsets() + }); + tasks.push(task); + } + Ok(tasks) + } +} diff --git a/fluss-rust/crates/fluss/src/client/connection.rs b/fluss-rust/crates/fluss/src/client/connection.rs new file mode 100644 index 0000000000..c31104c469 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/connection.rs @@ -0,0 +1,187 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::WriterClient; +use crate::client::admin::FlussAdmin; +use crate::client::lookup::LookupClient; +use crate::client::metadata::Metadata; +use crate::client::table::FlussTable; +use crate::config::Config; +use crate::error::{Error, FlussError, Result}; +use crate::metadata::TablePath; +use crate::rpc::RpcClient; +use parking_lot::RwLock; +use std::sync::Arc; +use std::time::Duration; + +pub struct FlussConnection { + metadata: Arc, + network_connects: Arc, + args: Config, + writer_client: RwLock>>, + admin_client: RwLock>>, + lookup_client: RwLock>>, +} + +impl FlussConnection { + pub async fn new(arg: Config) -> Result { + arg.validate_security() + .map_err(|msg| Error::IllegalArgument { message: msg })?; + arg.validate_scanner() + .map_err(|msg| Error::IllegalArgument { message: msg })?; + arg.validate_writer() + .map_err(|msg| Error::IllegalArgument { message: msg })?; + + let timeout = Duration::from_millis(arg.connect_timeout_ms); + // connect_timeout_ms: no lower-bound validation to match Java behavior. + // Java allows 0 — tracked in https://github.com/apache/fluss/issues/3068 + let connections = if arg.is_sasl_enabled() { + Arc::new( + RpcClient::new() + .with_sasl( + arg.security_sasl_username.clone(), + arg.security_sasl_password.clone(), + ) + .with_timeout(timeout), + ) + } else { + Arc::new(RpcClient::new().with_timeout(timeout)) + }; + let metadata = Metadata::new(arg.bootstrap_servers.as_str(), connections.clone()).await?; + + Ok(FlussConnection { + metadata: Arc::new(metadata), + network_connects: connections.clone(), + args: arg.clone(), + writer_client: Default::default(), + admin_client: RwLock::new(None), + lookup_client: Default::default(), + }) + } + + /// Gracefully shut down the connection, draining any pending write batches. + /// + /// If a writer client has been created, this method will signal it to drain + /// its buffers and wait for the background sender task to complete, bounded + /// by the provided timeout. + pub async fn close(&self, timeout: Duration) -> Result<()> { + let writer_client = self.writer_client.write().take(); + if let Some(client) = writer_client { + client.close(timeout).await?; + } + Ok(()) + } + + pub fn get_metadata(&self) -> Arc { + self.metadata.clone() + } + + pub fn get_connections(&self) -> Arc { + self.network_connects.clone() + } + + pub fn config(&self) -> &Config { + &self.args + } + + pub fn get_admin(&self) -> Result> { + // 1. Fast path: return cached instance if already initialized. + if let Some(admin) = self.admin_client.read().as_ref() { + return Ok(admin.clone()); + } + + // 2. Slow path: acquire write lock. + let mut admin_guard = self.admin_client.write(); + + // 3. Double-check: another thread may have initialized while we waited. + if let Some(admin) = admin_guard.as_ref() { + return Ok(admin.clone()); + } + + // 4. Initialize and cache. + let admin = Arc::new(FlussAdmin::new( + self.network_connects.clone(), + self.metadata.clone(), + )); + *admin_guard = Some(admin.clone()); + Ok(admin) + } + + pub fn get_or_create_writer_client(&self) -> Result> { + // 1. Fast path: Attempt to acquire a read lock to check if the client already exists. + if let Some(client) = self.writer_client.read().as_ref() { + return Ok(client.clone()); + } + + // 2. Slow path: Acquire the write lock. + let mut writer_guard = self.writer_client.write(); + + // 3. Double-check: Another thread might have initialized the client + // while this thread was waiting for the write lock. + if let Some(client) = writer_guard.as_ref() { + return Ok(client.clone()); + } + + // 4. Initialize the client since we are certain it doesn't exist yet. + let new_client = Arc::new(WriterClient::new(self.args.clone(), self.metadata.clone())?); + + // 5. Store and return the newly created client. + *writer_guard = Some(new_client.clone()); + Ok(new_client) + } + + /// Gets or creates a lookup client for batched lookup operations. + pub fn get_or_create_lookup_client(&self) -> Result> { + // 1. Fast path: Attempt to acquire a read lock to check if the client already exists. + if let Some(client) = self.lookup_client.read().as_ref() { + return Ok(client.clone()); + } + + // 2. Slow path: Acquire the write lock. + let mut lookup_guard = self.lookup_client.write(); + + // 3. Double-check: Another thread might have initialized the client + // while this thread was waiting for the write lock. + if let Some(client) = lookup_guard.as_ref() { + return Ok(client.clone()); + } + + // 4. Initialize the client since we are certain it doesn't exist yet. + let new_client = Arc::new(LookupClient::new(&self.args, self.metadata.clone())); + + // 5. Store and return the newly created client. + *lookup_guard = Some(new_client.clone()); + Ok(new_client) + } + + pub async fn get_table(&self, table_path: &TablePath) -> Result> { + self.metadata.update_table_metadata(table_path).await?; + let table_info = self + .metadata + .get_cluster() + .get_table(table_path) + .map_err(|e| { + if e.api_error() == Some(FlussError::InvalidTableException) { + Error::table_not_exist(format!("Table not found: {table_path}")) + } else { + e + } + })? + .clone(); + Ok(FlussTable::new(self, self.metadata.clone(), table_info)) + } +} diff --git a/fluss-rust/crates/fluss/src/client/credentials.rs b/fluss-rust/crates/fluss/src/client/credentials.rs new file mode 100644 index 0000000000..a954e2a916 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/credentials.rs @@ -0,0 +1,437 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::metadata::Metadata; +use crate::error::{Error, Result}; +use crate::rpc::RpcClient; +use crate::rpc::message::GetSecurityTokenRequest; +use log::{debug, info, warn}; +use parking_lot::RwLock; +use serde::Deserialize; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tokio::sync::{oneshot, watch}; +use tokio::task::JoinHandle; + +/// Default renewal time ratio - refresh at 80% of token lifetime +const DEFAULT_TOKEN_RENEWAL_RATIO: f64 = 0.8; +/// Default retry backoff when token fetch fails +const DEFAULT_RENEWAL_RETRY_BACKOFF: Duration = Duration::from_secs(30); +/// Minimum delay between refreshes +const MIN_RENEWAL_DELAY: Duration = Duration::from_secs(1); +/// Maximum delay between refreshes (7 days) - prevents overflow and ensures periodic refresh +const MAX_RENEWAL_DELAY: Duration = Duration::from_secs(7 * 24 * 60 * 60); +/// Default refresh interval for tokens without expiration (never expires) +const DEFAULT_NON_EXPIRING_REFRESH_INTERVAL: Duration = Duration::from_secs(7 * 24 * 60 * 60); // 7 day + +/// Type alias for credentials properties receiver +/// - `None` = not yet fetched, should wait +/// - `Some(HashMap)` = fetched (may be empty if no auth needed) +pub type CredentialsReceiver = watch::Receiver>>; + +#[derive(Debug, Deserialize)] +struct Credentials { + access_key_id: String, + access_key_secret: String, + security_token: Option, +} + +/// Returns (opendal_key, needs_inversion) +/// needs_inversion is true for path_style_access -> enable_virtual_host_style conversion +fn convert_hadoop_key_to_opendal(hadoop_key: &str) -> Option<(String, bool)> { + match hadoop_key { + // S3 specific configurations + "fs.s3a.endpoint" => Some(("endpoint".to_string(), false)), + "fs.s3a.endpoint.region" => Some(("region".to_string(), false)), + "fs.s3a.path.style.access" => Some(("enable_virtual_host_style".to_string(), true)), + "fs.s3a.connection.ssl.enabled" => None, + // OSS specific configurations + "fs.oss.endpoint" => Some(("endpoint".to_string(), false)), + "fs.oss.region" => Some(("region".to_string(), false)), + _ => None, + } +} + +/// Build remote filesystem props from credentials and additional info +fn build_remote_fs_props( + credentials: &Credentials, + addition_infos: &HashMap, +) -> HashMap { + let mut props = HashMap::new(); + + props.insert( + "access_key_id".to_string(), + credentials.access_key_id.clone(), + ); + + // S3 specific configurations + props.insert( + "secret_access_key".to_string(), + credentials.access_key_secret.clone(), + ); + + // OSS specific configurations, todo: consider refactor it + // to handle different conversion for different scheme in different method + props.insert( + "access_key_secret".to_string(), + credentials.access_key_secret.clone(), + ); + + if let Some(token) = &credentials.security_token { + props.insert("security_token".to_string(), token.clone()); + } + + for (key, value) in addition_infos { + if let Some((opendal_key, transform)) = convert_hadoop_key_to_opendal(key) { + let final_value = if transform { + // Invert boolean value (path_style_access -> enable_virtual_host_style) + if value == "true" { + "false".to_string() + } else { + "true".to_string() + } + } else { + value.clone() + }; + props.insert(opendal_key, final_value); + } + } + + props +} + +/// Manager for security tokens that refreshes tokens in a background task. +/// +/// This follows the pattern from Java's `DefaultSecurityTokenManager`, where +/// a background thread periodically refreshes tokens based on their expiration time. +/// +/// Uses `tokio::sync::watch` channel to broadcast token updates to consumers. +/// Consumers can subscribe by calling `subscribe()` to get a receiver. +/// +/// The channel value is `Option`: +/// - `None` = not yet fetched, consumers should wait +/// - `Some(HashMap)` = fetched (may be empty if no auth needed) +/// +/// # Example +/// ```ignore +/// let manager = SecurityTokenManager::new(rpc_client, metadata); +/// let credentials_rx = manager.subscribe(); +/// manager.start(); +/// +/// // Consumer can get latest credentials via: +/// let props = credentials_rx.borrow().clone(); +/// ``` +pub struct SecurityTokenManager { + rpc_client: Arc, + metadata: Arc, + token_renewal_ratio: f64, + renewal_retry_backoff: Duration, + /// Watch channel sender for broadcasting token updates + credentials_tx: watch::Sender>>, + /// Watch channel receiver (kept to allow cloning for new subscribers) + credentials_rx: watch::Receiver>>, + /// Handle to the background refresh task + task_handle: RwLock>>, + /// Sender to signal shutdown + shutdown_tx: RwLock>>, +} + +impl SecurityTokenManager { + pub fn new(rpc_client: Arc, metadata: Arc) -> Self { + let (credentials_tx, credentials_rx) = watch::channel(None); + Self { + rpc_client, + metadata, + token_renewal_ratio: DEFAULT_TOKEN_RENEWAL_RATIO, + renewal_retry_backoff: DEFAULT_RENEWAL_RETRY_BACKOFF, + credentials_tx, + credentials_rx, + task_handle: RwLock::new(None), + shutdown_tx: RwLock::new(None), + } + } + + /// Subscribe to credential updates. + /// Returns a receiver that always contains the latest credentials. + /// Consumers can call `receiver.borrow()` to get the current value. + pub fn subscribe(&self) -> CredentialsReceiver { + self.credentials_rx.clone() + } + + /// Start the background token refresh task. + /// This should be called once after creating the manager. + pub fn start(&self) { + if self.task_handle.read().is_some() { + warn!("SecurityTokenManager is already started"); + return; + } + + let (shutdown_tx, shutdown_rx) = oneshot::channel(); + *self.shutdown_tx.write() = Some(shutdown_tx); + + let rpc_client = Arc::clone(&self.rpc_client); + let metadata = Arc::clone(&self.metadata); + let token_renewal_ratio = self.token_renewal_ratio; + let renewal_retry_backoff = self.renewal_retry_backoff; + let credentials_tx = self.credentials_tx.clone(); + + let handle = tokio::spawn(async move { + Self::token_refresh_loop( + rpc_client, + metadata, + token_renewal_ratio, + renewal_retry_backoff, + credentials_tx, + shutdown_rx, + ) + .await; + }); + + *self.task_handle.write() = Some(handle); + info!("SecurityTokenManager started"); + } + + /// Stop the background token refresh task. + pub fn stop(&self) { + if let Some(tx) = self.shutdown_tx.write().take() { + let _ = tx.send(()); + } + // Take and drop the task handle so the task can finish gracefully + let _ = self.task_handle.write().take(); + info!("SecurityTokenManager stopped"); + } + + /// Background task that periodically refreshes tokens. + async fn token_refresh_loop( + rpc_client: Arc, + metadata: Arc, + token_renewal_ratio: f64, + renewal_retry_backoff: Duration, + credentials_tx: watch::Sender>>, + mut shutdown_rx: oneshot::Receiver<()>, + ) { + info!("Starting token refresh loop"); + + loop { + // Fetch token and send to channel + let result = Self::fetch_token(&rpc_client, &metadata).await; + + let next_delay = match result { + Ok((props, expiration_time)) => { + // Send credentials via watch channel (Some indicates fetched) + if let Err(e) = credentials_tx.send(Some(props)) { + debug!("No active subscribers for credentials update: {e:?}"); + } + + // Calculate next renewal delay based on expiration time + if let Some(exp_time) = expiration_time { + Self::calculate_renewal_delay(exp_time, token_renewal_ratio) + } else { + // No expiration time - token never expires, use long refresh interval + info!( + "Token has no expiration time (never expires), next refresh in {DEFAULT_NON_EXPIRING_REFRESH_INTERVAL:?}" + ); + DEFAULT_NON_EXPIRING_REFRESH_INTERVAL + } + } + Err(e) => { + warn!( + "Failed to obtain security token: {e:?}, will retry in {renewal_retry_backoff:?}" + ); + renewal_retry_backoff + } + }; + + debug!("Next token refresh in {next_delay:?}"); + + // Wait for either the delay to elapse or shutdown signal + tokio::select! { + _ = tokio::time::sleep(next_delay) => { + // Continue to next iteration to refresh + } + _ = &mut shutdown_rx => { + info!("Token refresh loop received shutdown signal"); + break; + } + } + } + } + + /// Fetch token from server. + /// Returns the props and expiration time if available. + async fn fetch_token( + rpc_client: &Arc, + metadata: &Arc, + ) -> Result<(HashMap, Option)> { + let cluster = metadata.get_cluster(); + let server_node = + cluster + .get_one_available_server() + .ok_or_else(|| Error::UnexpectedError { + message: "No tablet server available for token refresh".to_string(), + source: None, + })?; + + let conn = rpc_client.get_connection(server_node).await?; + let request = GetSecurityTokenRequest::new(); + let response = conn.request(request).await?; + + // The token may be empty if remote filesystem doesn't require authentication + if response.token.is_empty() { + info!("Empty token received, remote filesystem may not require authentication"); + return Ok((HashMap::new(), response.expiration_time)); + } + + let credentials: Credentials = + serde_json::from_slice(&response.token).map_err(|e| Error::JsonSerdeError { + message: format!("Error when parsing token from server: {e}"), + })?; + + let mut addition_infos = HashMap::new(); + for kv in &response.addition_info { + addition_infos.insert(kv.key.clone(), kv.value.clone()); + } + + let props = build_remote_fs_props(&credentials, &addition_infos); + debug!("Security token fetched successfully"); + + Ok((props, response.expiration_time)) + } + + /// Calculate the delay before next token renewal. + /// Uses the renewal ratio to refresh before actual expiration. + /// Caps the delay to MAX_RENEWAL_DELAY to prevent overflow and ensure periodic refresh. + fn calculate_renewal_delay(expiration_time: i64, renewal_ratio: f64) -> Duration { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as i64; + + let time_until_expiry = expiration_time - now; + if time_until_expiry <= 0 { + // Token already expired, refresh immediately + return MIN_RENEWAL_DELAY; + } + + // Cap time_until_expiry to prevent overflow when casting to f64 and back + let max_delay_ms = MAX_RENEWAL_DELAY.as_millis() as i64; + let capped_time = time_until_expiry.min(max_delay_ms); + + let delay_ms = (capped_time as f64 * renewal_ratio) as u64; + let delay = Duration::from_millis(delay_ms); + + debug!( + "Calculated renewal delay: {delay:?} (expiration: {expiration_time}, now: {now}, ratio: {renewal_ratio})" + ); + + delay.clamp(MIN_RENEWAL_DELAY, MAX_RENEWAL_DELAY) + } +} + +impl Drop for SecurityTokenManager { + fn drop(&mut self) { + self.stop(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn convert_hadoop_key_to_opendal_maps_known_keys() { + // S3 keys + let (key, invert) = convert_hadoop_key_to_opendal("fs.s3a.endpoint").expect("key"); + assert_eq!(key, "endpoint"); + assert!(!invert); + + let (key, invert) = convert_hadoop_key_to_opendal("fs.s3a.path.style.access").expect("key"); + assert_eq!(key, "enable_virtual_host_style"); + assert!(invert); + + assert!(convert_hadoop_key_to_opendal("fs.s3a.connection.ssl.enabled").is_none()); + + // OSS keys + let (key, invert) = convert_hadoop_key_to_opendal("fs.oss.endpoint").expect("key"); + assert_eq!(key, "endpoint"); + assert!(!invert); + + let (key, invert) = convert_hadoop_key_to_opendal("fs.oss.region").expect("key"); + assert_eq!(key, "region"); + assert!(!invert); + + // Unknown key + assert!(convert_hadoop_key_to_opendal("unknown.key").is_none()); + } + + #[test] + fn calculate_renewal_delay_returns_correct_delay() { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as i64; + + // Token expires in 1 hour + let expiration = now + 3600 * 1000; + let delay = SecurityTokenManager::calculate_renewal_delay(expiration, 0.8); + + // Should be approximately 48 minutes (80% of 1 hour) + let expected_min = Duration::from_secs(2800); // ~46.7 minutes + let expected_max = Duration::from_secs(2900); // ~48.3 minutes + assert!( + delay >= expected_min && delay <= expected_max, + "Expected delay between {expected_min:?} and {expected_max:?}, got {delay:?}" + ); + } + + #[test] + fn calculate_renewal_delay_handles_expired_token() { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as i64; + + // Token already expired + let expiration = now - 1000; + let delay = SecurityTokenManager::calculate_renewal_delay(expiration, 0.8); + + // Should return minimum delay + assert_eq!(delay, MIN_RENEWAL_DELAY); + } + + #[test] + fn build_remote_fs_props_includes_all_fields() { + let credentials = Credentials { + access_key_id: "ak".to_string(), + access_key_secret: "sk".to_string(), + security_token: Some("token".to_string()), + }; + let addition_infos = + HashMap::from([("fs.s3a.path.style.access".to_string(), "true".to_string())]); + + let props = build_remote_fs_props(&credentials, &addition_infos); + assert_eq!(props.get("access_key_id"), Some(&"ak".to_string())); + assert_eq!(props.get("access_key_secret"), Some(&"sk".to_string())); + assert_eq!(props.get("access_key_secret"), Some(&"sk".to_string())); + assert_eq!(props.get("security_token"), Some(&"token".to_string())); + assert_eq!( + props.get("enable_virtual_host_style"), + Some(&"false".to_string()) + ); + } +} diff --git a/fluss-rust/crates/fluss/src/client/lookup/lookup_client.rs b/fluss-rust/crates/fluss/src/client/lookup/lookup_client.rs new file mode 100644 index 0000000000..4d507aa9bf --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/lookup/lookup_client.rs @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Lookup client that batches multiple lookups together for improved throughput. +//! +//! This client achieves parity with the Java client by: +//! - Queuing lookup operations instead of sending them immediately +//! - Batching multiple lookups to the same server/bucket +//! - Running a background sender task to process batches + +use super::{LookupQueue, PrefixLookupQuery, PrimaryLookupQuery, QueuedLookup}; +use crate::client::lookup::lookup_sender::LookupSender; +use crate::client::metadata::Metadata; +use crate::config::Config; +use crate::error::{Error, Result}; +use crate::metadata::{TableBucket, TablePath}; +use bytes::Bytes; +use log::{debug, error}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; +use tokio::sync::{mpsc, watch}; +use tokio::task::JoinHandle; + +/// A client that lookups values from the server with batching support. +/// +/// The lookup client uses a queue and background sender to batch multiple +/// lookup operations together, reducing network round trips and improving +/// throughput. +/// +/// # Example +/// +/// ```ignore +/// let lookup_client = LookupClient::new(config, metadata); +/// let result = lookup_client.lookup(table_path, table_bucket, key_bytes).await?; +/// ``` +pub struct LookupClient { + /// Channel to send lookup requests to the queue + lookup_tx: mpsc::Sender, + /// Handle to the sender task + sender_handle: Option>, + /// Watch channel for internal shutdown handling + shutdown_tx: watch::Sender, + /// Whether the client is closed + closed: AtomicBool, +} + +impl LookupClient { + /// Creates a new lookup client. + pub fn new(config: &Config, metadata: Arc) -> Self { + // Extract configuration values + let queue_size = config.lookup_queue_size; + let max_batch_size = config.lookup_max_batch_size; + let batch_timeout_ms = config.lookup_batch_timeout_ms; + let max_inflight = config.lookup_max_inflight_requests; + let max_retries = config.lookup_max_retries; + + // Create queue and channels + let cluster_rx = metadata.subscribe_cluster_changes(); + let (queue, lookup_tx, re_enqueue_tx) = + LookupQueue::new(queue_size, max_batch_size, batch_timeout_ms, cluster_rx); + + // Create shutdown channel + let (shutdown_tx, shutdown_rx) = watch::channel(false); + + // Create sender with shutdown receiver + let mut sender = LookupSender::new( + metadata, + queue, + re_enqueue_tx, + max_inflight, + max_retries, + shutdown_rx, + ); + + // Spawn sender task - sender handles shutdown internally + let sender_handle = tokio::spawn(async move { + sender.run().await; + debug!("Lookup sender completed"); + }); + + Self { + lookup_tx, + sender_handle: Some(sender_handle), + shutdown_tx, + closed: AtomicBool::new(false), + } + } + + /// Looks up a value by its primary key. + /// + /// This method queues the lookup operation and returns a future that will + /// complete when the server responds. Multiple lookups may be batched together + /// for improved throughput. + /// + /// # Arguments + /// * `table_path` - The table path + /// * `table_bucket` - The table bucket + /// * `key_bytes` - The encoded primary key bytes + /// + /// # Returns + /// * `Ok(Some(bytes))` - The value bytes if found + /// * `Ok(None)` - If the key was not found + /// * `Err(Error)` - If the lookup fails + pub async fn lookup( + &self, + table_path: TablePath, + table_bucket: TableBucket, + key_bytes: Bytes, + ) -> Result>> { + if self.closed.load(Ordering::Acquire) { + return Err(Error::UnexpectedError { + message: "Lookup client is closed".to_string(), + source: None, + }); + } + + let (result_tx, result_rx) = tokio::sync::oneshot::channel(); + let query = QueuedLookup::Primary(PrimaryLookupQuery::new( + table_path, + table_bucket, + key_bytes, + result_tx, + )); + + self.enqueue(query).await?; + + result_rx.await.map_err(|_| Error::UnexpectedError { + message: "Lookup result channel closed".to_string(), + source: None, + })? + } + + /// Looks up all values matching a prefix key. + /// + /// The prefix key must be a prefix subset of the table's primary key + /// (specifically, the bucket keys). Returns every row whose primary key + /// starts with the supplied prefix. Queries are batched together with + /// other lookups going to the same server for improved throughput. + /// + /// # Arguments + /// * `table_path` - The table path + /// * `table_bucket` - The table bucket computed from the bucket key part of the prefix + /// * `key_bytes` - The encoded prefix key bytes + /// + /// # Returns + /// * `Ok(rows)` - Every row matching the prefix (possibly empty) + /// * `Err(Error)` - If the lookup fails + pub async fn prefix_lookup( + &self, + table_path: TablePath, + table_bucket: TableBucket, + key_bytes: Bytes, + ) -> Result>> { + if self.closed.load(Ordering::Acquire) { + return Err(Error::UnexpectedError { + message: "Lookup client is closed".to_string(), + source: None, + }); + } + + let (result_tx, result_rx) = tokio::sync::oneshot::channel(); + let query = QueuedLookup::Prefix(PrefixLookupQuery::new( + table_path, + table_bucket, + key_bytes, + result_tx, + )); + + self.enqueue(query).await?; + + result_rx.await.map_err(|_| Error::UnexpectedError { + message: "Lookup result channel closed".to_string(), + source: None, + })? + } + + async fn enqueue(&self, query: QueuedLookup) -> Result<()> { + self.lookup_tx.send(query).await.map_err(|e| { + let failed_query = e.0; + error!( + "Failed to queue lookup: channel closed. table_path: {}, table_bucket: {:?}, key_len: {}", + failed_query.table_path(), + failed_query.table_bucket(), + failed_query.key().len() + ); + Error::UnexpectedError { + message: "Failed to queue lookup: channel closed".to_string(), + source: None, + } + }) + } + + /// Closes the lookup client gracefully. + pub async fn close(mut self, timeout: Duration) { + debug!("Closing lookup client"); + + // Mark as closed to reject new lookups + self.closed.store(true, Ordering::Release); + + // Send shutdown signal via watch channel + let _ = self.shutdown_tx.send(true); + + // Wait for sender to complete with timeout + if let Some(handle) = self.sender_handle.take() { + debug!("Waiting for sender task to complete..."); + let abort_handle = handle.abort_handle(); + + match tokio::time::timeout(timeout, handle).await { + Ok(Ok(())) => { + debug!("Lookup sender task completed gracefully."); + } + Ok(Err(join_error)) => { + error!("Lookup sender task panicked: {:?}", join_error); + } + Err(_elapsed) => { + error!("Lookup sender task did not complete within timeout. Forcing shutdown."); + abort_handle.abort(); + } + } + } else { + debug!("Lookup client was already closed or never initialized properly."); + } + + debug!("Lookup client closed"); + } +} + +impl Drop for LookupClient { + fn drop(&mut self) { + // Abort the sender task on drop if it wasn't already consumed by close() + if let Some(handle) = self.sender_handle.take() { + handle.abort(); + } + } +} diff --git a/fluss-rust/crates/fluss/src/client/lookup/lookup_query.rs b/fluss-rust/crates/fluss/src/client/lookup/lookup_query.rs new file mode 100644 index 0000000000..19830aefa5 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/lookup/lookup_query.rs @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::{Error, Result}; +use crate::metadata::{TableBucket, TablePath}; +use bytes::Bytes; +use tokio::sync::oneshot; + +pub struct LookupQuery { + table_path: TablePath, + table_bucket: TableBucket, + key: Bytes, + retries: i32, + result_tx: Option>>, +} + +impl LookupQuery { + pub fn new( + table_path: TablePath, + table_bucket: TableBucket, + key: Bytes, + result_tx: oneshot::Sender>, + ) -> Self { + Self { + table_path, + table_bucket, + key, + retries: 0, + result_tx: Some(result_tx), + } + } + + pub fn table_path(&self) -> &TablePath { + &self.table_path + } + + pub fn table_bucket(&self) -> &TableBucket { + &self.table_bucket + } + + pub fn key(&self) -> &Bytes { + &self.key + } + + pub fn retries(&self) -> i32 { + self.retries + } + + pub fn increment_retries(&mut self) { + self.retries += 1; + } + + pub fn is_done(&self) -> bool { + self.result_tx.is_none() + } + + pub fn complete(&mut self, result: Result) { + if let Some(tx) = self.result_tx.take() { + let _ = tx.send(result); + } + } + + pub fn complete_with_error(&mut self, error: Error) { + self.complete(Err(error)); + } +} + +pub type PrimaryLookupQuery = LookupQuery>>; +pub type PrefixLookupQuery = LookupQuery>>; + +pub enum QueuedLookup { + Primary(PrimaryLookupQuery), + Prefix(PrefixLookupQuery), +} + +impl QueuedLookup { + pub fn table_path(&self) -> &TablePath { + match self { + Self::Primary(q) => q.table_path(), + Self::Prefix(q) => q.table_path(), + } + } + + pub fn table_bucket(&self) -> &TableBucket { + match self { + Self::Primary(q) => q.table_bucket(), + Self::Prefix(q) => q.table_bucket(), + } + } + + pub fn key(&self) -> &Bytes { + match self { + Self::Primary(q) => q.key(), + Self::Prefix(q) => q.key(), + } + } + + pub fn complete_with_error(&mut self, error: Error) { + match self { + Self::Primary(q) => q.complete_with_error(error), + Self::Prefix(q) => q.complete_with_error(error), + } + } +} + +impl From for QueuedLookup { + fn from(q: PrimaryLookupQuery) -> Self { + QueuedLookup::Primary(q) + } +} + +impl From for QueuedLookup { + fn from(q: PrefixLookupQuery) -> Self { + QueuedLookup::Prefix(q) + } +} diff --git a/fluss-rust/crates/fluss/src/client/lookup/lookup_queue.rs b/fluss-rust/crates/fluss/src/client/lookup/lookup_queue.rs new file mode 100644 index 0000000000..295ec93d8c --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/lookup/lookup_queue.rs @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Lookup queue for buffering pending lookup operations. +//! +//! This queue buffers lookup operations and provides batched draining +//! to improve throughput by reducing network round trips. + +use super::QueuedLookup; +use std::time::Duration; +use tokio::sync::{mpsc, watch}; + +/// A queue that buffers pending lookup operations and provides batched draining. +/// +/// The queue supports two types of entries: +/// - New lookups from client calls +/// - Re-enqueued lookups from retry logic +/// +/// Re-enqueued lookups are prioritized over new lookups to ensure fair processing. +pub struct LookupQueue { + /// Channel for receiving lookup requests + lookup_rx: mpsc::Receiver, + /// Channel for receiving re-enqueued lookups + re_enqueue_rx: mpsc::UnboundedReceiver, + /// Maximum batch size for draining + max_batch_size: usize, + /// Timeout for batch collection + batch_timeout: Duration, + /// Wakes `drain()` early when the cluster changes. + cluster_rx: watch::Receiver, +} + +impl LookupQueue { + pub fn new( + queue_size: usize, + max_batch_size: usize, + batch_timeout_ms: u64, + cluster_rx: watch::Receiver, + ) -> ( + Self, + mpsc::Sender, + mpsc::UnboundedSender, + ) { + let (lookup_tx, lookup_rx) = mpsc::channel(queue_size); + let (re_enqueue_tx, re_enqueue_rx) = mpsc::unbounded_channel(); + + let queue = Self { + lookup_rx, + re_enqueue_rx, + max_batch_size, + batch_timeout: Duration::from_millis(batch_timeout_ms), + cluster_rx, + }; + + (queue, lookup_tx, re_enqueue_tx) + } + + /// Drains a batch of lookup queries from the queue. + pub async fn drain(&mut self) -> Vec { + let mut lookups = Vec::with_capacity(self.max_batch_size); + let deadline = tokio::time::Instant::now() + self.batch_timeout; + + loop { + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + if remaining.is_zero() { + break; + } + + // Prioritize re-enqueued lookups. + while lookups.len() < self.max_batch_size { + match self.re_enqueue_rx.try_recv() { + Ok(lookup) => lookups.push(lookup), + Err(_) => break, + } + } + if lookups.len() >= self.max_batch_size { + break; + } + + let sleep = tokio::time::sleep(remaining); + tokio::select! { + biased; + maybe = self.lookup_rx.recv() => { + match maybe { + Some(lookup) => { + lookups.push(lookup); + while lookups.len() < self.max_batch_size { + match self.lookup_rx.try_recv() { + Ok(lookup) => lookups.push(lookup), + Err(_) => break, + } + } + } + None => break, + } + } + _ = self.cluster_rx.changed() => { + if !lookups.is_empty() { + break; + } + } + _ = sleep => break, + } + + if lookups.len() >= self.max_batch_size { + break; + } + } + + lookups + } + + /// Drains all remaining lookups from the queue. + pub fn drain_all(&mut self) -> Vec { + let mut lookups = Vec::new(); + + // Drain re-enqueued lookups + while let Ok(lookup) = self.re_enqueue_rx.try_recv() { + lookups.push(lookup); + } + + // Drain main queue + while let Ok(lookup) = self.lookup_rx.try_recv() { + lookups.push(lookup); + } + + lookups + } + + /// Returns true if there are undrained lookups in the queue. + pub fn has_undrained(&self) -> bool { + !self.lookup_rx.is_empty() || !self.re_enqueue_rx.is_empty() + } +} diff --git a/fluss-rust/crates/fluss/src/client/lookup/lookup_sender.rs b/fluss-rust/crates/fluss/src/client/lookup/lookup_sender.rs new file mode 100644 index 0000000000..06014bfbb7 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/lookup/lookup_sender.rs @@ -0,0 +1,711 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use super::{LookupQueue, QueuedLookup}; +use crate::client::lookup::lookup_query::LookupQuery; +use crate::client::metadata::Metadata; +use crate::error::{Error, FlussError, Result}; +use crate::metadata::{TableBucket, TablePath}; +use crate::proto::{LookupResponse, PrefixLookupResponse}; +use crate::rpc::ServerConnection; +use crate::rpc::message::{LookupRequest, PrefixLookupRequest, ReadType, RequestBody, WriteType}; +use crate::{BucketId, PartitionId, TableId}; +use bytes::Bytes; +use futures::stream::{FuturesUnordered, StreamExt}; +use log::{debug, error, warn}; +use std::collections::{HashMap, HashSet}; +use std::io::Cursor; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; +use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, watch}; + +type ServerId = i32; + +type BatchesByLeader = HashMap>>; +type PrimaryBatches = BatchesByLeader>>; +type PrefixBatches = BatchesByLeader>>; + +struct BucketResponse { + partition_id: Option, + bucket_id: BucketId, + error_code: Option, + error_message: Option, + values: Vec, +} + +trait LookupProtocol { + type Request: RequestBody + Send + WriteType>; + type Response: ReadType>> + Send; + type Value: Send; + + const OP_NAME: &'static str; + + fn build_request( + table_id: TableId, + keys_by_bucket: Vec<(BucketId, Option, Vec)>, + ) -> Self::Request; + + fn decode_buckets( + response: Self::Response, + ) -> impl Iterator>; +} + +struct Primary; +impl LookupProtocol for Primary { + type Request = LookupRequest; + type Response = LookupResponse; + type Value = Option>; + + const OP_NAME: &'static str = "Lookup"; + + fn build_request( + table_id: TableId, + keys_by_bucket: Vec<(BucketId, Option, Vec)>, + ) -> Self::Request { + LookupRequest::new_batched(table_id, keys_by_bucket) + } + + fn decode_buckets( + response: Self::Response, + ) -> impl Iterator> { + response.buckets_resp.into_iter().map(|r| BucketResponse { + partition_id: r.partition_id, + bucket_id: r.bucket_id, + error_code: r.error_code, + error_message: r.error_message, + values: r.values.into_iter().map(|pb| pb.values).collect(), + }) + } +} + +struct Prefix; +impl LookupProtocol for Prefix { + type Request = PrefixLookupRequest; + type Response = PrefixLookupResponse; + type Value = Vec>; + + const OP_NAME: &'static str = "Prefix lookup"; + + fn build_request( + table_id: TableId, + keys_by_bucket: Vec<(BucketId, Option, Vec)>, + ) -> Self::Request { + PrefixLookupRequest::new_batched(table_id, keys_by_bucket) + } + + fn decode_buckets( + response: Self::Response, + ) -> impl Iterator> { + response.buckets_resp.into_iter().map(|r| BucketResponse { + partition_id: r.partition_id, + bucket_id: r.bucket_id, + error_code: r.error_code, + error_message: r.error_message, + values: r.value_lists.into_iter().map(|pb| pb.values).collect(), + }) + } +} + +struct GroupByLeaderResult { + primary: PrimaryBatches, + prefix: PrefixBatches, + unknown_leader_tables: HashSet, + unknown_leader_partition_ids: HashSet, +} + +impl GroupByLeaderResult { + fn is_empty(&self) -> bool { + self.primary.is_empty() && self.prefix.is_empty() + } + + /// Assumes no `(server, bucket)` overlap — safe because the second pass only + /// re-groups items unknown in the first. + fn merge_batches(&mut self, other: GroupByLeaderResult) { + for (server, inner) in other.primary { + self.primary.entry(server).or_default().extend(inner); + } + for (server, inner) in other.prefix { + self.prefix.entry(server).or_default().extend(inner); + } + } +} + +struct GroupingResult { + groups: GroupByLeaderResult, + unknowns: Vec, +} + +pub struct LookupSender { + metadata: Arc, + queue: LookupQueue, + re_enqueue_tx: mpsc::UnboundedSender, + inflight_semaphore: Arc, + max_retries: i32, + running: AtomicBool, + force_close: AtomicBool, + shutdown_rx: watch::Receiver, +} + +struct LookupBatch { + table_bucket: TableBucket, + lookups: Vec>, + keys: Vec, +} + +impl LookupBatch { + fn new(table_bucket: TableBucket) -> Self { + Self { + table_bucket, + lookups: Vec::new(), + keys: Vec::new(), + } + } + + fn add_lookup(&mut self, lookup: LookupQuery) { + self.keys.push(lookup.key().clone()); + self.lookups.push(lookup); + } + + fn complete(&mut self, values: Vec) { + if values.len() != self.lookups.len() { + let err_msg = format!( + "The number of return values ({}) does not match the number of lookups ({})", + values.len(), + self.lookups.len() + ); + for lookup in &mut self.lookups { + lookup.complete_with_error(Error::UnexpectedError { + message: err_msg.clone(), + source: None, + }); + } + return; + } + + for (lookup, value) in self.lookups.iter_mut().zip(values) { + lookup.complete(Ok(value)); + } + } + + fn complete_all_with_error(&mut self, error_msg: &str) { + for lookup in &mut self.lookups { + lookup.complete_with_error(Error::UnexpectedError { + message: error_msg.to_string(), + source: None, + }); + } + } + + fn keys_tuple(&mut self) -> (BucketId, Option, Vec) { + ( + self.table_bucket.bucket_id(), + self.table_bucket.partition_id(), + std::mem::take(&mut self.keys), + ) + } +} + +impl LookupSender { + pub fn new( + metadata: Arc, + queue: LookupQueue, + re_enqueue_tx: mpsc::UnboundedSender, + max_inflight_requests: usize, + max_retries: i32, + shutdown_rx: watch::Receiver, + ) -> Self { + Self { + metadata, + queue, + re_enqueue_tx, + inflight_semaphore: Arc::new(Semaphore::new(max_inflight_requests)), + max_retries, + running: AtomicBool::new(true), + force_close: AtomicBool::new(false), + shutdown_rx, + } + } + + pub async fn run(&mut self) { + debug!("Starting Fluss lookup sender"); + + let mut shutdown_rx = self.shutdown_rx.clone(); + + while self.running.load(Ordering::Acquire) { + if *shutdown_rx.borrow() { + debug!("Lookup sender received shutdown signal"); + self.initiate_close(); + break; + } + + tokio::select! { + biased; + _ = shutdown_rx.changed() => { + if *shutdown_rx.borrow() { + debug!("Lookup sender received shutdown signal during select"); + self.initiate_close(); + } + } + result = self.run_once(false) => { + if let Err(e) = result { + error!("Error in lookup sender: {}", e); + } + } + } + } + + debug!("Beginning shutdown of lookup sender, sending remaining lookups"); + + // TODO: Check the in-flight request count in the accumulator. + if !self.force_close.load(Ordering::Acquire) && self.queue.has_undrained() { + if let Err(e) = self.run_once(true).await { + error!("Error during lookup sender shutdown: {}", e); + } + } + + // TODO: If force close failed, add logic to abort incomplete lookup requests. + debug!("Lookup sender shutdown complete"); + } + + async fn run_once(&mut self, drain_all: bool) -> Result<()> { + let lookups = if drain_all { + self.queue.drain_all() + } else { + self.queue.drain().await + }; + + self.send_lookups(lookups).await + } + + async fn send_lookups(&self, lookups: Vec) -> Result<()> { + if lookups.is_empty() { + return Ok(()); + } + + let GroupingResult { + mut groups, + unknowns, + } = self.group_by_leader(lookups); + + if !unknowns.is_empty() { + let table_paths_refs: HashSet<&TablePath> = + groups.unknown_leader_tables.iter().collect(); + let partition_ids: Vec = groups + .unknown_leader_partition_ids + .iter() + .copied() + .collect(); + if let Err(e) = self + .metadata + .update_tables_metadata(&table_paths_refs, &HashSet::new(), partition_ids) + .await + { + warn!("Failed to update metadata for unknown leader tables: {}", e); + } else { + debug!( + "Updated metadata due to unknown leader tables during lookup: {:?}", + groups.unknown_leader_tables + ); + } + + // Re-group with fresh cluster state; dispatch what resolved, re-enqueue the rest. + let retry = self.group_by_leader(unknowns); + groups.merge_batches(retry.groups); + for item in retry.unknowns { + self.re_enqueue_lookup(item); + } + + // Nothing to dispatch even after refresh — back off to avoid a tight RPC loop. + if groups.is_empty() { + let mut cluster_rx = self.metadata.subscribe_cluster_changes(); + tokio::select! { + _ = cluster_rx.changed() => {} + _ = tokio::time::sleep(Duration::from_millis(100)) => {} + } + return Ok(()); + } + } + + let primary_fut = async { + let mut pending = FuturesUnordered::new(); + for (server, batches) in groups.primary { + pending.push(self.send_request::(server, batches)); + } + while pending.next().await.is_some() {} + }; + let prefix_fut = async { + let mut pending = FuturesUnordered::new(); + for (server, batches) in groups.prefix { + pending.push(self.send_request::(server, batches)); + } + while pending.next().await.is_some() {} + }; + tokio::join!(primary_fut, prefix_fut); + + Ok(()) + } + + fn group_by_leader(&self, lookups: Vec) -> GroupingResult { + let cluster = self.metadata.get_cluster(); + let mut primary: PrimaryBatches = HashMap::new(); + let mut prefix: PrefixBatches = HashMap::new(); + let mut unknown_leader_tables: HashSet = HashSet::new(); + let mut unknown_leader_partition_ids: HashSet = HashSet::new(); + let mut unknowns: Vec = Vec::new(); + + for query in lookups { + let table_bucket = query.table_bucket().clone(); + + let leader = match cluster.leader_for(&table_bucket) { + Some(leader) => leader.id(), + None => { + warn!( + "No leader found for table bucket {} during lookup", + table_bucket + ); + unknown_leader_tables.insert(query.table_path().clone()); + if let Some(partition_id) = table_bucket.partition_id() { + unknown_leader_partition_ids.insert(partition_id); + } + unknowns.push(query); + continue; + } + }; + + match query { + QueuedLookup::Primary(q) => { + primary + .entry(leader) + .or_default() + .entry(table_bucket.clone()) + .or_insert_with(|| LookupBatch::new(table_bucket)) + .add_lookup(q); + } + QueuedLookup::Prefix(q) => { + prefix + .entry(leader) + .or_default() + .entry(table_bucket.clone()) + .or_insert_with(|| LookupBatch::new(table_bucket)) + .add_lookup(q); + } + } + } + + GroupingResult { + groups: GroupByLeaderResult { + primary, + prefix, + unknown_leader_tables, + unknown_leader_partition_ids, + }, + unknowns, + } + } + + async fn send_request( + &self, + destination: ServerId, + batches_by_bucket: HashMap>, + ) where + LookupQuery: Into, + { + let mut batches_by_table = group_by_table(batches_by_bucket); + let connection = match self + .connect_or_fail(destination, &mut batches_by_table) + .await + { + Some(conn) => conn, + None => return, + }; + + let mut pending = FuturesUnordered::new(); + for (table_id, mut batches) in batches_by_table { + let keys_by_bucket: Vec<_> = batches.iter_mut().map(|b| b.keys_tuple()).collect(); + let request = P::build_request(table_id, keys_by_bucket); + pending.push(self.send_single_table_lookup::

( + table_id, + destination, + connection.clone(), + request, + batches, + )); + } + while pending.next().await.is_some() {} + } + + async fn connect_or_fail( + &self, + destination: ServerId, + batches_by_table: &mut HashMap>>, + ) -> Option + where + LookupQuery: Into, + { + let cluster = self.metadata.get_cluster(); + let tablet_server = match cluster.get_tablet_server(destination) { + Some(server) => server.clone(), + None => { + let err_msg = format!("Server {} is not found in metadata cache", destination); + self.fail_all_batches(&err_msg, true, batches_by_table); + return None; + } + }; + + match self.metadata.get_connection(&tablet_server).await { + Ok(conn) => Some(conn), + Err(e) => { + let err_msg = format!("Failed to get connection to server {}: {}", destination, e); + self.fail_all_batches(&err_msg, true, batches_by_table); + None + } + } + } + + fn fail_all_batches( + &self, + err_msg: &str, + is_retriable: bool, + batches_by_table: &mut HashMap>>, + ) where + LookupQuery: Into, + { + for batches in batches_by_table.values_mut() { + for batch in batches.iter_mut() { + self.handle_batch_error(err_msg, is_retriable, batch); + } + } + } + + async fn send_single_table_lookup( + &self, + table_id: TableId, + destination: ServerId, + connection: ServerConnection, + request: P::Request, + mut batches: Vec>, + ) where + LookupQuery: Into, + { + let _permit = match self.acquire_inflight_permit(&mut batches).await { + Some(p) => p, + None => return, + }; + + match connection.request(request).await { + Ok(response) => { + self.handle_response::

(table_id, destination, response, &mut batches); + } + Err(e) => { + let err_msg = format!("{} request failed: {}", P::OP_NAME, e); + let is_retriable = e.is_retriable(); + for batch in &mut batches { + self.handle_batch_error(&err_msg, is_retriable, batch); + } + } + } + } + + async fn acquire_inflight_permit( + &self, + batches: &mut [LookupBatch], + ) -> Option { + match self.inflight_semaphore.clone().acquire_owned().await { + Ok(p) => Some(p), + Err(_) => { + error!("Semaphore closed during lookup"); + for batch in batches.iter_mut() { + batch.complete_all_with_error("Lookup sender shutdown"); + } + None + } + } + } + + fn handle_response( + &self, + table_id: TableId, + destination: ServerId, + response: P::Response, + batches: &mut [LookupBatch], + ) where + LookupQuery: Into, + { + let bucket_to_index = build_bucket_index(batches); + let mut processed = vec![false; batches.len()]; + + for bucket_resp in P::decode_buckets(response) { + let table_bucket = TableBucket::new_with_partition( + table_id, + bucket_resp.partition_id, + bucket_resp.bucket_id, + ); + let Some(&idx) = bucket_to_index.get(&table_bucket) else { + error!( + "Received {} response for unknown bucket {} from server {}", + P::OP_NAME, + table_bucket, + destination + ); + continue; + }; + processed[idx] = true; + let batch = &mut batches[idx]; + + if let Some(err) = extract_bucket_error( + bucket_resp.error_code, + bucket_resp.error_message, + &table_bucket, + P::OP_NAME, + ) { + self.handle_batch_error(&err.message, err.is_retriable, batch); + continue; + } + + batch.complete(bucket_resp.values); + } + + self.fail_unprocessed_batches(&processed, batches, destination, P::OP_NAME); + } + + fn fail_unprocessed_batches( + &self, + processed: &[bool], + batches: &mut [LookupBatch], + destination: ServerId, + op_name: &'static str, + ) where + LookupQuery: Into, + { + for (idx, was_processed) in processed.iter().enumerate() { + if !was_processed { + let batch = &mut batches[idx]; + let err_msg = format!( + "Bucket {} {} response missing from server {}", + batch.table_bucket.bucket_id(), + op_name, + destination + ); + self.handle_batch_error(&err_msg, true, batch); + } + } + } + + fn handle_batch_error(&self, error_msg: &str, is_retriable: bool, batch: &mut LookupBatch) + where + LookupQuery: Into, + { + let mut retried = 0usize; + let mut failed = 0usize; + let table_bucket = batch.table_bucket.clone(); + + for mut lookup in batch.lookups.drain(..) { + if is_retriable && lookup.retries() < self.max_retries && !lookup.is_done() { + lookup.increment_retries(); + self.re_enqueue_lookup(lookup.into()); + retried += 1; + } else { + lookup.complete_with_error(Error::UnexpectedError { + message: error_msg.to_string(), + source: None, + }); + failed += 1; + } + } + + if retried > 0 { + warn!( + "Lookup error for bucket {}, retrying {} lookups: {}", + table_bucket, retried, error_msg + ); + } + if failed > 0 { + warn!( + "Lookup failed for bucket {} ({} lookups): {}", + table_bucket, failed, error_msg + ); + } + } + + fn re_enqueue_lookup(&self, lookup: QueuedLookup) { + if let Err(e) = self.re_enqueue_tx.send(lookup) { + error!("Failed to re-enqueue lookup: {}", e); + let mut failed_lookup = e.0; + failed_lookup.complete_with_error(Error::UnexpectedError { + message: "Failed to re-enqueue lookup: channel closed".to_string(), + source: None, + }); + } + } + + pub fn initiate_close(&mut self) { + self.running.store(false, Ordering::Release); + } + + #[allow(dead_code)] + pub fn force_close(&mut self) { + self.force_close.store(true, Ordering::Release); + self.initiate_close(); + } +} + +fn group_by_table( + batches_by_bucket: HashMap>, +) -> HashMap>> { + let mut out: HashMap>> = HashMap::new(); + for (table_bucket, batch) in batches_by_bucket { + out.entry(table_bucket.table_id()).or_default().push(batch); + } + out +} + +fn build_bucket_index(batches: &[LookupBatch]) -> HashMap { + batches + .iter() + .enumerate() + .map(|(idx, batch)| (batch.table_bucket.clone(), idx)) + .collect() +} + +struct BucketError { + message: String, + is_retriable: bool, +} + +fn extract_bucket_error( + error_code: Option, + error_message: Option, + table_bucket: &TableBucket, + op: &str, +) -> Option { + let code = error_code?; + let fluss_error = FlussError::for_code(code); + if fluss_error == FlussError::None { + return None; + } + Some(BucketError { + message: format!( + "{} error for bucket {}: code={}, message={}", + op, + table_bucket, + code, + error_message.unwrap_or_default() + ), + is_retriable: fluss_error.is_retriable(), + }) +} diff --git a/fluss-rust/crates/fluss/src/client/lookup/mod.rs b/fluss-rust/crates/fluss/src/client/lookup/mod.rs new file mode 100644 index 0000000000..ac2446a9e4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/lookup/mod.rs @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Lookup client implementation with batching and queuing support. +//! +//! This module provides a high-throughput lookup client that batches multiple +//! lookup operations together to reduce network round trips, achieving parity +//! with the Java client implementation. +//! +//! # Example +//! +//! ```ignore +//! let lookup_client = LookupClient::new(config, metadata); +//! let future = lookup_client.lookup(table_path, table_bucket, key_bytes); +//! let result = future.await?; +//! ``` + +mod lookup_client; +mod lookup_query; +mod lookup_queue; +mod lookup_sender; + +pub use lookup_client::LookupClient; +pub(crate) use lookup_query::{PrefixLookupQuery, PrimaryLookupQuery, QueuedLookup}; +pub(crate) use lookup_queue::LookupQueue; diff --git a/fluss-rust/crates/fluss/src/client/metadata.rs b/fluss-rust/crates/fluss/src/client/metadata.rs new file mode 100644 index 0000000000..1e3ee7fe1c --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/metadata.rs @@ -0,0 +1,367 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::PartitionId; +use crate::cluster::{Cluster, ServerNode, ServerType}; +use crate::error::{Error, FlussError, Result}; +use crate::metadata::{PhysicalTablePath, TableBucket, TablePath}; +use crate::proto::MetadataResponse; +use crate::rpc::message::UpdateMetadataRequest; +use crate::rpc::{RpcClient, ServerConnection}; +use log::info; +use parking_lot::RwLock; +use std::collections::HashSet; +use std::net::{SocketAddr, ToSocketAddrs}; +use std::sync::Arc; +use tokio::sync::watch; + +pub struct Metadata { + cluster: RwLock>, + connections: Arc, + bootstrap: Arc, + cluster_version_tx: watch::Sender, +} + +impl Metadata { + pub async fn new(bootstrap: &str, connections: Arc) -> Result { + let cluster = Self::init_cluster(bootstrap, connections.clone()).await?; + let (cluster_version_tx, _) = watch::channel(0); + Ok(Metadata { + cluster: RwLock::new(Arc::new(cluster)), + connections, + bootstrap: bootstrap.into(), + cluster_version_tx, + }) + } + + pub fn subscribe_cluster_changes(&self) -> watch::Receiver { + self.cluster_version_tx.subscribe() + } + + fn notify_cluster_changed(&self) { + self.cluster_version_tx + .send_modify(|v| *v = v.wrapping_add(1)); + } + + fn parse_bootstrap(boot_strap: &str) -> Result { + // Resolve all socket addresses and deterministically choose one. + let addrs = boot_strap + .to_socket_addrs() + .map_err(|e| Error::IllegalArgument { + message: format!("Invalid bootstrap address '{boot_strap}': {e}"), + })?; + + // Prefer IPv4 addresses; if none are available, fall back to the first IPv6. + let mut ipv6_candidate: Option = None; + for addr in addrs { + if addr.is_ipv4() { + return Ok(addr); + } + if ipv6_candidate.is_none() { + ipv6_candidate = Some(addr); + } + } + + let addr = ipv6_candidate.ok_or_else(|| Error::IllegalArgument { + message: format!("Unable to resolve bootstrap address '{boot_strap}'"), + })?; + Ok(addr) + } + + async fn init_cluster(boot_strap: &str, connections: Arc) -> Result { + let socket_address = Self::parse_bootstrap(boot_strap)?; + let server_node = ServerNode::new( + -1, + socket_address.ip().to_string(), + socket_address.port() as u32, + ServerType::CoordinatorServer, + ); + let con = connections.get_connection(&server_node).await?; + + let response = con + .request(UpdateMetadataRequest::new( + &HashSet::default(), + &HashSet::new(), + vec![], + )) + .await?; + Cluster::from_metadata_response(response, None) + } + + pub(crate) async fn reinit_cluster(&self) -> Result<()> { + let cluster = Self::init_cluster(&self.bootstrap, self.connections.clone()).await?; + *self.cluster.write() = cluster.into(); + self.notify_cluster_changed(); + Ok(()) + } + + pub fn invalidate_server(&self, server_id: &i32, table_ids: Vec) { + { + let mut cluster_guard = self.cluster.write(); + let updated_cluster = cluster_guard.invalidate_server(server_id, table_ids); + *cluster_guard = Arc::new(updated_cluster); + } + self.notify_cluster_changed(); + } + + pub fn invalidate_physical_table_meta( + &self, + physical_tables_to_invalid: &HashSet, + ) { + { + let mut cluster_guard = self.cluster.write(); + let updated_cluster = + cluster_guard.invalidate_physical_table_meta(physical_tables_to_invalid); + *cluster_guard = Arc::new(updated_cluster); + } + self.notify_cluster_changed(); + } + + pub async fn update(&self, metadata_response: MetadataResponse) -> Result<()> { + let origin_cluster = self.cluster.read().clone(); + let new_cluster = + Cluster::from_metadata_response(metadata_response, Some(&origin_cluster))?; + { + let mut cluster = self.cluster.write(); + *cluster = Arc::new(new_cluster); + } + self.notify_cluster_changed(); + Ok(()) + } + + pub async fn update_tables_metadata( + &self, + table_paths: &HashSet<&TablePath>, + physical_table_paths: &HashSet<&Arc>, + partition_ids: Vec, + ) -> Result<()> { + let maybe_server = { + let guard = self.cluster.read(); + guard.get_one_available_server().cloned() + }; + + let server = match maybe_server { + Some(s) => s, + None => { + info!( + "No available tablet server to update metadata, attempting to re-initialize cluster using bootstrap server." + ); + self.reinit_cluster().await?; + return Ok(()); + } + }; + + let conn = self.connections.get_connection(&server).await?; + + let response = conn + .request(UpdateMetadataRequest::new( + table_paths, + physical_table_paths, + partition_ids, + )) + .await?; + self.update(response).await?; + Ok(()) + } + + pub async fn update_table_metadata(&self, table_path: &TablePath) -> Result<()> { + self.update_tables_metadata(&HashSet::from([table_path]), &HashSet::new(), vec![]) + .await + } + + pub async fn update_physical_table_metadata( + &self, + physical_table_paths: &[Arc], + ) -> Result<()> { + let mut update_table_paths = HashSet::new(); + let mut update_partition_paths = HashSet::new(); + for physical_table_path in physical_table_paths { + match physical_table_path.get_partition_name() { + Some(_) => { + update_partition_paths.insert(physical_table_path); + } + None => { + update_table_paths.insert(physical_table_path.get_table_path()); + } + } + } + + self.update_tables_metadata(&update_table_paths, &update_partition_paths, vec![]) + .await + } + + pub async fn check_and_update_table_metadata(&self, table_paths: &[TablePath]) -> Result<()> { + let cluster_binding = self.cluster.read().clone(); + let need_update_table_paths: HashSet<&TablePath> = table_paths + .iter() + .filter(|table_path| cluster_binding.opt_get_table(table_path).is_none()) + .collect(); + + if !need_update_table_paths.is_empty() { + self.update_tables_metadata(&need_update_table_paths, &HashSet::new(), vec![]) + .await?; + } + Ok(()) + } + + /// Resolves the partition id, refreshing metadata once if not cached. + /// Returns `None` when the partition does not exist — `PartitionNotExists` + /// server errors are swallowed so callers can short-circuit to an empty result. + pub async fn check_and_update_partition_metadata( + &self, + physical_table_path: &PhysicalTablePath, + ) -> Result> { + if let Some(id) = self.get_cluster().get_partition_id(physical_table_path) { + return Ok(Some(id)); + } + let path = Arc::new(physical_table_path.clone()); + match self.update_physical_table_metadata(&[path]).await { + Ok(()) => {} + Err(e) if matches!(e.api_error(), Some(FlussError::PartitionNotExists)) => { + return Ok(None); + } + Err(e) => return Err(e), + } + Ok(self.get_cluster().get_partition_id(physical_table_path)) + } + + pub async fn get_connection(&self, server_node: &ServerNode) -> Result { + let result = self.connections.get_connection(server_node).await?; + Ok(result) + } + + pub fn get_cluster(&self) -> Arc { + let guard = self.cluster.read(); + guard.clone() + } + + const MAX_RETRY_TIMES: u8 = 3; + + pub async fn leader_for( + &self, + table_path: &TablePath, + table_bucket: &TableBucket, + ) -> Result> { + let leader = self.get_leader_for(table_bucket); + + if leader.is_some() { + Ok(leader) + } else { + for _ in 0..Self::MAX_RETRY_TIMES { + if let Some(partition_id) = table_bucket.partition_id() { + self.update_tables_metadata( + &HashSet::from([table_path]), + &HashSet::new(), + vec![partition_id], + ) + .await?; + } else { + self.update_tables_metadata( + &HashSet::from([table_path]), + &HashSet::new(), + vec![], + ) + .await?; + } + + let cluster = self.cluster.read(); + let leader = cluster.leader_for(table_bucket); + + if leader.is_some() { + return Ok(leader.cloned()); + } + } + + Ok(None) + } + } + + fn get_leader_for(&self, table_bucket: &TableBucket) -> Option { + let cluster = self.cluster.read(); + cluster.leader_for(table_bucket).cloned() + } +} + +#[cfg(test)] +impl Metadata { + pub(crate) fn new_for_test(cluster: Arc) -> Self { + let (cluster_version_tx, _) = watch::channel(0); + Metadata { + cluster: RwLock::new(cluster), + connections: Arc::new(RpcClient::new()), + bootstrap: Arc::from(""), + cluster_version_tx, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{TableBucket, TablePath}; + use crate::test_utils::build_cluster_arc; + + #[tokio::test] + async fn leader_for_returns_server() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Metadata::new_for_test(cluster); + let leader = metadata + .leader_for(&table_path, &TableBucket::new(1, 0)) + .await + .unwrap() + .expect("leader"); + assert_eq!(leader.id(), 1); + } + + #[test] + fn invalidate_server_removes_leader() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Metadata::new_for_test(cluster); + metadata.invalidate_server(&1, vec![1]); + let cluster = metadata.get_cluster(); + assert!(cluster.get_tablet_server(1).is_none()); + } + + #[test] + fn parse_bootstrap_variants() { + // valid IP + let addr = Metadata::parse_bootstrap("127.0.0.1:8080").unwrap(); + assert_eq!(addr.port(), 8080); + + // valid hostname + let addr = Metadata::parse_bootstrap("localhost:9090").unwrap(); + assert_eq!(addr.port(), 9090); + + // valid IPv6 address + let addr = Metadata::parse_bootstrap("[::1]:8080").unwrap(); + assert_eq!(addr.port(), 8080); + + // invalid input: missing port + assert!(Metadata::parse_bootstrap("localhost").is_err()); + + // invalid input: out-of-range port + assert!(Metadata::parse_bootstrap("localhost:99999").is_err()); + + // invalid input: empty string + assert!(Metadata::parse_bootstrap("").is_err()); + + // invalid input: nonsensical address + assert!(Metadata::parse_bootstrap("invalid_address").is_err()); + } +} diff --git a/fluss-rust/crates/fluss/src/client/mod.rs b/fluss-rust/crates/fluss/src/client/mod.rs new file mode 100644 index 0000000000..f8027948ae --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/mod.rs @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod admin; +mod connection; +mod credentials; +pub mod lookup; +mod metadata; +mod schema_getter; +mod table; +mod write; + +pub use admin::*; +pub use connection::*; +pub use credentials::*; +pub use lookup::LookupClient; +pub use metadata::*; +pub(crate) use schema_getter::ClientSchemaGetter; +pub use table::*; +pub use write::*; diff --git a/fluss-rust/crates/fluss/src/client/schema_getter.rs b/fluss-rust/crates/fluss/src/client/schema_getter.rs new file mode 100644 index 0000000000..4b643c0bec --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/schema_getter.rs @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Per-table schema cache that lazily fetches missing schema versions +//! from the coordinator. Used by the lookup path to decode rows that +//! predate the table's current schema. + +use crate::client::admin::FlussAdmin; +use crate::error::{Error, Result}; +use crate::metadata::{Schema, SchemaInfo, TablePath}; +use parking_lot::RwLock; +use std::collections::HashMap; +use std::sync::Arc; + +pub(crate) struct ClientSchemaGetter { + table_path: TablePath, + admin: Arc, + /// Pre-seeded with the table's current schema so the dominant case + /// (every row written under the latest schema) needs zero RPCs. + cache: RwLock>>, +} + +impl ClientSchemaGetter { + pub fn new(table_path: TablePath, admin: Arc, latest: SchemaInfo) -> Self { + let mut map = HashMap::new(); + let (schema, schema_id) = latest.into_parts(); + map.insert(schema_id, Arc::new(schema)); + Self { + table_path, + admin, + cache: RwLock::new(map), + } + } + + /// Concurrent fetches for the same id are not deduplicated; we + /// accept one redundant RPC in exchange for staying off + /// `tokio::sync` machinery. Schemas are immutable per id, so + /// last-write-wins on the cache insert is correct. + pub async fn get_schema(&self, schema_id: i32) -> Result> { + if let Some(schema) = self.cache.read().get(&schema_id).cloned() { + return Ok(schema); + } + + let info = self + .admin + .get_table_schema(&self.table_path, Some(schema_id)) + .await?; + let (schema, fetched_id) = info.into_parts(); + if fetched_id != schema_id { + return Err(Error::UnexpectedError { + message: format!( + "Requested schema id {schema_id}, but server returned schema id {fetched_id}" + ), + source: None, + }); + } + let schema = Arc::new(schema); + + self.cache.write().insert(schema_id, Arc::clone(&schema)); + Ok(schema) + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/append.rs b/fluss-rust/crates/fluss/src/client/table/append.rs new file mode 100644 index 0000000000..562e8ea7e7 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/append.rs @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::table::partition_getter::{PartitionGetter, get_physical_path}; +use crate::client::{WriteRecord, WriteResultFuture, WriterClient}; +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::{PhysicalTablePath, TableInfo, TablePath}; +use crate::row::{ColumnarRow, InternalRow}; +use arrow::array::RecordBatch; +use std::sync::Arc; + +pub struct TableAppend { + table_path: Arc, + table_info: Arc, + writer_client: Arc, +} + +impl TableAppend { + pub(super) fn new( + table_path: TablePath, + table_info: Arc, + writer_client: Arc, + ) -> Self { + Self { + table_path: Arc::new(table_path), + table_info, + writer_client, + } + } + + pub fn create_writer(&self) -> Result { + let partition_getter = if self.table_info.is_partitioned() { + Some(PartitionGetter::new( + self.table_info.row_type(), + Arc::clone(self.table_info.get_partition_keys()), + )?) + } else { + None + }; + + Ok(AppendWriter { + table_path: Arc::clone(&self.table_path), + partition_getter, + writer_client: self.writer_client.clone(), + table_info: Arc::clone(&self.table_info), + }) + } +} + +pub struct AppendWriter { + table_path: Arc, + partition_getter: Option, + writer_client: Arc, + table_info: Arc, +} + +impl AppendWriter { + fn check_field_count(&self, row: &R) -> Result<()> { + let expected = self.table_info.get_row_type().fields().len(); + if row.get_field_count() != expected { + return Err(IllegalArgument { + message: format!( + "The field count of the row does not match the table schema. \ + Expected: {}, Actual: {}", + expected, + row.get_field_count() + ), + }); + } + Ok(()) + } + + /// Appends a row to the table. + /// + /// This method returns a [`WriteResultFuture`] immediately after queueing the write, + /// enabling fire-and-forget semantics for efficient batching. + /// + /// # Arguments + /// * row - the row to append. + /// + /// # Returns + /// A [`WriteResultFuture`] that can be awaited to wait for server acknowledgment, + /// or dropped for fire-and-forget behavior (use `flush()` to ensure delivery). + pub fn append(&self, row: &R) -> Result { + self.check_field_count(row)?; + let physical_table_path = Arc::new(get_physical_path( + &self.table_path, + self.partition_getter.as_ref(), + row, + )?); + let record = WriteRecord::for_append( + Arc::clone(&self.table_info), + physical_table_path, + self.table_info.schema_id, + row, + ); + let result_handle = self.writer_client.send(&record)?; + Ok(WriteResultFuture::new(result_handle)) + } + + /// Appends an Arrow RecordBatch to the table. + /// + /// This method returns a [`WriteResultFuture`] immediately after queueing the write, + /// enabling fire-and-forget semantics for efficient batching. + /// + /// For partitioned tables, the partition is derived from the **first row** of the batch. + /// Callers must ensure all rows in the batch belong to the same partition. + /// + /// # Returns + /// A [`WriteResultFuture`] that can be awaited to wait for server acknowledgment, + /// or dropped for fire-and-forget behavior (use `flush()` to ensure delivery). + pub fn append_arrow_batch(&self, batch: RecordBatch) -> Result { + let physical_table_path = if self.partition_getter.is_some() && batch.num_rows() > 0 { + let first_row = ColumnarRow::new( + Arc::new(batch.clone()), + Arc::new(self.table_info.row_type.clone()), + 0, + None, + ); + Arc::new(get_physical_path( + &self.table_path, + self.partition_getter.as_ref(), + &first_row, + )?) + } else { + Arc::new(PhysicalTablePath::of(Arc::clone(&self.table_path))) + }; + + let record = WriteRecord::for_append_record_batch( + Arc::clone(&self.table_info), + physical_table_path, + self.table_info.schema_id, + batch, + ); + let result_handle = self.writer_client.send(&record)?; + Ok(WriteResultFuture::new(result_handle)) + } + + pub async fn flush(&self) -> Result<()> { + self.writer_client.flush().await + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/batch_scanner.rs b/fluss-rust/crates/fluss/src/client/table/batch_scanner.rs new file mode 100644 index 0000000000..cc0585f30e --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/batch_scanner.rs @@ -0,0 +1,767 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Bounded batch scanner backed by a single `LimitScanRequest`, polled with +//! `next_batch` until it returns `None` (like `RecordBatchLogReader`). +//! +//! The KV branch decodes a [`ValueRecordBatch`], decoding each record against +//! its own schema id via [`FixedSchemaDecoder`] so older records are projected +//! onto the current schema (the same path as lookup). + +use crate::client::ClientSchemaGetter; +use crate::client::metadata::Metadata; +use crate::error::{ApiError, Error, FlussError, Result}; +use crate::metadata::{KvFormat, RowType, Schema, TableBucket, TableInfo}; +use crate::proto::ErrorResponse; +use crate::record::kv::{SCHEMA_ID_LENGTH, ValueRecordBatch}; +use crate::record::{ + LogRecordsBatches, ReadContext as ArrowReadContext, RowAppendRecordBatchBuilder, ScanBatch, + to_arrow_schema, +}; +use crate::row::FixedSchemaDecoder; +use crate::rpc::RpcClient; +use crate::rpc::message::LimitScanRequest; +use arrow::array::RecordBatch; +use arrow::compute::concat_batches; +use arrow_schema::SchemaRef; +use byteorder::{ByteOrder, LittleEndian}; +use bytes::Bytes; +use std::collections::HashMap; +use std::ops::Range; +use std::sync::Arc; + +/// One-shot bounded scanner: a single `LimitScanRequest` yielded as one +/// [`ScanBatch`]. Creation is cheap; the request runs on the first +/// [`next_batch`](Self::next_batch), which returns the batch once, then `None`. +pub struct LimitBatchScanner { + bucket: TableBucket, + /// Taken on the first `next_batch` to run the scan; `None` afterward. + pending: Option, +} + +/// Request inputs captured at creation, consumed by the first `next_batch`. +struct PendingScan { + rpc_client: Arc, + metadata: Arc, + table_info: TableInfo, + schema_getter: Arc, + projected_fields: Option>, + limit: i32, +} + +impl LimitBatchScanner { + pub(super) fn new( + rpc_client: Arc, + metadata: Arc, + table_info: TableInfo, + schema_getter: Arc, + projected_fields: Option>, + bucket: TableBucket, + limit: i32, + ) -> Self { + Self { + bucket, + pending: Some(PendingScan { + rpc_client, + metadata, + table_info, + schema_getter, + projected_fields, + limit, + }), + } + } + + /// Runs the scan on the first call and returns its batch, then `None`. Not + /// retried — an error leaves the scanner spent; create a new one to retry. + pub async fn next_batch(&mut self) -> Result> { + let Some(pending) = self.pending.take() else { + return Ok(None); + }; + run_limit_scan(&pending, &self.bucket).await.map(Some) + } + + /// Drains the scanner into all of its batches. + pub async fn collect_all_batches(&mut self) -> Result> { + let mut batches = Vec::new(); + while let Some(batch) = self.next_batch().await? { + batches.push(batch); + } + Ok(batches) + } + + /// The bucket scanned by this `LimitBatchScanner`. + pub fn bucket(&self) -> &TableBucket { + &self.bucket + } +} + +/// Resolves the leader, sends the `LimitScanRequest`, and decodes the response +/// into one [`ScanBatch`]. +async fn run_limit_scan(pending: &PendingScan, bucket: &TableBucket) -> Result { + let leader = pending + .metadata + .leader_for(&pending.table_info.table_path, bucket) + .await? + .ok_or_else(|| { + Error::leader_not_available(format!("No leader found for table bucket: {bucket}")) + })?; + let connection = pending.rpc_client.get_connection(&leader).await?; + + let request = LimitScanRequest::new( + pending.table_info.table_id, + bucket.partition_id(), + bucket.bucket_id(), + pending.limit, + ); + let response = connection.request(request).await?; + + if let Some(error_code) = response.error_code + && error_code != FlussError::None.code() + { + let err: ApiError = ErrorResponse { + error_code, + error_message: response.error_message.clone(), + } + .into(); + return Err(Error::FlussAPIError { api_error: err }); + } + + let raw = response.records.unwrap_or_default(); + // `limit` is validated positive by `TableScan::limit`. + let limit = pending.limit.max(0) as usize; + let projected = pending.projected_fields.as_deref(); + + // Choose the payload format from table metadata, not the response's advisory + // `is_log_table` flag. + let (batch, base_offset) = if !pending.table_info.has_primary_key() { + decode_log_batch(&pending.table_info, projected, raw, limit)? + } else { + // KV (primary-key) limit scan: no log offset, so base_offset is 0. + let batch = decode_kv_batch( + &pending.table_info, + &pending.schema_getter, + projected, + raw, + limit, + ) + .await?; + (batch, 0) + }; + + Ok(ScanBatch::new(bucket.clone(), batch, base_offset)) +} + +/// Decode the log payload into a single Arrow `RecordBatch`, concatenating any +/// inner batches. If more than `limit` rows are returned, the last `limit` are +/// kept and `base_offset` is advanced by the number dropped. +fn decode_log_batch( + table_info: &TableInfo, + projected_fields: Option<&[usize]>, + raw: Vec, + limit: usize, +) -> Result<(RecordBatch, i64)> { + let row_type = Arc::new(table_info.get_row_type().clone()); + let full_schema = to_arrow_schema(table_info.get_row_type())?; + let read_context = match projected_fields { + None => ArrowReadContext::new(full_schema.clone(), row_type.clone(), false), + Some(fields) => ArrowReadContext::with_projection_pushdown( + full_schema.clone(), + row_type.clone(), + fields.to_vec(), + false, + )?, + }; + + let target_schema: SchemaRef = match projected_fields { + None => full_schema, + Some(fields) => { + ArrowReadContext::project_schema(to_arrow_schema(table_info.get_row_type())?, fields)? + } + }; + + if raw.is_empty() { + return Ok((RecordBatch::new_empty(target_schema), 0)); + } + + let mut batches: Vec = Vec::new(); + let mut base_offset: Option = None; + for log_batch in LogRecordsBatches::new(raw) { + let log_batch = log_batch?; + if base_offset.is_none() { + base_offset = Some(log_batch.base_log_offset()); + } + let rb = log_batch.record_batch(&read_context)?; + batches.push(rb); + } + + let base_offset = base_offset.unwrap_or(0); + let merged = if batches.is_empty() { + RecordBatch::new_empty(target_schema) + } else if batches.len() == 1 { + batches.into_iter().next().unwrap() + } else { + concat_batches(&target_schema, batches.iter()).map_err(|e| Error::UnexpectedError { + message: format!("Failed to concatenate log record batches: {e}"), + source: None, + })? + }; + + Ok(take_last_rows(merged, base_offset, limit)) +} + +/// Decode a KV limit-scan [`ValueRecordBatch`] into a single Arrow +/// `RecordBatch`, decoding each record by its own schema id and projecting onto +/// the current schema. +async fn decode_kv_batch( + table_info: &TableInfo, + schema_getter: &ClientSchemaGetter, + projected_fields: Option<&[usize]>, + raw: Vec, + limit: usize, +) -> Result { + // No records: return an empty (projected) batch. + if raw.is_empty() { + return empty_record_batch(table_info.get_row_type(), projected_fields); + } + + let kv_format = table_info.table_config.get_kv_format()?; + let target_schema = table_info.get_schema(); + let target_schema_id = + i16::try_from(table_info.get_schema_id()).map_err(|_| Error::UnexpectedError { + message: format!( + "Schema id {} does not fit in 16 bits — wire format violated", + table_info.get_schema_id() + ), + source: None, + })?; + + let batch = ValueRecordBatch::new(Bytes::from(raw)); + let ranges = batch.value_ranges()?; + + // Collect the distinct schema ids present, then build one decoder per id + // (fetching older schemas via the coordinator as needed). + let mut schema_ids: Vec = Vec::new(); + for range in &ranges { + let id = read_schema_id(&batch.data()[range.clone()])?; + if !schema_ids.contains(&id) { + schema_ids.push(id); + } + } + let decoders = build_kv_decoders( + schema_getter, + target_schema, + target_schema_id, + kv_format, + &schema_ids, + ) + .await?; + + value_records_to_record_batch( + &batch, + &ranges, + &decoders, + table_info.get_row_type(), + projected_fields, + limit, + ) +} + +/// Build one [`FixedSchemaDecoder`] per distinct schema id. The current schema +/// decodes without projection; older schemas are fetched and projected onto the +/// current schema. +async fn build_kv_decoders( + schema_getter: &ClientSchemaGetter, + target_schema: &Schema, + target_schema_id: i16, + kv_format: KvFormat, + schema_ids: &[i16], +) -> Result> { + let mut decoders = HashMap::with_capacity(schema_ids.len()); + for &id in schema_ids { + if decoders.contains_key(&id) { + continue; + } + let decoder = if id == target_schema_id { + FixedSchemaDecoder::new_no_projection(kv_format, target_schema)? + } else { + let source = schema_getter.get_schema(id as i32).await?; + FixedSchemaDecoder::new(kv_format, source.as_ref(), target_schema)? + }; + decoders.insert(id, decoder); + } + Ok(decoders) +} + +/// Decode every value record into a row shaped by `target_row_type`, build a +/// single Arrow batch, keep the last `limit` rows, then apply column projection. +fn value_records_to_record_batch( + batch: &ValueRecordBatch, + ranges: &[Range], + decoders: &HashMap, + target_row_type: &RowType, + projected_fields: Option<&[usize]>, + limit: usize, +) -> Result { + let mut builder = RowAppendRecordBatchBuilder::new(target_row_type)?; + for range in ranges { + let payload = &batch.data()[range.clone()]; + let schema_id = read_schema_id(payload)?; + let decoder = decoders + .get(&schema_id) + .ok_or_else(|| Error::UnexpectedError { + message: format!("No decoder built for schema id {schema_id}"), + source: None, + })?; + let row = decoder.decode(payload)?; + builder.append(&row)?; + } + + let full = Arc::unwrap_or_clone(builder.build_arrow_record_batch()?); + let (full, _) = take_last_rows(full, 0, limit); + project_batch(full, target_row_type, projected_fields) +} + +/// Read the leading little-endian schema id from a `[schema_id | row]` payload. +fn read_schema_id(payload: &[u8]) -> Result { + if payload.len() < SCHEMA_ID_LENGTH { + return Err(Error::UnexpectedError { + message: format!( + "Value record payload too short: {} bytes, need {} for schema id", + payload.len(), + SCHEMA_ID_LENGTH + ), + source: None, + }); + } + let schema_id = LittleEndian::read_i16(&payload[..SCHEMA_ID_LENGTH]); + if schema_id < 0 { + return Err(Error::UnexpectedError { + message: format!("Invalid negative schema id {schema_id}; payload is corrupt"), + source: None, + }); + } + Ok(schema_id) +} + +/// Keep the last `limit` rows of `batch`, advancing `base_offset` by the number +/// of dropped leading rows. A `batch` at or under the limit is returned as-is. +fn take_last_rows(batch: RecordBatch, base_offset: i64, limit: usize) -> (RecordBatch, i64) { + let rows = batch.num_rows(); + if rows > limit { + let dropped = rows - limit; + (batch.slice(dropped, limit), base_offset + dropped as i64) + } else { + (batch, base_offset) + } +} + +/// An empty `RecordBatch` with the (optionally projected) target schema. +fn empty_record_batch( + target_row_type: &RowType, + projected_fields: Option<&[usize]>, +) -> Result { + let empty = RecordBatch::new_empty(to_arrow_schema(target_row_type)?); + project_batch(empty, target_row_type, projected_fields) +} + +/// Project `batch` (shaped by `target_row_type`) onto the requested columns. +fn project_batch( + batch: RecordBatch, + target_row_type: &RowType, + projected_fields: Option<&[usize]>, +) -> Result { + match projected_fields { + None => Ok(batch), + Some(fields) => { + let projected_schema = + ArrowReadContext::project_schema(to_arrow_schema(target_row_type)?, fields)?; + let columns: Vec<_> = fields + .iter() + .map(|&idx| batch.column(idx).clone()) + .collect(); + Ok(RecordBatch::try_new(projected_schema, columns)?) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::WriteRecord; + use crate::compression::{ + ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType, + DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }; + use crate::metadata::{ + Column, DataField, DataType, DataTypes, PhysicalTablePath, Schema, TableDescriptor, + TableInfo, TablePath, + }; + use crate::record::MemoryLogRecordsArrowBuilder; + use crate::row::GenericRow; + use crate::row::binary::BinaryWriter; + use crate::row::compacted::CompactedRowWriter; + use arrow::array::{Array, Int32Array, Int64Array}; + + fn build_two_col_table_info() -> TableInfo { + let row_type = DataTypes::row(vec![ + DataField::new("id", DataTypes::int(), None), + DataField::new("name", DataTypes::string(), None), + ]); + let schema = Schema::builder() + .with_row_type(&row_type) + .build() + .expect("schema build"); + let descriptor = TableDescriptor::builder() + .schema(schema) + .distributed_by(Some(1), vec![]) + .build() + .expect("descriptor build"); + TableInfo::of( + TablePath::new("db".to_string(), "tbl".to_string()), + 42, + 1, + descriptor, + 0, + 0, + ) + } + + fn build_log_records( + table_info: &TableInfo, + base_offset: i64, + rows: &[(i32, &str)], + ) -> Vec { + let row_type = table_info.get_row_type(); + let table_path = table_info.table_path.clone(); + let table_info_arc = Arc::new(table_info.clone()); + let physical = Arc::new(PhysicalTablePath::of(Arc::new(table_path))); + let mut builder = MemoryLogRecordsArrowBuilder::new( + 1, + row_type, + false, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + usize::MAX, + Arc::new(ArrowCompressionRatioEstimator::default()), + ) + .expect("builder"); + + for (i, (id, name)) in rows.iter().enumerate() { + let mut row = GenericRow::new(2); + row.set_field(0, *id); + row.set_field(1, *name); + let record = WriteRecord::for_append( + Arc::clone(&table_info_arc), + physical.clone(), + (i + 1) as i32, + &row, + ); + builder.append(&record).expect("append"); + } + let mut data = builder.build().expect("build log batch"); + // Builder always writes base_log_offset=0; patch it so tests can verify + // BatchScanner faithfully propagates whatever offset the server returned. + let bytes = base_offset.to_le_bytes(); + data[..bytes.len()].copy_from_slice(&bytes); + data + } + + // ---- log path ---------------------------------------------------------- + + #[test] + fn decode_log_batch_empty_returns_empty_record_batch() { + let table_info = build_two_col_table_info(); + let (batch, base_offset) = + decode_log_batch(&table_info, None, Vec::new(), usize::MAX).expect("decode empty"); + assert_eq!(batch.num_rows(), 0); + assert_eq!(batch.num_columns(), 2); + assert_eq!(base_offset, 0); + } + + #[test] + fn decode_log_batch_empty_with_projection() { + let table_info = build_two_col_table_info(); + let (batch, base_offset) = + decode_log_batch(&table_info, Some(&[1usize]), Vec::new(), usize::MAX) + .expect("decode empty"); + assert_eq!(batch.num_rows(), 0); + assert_eq!(batch.num_columns(), 1); + assert_eq!(batch.schema().field(0).name(), "name"); + assert_eq!(base_offset, 0); + } + + #[test] + fn decode_log_batch_extracts_base_offset_and_rows() { + let table_info = build_two_col_table_info(); + let raw = build_log_records(&table_info, 17, &[(1, "alice"), (2, "bob"), (3, "carol")]); + + let (batch, base_offset) = + decode_log_batch(&table_info, None, raw, usize::MAX).expect("decode populated"); + assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_columns(), 2); + assert_eq!(base_offset, 17); + } + + #[test] + fn decode_log_batch_projection_keeps_requested_columns() { + let table_info = build_two_col_table_info(); + let raw = build_log_records(&table_info, 0, &[(7, "x"), (8, "y")]); + + let (batch, _) = decode_log_batch(&table_info, Some(&[0usize]), raw, usize::MAX) + .expect("decode projected"); + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 1); + assert_eq!(batch.schema().field(0).name(), "id"); + } + + #[test] + fn decode_log_batch_truncates_to_last_limit_rows() { + let table_info = build_two_col_table_info(); + // Server returned 4 rows starting at offset 100, but limit is 2. + let raw = build_log_records(&table_info, 100, &[(1, "a"), (2, "b"), (3, "c"), (4, "d")]); + + let (batch, base_offset) = decode_log_batch(&table_info, None, raw, 2).expect("decode"); + assert_eq!(batch.num_rows(), 2); + // The last two rows are kept, so the base offset advances by 2. + assert_eq!(base_offset, 102); + let ids = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ids.value(0), 3); + assert_eq!(ids.value(1), 4); + } + + // ---- KV path ----------------------------------------------------------- + + fn schema_with_ids(columns: &[(i32, &str, DataType)]) -> Schema { + let cols: Vec = columns + .iter() + .map(|(id, name, dt)| Column::new(*name, dt.clone()).with_id(*id)) + .collect(); + Schema::builder().with_columns(cols).build().unwrap() + } + + /// Encode a value-record batch from `(schema_id, compacted-row-bytes)` + /// pairs, matching the Java `DefaultValueRecordBatch` wire layout. + fn value_batch(records: &[(i16, Vec)]) -> ValueRecordBatch { + let mut body = Vec::new(); + for (schema_id, row) in records { + let rec_len = (SCHEMA_ID_LENGTH + row.len()) as i32; + body.extend_from_slice(&rec_len.to_le_bytes()); + body.extend_from_slice(&schema_id.to_le_bytes()); + body.extend_from_slice(row); + } + let mut out = Vec::new(); + out.extend_from_slice(&((1 + 4 + body.len()) as i32).to_le_bytes()); // Length + out.push(0); // Magic + out.extend_from_slice(&(records.len() as i32).to_le_bytes()); // RecordCount + out.extend_from_slice(&body); + ValueRecordBatch::new(Bytes::from(out)) + } + + fn compacted(field_count: usize, write: impl FnOnce(&mut CompactedRowWriter)) -> Vec { + let mut w = CompactedRowWriter::new(field_count); + write(&mut w); + w.to_bytes().as_ref().to_vec() + } + + fn id_name_schema() -> Schema { + schema_with_ids(&[ + (0, "id", DataTypes::int()), + (1, "name", DataTypes::string()), + ]) + } + + #[test] + fn value_records_empty_returns_empty_batch() { + let schema = id_name_schema(); + let batch = value_batch(&[]); + let ranges = batch.value_ranges().unwrap(); + let rb = value_records_to_record_batch( + &batch, + &ranges, + &HashMap::new(), + schema.row_type(), + None, + usize::MAX, + ) + .expect("decode empty kv"); + assert_eq!(rb.num_rows(), 0); + assert_eq!(rb.num_columns(), 2); + } + + #[test] + fn empty_kv_payload_returns_empty_batch() { + let schema = id_name_schema(); + // Full schema. + let rb = empty_record_batch(schema.row_type(), None).expect("empty"); + assert_eq!(rb.num_rows(), 0); + assert_eq!(rb.num_columns(), 2); + // Projected. + let rb = empty_record_batch(schema.row_type(), Some(&[1usize])).expect("empty projected"); + assert_eq!(rb.num_rows(), 0); + assert_eq!(rb.num_columns(), 1); + assert_eq!(rb.schema().field(0).name(), "name"); + } + + #[test] + fn value_records_decode_rows() { + let schema = id_name_schema(); + let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &schema).unwrap(); + let mut decoders = HashMap::new(); + decoders.insert(0i16, decoder); + + let r0 = compacted(2, |w| { + w.write_int(1); + w.write_string("alice"); + }); + let r1 = compacted(2, |w| { + w.write_int(2); + w.write_string("bob"); + }); + let batch = value_batch(&[(0, r0), (0, r1)]); + let ranges = batch.value_ranges().unwrap(); + + let rb = value_records_to_record_batch( + &batch, + &ranges, + &decoders, + schema.row_type(), + None, + usize::MAX, + ) + .expect("decode kv rows"); + assert_eq!(rb.num_rows(), 2); + let ids = rb.column(0).as_any().downcast_ref::().unwrap(); + assert_eq!(ids.value(0), 1); + assert_eq!(ids.value(1), 2); + } + + #[test] + fn value_records_limit_keeps_last_rows() { + let schema = id_name_schema(); + let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &schema).unwrap(); + let mut decoders = HashMap::new(); + decoders.insert(0i16, decoder); + + let records: Vec<(i16, Vec)> = (1..=5) + .map(|i| { + ( + 0i16, + compacted(2, |w| { + w.write_int(i); + w.write_string("x"); + }), + ) + }) + .collect(); + let batch = value_batch(&records); + let ranges = batch.value_ranges().unwrap(); + + let rb = + value_records_to_record_batch(&batch, &ranges, &decoders, schema.row_type(), None, 3) + .expect("decode kv rows"); + assert_eq!(rb.num_rows(), 3); + let ids = rb.column(0).as_any().downcast_ref::().unwrap(); + // Last 3 of [1,2,3,4,5]. + assert_eq!(ids.values(), &[3, 4, 5]); + } + + #[test] + fn value_records_projection_keeps_requested_columns() { + let schema = id_name_schema(); + let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &schema).unwrap(); + let mut decoders = HashMap::new(); + decoders.insert(0i16, decoder); + + let r0 = compacted(2, |w| { + w.write_int(9); + w.write_string("nine"); + }); + let batch = value_batch(&[(0, r0)]); + let ranges = batch.value_ranges().unwrap(); + + let rb = value_records_to_record_batch( + &batch, + &ranges, + &decoders, + schema.row_type(), + Some(&[1usize]), + usize::MAX, + ) + .expect("decode projected kv"); + assert_eq!(rb.num_columns(), 1); + assert_eq!(rb.schema().field(0).name(), "name"); + } + + #[test] + fn value_records_decode_across_schema_evolution() { + // Source schema (older): [id, name]. Target (current): added `age`. + let source = id_name_schema(); + let target = schema_with_ids(&[ + (0, "id", DataTypes::int()), + (1, "name", DataTypes::string()), + (2, "age", DataTypes::bigint()), + ]); + + let mut decoders = HashMap::new(); + // Records with schema id 0 were written under the old schema. + decoders.insert( + 0i16, + FixedSchemaDecoder::new(KvFormat::COMPACTED, &source, &target).unwrap(), + ); + // Records with schema id 1 carry the current schema. + decoders.insert( + 1i16, + FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap(), + ); + + let old_row = compacted(2, |w| { + w.write_int(1); + w.write_string("alice"); + }); + let new_row = compacted(3, |w| { + w.write_int(2); + w.write_string("bob"); + w.write_long(30); + }); + let batch = value_batch(&[(0, old_row), (1, new_row)]); + let ranges = batch.value_ranges().unwrap(); + + let rb = value_records_to_record_batch( + &batch, + &ranges, + &decoders, + target.row_type(), + None, + usize::MAX, + ) + .expect("decode mixed-schema kv"); + + assert_eq!(rb.num_rows(), 2); + assert_eq!(rb.num_columns(), 3); + let age = rb.column(2).as_any().downcast_ref::().unwrap(); + // Old record has no `age` column -> null; new record carries 30. + assert!(age.is_null(0), "old-schema record must read age as null"); + assert_eq!(age.value(1), 30); + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs new file mode 100644 index 0000000000..9d45abad29 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs @@ -0,0 +1,947 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::RecordBatch; +use parking_lot::Mutex; + +use crate::client::table::remote_log::{ + PrefetchPermit, RemoteLogDownloadFuture, RemoteLogFile, RemoteLogSegment, +}; +use crate::error::{ApiError, Error, Result}; +use crate::metadata::TableBucket; +use crate::record::{ + LogRecordBatch, LogRecordIterator, LogRecordsBatches, ReadContext, ScanRecord, +}; +use std::{ + collections::{HashMap, VecDeque}, + sync::{ + Arc, + atomic::{AtomicBool, Ordering}, + }, + time::{Duration, Instant}, +}; +use tokio::sync::Notify; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum FetchErrorAction { + Ignore, + LogOffsetOutOfRange, + Authorization, + CorruptMessage, + Unexpected, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub(crate) enum FetchErrorLogLevel { + Debug, + Warn, +} + +#[derive(Clone, Debug)] +pub(crate) struct FetchErrorContext { + pub(crate) action: FetchErrorAction, + pub(crate) log_level: FetchErrorLogLevel, + pub(crate) log_message: String, +} + +/// Represents a completed fetch that can be consumed +pub trait CompletedFetch: Send + Sync { + fn table_bucket(&self) -> &TableBucket; + fn api_error(&self) -> Option<&ApiError>; + fn fetch_error_context(&self) -> Option<&FetchErrorContext>; + fn take_error(&mut self) -> Option; + fn fetch_records(&mut self, max_records: usize) -> Result>; + fn fetch_batches(&mut self, max_batches: usize) -> Result>; + fn is_consumed(&self) -> bool; + fn records_read(&self) -> usize; + fn drain(&mut self); + fn size_in_bytes(&self) -> usize; + fn high_watermark(&self) -> i64; + fn is_initialized(&self) -> bool; + fn set_initialized(&mut self); + fn next_fetch_offset(&self) -> i64; +} + +/// Represents a pending fetch that is waiting to be completed +pub trait PendingFetch: Send + Sync { + fn table_bucket(&self) -> &TableBucket; + fn is_completed(&self) -> bool; + fn to_completed_fetch(self: Box) -> Result>; +} + +/// Thread-safe buffer for completed fetches +pub struct LogFetchBuffer { + read_context: ReadContext, + completed_fetches: Mutex>>, + pending_fetches: Mutex>>>, + next_in_line_fetch: Mutex>>, + not_empty_notify: Notify, + woken_up: Arc, +} + +impl LogFetchBuffer { + pub fn new(read_context: ReadContext) -> Self { + Self { + read_context, + completed_fetches: Mutex::new(VecDeque::new()), + pending_fetches: Mutex::new(HashMap::new()), + next_in_line_fetch: Mutex::new(None), + not_empty_notify: Notify::new(), + woken_up: Arc::new(AtomicBool::new(false)), + } + } + + /// Check if the buffer is empty + pub fn is_empty(&self) -> bool { + self.completed_fetches.lock().is_empty() + } + + /// Wait for the buffer to become non-empty, with timeout. + /// Returns true if data became available, false if timeout. + pub async fn await_not_empty(&self, timeout: Duration) -> Result { + let deadline = Instant::now() + timeout; + + loop { + // Check if buffer is not empty + if !self.is_empty() { + return Ok(true); + } + + // Check if woken up + if self.woken_up.swap(false, Ordering::Acquire) { + return Err(Error::WakeupError { + message: "The await operation was interrupted by wakeup.".to_string(), + }); + } + + // Check if timeout + let now = Instant::now(); + if now >= deadline { + return Ok(false); + } + + // Wait for notification with remaining time + let remaining = deadline - now; + let notified = self.not_empty_notify.notified(); + tokio::select! { + _ = tokio::time::sleep(remaining) => { + return Ok(false); // Timeout + } + _ = notified => { + // Got notification, check again + continue; + } + } + } + } + + #[allow(dead_code)] + /// Wake up any waiting threads + pub fn wakeup(&self) { + self.woken_up.store(true, Ordering::Release); + self.not_empty_notify.notify_waiters(); + } + + pub(crate) fn add_api_error( + &self, + table_bucket: TableBucket, + api_error: ApiError, + fetch_error_context: FetchErrorContext, + fetch_offset: i64, + ) { + let error_fetch = DefaultCompletedFetch::from_api_error( + table_bucket, + api_error, + fetch_error_context, + fetch_offset, + self.read_context.clone(), + ); + self.completed_fetches + .lock() + .push_back(Box::new(error_fetch)); + self.not_empty_notify.notify_waiters(); + } + + /// Add a pending fetch to the buffer + pub fn pend(&self, pending_fetch: Box) { + let table_bucket = pending_fetch.table_bucket().clone(); + self.pending_fetches + .lock() + .entry(table_bucket) + .or_default() + .push_back(pending_fetch); + } + + /// Try to complete pending fetches in order, converting them to completed fetches + pub fn try_complete(&self, table_bucket: &TableBucket) { + // Collect completed fetches while holding the pending_fetches lock, + // then push them to completed_fetches after releasing it to avoid + // holding both locks simultaneously. + let mut completed_to_push: Vec> = Vec::new(); + let mut has_completed = false; + let mut pending_error: Option = None; + { + let mut pending_map = self.pending_fetches.lock(); + if let Some(pendings) = pending_map.get_mut(table_bucket) { + while let Some(front) = pendings.front() { + if front.is_completed() { + let pending = pendings.pop_front().unwrap(); + match pending.to_completed_fetch() { + Ok(completed) => { + completed_to_push.push(completed); + has_completed = true; + } + Err(e) => { + pending_error = Some(e); + has_completed = true; + break; + } + } + } else { + break; + } + } + if has_completed && pendings.is_empty() { + pending_map.remove(table_bucket); + } + } + } + + if let Some(error) = pending_error { + let error_fetch = DefaultCompletedFetch::from_error( + table_bucket.clone(), + error, + -1, + self.read_context.clone(), + ); + completed_to_push.push(Box::new(error_fetch)); + } + + if !completed_to_push.is_empty() { + let mut completed_queue = self.completed_fetches.lock(); + for completed in completed_to_push { + completed_queue.push_back(completed); + } + has_completed = true; + } + + if has_completed { + // Signal that buffer is not empty + self.not_empty_notify.notify_waiters(); + } + } + + /// Add a completed fetch to the buffer + pub fn add(&self, completed_fetch: Box) { + let table_bucket = completed_fetch.table_bucket(); + let mut pending_map = self.pending_fetches.lock(); + + if let Some(pendings) = pending_map.get_mut(table_bucket) + && !pendings.is_empty() + { + pendings.push_back(Box::new(CompletedPendingFetch::new(completed_fetch))); + return; + } + // If there's no pending fetch for this table_bucket, + // directly add to completed_fetches + self.completed_fetches.lock().push_back(completed_fetch); + self.not_empty_notify.notify_waiters(); + } + + /// Poll the next completed fetch + pub fn poll(&self) -> Option> { + self.completed_fetches.lock().pop_front() + } + + /// Get the next in line fetch + pub fn next_in_line_fetch(&self) -> Option> { + self.next_in_line_fetch.lock().take() + } + + /// Set the next in line fetch + pub fn set_next_in_line_fetch(&self, fetch: Option>) { + *self.next_in_line_fetch.lock() = fetch; + } + + /// Get the set of buckets that have buffered data + pub fn buffered_buckets(&self) -> Vec { + let mut buckets = Vec::new(); + + // Avoid holding multiple locks at once to prevent lock-order inversion. + { + let next_in_line_fetch = self.next_in_line_fetch.lock(); + if let Some(complete_fetch) = next_in_line_fetch.as_ref() { + if !complete_fetch.is_consumed() { + buckets.push(complete_fetch.table_bucket().clone()); + } + } + } + + { + let completed = self.completed_fetches.lock(); + for fetch in completed.iter() { + buckets.push(fetch.table_bucket().clone()); + } + } + + { + let pending = self.pending_fetches.lock(); + buckets.extend(pending.keys().cloned()); + } + buckets + } +} + +/// A wrapper that makes a completed fetch look like a pending fetch +struct CompletedPendingFetch { + completed_fetch: Box, +} + +impl CompletedPendingFetch { + fn new(completed_fetch: Box) -> Self { + Self { completed_fetch } + } +} + +impl PendingFetch for CompletedPendingFetch { + fn table_bucket(&self) -> &TableBucket { + self.completed_fetch.table_bucket() + } + + fn is_completed(&self) -> bool { + true + } + + fn to_completed_fetch(self: Box) -> Result> { + Ok(self.completed_fetch) + } +} + +/// Default implementation of CompletedFetch for in-memory log records +/// Used for local fetches from tablet server +pub struct DefaultCompletedFetch { + table_bucket: TableBucket, + api_error: Option, + fetch_error_context: Option, + error: Option, + log_record_batch: LogRecordsBatches, + read_context: ReadContext, + next_fetch_offset: i64, + high_watermark: i64, + size_in_bytes: usize, + consumed: bool, + initialized: bool, + records_read: usize, + current_record_iterator: Option, + current_record_batch: Option, + last_record: Option, + cached_record_error: Option, + corrupt_last_record: bool, +} + +impl DefaultCompletedFetch { + pub fn new( + table_bucket: TableBucket, + log_record_batch: LogRecordsBatches, + size_in_bytes: usize, + read_context: ReadContext, + fetch_offset: i64, + high_watermark: i64, + ) -> Self { + Self { + table_bucket, + api_error: None, + fetch_error_context: None, + error: None, + log_record_batch, + read_context, + next_fetch_offset: fetch_offset, + high_watermark, + size_in_bytes, + consumed: false, + initialized: false, + records_read: 0, + current_record_iterator: None, + current_record_batch: None, + last_record: None, + cached_record_error: None, + corrupt_last_record: false, + } + } + + pub(crate) fn from_error( + table_bucket: TableBucket, + error: Error, + fetch_offset: i64, + read_context: ReadContext, + ) -> Self { + Self { + table_bucket, + api_error: None, + fetch_error_context: None, + error: Some(error), + log_record_batch: LogRecordsBatches::new(Vec::new()), + read_context, + next_fetch_offset: fetch_offset, + high_watermark: -1, + size_in_bytes: 0, + consumed: false, + initialized: false, + records_read: 0, + current_record_iterator: None, + current_record_batch: None, + last_record: None, + cached_record_error: None, + corrupt_last_record: false, + } + } + + pub(crate) fn from_api_error( + table_bucket: TableBucket, + api_error: ApiError, + fetch_error_context: FetchErrorContext, + fetch_offset: i64, + read_context: ReadContext, + ) -> Self { + Self { + table_bucket, + api_error: Some(api_error), + fetch_error_context: Some(fetch_error_context), + error: None, + log_record_batch: LogRecordsBatches::new(Vec::new()), + read_context, + next_fetch_offset: fetch_offset, + high_watermark: -1, + size_in_bytes: 0, + consumed: false, + initialized: false, + records_read: 0, + current_record_iterator: None, + current_record_batch: None, + last_record: None, + cached_record_error: None, + corrupt_last_record: false, + } + } + + /// Get the next fetched record, handling batch iteration and record skipping + fn next_fetched_record(&mut self) -> Result> { + loop { + if let Some(record) = self + .current_record_iterator + .as_mut() + .and_then(Iterator::next) + { + if record.offset() >= self.next_fetch_offset { + return Ok(Some(record)); + } + } else if let Some(batch_result) = self.log_record_batch.next() { + let batch = batch_result?; + self.current_record_iterator = Some(batch.records(&self.read_context)?); + self.current_record_batch = Some(batch); + } else { + if let Some(batch) = self.current_record_batch.take() { + self.next_fetch_offset = batch.next_log_offset(); + } + self.drain(); + return Ok(None); + } + } + } + + fn fetch_error(&self) -> Error { + let mut message = format!( + "Received exception when fetching the next record from {table_bucket}. If needed, please back to past the record to continue scanning.", + table_bucket = self.table_bucket + ); + if let Some(cause) = self.cached_record_error.as_deref() { + message.push_str(&format!(" Cause: {cause}")); + } + Error::UnexpectedError { + message, + source: None, + } + } + /// Get the next batch with its base offset. + /// Returns (RecordBatch, base_offset) where base_offset is the offset of the first record. + fn next_fetched_batch(&mut self) -> Result> { + loop { + let Some(log_batch_result) = self.log_record_batch.next() else { + self.drain(); + return Ok(None); + }; + + let log_batch = log_batch_result?; + let mut record_batch = log_batch.record_batch(&self.read_context)?; + + // Skip empty batches + if record_batch.num_rows() == 0 { + continue; + } + + // Calculate the effective base offset for this batch + let log_base_offset = log_batch.base_log_offset(); + let effective_base_offset = if self.next_fetch_offset > log_base_offset { + let skip_count = (self.next_fetch_offset - log_base_offset) as usize; + if skip_count >= record_batch.num_rows() { + continue; + } + // Slice the batch to skip the first skip_count rows + record_batch = record_batch.slice(skip_count, record_batch.num_rows() - skip_count); + self.next_fetch_offset + } else { + log_base_offset + }; + + self.next_fetch_offset = log_batch.next_log_offset(); + self.records_read += record_batch.num_rows(); + return Ok(Some((record_batch, effective_base_offset))); + } + } +} + +impl CompletedFetch for DefaultCompletedFetch { + fn table_bucket(&self) -> &TableBucket { + &self.table_bucket + } + + fn api_error(&self) -> Option<&ApiError> { + self.api_error.as_ref() + } + + fn fetch_error_context(&self) -> Option<&FetchErrorContext> { + self.fetch_error_context.as_ref() + } + + fn take_error(&mut self) -> Option { + self.error.take() + } + + fn fetch_records(&mut self, max_records: usize) -> Result> { + if let Some(error) = self.error.take() { + return Err(error); + } + + if let Some(api_error) = self.api_error.as_ref() { + return Err(Error::FlussAPIError { + api_error: ApiError { + code: api_error.code, + message: api_error.message.clone(), + }, + }); + } + + if self.corrupt_last_record { + return Err(self.fetch_error()); + } + + if self.consumed { + return Ok(Vec::new()); + } + + let mut scan_records = Vec::new(); + + for _ in 0..max_records { + if self.cached_record_error.is_none() { + self.corrupt_last_record = true; + match self.next_fetched_record() { + Ok(Some(record)) => { + self.corrupt_last_record = false; + self.last_record = Some(record); + } + Ok(None) => { + self.corrupt_last_record = false; + self.last_record = None; + } + Err(e) => { + self.cached_record_error = Some(e.to_string()); + } + } + } + + let Some(record) = self.last_record.take() else { + break; + }; + + self.next_fetch_offset = record.offset() + 1; + self.records_read += 1; + scan_records.push(record); + } + + if self.cached_record_error.is_some() && scan_records.is_empty() { + return Err(self.fetch_error()); + } + + Ok(scan_records) + } + + fn fetch_batches(&mut self, max_batches: usize) -> Result> { + if let Some(error) = self.error.take() { + return Err(error); + } + + if let Some(api_error) = self.api_error.as_ref() { + return Err(Error::FlussAPIError { + api_error: ApiError { + code: api_error.code, + message: api_error.message.clone(), + }, + }); + } + + if self.consumed { + return Ok(Vec::new()); + } + + let mut batches = Vec::with_capacity(max_batches.min(16)); + + for _ in 0..max_batches { + match self.next_fetched_batch()? { + Some(batch_with_offset) => batches.push(batch_with_offset), + None => break, + } + } + + Ok(batches) + } + + fn is_consumed(&self) -> bool { + self.consumed + } + + fn records_read(&self) -> usize { + self.records_read + } + + fn drain(&mut self) { + self.consumed = true; + self.api_error = None; + self.fetch_error_context = None; + self.error = None; + self.cached_record_error = None; + self.corrupt_last_record = false; + self.last_record = None; + } + + fn size_in_bytes(&self) -> usize { + self.size_in_bytes + } + + fn high_watermark(&self) -> i64 { + self.high_watermark + } + + fn is_initialized(&self) -> bool { + self.initialized + } + + fn set_initialized(&mut self) { + self.initialized = true; + } + + fn next_fetch_offset(&self) -> i64 { + self.next_fetch_offset + } +} + +/// Completed fetch for remote log segments +/// Matches Java's RemoteCompletedFetch design - separate class for remote vs local +/// Holds RAII permit until consumed (data is in inner) +pub struct RemoteCompletedFetch { + inner: DefaultCompletedFetch, + permit: Option, +} + +impl RemoteCompletedFetch { + pub fn new(inner: DefaultCompletedFetch, permit: PrefetchPermit) -> Self { + Self { + inner, + permit: Some(permit), + } + } +} + +impl CompletedFetch for RemoteCompletedFetch { + fn table_bucket(&self) -> &TableBucket { + self.inner.table_bucket() + } + + fn api_error(&self) -> Option<&ApiError> { + self.inner.api_error() + } + + fn fetch_error_context(&self) -> Option<&FetchErrorContext> { + self.inner.fetch_error_context() + } + + fn take_error(&mut self) -> Option { + self.inner.take_error() + } + + fn fetch_records(&mut self, max_records: usize) -> Result> { + self.inner.fetch_records(max_records) + } + + fn fetch_batches(&mut self, max_batches: usize) -> Result> { + self.inner.fetch_batches(max_batches) + } + + fn is_consumed(&self) -> bool { + self.inner.is_consumed() + } + + fn records_read(&self) -> usize { + self.inner.records_read() + } + + fn drain(&mut self) { + self.inner.drain(); + // Release permit immediately (don't wait for struct drop) + // Critical: allows prefetch to continue even if Box kept around + self.permit.take(); // drops permit here, triggers recycle notification + } + + fn size_in_bytes(&self) -> usize { + self.inner.size_in_bytes() + } + + fn high_watermark(&self) -> i64 { + self.inner.high_watermark() + } + + fn is_initialized(&self) -> bool { + self.inner.is_initialized() + } + + fn set_initialized(&mut self) { + self.inner.set_initialized() + } + + fn next_fetch_offset(&self) -> i64 { + self.inner.next_fetch_offset() + } +} +// Permit released explicitly in drain() or automatically when struct drops + +/// Pending fetch that waits for remote log file to be downloaded +pub struct RemotePendingFetch { + segment: RemoteLogSegment, + download_future: RemoteLogDownloadFuture, + pos_in_log_segment: i32, + fetch_offset: i64, + high_watermark: i64, + read_context: ReadContext, +} + +impl RemotePendingFetch { + pub fn new( + segment: RemoteLogSegment, + download_future: RemoteLogDownloadFuture, + pos_in_log_segment: i32, + fetch_offset: i64, + high_watermark: i64, + read_context: ReadContext, + ) -> Self { + Self { + segment, + download_future, + pos_in_log_segment, + fetch_offset, + high_watermark, + read_context, + } + } +} + +impl PendingFetch for RemotePendingFetch { + fn table_bucket(&self) -> &TableBucket { + &self.segment.table_bucket + } + + fn is_completed(&self) -> bool { + self.download_future.is_done() + } + + fn to_completed_fetch(self: Box) -> Result> { + // Take the RemoteLogFile and destructure + let remote_log_file = self.download_future.take_remote_log_file()?; + let RemoteLogFile { + file_path, + file_size: _, + permit, + } = remote_log_file; + + // Open file for streaming (no memory allocation for entire file) + let file = std::fs::File::open(&file_path)?; + let file_size = file.metadata()?.len() as usize; + + // Create file-backed LogRecordsBatches with cleanup (streaming!) + // Data will be read batch-by-batch on-demand, not all at once + // FileSource will delete the file when dropped (after file is closed) + let log_record_batch = + LogRecordsBatches::from_file(file, self.pos_in_log_segment as usize, file_path)?; + + // Calculate size based on position offset + let size_in_bytes = if self.pos_in_log_segment > 0 { + let pos = self.pos_in_log_segment as usize; + if pos >= file_size { + return Err(Error::UnexpectedError { + message: format!("Position {pos} exceeds file size {file_size}"), + source: None, + }); + } + file_size - pos + } else { + file_size + }; + + // Create DefaultCompletedFetch + let inner_fetch = DefaultCompletedFetch::new( + self.segment.table_bucket.clone(), + log_record_batch, + size_in_bytes, + self.read_context, + self.fetch_offset, + self.high_watermark, + ); + + // Wrap it with RemoteCompletedFetch to hold the permit + // Permit manages the prefetch slot (releases semaphore and notifies coordinator) when dropped; + // file deletion is handled by FileCleanupGuard in the file-backed source created via from_file + Ok(Box::new(RemoteCompletedFetch::new(inner_fetch, permit))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::WriteRecord; + use crate::compression::{ + ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType, + DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }; + use crate::metadata::{DataField, DataTypes, PhysicalTablePath, RowType, TablePath}; + use crate::record::{MemoryLogRecordsArrowBuilder, ReadContext, to_arrow_schema}; + use crate::row::GenericRow; + use crate::test_utils::build_table_info; + use std::sync::Arc; + + fn test_read_context() -> Result { + let row_type = RowType::new(vec![DataField::new("id", DataTypes::int(), None)]); + Ok(ReadContext::new( + to_arrow_schema(&row_type)?, + Arc::new(row_type), + false, + )) + } + + struct ErrorPendingFetch { + table_bucket: TableBucket, + } + + impl PendingFetch for ErrorPendingFetch { + fn table_bucket(&self) -> &TableBucket { + &self.table_bucket + } + + fn is_completed(&self) -> bool { + true + } + + fn to_completed_fetch(self: Box) -> Result> { + Err(Error::UnexpectedError { + message: "pending fetch failure".to_string(), + source: None, + }) + } + } + + #[tokio::test] + async fn await_not_empty_returns_wakeup_error() { + let buffer = LogFetchBuffer::new(test_read_context().unwrap()); + buffer.wakeup(); + + let result = buffer.await_not_empty(Duration::from_millis(10)).await; + assert!(matches!(result, Err(Error::WakeupError { .. }))); + } + + #[tokio::test] + async fn await_not_empty_returns_pending_error() { + let buffer = LogFetchBuffer::new(test_read_context().unwrap()); + let table_bucket = TableBucket::new(1, 0); + buffer.pend(Box::new(ErrorPendingFetch { + table_bucket: table_bucket.clone(), + })); + buffer.try_complete(&table_bucket); + + let result = buffer.await_not_empty(Duration::from_millis(10)).await; + assert!(matches!(result, Ok(true))); + + let mut completed = buffer.poll().expect("completed fetch"); + assert!(completed.take_error().is_some()); + } + + #[test] + fn default_completed_fetch_reads_records() -> Result<()> { + let row_type = RowType::new(vec![ + DataField::new("id", DataTypes::int(), None), + DataField::new("name", DataTypes::string(), None), + ]); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path))); + + let mut builder = MemoryLogRecordsArrowBuilder::new( + 1, + &row_type, + false, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + usize::MAX, + Arc::new(ArrowCompressionRatioEstimator::default()), + )?; + + let mut row = GenericRow::new(2); + row.set_field(0, 1_i32); + row.set_field(1, "alice"); + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + builder.append(&record)?; + + let data = builder.build()?; + let log_records = LogRecordsBatches::new(data.clone()); + let read_context = ReadContext::new(to_arrow_schema(&row_type)?, Arc::new(row_type), false); + let mut fetch = DefaultCompletedFetch::new( + TableBucket::new(1, 0), + log_records, + data.len(), + read_context, + 0, + 0, + ); + + let records = fetch.fetch_records(10)?; + assert_eq!(records.len(), 1); + assert_eq!(records[0].offset(), 0); + + let empty = fetch.fetch_records(10)?; + assert!(empty.is_empty()); + + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/lookup.rs b/fluss-rust/crates/fluss/src/client/table/lookup.rs new file mode 100644 index 0000000000..51a0a0714d --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/lookup.rs @@ -0,0 +1,774 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::bucketing::BucketingFunction; +use crate::client::ClientSchemaGetter; +use crate::client::lookup::LookupClient; +use crate::client::metadata::Metadata; +use crate::client::table::partition_getter::PartitionGetter; +use crate::error::{Error, Result}; +use crate::metadata::{ + KvFormat, PhysicalTablePath, RowType, Schema, TableBucket, TableInfo, TablePath, +}; +use crate::record::RowAppendRecordBatchBuilder; +use crate::record::kv::SCHEMA_ID_LENGTH; +use crate::row::encode::{KeyEncoder, KeyEncoderFactory}; +use crate::row::{FixedSchemaDecoder, InternalRow, LookupRow}; +use arrow::array::RecordBatch; +use byteorder::{ByteOrder, LittleEndian}; +use futures::future::try_join_all; +use parking_lot::RwLock; +use std::collections::HashMap; +use std::sync::Arc; + +/// Per-Lookuper decoder cache. The target-schema decoder is held +/// directly so the dominant decode path is a single field access; older +/// schemas are populated lazily on first observation. +struct DecoderCache { + target_id: i16, + target_decoder: Arc, + others: RwLock>>, +} + +impl DecoderCache { + fn new(target_id: i16, target_decoder: Arc) -> Self { + Self { + target_id, + target_decoder, + others: RwLock::new(HashMap::new()), + } + } + + fn decode<'a>(&self, schema_id: i16, bytes: &'a [u8]) -> Result> { + if schema_id == self.target_id { + return self.target_decoder.decode(bytes); + } + let decoder = + self.others + .read() + .get(&schema_id) + .cloned() + .ok_or_else(|| Error::RowConvertError { + message: format!("No decoder available for schema id {schema_id}"), + })?; + decoder.decode(bytes) + } + + fn contains(&self, schema_id: i16) -> bool { + schema_id == self.target_id || self.others.read().contains_key(&schema_id) + } + + fn insert(&self, schema_id: i16, decoder: Arc) { + self.others.write().insert(schema_id, decoder); + } + + #[cfg(test)] + fn get(&self, schema_id: i16) -> Option> { + if schema_id == self.target_id { + return Some(Arc::clone(&self.target_decoder)); + } + self.others.read().get(&schema_id).cloned() + } +} + +/// Rows returned from a lookup. Primary-key lookups produce at most one +/// row; prefix-key lookups may produce many. Rows written under older +/// schemas are decoded with their original schema and projected to the +/// schema captured when the `Lookuper` was created — schema evolutions +/// that land after that point are not picked up by an existing +/// `Lookuper`; create a new one to see them. +pub struct LookupResult { + rows: Vec>, + target_row_type: Arc, + decoders: Arc, +} + +impl LookupResult { + fn new(rows: Vec>, target_row_type: Arc, decoders: Arc) -> Self { + Self { + rows, + target_row_type, + decoders, + } + } + + fn read_schema_id(bytes: &[u8]) -> Result { + if bytes.len() < SCHEMA_ID_LENGTH { + return Err(Error::RowConvertError { + message: format!( + "Row payload too short: {} bytes, need at least {} for schema id", + bytes.len(), + SCHEMA_ID_LENGTH + ), + }); + } + let schema_id = LittleEndian::read_i16(&bytes[..SCHEMA_ID_LENGTH]); + if schema_id < 0 { + return Err(Error::RowConvertError { + message: format!("Invalid negative schema id {schema_id}; row prefix is corrupt"), + }); + } + Ok(schema_id) + } + + fn decode<'a>(&self, bytes: &'a [u8]) -> Result> { + let schema_id = Self::read_schema_id(bytes)?; + self.decoders.decode(schema_id, bytes) + } + + /// Returns the single row when exactly one is present, `None` for + /// empty, or an error if the result holds more than one row. + pub fn get_single_row(&self) -> Result>> { + match self.rows.len() { + 0 => Ok(None), + 1 => Ok(Some(self.decode(&self.rows[0])?)), + _ => Err(Error::UnexpectedError { + message: "LookupResult contains multiple rows, use get_rows() instead".to_string(), + source: None, + }), + } + } + + pub fn get_rows(&self) -> Result>> { + self.rows.iter().map(|bytes| self.decode(bytes)).collect() + } + + pub fn to_record_batch(&self) -> Result { + let mut builder = RowAppendRecordBatchBuilder::new(&self.target_row_type)?; + for bytes in &self.rows { + let row = self.decode(bytes)?; + builder.append(&row)?; + } + builder.build_arrow_record_batch().map(Arc::unwrap_or_clone) + } +} + +struct LookupSchemaCtx { + target_schema: Arc, + target_row_type: Arc, + kv_format: KvFormat, + schema_getter: Arc, + decoders: Arc, +} + +impl LookupSchemaCtx { + fn new(table_info: &TableInfo, schema_getter: Arc) -> Result { + let target_schema_i32 = table_info.get_schema_id(); + if !(0..=i16::MAX as i32).contains(&target_schema_i32) { + return Err(Error::UnexpectedError { + message: format!( + "Schema id {target_schema_i32} does not fit in 16 bits — wire format violated" + ), + source: None, + }); + } + let target_schema = Arc::new(table_info.get_schema().clone()); + let target_row_type = Arc::new(table_info.row_type().clone()); + let kv_format = table_info.get_table_config().get_kv_format()?; + let target_decoder = Arc::new(FixedSchemaDecoder::new_no_projection( + kv_format, + target_schema.as_ref(), + )?); + let decoders = Arc::new(DecoderCache::new(target_schema_i32 as i16, target_decoder)); + Ok(Self { + target_schema, + target_row_type, + kv_format, + schema_getter, + decoders, + }) + } + + async fn ensure_decoders(&self, rows: &[Vec]) -> Result<()> { + let mut missing: Vec = Vec::new(); + for bytes in rows { + let schema_id = LookupResult::read_schema_id(bytes)?; + if !self.decoders.contains(schema_id) && !missing.contains(&schema_id) { + missing.push(schema_id); + } + } + if missing.is_empty() { + return Ok(()); + } + + let fetches = missing.into_iter().map(|schema_id| { + let cache = Arc::clone(&self.decoders); + let schema_getter = Arc::clone(&self.schema_getter); + let target_schema = Arc::clone(&self.target_schema); + let kv_format = self.kv_format; + async move { + let source = schema_getter.get_schema(schema_id as i32).await?; + let decoder = + FixedSchemaDecoder::new(kv_format, source.as_ref(), target_schema.as_ref())?; + cache.insert(schema_id, Arc::new(decoder)); + Ok::<_, Error>(()) + } + }); + try_join_all(fetches).await?; + Ok(()) + } + + async fn build_result(&self, rows: Vec>) -> Result { + if !rows.is_empty() { + self.ensure_decoders(&rows).await?; + } + Ok(LookupResult::new( + rows, + Arc::clone(&self.target_row_type), + Arc::clone(&self.decoders), + )) + } + + fn empty_result(&self) -> LookupResult { + LookupResult::new( + Vec::new(), + Arc::clone(&self.target_row_type), + Arc::clone(&self.decoders), + ) + } +} + +/// Builder for lookup operations. `create_lookuper()` builds a primary-key +/// `Lookuper`; `lookup_by(columns).create_lookuper()` builds a +/// `PrefixKeyLookuper` for prefix scans. +// TODO: Add create_typed_lookuper() for typed lookups with POJO mapping +pub struct TableLookup { + lookup_client: Arc, + table_info: TableInfo, + metadata: Arc, + schema_getter: Arc, +} + +impl TableLookup { + pub(super) fn new( + lookup_client: Arc, + table_info: TableInfo, + metadata: Arc, + schema_getter: Arc, + ) -> Self { + Self { + lookup_client, + table_info, + metadata, + schema_getter, + } + } + + /// Switches the builder into prefix-scan mode. `lookup_column_names` + /// must list the table's partition keys (if any) plus the bucket keys, + /// in that order — i.e. this is a **bucket-key prefix** scan, not an + /// arbitrary primary-key prefix. Validation is deferred to + /// `create_lookuper()`. + pub fn lookup_by(self, lookup_column_names: Vec) -> TablePrefixLookup { + TablePrefixLookup { + lookup_client: self.lookup_client, + table_info: self.table_info, + metadata: self.metadata, + schema_getter: self.schema_getter, + lookup_column_names, + } + } + + /// Creates a `Lookuper` for performing key-based lookups. + /// + /// The lookuper will automatically encode the key and compute the bucket + /// for each lookup using the appropriate bucketing function. + /// + /// The lookuper uses a shared `LookupClient` that batches multiple lookup + /// operations together to reduce network round trips. This achieves parity + /// with the Java client implementation for improved throughput. + pub fn create_lookuper(self) -> Result { + let num_buckets = self.table_info.get_num_buckets(); + + // Get data lake format from table config for bucketing function + let data_lake_format = self.table_info.get_table_config().get_datalake_format()?; + let bucketing_function = ::of(data_lake_format.as_ref()); + + let row_type = self.table_info.row_type(); + let primary_keys = self.table_info.get_primary_keys(); + let lookup_row_type = row_type.project_with_field_names(primary_keys)?; + + let physical_primary_keys = self.table_info.get_physical_primary_keys().to_vec(); + let primary_key_encoder = + KeyEncoderFactory::of(&lookup_row_type, &physical_primary_keys, &data_lake_format)?; + + let bucket_key_encoder = if self.table_info.is_default_bucket_key() { + None + } else { + let bucket_keys = self.table_info.get_bucket_keys().to_vec(); + Some(KeyEncoderFactory::of( + &lookup_row_type, + &bucket_keys, + &data_lake_format, + )?) + }; + + let partition_getter = if self.table_info.is_partitioned() { + Some(PartitionGetter::new( + &lookup_row_type, + Arc::clone(self.table_info.get_partition_keys()), + )?) + } else { + None + }; + + let schema_ctx = LookupSchemaCtx::new(&self.table_info, self.schema_getter)?; + + Ok(Lookuper { + table_path: Arc::new(self.table_info.table_path.clone()), + table_info: self.table_info, + metadata: self.metadata, + lookup_client: self.lookup_client, + bucketing_function, + primary_key_encoder, + bucket_key_encoder, + partition_getter, + num_buckets, + schema_ctx, + }) + } +} + +/// Performs key-based lookups against a primary key table. +/// +/// The `Lookuper` automatically encodes the lookup key, computes the target +/// bucket, and retrieves the value using the batched `LookupClient`. +/// +/// # Example +/// ```ignore +/// let lookuper = table.new_lookup()?.create_lookuper()?; +/// let row = GenericRow::new(vec![Datum::Int32(42)]); // lookup key +/// let result = lookuper.lookup(&row).await?; +/// ``` +pub struct Lookuper { + table_path: Arc, + table_info: TableInfo, + metadata: Arc, + lookup_client: Arc, + bucketing_function: Box, + primary_key_encoder: Box, + bucket_key_encoder: Option>, + partition_getter: Option, + num_buckets: i32, + schema_ctx: LookupSchemaCtx, +} + +impl Lookuper { + /// Looks up a value by its primary key. + /// + /// The key is encoded and the bucket is automatically computed using + /// the table's bucketing function. The lookup is queued and batched + /// with other lookups for improved throughput. + /// + /// # Arguments + /// * `row` - The row containing the primary key field values + /// + /// # Returns + /// * `Ok(LookupResult)` - The lookup result (may be empty if key not found) + /// * `Err(Error)` - If the lookup fails + pub async fn lookup(&mut self, row: &dyn InternalRow) -> Result { + let pk_bytes = self.primary_key_encoder.encode_key(row)?; + let bk_bytes = match &mut self.bucket_key_encoder { + Some(encoder) => encoder.encode_key(row)?, + None => pk_bytes.clone(), + }; + + let partition_id = if let Some(ref partition_getter) = self.partition_getter { + let partition_name = partition_getter.get_partition(row)?; + let physical_table_path = PhysicalTablePath::of_partitioned( + Arc::clone(&self.table_path), + Some(partition_name), + ); + match self + .metadata + .check_and_update_partition_metadata(&physical_table_path) + .await? + { + Some(id) => Some(id), + None => return Ok(self.schema_ctx.empty_result()), + } + } else { + None + }; + + let bucket_id = self + .bucketing_function + .bucketing(&bk_bytes, self.num_buckets)?; + + let table_id = self.table_info.get_table_id(); + let table_bucket = TableBucket::new_with_partition(table_id, partition_id, bucket_id); + + // Use the batched lookup client + let result = self + .lookup_client + .lookup(self.table_path.as_ref().clone(), table_bucket, pk_bytes) + .await?; + + let rows = match result { + Some(value_bytes) => vec![value_bytes], + None => Vec::new(), + }; + self.schema_ctx.build_result(rows).await + } + + /// Returns a reference to the table info. + pub fn table_info(&self) -> &TableInfo { + &self.table_info + } +} + +pub struct TablePrefixLookup { + lookup_client: Arc, + table_info: TableInfo, + metadata: Arc, + schema_getter: Arc, + lookup_column_names: Vec, +} + +impl TablePrefixLookup { + pub fn create_lookuper(self) -> Result { + validate_prefix_lookup(&self.table_info, &self.lookup_column_names)?; + + let num_buckets = self.table_info.get_num_buckets(); + let data_lake_format = self.table_info.get_table_config().get_datalake_format()?; + let bucketing_function = ::of(data_lake_format.as_ref()); + + let row_type = self.table_info.row_type(); + let lookup_row_type = row_type.project_with_field_names(&self.lookup_column_names)?; + + let bucket_keys = self.table_info.get_bucket_keys().to_vec(); + let prefix_key_encoder = + KeyEncoderFactory::of(&lookup_row_type, &bucket_keys, &data_lake_format)?; + + let partition_getter = if self.table_info.is_partitioned() { + Some(PartitionGetter::new( + &lookup_row_type, + Arc::clone(self.table_info.get_partition_keys()), + )?) + } else { + None + }; + + let schema_ctx = LookupSchemaCtx::new(&self.table_info, self.schema_getter)?; + + Ok(PrefixKeyLookuper { + table_path: Arc::new(self.table_info.table_path.clone()), + table_info: self.table_info, + metadata: self.metadata, + lookup_client: self.lookup_client, + bucketing_function, + prefix_key_encoder, + partition_getter, + num_buckets, + schema_ctx, + }) + } +} + +fn validate_prefix_lookup(table_info: &TableInfo, lookup_columns: &[String]) -> Result<()> { + if !table_info.has_primary_key() { + return Err(Error::IllegalArgument { + message: format!( + "Log table {} doesn't support prefix lookup", + table_info.get_table_path() + ), + }); + } + + let physical_primary_keys = table_info.get_physical_primary_keys(); + let bucket_keys = table_info.get_bucket_keys(); + + if bucket_keys.is_empty() { + return Err(Error::IllegalArgument { + message: format!( + "Can not perform prefix lookup on table '{}', because it has no bucket keys.", + table_info.get_table_path() + ), + }); + } + + if !physical_primary_keys.starts_with(bucket_keys) { + return Err(Error::IllegalArgument { + message: format!( + "Can not perform prefix lookup on table '{}', because the bucket keys {:?} \ + is not a prefix subset of the physical primary keys {:?} \ + (excluded partition fields if present).", + table_info.get_table_path(), + bucket_keys, + physical_primary_keys, + ), + }); + } + + let partition_keys: &[String] = table_info.get_partition_keys(); + if table_info.is_partitioned() { + for pk in partition_keys { + if !lookup_columns.iter().any(|c| c == pk) { + return Err(Error::IllegalArgument { + message: format!( + "Can not perform prefix lookup on table '{}', because the lookup columns \ + {:?} must contain all partition fields {:?}.", + table_info.get_table_path(), + lookup_columns, + partition_keys, + ), + }); + } + } + } + + let physical_lookup_columns: Vec<&String> = lookup_columns + .iter() + .filter(|c| !partition_keys.iter().any(|p| p == *c)) + .collect(); + if physical_lookup_columns.len() != bucket_keys.len() + || !physical_lookup_columns + .iter() + .zip(bucket_keys.iter()) + .all(|(a, b)| *a == b) + { + return Err(Error::IllegalArgument { + message: format!( + "Can not perform prefix lookup on table '{}', because the lookup columns {:?} \ + must contain all bucket keys {:?} in order.", + table_info.get_table_path(), + lookup_columns, + bucket_keys, + ), + }); + } + + if bucket_keys == physical_primary_keys { + return Err(Error::IllegalArgument { + message: format!( + "Can not perform prefix lookup on table '{}', because the lookup columns {:?} \ + equals the physical primary keys {:?}. \ + Please use primary key lookup (Lookuper without lookup_by) instead.", + table_info.get_table_path(), + lookup_columns, + physical_primary_keys, + ), + }); + } + + Ok(()) +} + +pub struct PrefixKeyLookuper { + table_path: Arc, + table_info: TableInfo, + metadata: Arc, + lookup_client: Arc, + bucketing_function: Box, + prefix_key_encoder: Box, + partition_getter: Option, + num_buckets: i32, + schema_ctx: LookupSchemaCtx, +} + +impl PrefixKeyLookuper { + pub async fn lookup(&mut self, row: &dyn InternalRow) -> Result { + let prefix_bytes = self.prefix_key_encoder.encode_key(row)?; + + let partition_id = if let Some(ref partition_getter) = self.partition_getter { + let partition_name = partition_getter.get_partition(row)?; + let physical_table_path = PhysicalTablePath::of_partitioned( + Arc::clone(&self.table_path), + Some(partition_name), + ); + match self + .metadata + .check_and_update_partition_metadata(&physical_table_path) + .await? + { + Some(id) => Some(id), + None => return Ok(self.schema_ctx.empty_result()), + } + } else { + None + }; + + let bucket_id = self + .bucketing_function + .bucketing(&prefix_bytes, self.num_buckets)?; + + let table_id = self.table_info.get_table_id(); + let table_bucket = TableBucket::new_with_partition(table_id, partition_id, bucket_id); + + let rows = self + .lookup_client + .prefix_lookup(self.table_path.as_ref().clone(), table_bucket, prefix_bytes) + .await?; + + self.schema_ctx.build_result(rows).await + } + + pub fn table_info(&self) -> &TableInfo { + &self.table_info + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{Column, DataTypes, Schema}; + use crate::row::binary::BinaryWriter; + use crate::row::compacted::CompactedRowWriter; + use arrow::array::Int32Array; + + fn make_row_bytes(schema_id: i16, row_data: &[u8]) -> Vec { + let mut bytes = Vec::with_capacity(SCHEMA_ID_LENGTH + row_data.len()); + bytes.extend_from_slice(&schema_id.to_le_bytes()); + bytes.extend_from_slice(row_data); + bytes + } + + fn schema_with_ids(columns: &[(i32, &str, crate::metadata::DataType)]) -> Schema { + let cols: Vec = columns + .iter() + .map(|(id, name, dt)| Column::new(*name, dt.clone()).with_id(*id)) + .collect(); + Schema::builder().with_columns(cols).build().unwrap() + } + + fn cache_with( + target_id: i16, + target_decoder: FixedSchemaDecoder, + others: Vec<(i16, FixedSchemaDecoder)>, + ) -> Arc { + let cache = DecoderCache::new(target_id, Arc::new(target_decoder)); + for (id, decoder) in others { + cache.insert(id, Arc::new(decoder)); + } + Arc::new(cache) + } + + fn lookup_result_from( + rows: Vec>, + target_schema: &Schema, + decoders: Arc, + ) -> LookupResult { + LookupResult::new(rows, Arc::new(target_schema.row_type().clone()), decoders) + } + + #[test] + fn test_to_record_batch_empty() { + let target = schema_with_ids(&[(0, "id", DataTypes::int())]); + let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap(); + let result = lookup_result_from(Vec::new(), &target, cache_with(0, decoder, vec![])); + let batch = result.to_record_batch().unwrap(); + assert_eq!(batch.num_rows(), 0); + assert_eq!(batch.num_columns(), 1); + } + + #[test] + fn test_to_record_batch_with_row_at_target_schema() { + let target = schema_with_ids(&[(0, "id", DataTypes::int())]); + + let mut writer = CompactedRowWriter::new(1); + writer.write_int(42); + let row_bytes = make_row_bytes(0, writer.buffer()); + + let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap(); + let result = lookup_result_from(vec![row_bytes], &target, cache_with(0, decoder, vec![])); + + let batch = result.to_record_batch().unwrap(); + assert_eq!(batch.num_rows(), 1); + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(col.value(0), 42); + } + + #[test] + fn test_get_rows_decodes_per_row_schema_id_with_projection() { + let source = schema_with_ids(&[(0, "a", DataTypes::int())]); + let target = schema_with_ids(&[(0, "a", DataTypes::int()), (1, "b", DataTypes::string())]); + + let mut w = CompactedRowWriter::new(1); + w.write_int(7); + let old_row = make_row_bytes(3, w.buffer()); + + let mut w = CompactedRowWriter::new(2); + w.write_int(8); + w.write_string("eight"); + let new_row = make_row_bytes(7, w.buffer()); + + let target_decoder = + FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap(); + let projection_decoder = + FixedSchemaDecoder::new(KvFormat::COMPACTED, &source, &target).unwrap(); + let cache = cache_with(7, target_decoder, vec![(3, projection_decoder)]); + let result = lookup_result_from(vec![old_row, new_row], &target, cache); + + let rows = result.get_rows().unwrap(); + assert_eq!(rows.len(), 2); + assert_eq!(rows[0].get_int(0).unwrap(), 7); + assert!(rows[0].is_null_at(1).unwrap()); + assert_eq!(rows[1].get_int(0).unwrap(), 8); + assert_eq!(rows[1].get_string(1).unwrap(), "eight"); + } + + #[test] + fn test_to_record_batch_payload_too_short() { + let target = schema_with_ids(&[(0, "id", DataTypes::int())]); + let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap(); + let result = lookup_result_from(vec![vec![0u8]], &target, cache_with(0, decoder, vec![])); + assert!(result.to_record_batch().is_err()); + } + + #[test] + fn test_get_rows_errors_when_no_decoder_for_schema_id() { + let target = schema_with_ids(&[(0, "id", DataTypes::int())]); + let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap(); + let mut w = CompactedRowWriter::new(1); + w.write_int(1); + let row = make_row_bytes(99, w.buffer()); + let result = lookup_result_from(vec![row], &target, cache_with(0, decoder, vec![])); + + let err = result + .get_rows() + .map(|_| ()) + .map_err(|e| e.to_string()) + .unwrap_err(); + assert!(err.contains("schema id 99"), "{err}"); + } + + #[test] + fn test_read_schema_id_rejects_negative() { + let bytes = [0xFFu8, 0xFFu8, 0u8]; + let err = LookupResult::read_schema_id(&bytes).unwrap_err(); + assert!( + err.to_string().contains("Invalid negative schema id"), + "{err}" + ); + } + + #[test] + fn test_decoder_cache_target_lookup_skips_lock() { + let target = schema_with_ids(&[(0, "a", DataTypes::int())]); + let target_decoder = + Arc::new(FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap()); + let cache = DecoderCache::new(7, Arc::clone(&target_decoder)); + + let returned = cache.get(7).expect("target id must hit the cache"); + assert!(Arc::ptr_eq(&returned, &target_decoder)); + assert!(cache.get(99).is_none()); + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs new file mode 100644 index 0000000000..657a44bfe8 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/mod.rs @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::connection::FlussConnection; +use crate::client::metadata::Metadata; +use crate::client::schema_getter::ClientSchemaGetter; +use crate::error::{Error, Result}; +use crate::metadata::{SchemaInfo, TableInfo, TablePath}; +use std::sync::Arc; + +pub const EARLIEST_OFFSET: i64 = -2; + +mod append; +mod batch_scanner; +mod lookup; + +mod log_fetch_buffer; +mod partition_getter; +mod reader; +mod remote_log; +mod scanner; +mod upsert; + +pub use append::{AppendWriter, TableAppend}; +pub use batch_scanner::LimitBatchScanner; +pub use lookup::{LookupResult, Lookuper, PrefixKeyLookuper, TableLookup, TablePrefixLookup}; +pub use reader::{RecordBatchLogReader, SyncRecordBatchLogReader}; +pub use remote_log::{ + DEFAULT_REMOTE_FILE_DOWNLOAD_THREAD_NUM, DEFAULT_SCANNER_REMOTE_LOG_PREFETCH_NUM, +}; +pub use scanner::{LogScanner, RecordBatchLogScanner, TableScan}; +pub use upsert::{TableUpsert, UpsertWriter}; + +#[allow(dead_code)] +pub struct FlussTable<'a> { + conn: &'a FlussConnection, + metadata: Arc, + table_info: TableInfo, + table_path: TablePath, + has_primary_key: bool, +} + +impl<'a> FlussTable<'a> { + pub fn new(conn: &'a FlussConnection, metadata: Arc, table_info: TableInfo) -> Self { + FlussTable { + conn, + table_path: table_info.table_path.clone(), + has_primary_key: table_info.has_primary_key(), + table_info, + metadata, + } + } + + pub fn new_append(&self) -> Result { + if self.has_primary_key { + return Err(Error::UnsupportedOperation { + message: "Append is only supported for log tables (without primary key)" + .to_string(), + }); + } + Ok(TableAppend::new( + self.table_path.clone(), + Arc::new(self.table_info.clone()), + self.conn.get_or_create_writer_client()?, + )) + } + + pub fn new_scan(&self) -> TableScan<'_> { + TableScan::new(self.conn, self.table_info.clone(), self.metadata.clone()) + } + + pub fn metadata(&self) -> &Arc { + &self.metadata + } + + pub fn get_table_info(&self) -> &TableInfo { + &self.table_info + } + + pub fn table_path(&self) -> &TablePath { + &self.table_path + } + + pub fn has_primary_key(&self) -> bool { + self.has_primary_key + } + + /// Creates a new `TableLookup` for configuring lookup operations. + /// + /// This follows the same pattern as `new_scan()` and `new_append()`, + /// returning a configuration object that can be used to create a `Lookuper`. + /// + /// The table must have a primary key (be a primary key table). + /// + /// # Returns + /// * `Ok(TableLookup)` - A lookup configuration object + /// * `Err(Error)` - If the table doesn't have a primary key + /// + /// # Example + /// ```ignore + /// let table = conn.get_table(&table_path).await?; + /// let lookuper = table.new_lookup()?.create_lookuper()?; + /// let key = vec![1, 2, 3]; // encoded primary key bytes + /// if let Some(value) = lookuper.lookup(key).await? { + /// println!("Found value: {:?}", value); + /// } + /// ``` + pub fn new_lookup(&self) -> Result { + if !self.has_primary_key { + return Err(Error::UnsupportedOperation { + message: "Lookup is only supported for primary key tables".to_string(), + }); + } + let lookup_client = self.conn.get_or_create_lookup_client()?; + // Pre-seed the schema getter with the table's current schema — + // rows written under it (the dominant case) never trigger an RPC. + let latest = SchemaInfo::new( + self.table_info.get_schema().clone(), + self.table_info.get_schema_id(), + ); + let schema_getter = Arc::new(ClientSchemaGetter::new( + self.table_path.clone(), + self.conn.get_admin()?, + latest, + )); + Ok(TableLookup::new( + lookup_client, + self.table_info.clone(), + self.metadata.clone(), + schema_getter, + )) + } + + pub fn new_upsert(&self) -> Result { + if !self.has_primary_key { + return Err(Error::UnsupportedOperation { + message: "Upsert is only supported for primary key tables".to_string(), + }); + } + + Ok(TableUpsert::new( + self.table_path.clone(), + self.table_info.clone(), + self.conn.get_or_create_writer_client()?, + )) + } +} + +impl<'a> Drop for FlussTable<'a> { + fn drop(&mut self) { + // do-nothing now + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/partition_getter.rs b/fluss-rust/crates/fluss/src/client/table/partition_getter.rs new file mode 100644 index 0000000000..1115ded3bd --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/partition_getter.rs @@ -0,0 +1,199 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::{DataType, PhysicalTablePath, ResolvedPartitionSpec, RowType, TablePath}; +use crate::row::InternalRow; +use crate::row::field_getter::FieldGetter; +use crate::util::partition; +use std::sync::Arc; + +/// Get the physical table path for a row, handling partitioned vs non-partitioned tables. +pub fn get_physical_path( + table_path: &Arc, + partition_getter: Option<&PartitionGetter>, + row: &R, +) -> Result { + if let Some(getter) = partition_getter { + let partition = getter.get_partition(row)?; + Ok(PhysicalTablePath::of_partitioned( + Arc::clone(table_path), + Some(partition), + )) + } else { + Ok(PhysicalTablePath::of(Arc::clone(table_path))) + } +} + +/// A getter to get partition name from a row. +#[allow(dead_code)] +pub struct PartitionGetter { + partition_keys: Arc<[String]>, + partitions: Vec<(DataType, FieldGetter)>, +} + +#[allow(dead_code)] +impl PartitionGetter { + pub fn new(row_type: &RowType, partition_keys: Arc<[String]>) -> Result { + let mut partitions = Vec::with_capacity(partition_keys.len()); + + for partition_key in partition_keys.iter() { + if let Some(partition_col_index) = row_type.get_field_index(partition_key.as_str()) { + let data_type = row_type + .fields() + .get(partition_col_index) + .unwrap() + .data_type + .clone(); + let field_getter = FieldGetter::create(&data_type, partition_col_index); + + partitions.push((data_type, field_getter)); + } else { + return Err(IllegalArgument { + message: format!( + "The partition column {partition_key} is not in the row {row_type}." + ), + }); + }; + } + + Ok(Self { + partition_keys, + partitions, + }) + } + + pub fn get_partition(&self, row: &dyn InternalRow) -> Result { + self.get_partition_spec(row) + .map(|ps| ps.get_partition_name()) + } + + pub fn get_partition_spec(&self, row: &dyn InternalRow) -> Result { + let mut partition_values = Vec::with_capacity(self.partitions.len()); + + for (data_type, field_getter) in &self.partitions { + let value = field_getter.get_field(row)?; + if value.is_null() { + return Err(IllegalArgument { + message: "Partition value shouldn't be null.".to_string(), + }); + } + partition_values.push(partition::convert_value_of_type(&value, data_type)?); + } + + ResolvedPartitionSpec::new(Arc::clone(&self.partition_keys), partition_values) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{DataField, IntType, StringType}; + use crate::row::{Datum, GenericRow}; + + #[test] + fn test_partition_getter_single_key() { + let row_type = RowType::new(vec![ + DataField::new("id", DataType::Int(IntType::new()), None), + DataField::new("region", DataType::String(StringType::new()), None), + ]); + + let getter = PartitionGetter::new(&row_type, Arc::from(["region".to_string()])) + .expect("should succeed"); + + let row = GenericRow::from_data(vec![Datum::Int32(42), Datum::from("US")]); + let partition_name = getter.get_partition(&row).expect("should succeed"); + assert_eq!(partition_name, "US"); + } + + #[test] + fn test_partition_getter_multiple_keys() { + let row_type = RowType::new(vec![ + DataField::new("id", DataType::Int(IntType::new()), None), + DataField::new("date", DataType::String(StringType::new()), None), + DataField::new("region", DataType::String(StringType::new()), None), + ]); + + let getter = PartitionGetter::new( + &row_type, + Arc::from(["date".to_string(), "region".to_string()]), + ) + .expect("should succeed"); + + let row = GenericRow::from_data(vec![ + Datum::Int32(42), + Datum::from("2024-01-15"), + Datum::from("US"), + ]); + let partition_name = getter.get_partition(&row).expect("should succeed"); + assert_eq!(partition_name, "2024-01-15$US"); + } + + #[test] + fn test_partition_getter_invalid_column() { + let row_type = RowType::new(vec![DataField::new( + "id", + DataType::Int(IntType::new()), + None, + )]); + + let result = PartitionGetter::new(&row_type, Arc::from(["nonexistent".to_string()])); + assert!(result.is_err()); + } + + #[test] + fn test_partition_getter_null_value() { + let row_type = RowType::new(vec![ + DataField::new("id", DataType::Int(IntType::new()), None), + DataField::new("region", DataType::String(StringType::new()), None), + ]); + + let getter = PartitionGetter::new(&row_type, Arc::from(["region".to_string()])) + .expect("should succeed"); + + let row = GenericRow::from_data(vec![Datum::Int32(42), Datum::Null]); + let result = getter.get_partition(&row); + assert!(result.is_err()); + } + + #[test] + fn test_get_partition_spec() { + let row_type = RowType::new(vec![ + DataField::new("id", DataType::Int(IntType::new()), None), + DataField::new("date", DataType::String(StringType::new()), None), + DataField::new("region", DataType::String(StringType::new()), None), + ]); + + let getter = PartitionGetter::new( + &row_type, + Arc::from(["date".to_string(), "region".to_string()]), + ) + .expect("should succeed"); + + let row = GenericRow::from_data(vec![ + Datum::Int32(42), + Datum::from("2024-01-15"), + Datum::from("US"), + ]); + let spec = getter.get_partition_spec(&row).expect("should succeed"); + + assert_eq!(spec.get_partition_keys(), &["date", "region"]); + assert_eq!(spec.get_partition_values(), &["2024-01-15", "US"]); + assert_eq!(spec.get_partition_name(), "2024-01-15$US"); + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/reader.rs b/fluss-rust/crates/fluss/src/client/table/reader.rs new file mode 100644 index 0000000000..518c68a222 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/reader.rs @@ -0,0 +1,701 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Bounded log reader that polls until stopping offsets, then terminates. +//! +//! Unlike [`RecordBatchLogScanner`] which is unbounded (continuous streaming), +//! [`RecordBatchLogReader`] reads log data up to a finite set of stopping +//! offsets and then signals completion. This enables "snapshot-style" reads +//! from a streaming log: capture the latest offsets, then consume all data +//! up to those offsets. +//! +//! The reader **takes ownership** of the scanner (move, not clone). Once the +//! scanner is moved into a reader, the compiler prevents concurrent polls. +//! +//! The reader also provides a synchronous [`arrow::record_batch::RecordBatchReader`] +//! adapter via [`RecordBatchLogReader::to_record_batch_reader`] for Arrow +//! ecosystem interop and FFI consumers (Python, C++). + +use crate::client::admin::FlussAdmin; +use crate::client::table::RecordBatchLogScanner; +use crate::error::{Error, Result}; +use crate::metadata::TableBucket; +use crate::record::ScanBatch; +use crate::rpc::message::OffsetSpec; +use arrow::record_batch::RecordBatch; +use arrow_schema::SchemaRef; +use log::warn; +use std::collections::{HashMap, VecDeque}; +use std::time::Duration; + +const DEFAULT_POLL_TIMEOUT: Duration = Duration::from_millis(500); + +/// Bounded log reader that consumes log data up to specified stopping offsets. +/// +/// This type wraps a [`RecordBatchLogScanner`] and adds stopping semantics: +/// it polls batches from the scanner, filters/slices them against per-bucket +/// stopping offsets, and signals completion when all buckets are caught up. +/// +/// The reader takes **ownership** of the scanner. Once moved in, no other code +/// can poll the same scanner concurrently. +/// +/// # Construction +/// +/// Use [`RecordBatchLogReader::new_until_latest`] for the common case of +/// reading all currently-available data, or [`RecordBatchLogReader::new_until_offsets`] +/// for custom stopping offsets. +/// +/// # Async iteration +/// +/// Call [`next_batch`](RecordBatchLogReader::next_batch) repeatedly to get +/// [`ScanBatch`]es lazily, one at a time. Returns `None` when all buckets +/// have reached their stopping offsets. +/// +/// # Sync adapter +/// +/// Call [`to_record_batch_reader`](RecordBatchLogReader::to_record_batch_reader) +/// to get a synchronous [`arrow::record_batch::RecordBatchReader`] suitable +/// for Arrow FFI consumers. +pub struct RecordBatchLogReader { + scanner: RecordBatchLogScanner, + stopping_offsets: HashMap, + buffer: VecDeque, + schema: SchemaRef, +} + +impl RecordBatchLogReader { + /// Create a reader that reads until the latest offsets at the time of creation. + /// + /// Queries the server for the current latest offset of each subscribed + /// bucket, then reads until those offsets are reached. Buckets whose + /// subscribed offset already meets or exceeds the latest offset are + /// excluded (nothing to read). + /// + /// Partition metadata is fetched once during construction; no caching + /// is needed since each reader is typically short-lived. + pub async fn new_until_latest( + scanner: RecordBatchLogScanner, + admin: &FlussAdmin, + ) -> Result { + // Acquire the guard first so no concurrent unsubscribe can mutate + // state between reading subscriptions and using them. + scanner.try_set_reader_active()?; + + let subscribed = scanner.get_subscribed_buckets(); + if subscribed.is_empty() { + scanner.clear_reader_active(); + return Err(Error::IllegalArgument { + message: "No buckets subscribed. Call subscribe() before creating a reader." + .to_string(), + }); + } + + let stopping_offsets = match query_latest_offsets(admin, &scanner, &subscribed).await { + Ok(o) => o, + Err(e) => { + scanner.clear_reader_active(); + return Err(e); + } + }; + let schema = scanner.schema(); + + Ok(Self { + scanner, + stopping_offsets, + buffer: VecDeque::new(), + schema, + }) + } + + /// Create a reader with explicit stopping offsets per bucket. + /// + /// # NOTE: Every key in `stopping_offsets` **must** correspond to a bucket that is + /// currently subscribed on the `scanner`. If a stopping offset refers to a + /// bucket that will never appear in polled batches, the reader will loop + /// indefinitely waiting for data that never arrives. + /// + /// Use [`new_until_latest`](Self::new_until_latest) for the common case; + /// it queries the server and builds a validated stopping-offset map + /// automatically. + pub fn new_until_offsets( + scanner: RecordBatchLogScanner, + stopping_offsets: HashMap, + ) -> Result { + scanner.try_set_reader_active()?; + let schema = scanner.schema(); + Ok(Self { + scanner, + stopping_offsets, + buffer: VecDeque::new(), + schema, + }) + } + + /// Returns the Arrow schema for batches produced by this reader. + pub fn schema(&self) -> SchemaRef { + self.schema.clone() + } + + /// Drain all remaining batches until stopping offsets are satisfied. + /// + /// This is a convenience for callers (e.g. bindings building a single Arrow + /// table) that want to materialize the full result in Rust without per-batch + /// iteration. + pub async fn collect_all_batches(&mut self) -> Result> { + let mut out = Vec::new(); + while let Some(b) = self.next_batch().await? { + out.push(b); + } + Ok(out) + } + + /// Fetch the next [`ScanBatch`], or `None` if all buckets are caught up. + /// + /// Each call may internally poll multiple batches from the scanner, + /// buffer them, and return one at a time. Batches that cross a stopping + /// offset boundary are sliced to exclude records at or beyond the stop point. + /// + /// Completed buckets are unsubscribed from the scanner to avoid wasting + /// network traffic on data the reader will discard. + pub async fn next_batch(&mut self) -> Result> { + loop { + if let Some(batch) = self.buffer.pop_front() { + return Ok(Some(batch)); + } + + if self.stopping_offsets.is_empty() { + return Ok(None); + } + + let scan_batches = self.scanner.poll(DEFAULT_POLL_TIMEOUT).await?; + + if scan_batches.is_empty() { + continue; + } + + let completed = + filter_batches(scan_batches, &mut self.stopping_offsets, &mut self.buffer); + + // Use the `_sync` unsubscribe variants here: the active-reader + // guard rejects calls to the async `unsubscribe*` methods, but + // the reader is allowed to clean up its own completed buckets. + // The sync variants do the same map removal without the guard + // check, and the partitioned/non-partitioned mismatch they + // silently ignore is unreachable since the reader inherits the + // scanner's partition mode. + for tb in completed { + if let Some(partition_id) = tb.partition_id() { + self.scanner + .unsubscribe_partition_sync(partition_id, tb.bucket_id()); + } else { + self.scanner.unsubscribe_sync(tb.bucket_id()); + } + } + } + } + + /// Convert this async reader into a synchronous [`arrow::record_batch::RecordBatchReader`]. + /// + /// The returned adapter calls [`tokio::runtime::Handle::block_on`] on each + /// iterator step. **Do not** call this from inside a Tokio worker thread + /// while the same runtime is driving async work (nested `block_on` can + /// panic or deadlock). Prefer [`next_batch`](RecordBatchLogReader::next_batch) + /// in async Rust code. This is intended for sync/FFI boundaries (C++, some + /// Python call paths). + pub fn to_record_batch_reader( + self, + handle: tokio::runtime::Handle, + ) -> SyncRecordBatchLogReader { + SyncRecordBatchLogReader { + reader: self, + handle, + } + } +} + +/// Best-effort cleanup when the reader is dropped before all buckets reach +/// their stopping offsets (early `break`, an exception in the consumer, etc.). +/// +/// Why this matters even though we own the scanner: +/// +/// In pure Rust, dropping the reader drops the owned `RecordBatchLogScanner`, +/// which decrements the `Arc` to zero and frees the inner +/// state. Subscriptions die with it, so this `Drop` is a no-op in that path. +/// +/// In the binding layer (Python today, C++/Elixir later), the binding holds +/// its own `Arc` and uses +/// [`RecordBatchLogScanner::new_shared_handle`] to obtain a second handle for +/// the reader. When the reader is dropped mid-iteration the inner state stays +/// alive — and any buckets the reader hadn't yet completed remain in +/// `LogScannerStatus.bucket_status_map`. The user's next operations on the +/// original `LogScanner` would then see "ghost" subscriptions (extra buckets +/// being polled, stale offsets, etc.). +/// +/// The `next_batch` loop already calls `unsubscribe` on each completed bucket, +/// so `stopping_offsets` accurately reflects the still-active set when `Drop` +/// runs. We unsubscribe each remaining bucket synchronously via the +/// `_sync` escape hatches (the underlying `LogScannerStatus` ops don't await), +/// so this is safe to call from any context — sync, async, a Tokio worker, or +/// a Python thread holding the GIL. +/// +/// After cleanup, the `reader_active` guard is cleared so that the original +/// scanner (held by the binding layer) can accept new subscriptions again. +/// +/// Caveats: +/// - Batches already buffered in `LogFetcher.log_fetch_buffer` for an +/// unsubscribed bucket are not drained here. They'll either be filtered out +/// by the next `RecordBatchLogReader` (via the "bucket not in +/// stopping_offsets" branch) or surface to a direct `poll_arrow` caller, who +/// was sharing scanner state in the first place. +/// - `Drop` cannot return errors. The `_sync` variants no-op on +/// partitioned/non-partitioned mismatch, but that mismatch is unreachable +/// here because the reader was constructed from this scanner and inherited +/// its partition mode. +impl Drop for RecordBatchLogReader { + fn drop(&mut self) { + for (tb, _) in self.stopping_offsets.drain() { + if let Some(partition_id) = tb.partition_id() { + self.scanner + .unsubscribe_partition_sync(partition_id, tb.bucket_id()); + } else { + self.scanner.unsubscribe_sync(tb.bucket_id()); + } + } + self.scanner.clear_reader_active(); + } +} + +/// Synchronous adapter that implements [`arrow::record_batch::RecordBatchReader`]. +/// +/// Created via [`RecordBatchLogReader::to_record_batch_reader`]. +/// Blocks the current thread on each `next()` call using the provided +/// Tokio runtime handle. +/// +/// The iterator yields plain [`RecordBatch`]es (bucket/offset metadata from +/// [`ScanBatch`] is stripped to satisfy the Arrow trait contract). +pub struct SyncRecordBatchLogReader { + reader: RecordBatchLogReader, + handle: tokio::runtime::Handle, +} + +impl Iterator for SyncRecordBatchLogReader { + type Item = std::result::Result; + + fn next(&mut self) -> Option { + match self.handle.block_on(self.reader.next_batch()) { + Ok(Some(scan_batch)) => Some(Ok(scan_batch.into_batch())), + Ok(None) => None, + Err(e) => Some(Err(arrow::error::ArrowError::ExternalError(Box::new(e)))), + } + } +} + +impl arrow::record_batch::RecordBatchReader for SyncRecordBatchLogReader { + fn schema(&self) -> SchemaRef { + self.reader.schema() + } +} + +/// Query latest offsets for all subscribed buckets, handling both partitioned +/// and non-partitioned tables. +/// +/// Buckets whose subscribed offset already meets or exceeds the latest offset +/// are excluded from the result (there is nothing to read). A `latest_offset` +/// of `0` means the bucket is empty and is silently skipped; a negative value +/// is unexpected from the server and is logged as a warning before being +/// skipped. +async fn query_latest_offsets( + admin: &FlussAdmin, + scanner: &RecordBatchLogScanner, + subscribed: &[(TableBucket, i64)], +) -> Result> { + let table_path = scanner.table_path(); + + if !scanner.is_partitioned() { + let bucket_ids: Vec = subscribed.iter().map(|(tb, _)| tb.bucket_id()).collect(); + + let offsets = admin + .list_offsets(table_path, &bucket_ids, OffsetSpec::Latest) + .await?; + + let subscribed_offset_by_bucket: HashMap = subscribed + .iter() + .map(|(tb, off)| (tb.bucket_id(), *off)) + .collect(); + + let table_id = scanner.table_id(); + Ok(offsets + .into_iter() + .filter(|(bucket_id, latest_offset)| { + if *latest_offset < 0 { + warn!( + "Server returned negative latest offset {latest_offset} for bucket {bucket_id} of table {table_id}; skipping bucket." + ); + return false; + } + if *latest_offset == 0 { + return false; + } + let Some(&subscribed_offset) = subscribed_offset_by_bucket.get(bucket_id) + else { + return false; + }; + subscribed_offset < *latest_offset + }) + .map(|(bucket_id, offset)| (TableBucket::new(table_id, bucket_id), offset)) + .collect()) + } else { + query_partitioned_offsets(admin, scanner, subscribed).await + } +} + +/// Query offsets for partitioned table subscriptions. +/// +/// Partition metadata is fetched once per reader construction (not cached), +/// since each [`RecordBatchLogReader`] is typically short-lived and consumed. +async fn query_partitioned_offsets( + admin: &FlussAdmin, + scanner: &RecordBatchLogScanner, + subscribed: &[(TableBucket, i64)], +) -> Result> { + let table_path = scanner.table_path(); + let table_id = scanner.table_id(); + + let partition_infos = admin.list_partition_infos(table_path).await?; + let partition_id_to_name: HashMap = partition_infos + .into_iter() + .map(|info| (info.get_partition_id(), info.get_partition_name())) + .collect(); + + let subscribed_offset_map: HashMap = subscribed.iter().cloned().collect(); + + let mut by_partition: HashMap> = HashMap::new(); + for (tb, _) in subscribed { + if let Some(partition_id) = tb.partition_id() { + by_partition + .entry(partition_id) + .or_default() + .push(tb.bucket_id()); + } + } + + let mut result: HashMap = HashMap::new(); + + for (partition_id, bucket_ids) in by_partition { + let partition_name = + partition_id_to_name + .get(&partition_id) + .ok_or_else(|| Error::UnexpectedError { + message: format!("Unknown partition_id: {partition_id}"), + source: None, + })?; + + let offsets = admin + .list_partition_offsets(table_path, partition_name, &bucket_ids, OffsetSpec::Latest) + .await?; + + for (bucket_id, latest_offset) in offsets { + if latest_offset < 0 { + warn!( + "Server returned negative latest offset {latest_offset} for bucket {bucket_id} of partition {partition_id} (table {table_id}); skipping bucket." + ); + continue; + } + if latest_offset == 0 { + continue; + } + let tb = TableBucket::new_with_partition(table_id, Some(partition_id), bucket_id); + let Some(&subscribed_offset) = subscribed_offset_map.get(&tb) else { + continue; + }; + if subscribed_offset < latest_offset { + result.insert(tb, latest_offset); + } + } + } + + Ok(result) +} + +/// Filter and slice scan batches against per-bucket stopping offsets. +/// +/// For each batch: +/// - If the batch's bucket is not in `stopping_offsets`, skip it. +/// - If `base_offset >= stop_at`, the bucket is exhausted; remove from map. +/// - If `last_offset >= stop_at`, slice to keep only records before stop_at. +/// - Otherwise, keep the full batch. +/// +/// Accepted batches with at least one row are pushed to `buffer`; empty +/// batches (e.g. a server-emitted batch containing no rows, or a slice that +/// reduces to zero rows) are dropped so consumers never observe an empty +/// `ScanBatch`. Returns the list of buckets that completed (were removed +/// from `stopping_offsets`). +fn filter_batches( + scan_batches: Vec, + stopping_offsets: &mut HashMap, + buffer: &mut VecDeque, +) -> Vec { + let mut completed = Vec::new(); + + for scan_batch in scan_batches { + let bucket = scan_batch.bucket().clone(); + let Some(&stop_at) = stopping_offsets.get(&bucket) else { + continue; + }; + + let base_offset = scan_batch.base_offset(); + let last_offset = scan_batch.last_offset(); + + if base_offset >= stop_at { + stopping_offsets.remove(&bucket); + completed.push(bucket); + continue; + } + + let kept_batch = if last_offset >= stop_at { + let num_to_keep = (stop_at - base_offset) as usize; + let b = scan_batch.into_batch(); + let limit = num_to_keep.min(b.num_rows()); + ScanBatch::new(bucket.clone(), b.slice(0, limit), base_offset) + } else { + scan_batch + }; + + if kept_batch.batch().num_rows() > 0 { + buffer.push_back(kept_batch); + } + + if last_offset >= stop_at - 1 { + stopping_offsets.remove(&bucket); + completed.push(bucket); + } + } + + completed +} + +// Rust-level end-to-end coverage for `new_until_latest`, partitioned tables, +// and `new_until_offsets` stopping semantics lives in +// `crates/fluss/tests/integration/record_batch_log_reader.rs`. Drop cleanup and the +// reader-active guard remain covered by the Python integration test +// `test_to_arrow_batch_reader_drop_and_guard`. +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::Int32Array; + use arrow_schema::{DataType, Field, Schema}; + use std::sync::Arc; + + fn test_schema() -> SchemaRef { + Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, false)])) + } + + fn make_batch(values: &[i32]) -> RecordBatch { + RecordBatch::try_new( + test_schema(), + vec![Arc::new(Int32Array::from(values.to_vec()))], + ) + .unwrap() + } + + fn make_scan_batch(bucket: TableBucket, base_offset: i64, values: &[i32]) -> ScanBatch { + ScanBatch::new(bucket, make_batch(values), base_offset) + } + + fn bucket(id: i32) -> TableBucket { + TableBucket::new(1, id) + } + + #[test] + fn filter_batch_entirely_before_stop() { + let mut offsets = HashMap::from([(bucket(0), 100)]); + let mut buffer = VecDeque::new(); + + let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3])]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert_eq!(buffer.len(), 1); + assert_eq!(buffer[0].batch().num_rows(), 3); + assert!(offsets.contains_key(&bucket(0))); + assert!(completed.is_empty()); + } + + #[test] + fn filter_batch_crossing_stop_offset_is_sliced() { + let mut offsets = HashMap::from([(bucket(0), 12)]); + let mut buffer = VecDeque::new(); + + // base_offset=10, 5 rows -> offsets 10,11,12,13,14; stop_at=12 -> keep 2 + let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3, 4, 5])]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert_eq!(buffer.len(), 1); + assert_eq!(buffer[0].batch().num_rows(), 2); + assert!(!offsets.contains_key(&bucket(0))); + assert_eq!(completed, vec![bucket(0)]); + } + + #[test] + fn filter_batch_at_or_after_stop_offset_is_skipped() { + let mut offsets = HashMap::from([(bucket(0), 10)]); + let mut buffer = VecDeque::new(); + + // base_offset=10, stop_at=10 -> base >= stop, skip entirely + let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3])]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert!(buffer.is_empty()); + assert!(!offsets.contains_key(&bucket(0))); + assert_eq!(completed, vec![bucket(0)]); + } + + #[test] + fn filter_batch_ending_exactly_at_stop_minus_one() { + let mut offsets = HashMap::from([(bucket(0), 13)]); + let mut buffer = VecDeque::new(); + + // base_offset=10, 3 rows -> offsets 10,11,12; last_offset=12, stop_at=13 + // last_offset (12) >= stop_at - 1 (12) => bucket done + let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3])]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert_eq!(buffer.len(), 1); + assert_eq!(buffer[0].batch().num_rows(), 3); + assert!(!offsets.contains_key(&bucket(0))); + assert_eq!(completed, vec![bucket(0)]); + } + + #[test] + fn filter_unknown_bucket_is_ignored() { + let mut offsets = HashMap::from([(bucket(0), 100)]); + let mut buffer = VecDeque::new(); + + let batches = vec![make_scan_batch(bucket(99), 0, &[1, 2])]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert!(buffer.is_empty()); + assert!(offsets.contains_key(&bucket(0))); + assert!(completed.is_empty()); + } + + #[test] + fn filter_multiple_buckets_independent_tracking() { + let mut offsets = HashMap::from([(bucket(0), 12), (bucket(1), 5)]); + let mut buffer = VecDeque::new(); + + let batches = vec![ + make_scan_batch(bucket(0), 10, &[1, 2, 3]), // last=12, stop=12 -> keep 2, done + make_scan_batch(bucket(1), 0, &[10, 20, 30]), // last=2, stop=5 -> keep all, not done + ]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert_eq!(buffer.len(), 2); + assert_eq!(buffer[0].batch().num_rows(), 2); // bucket 0: sliced + assert_eq!(buffer[1].batch().num_rows(), 3); // bucket 1: full + assert!(!offsets.contains_key(&bucket(0))); // bucket 0: done + assert!(offsets.contains_key(&bucket(1))); // bucket 1: still tracking + assert_eq!(completed, vec![bucket(0)]); + } + + #[test] + fn filter_empty_batch_at_stop() { + let mut offsets = HashMap::from([(bucket(0), 5)]); + let mut buffer = VecDeque::new(); + + // empty batch: base_offset=5, 0 rows -> last_offset = base-1 = 4 + // base_offset (5) >= stop_at (5) -> skip, remove + let batches = vec![make_scan_batch(bucket(0), 5, &[])]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert!(buffer.is_empty()); + assert!(!offsets.contains_key(&bucket(0))); + assert_eq!(completed, vec![bucket(0)]); + } + + #[test] + fn filter_drops_empty_batch_before_stop() { + // Empty batch well below the stop offset: base=5, 0 rows -> last=4, stop=100. + // base_offset (5) < stop_at (100) and last_offset (4) < stop_at (100), + // so it falls into the "keep full batch" branch but must not surface to + // the consumer because it has zero rows. + let mut offsets = HashMap::from([(bucket(0), 100)]); + let mut buffer = VecDeque::new(); + + let batches = vec![make_scan_batch(bucket(0), 5, &[])]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert!(buffer.is_empty()); + assert!(offsets.contains_key(&bucket(0))); + assert!(completed.is_empty()); + } + + #[test] + fn filter_single_row_batch_before_stop() { + let mut offsets = HashMap::from([(bucket(0), 10)]); + let mut buffer = VecDeque::new(); + + let batches = vec![make_scan_batch(bucket(0), 5, &[42])]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert_eq!(buffer.len(), 1); + assert_eq!(buffer[0].batch().num_rows(), 1); + assert!(offsets.contains_key(&bucket(0))); + assert!(completed.is_empty()); + } + + #[test] + fn filter_single_row_batch_at_stop_boundary() { + let mut offsets = HashMap::from([(bucket(0), 5)]); + let mut buffer = VecDeque::new(); + + // base_offset=4, 1 row -> last_offset=4, stop=5 + // last < stop -> keep all; last (4) >= stop-1 (4) -> done + let batches = vec![make_scan_batch(bucket(0), 4, &[42])]; + let completed = filter_batches(batches, &mut offsets, &mut buffer); + + assert_eq!(buffer.len(), 1); + assert_eq!(buffer[0].batch().num_rows(), 1); + assert!(!offsets.contains_key(&bucket(0))); + assert_eq!(completed, vec![bucket(0)]); + } + + #[test] + fn filter_preserves_scan_batch_metadata() { + let mut offsets = HashMap::from([(bucket(3), 100)]); + let mut buffer = VecDeque::new(); + + let batches = vec![make_scan_batch(bucket(3), 42, &[1, 2])]; + filter_batches(batches, &mut offsets, &mut buffer); + + let sb = &buffer[0]; + assert_eq!(*sb.bucket(), bucket(3)); + assert_eq!(sb.base_offset(), 42); + } + + #[test] + fn filter_sliced_batch_preserves_base_offset() { + let mut offsets = HashMap::from([(bucket(0), 12)]); + let mut buffer = VecDeque::new(); + + let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3, 4, 5])]; + filter_batches(batches, &mut offsets, &mut buffer); + + let sb = &buffer[0]; + assert_eq!(sb.base_offset(), 10); + assert_eq!(*sb.bucket(), bucket(0)); + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/remote_log.rs b/fluss-rust/crates/fluss/src/client/table/remote_log.rs new file mode 100644 index 0000000000..c48bdccabf --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/remote_log.rs @@ -0,0 +1,1343 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::client::credentials::CredentialsReceiver; +use crate::error::{Error, Result}; +use crate::io::{FileIO, Storage}; +use crate::metadata::TableBucket; +use crate::metrics::ScannerMetrics; +use crate::proto::{PbRemoteLogFetchInfo, PbRemoteLogSegment}; +use futures::TryStreamExt; +use parking_lot::Mutex; +use std::{ + cmp::{Ordering, Reverse}, + collections::{BinaryHeap, HashMap}, + future::Future, + io, mem, + path::{Path, PathBuf}, + pin::Pin, + sync::Arc, + time::Duration, +}; + +#[cfg(test)] +use std::{ + env, + time::{SystemTime, UNIX_EPOCH}, +}; +use tempfile::TempDir; +use tokio::io::AsyncWriteExt; +use tokio::sync::{Notify, OwnedSemaphorePermit, Semaphore, mpsc, oneshot}; +use tokio::task::JoinSet; + +/// Default maximum number of remote log segments to prefetch +/// Matches Java's CLIENT_SCANNER_REMOTE_LOG_PREFETCH_NUM (default: 4) +pub const DEFAULT_SCANNER_REMOTE_LOG_PREFETCH_NUM: usize = 4; + +/// Default maximum concurrent remote log downloads +/// Matches Java's REMOTE_FILE_DOWNLOAD_THREAD_NUM (default: 3) +pub const DEFAULT_REMOTE_FILE_DOWNLOAD_THREAD_NUM: usize = 3; + +/// Initial retry backoff delay (milliseconds) +/// Prevents hot-spin retry loops on persistent failures +const RETRY_BACKOFF_BASE_MS: u64 = 100; + +/// Maximum retry backoff delay (milliseconds) +/// Caps exponential backoff to avoid excessive delays +const RETRY_BACKOFF_MAX_MS: u64 = 5_000; + +/// Maximum number of retries before giving up +/// After this many retries, the download will fail permanently +const MAX_RETRY_COUNT: u32 = 10; + +/// Calculate exponential backoff delay with jitter for retries +fn calculate_backoff_delay(retry_count: u32) -> tokio::time::Duration { + use rand::Rng; + + // Exponential backoff: base * 2^retry_count + let exponential_ms = RETRY_BACKOFF_BASE_MS.saturating_mul(1 << retry_count.min(10)); // Cap exponent to prevent overflow + + // Cap at maximum + let capped_ms = exponential_ms.min(RETRY_BACKOFF_MAX_MS); + + // Add jitter (±25% randomness) to avoid thundering herd + let mut rng = rand::rng(); + let jitter = rng.random_range(0.75..=1.25); + let final_ms = ((capped_ms as f64) * jitter) as u64; + + tokio::time::Duration::from_millis(final_ms) +} + +/// Result of a fetch operation containing file path and size +#[derive(Debug)] +pub struct FetchResult { + pub file_path: PathBuf, + pub file_size: usize, +} + +/// Trait for fetching remote log segments (allows dependency injection for testing) +pub trait RemoteLogFetcher: Send + Sync { + fn fetch( + &self, + request: &RemoteLogDownloadRequest, + ) -> Pin> + Send>>; +} + +/// Represents a remote log segment that needs to be downloaded +#[derive(Debug, Clone)] +pub struct RemoteLogSegment { + pub segment_id: String, + pub start_offset: i64, + #[allow(dead_code)] + pub end_offset: i64, + #[allow(dead_code)] + pub size_in_bytes: i32, + pub table_bucket: TableBucket, + pub max_timestamp: i64, +} + +impl RemoteLogSegment { + pub fn from_proto(segment: &PbRemoteLogSegment, table_bucket: TableBucket) -> Self { + Self { + segment_id: segment.remote_log_segment_id.clone(), + start_offset: segment.remote_log_start_offset, + end_offset: segment.remote_log_end_offset, + size_in_bytes: segment.segment_size_in_bytes, + table_bucket, + // Match Java's behavior: use -1 for missing timestamp + // (Java: CommonRpcMessageUtils.java:171-174) + max_timestamp: segment.max_timestamp.unwrap_or(-1), + } + } + + /// Get the local file name for this remote log segment + pub fn local_file_name(&self) -> String { + // Format: ${remote_segment_id}_${offset_prefix}.log + let offset_prefix = format!("{:020}", self.start_offset); + format!("{}_{}.log", self.segment_id, offset_prefix) + } +} + +/// Represents remote log fetch information +#[derive(Debug, Clone)] +pub struct RemoteLogFetchInfo { + pub remote_log_tablet_dir: String, + #[allow(dead_code)] + pub partition_name: Option, + pub remote_log_segments: Vec, + pub first_start_pos: i32, +} + +impl RemoteLogFetchInfo { + pub fn from_proto(info: &PbRemoteLogFetchInfo, table_bucket: TableBucket) -> Self { + let segments = info + .remote_log_segments + .iter() + .map(|s| RemoteLogSegment::from_proto(s, table_bucket.clone())) + .collect(); + + Self { + remote_log_tablet_dir: info.remote_log_tablet_dir.clone(), + partition_name: info.partition_name.clone(), + remote_log_segments: segments, + first_start_pos: info.first_start_pos.unwrap_or(0), + } + } +} + +/// RAII guard for prefetch permit that notifies coordinator on drop +/// +/// NOTE: File deletion is now handled by FileSource::drop(), not here. +/// This ensures the file is closed before deletion +#[derive(Debug)] +pub struct PrefetchPermit { + permit: Option, + recycle_notify: Arc, +} + +impl PrefetchPermit { + fn new(permit: OwnedSemaphorePermit, recycle_notify: Arc) -> Self { + Self { + permit: Some(permit), + recycle_notify, + } + } +} + +impl Drop for PrefetchPermit { + fn drop(&mut self) { + // Release capacity (critical: permit must be dropped before notify) + let _ = self.permit.take(); // drops permit here + + // Then wake coordinator so it can acquire the now-available permit + self.recycle_notify.notify_one(); + } +} + +/// Downloaded remote log file with prefetch permit +/// File remains on disk for memory efficiency; file deletion is handled by FileCleanupGuard in FileSource +#[derive(Debug)] +pub struct RemoteLogFile { + /// Path to the downloaded file on local disk + pub file_path: PathBuf, + /// Size of the file in bytes + /// Currently unused but kept for potential future use (logging, metrics, etc.) + #[allow(dead_code)] + pub file_size: usize, + /// RAII permit that releases prefetch semaphore slot and notifies coordinator when dropped + pub permit: PrefetchPermit, +} + +/// Represents a request to download a remote log segment with priority ordering +#[derive(Debug)] +pub struct RemoteLogDownloadRequest { + segment: RemoteLogSegment, + remote_log_tablet_dir: String, + result_sender: oneshot::Sender>, + retry_count: u32, + next_retry_at: Option, +} + +impl RemoteLogDownloadRequest { + /// Get the segment (used by test fetcher implementations) + #[cfg(test)] + pub fn segment(&self) -> &RemoteLogSegment { + &self.segment + } +} + +// Total ordering for priority queue (Rust requirement: cmp==Equal implies Eq) +// Primary: Java semantics (timestamp cross-bucket, offset within-bucket) +// Tie-breakers: table_bucket fields (table_id, partition_id, bucket_id), then segment_id +impl Ord for RemoteLogDownloadRequest { + fn cmp(&self, other: &Self) -> Ordering { + if self.segment.table_bucket == other.segment.table_bucket { + // Same bucket: order by start_offset (ascending - earlier segments first) + self.segment + .start_offset + .cmp(&other.segment.start_offset) + .then_with(|| self.segment.segment_id.cmp(&other.segment.segment_id)) + } else { + // Different buckets: order by max_timestamp (ascending - older segments first) + // Then by table_bucket fields for true total ordering + self.segment + .max_timestamp + .cmp(&other.segment.max_timestamp) + .then_with(|| { + self.segment + .table_bucket + .table_id() + .cmp(&other.segment.table_bucket.table_id()) + }) + .then_with(|| { + self.segment + .table_bucket + .partition_id() + .cmp(&other.segment.table_bucket.partition_id()) + }) + .then_with(|| { + self.segment + .table_bucket + .bucket_id() + .cmp(&other.segment.table_bucket.bucket_id()) + }) + .then_with(|| self.segment.segment_id.cmp(&other.segment.segment_id)) + } + } +} + +impl PartialOrd for RemoteLogDownloadRequest { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for RemoteLogDownloadRequest { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for RemoteLogDownloadRequest {} + +/// Result of a download task +enum DownloadResult { + /// Successful download - deliver result to future + Success { + result: RemoteLogFile, + result_sender: oneshot::Sender>, + }, + /// Download failed - re-queue request for retry (Java pattern) + FailedRetry { request: RemoteLogDownloadRequest }, + /// Download failed permanently after max retries - fail the future + FailedPermanently { + error: Error, + result_sender: oneshot::Sender>, + }, + /// Cancelled - don't deliver, don't re-queue + Cancelled, +} + +/// Production implementation of RemoteLogFetcher that downloads from actual storage +struct ProductionFetcher { + credentials_rx: CredentialsReceiver, + local_log_dir: Arc, + remote_log_read_concurrency: usize, +} + +impl RemoteLogFetcher for ProductionFetcher { + fn fetch( + &self, + request: &RemoteLogDownloadRequest, + ) -> Pin> + Send>> { + let mut credentials_rx = self.credentials_rx.clone(); + let local_log_dir = self.local_log_dir.clone(); + let remote_log_read_concurrency = self.remote_log_read_concurrency; + + // Clone data needed for async operation to avoid lifetime issues + let segment = request.segment.clone(); + let remote_log_tablet_dir = request.remote_log_tablet_dir.to_string(); + + Box::pin(async move { + let local_file_name = segment.local_file_name(); + let local_file_path = local_log_dir.path().join(&local_file_name); + + // Build remote path + let offset_prefix = format!("{:020}", segment.start_offset); + let remote_path = format!( + "{}/{}/{}.log", + remote_log_tablet_dir, segment.segment_id, offset_prefix + ); + + // Get credentials from watch channel, waiting if not yet fetched + // - None = not yet fetched, wait + // - Some(props) = fetched (may be empty if no auth needed) + let remote_fs_props = { + let maybe_props = credentials_rx.borrow().clone(); + match maybe_props { + Some(props) => props, + None => { + // Credentials not yet fetched, wait for first update + log::info!("Waiting for credentials to be available..."); + // If the sender side has been dropped (e.g. during shutdown), + // this will return an error. Surface that as a proper error + // instead of silently falling back to empty credentials. + if let Err(e) = credentials_rx.changed().await { + let io_err = io::Error::new( + io::ErrorKind::BrokenPipe, + format!( + "credentials manager shut down before credentials were obtained: {e}" + ), + ); + return Err(io_err.into()); + } + // After a successful change notification, credentials should be set. + // If they are still missing, treat this as an error instead of + // defaulting to an empty map (which could break auth flows). + credentials_rx + .borrow() + .clone() + .ok_or_else(|| Error::UnexpectedError { + message: "credentials not available after watch notification" + .to_string(), + source: None, + })? + } + } + }; + + // Download file to disk (streaming, no memory spike) + let file_path = RemoteLogDownloader::download_file( + &remote_log_tablet_dir, + &remote_path, + &local_file_path, + &remote_fs_props, + remote_log_read_concurrency, + ) + .await?; + + // Get file size + let metadata = tokio::fs::metadata(&file_path).await?; + let file_size = metadata.len() as usize; + + // Return file path - file stays on disk until PrefetchPermit is dropped + Ok(FetchResult { + file_path, + file_size, + }) + }) + } +} + +/// Coordinator that owns all download state and orchestrates downloads +struct DownloadCoordinator { + download_queue: BinaryHeap>, + active_downloads: JoinSet, + in_flight: usize, + prefetch_semaphore: Arc, + max_concurrent_downloads: usize, + recycle_notify: Arc, + fetcher: Arc, + /// Per-table scanner metric handles cloned by every spawned download + /// task to attribute remote-fetch metrics to the owning scanner's + /// `(database, table)`. + metrics: Arc, +} + +impl DownloadCoordinator { + /// Check if we should wait for recycle notification + /// Only wait if we're blocked on permits AND have pending work + fn should_wait_for_recycle(&self) -> bool { + !self.download_queue.is_empty() + && self.in_flight < self.max_concurrent_downloads + && self.prefetch_semaphore.available_permits() == 0 + } + + /// Find the earliest retry deadline among pending requests + fn next_retry_deadline(&self) -> Option { + self.download_queue + .iter() + .filter_map(|Reverse(req)| req.next_retry_at) + .min() + } +} + +impl DownloadCoordinator { + /// Try to start as many downloads as possible (event-driven drain) + fn drain(&mut self) { + // Collect deferred requests (backoff not ready) to push back later + let mut deferred = Vec::new(); + // Scan entire queue once to find ready requests (prevents head-of-line blocking) + // Bound to reasonable max to avoid excessive work if queue is huge + let max_scan = self.download_queue.len().min(100); + let mut scanned = 0; + + while !self.download_queue.is_empty() + && self.in_flight < self.max_concurrent_downloads + && scanned < max_scan + { + // Try acquire prefetch permit (non-blocking) + let permit = match self.prefetch_semaphore.clone().try_acquire_owned() { + Ok(p) => p, + Err(_) => break, // No permits available + }; + + // Pop highest priority request + let Some(Reverse(request)) = self.download_queue.pop() else { + drop(permit); + break; + }; + + scanned += 1; + + // Retry backoff check: defer if retry time hasn't arrived yet + if let Some(next_retry_at) = request.next_retry_at { + let now = tokio::time::Instant::now(); + if next_retry_at > now { + // Not ready for retry yet - defer and continue looking for ready requests + drop(permit); + deferred.push(request); + continue; // Don't block - keep looking for ready requests + } + } + + // Cancellation check: skip if sender closed + if request.result_sender.is_closed() { + drop(permit); + continue; // Try next request + } + + // Clone data for the spawned task + let fetcher = self.fetcher.clone(); + let recycle_notify = self.recycle_notify.clone(); + let metrics = Arc::clone(&self.metrics); + + // Spawn download task + self.active_downloads.spawn(async move { + spawn_download_task(request, permit, fetcher, recycle_notify, metrics).await + }); + self.in_flight += 1; + } + + // Push deferred requests back to queue (maintains priority order) + if !deferred.is_empty() { + for req in deferred { + self.download_queue.push(Reverse(req)); + } + } + } +} + +/// Spawn a download task that attempts download once +/// Matches Java's RemoteLogDownloader.java +/// +/// Benefits over infinite in-place retry: +/// - Failed downloads don't block prefetch slots +/// - Other segments can make progress while one is failing +/// - Natural retry through coordinator re-picking from queue +async fn spawn_download_task( + request: RemoteLogDownloadRequest, + permit: tokio::sync::OwnedSemaphorePermit, + fetcher: Arc, + recycle_notify: Arc, + metrics: Arc, +) -> DownloadResult { + // Check if receiver still alive (early cancellation check) + if request.result_sender.is_closed() { + drop(permit); + return DownloadResult::Cancelled; + } + + // Java reference: RemoteLogDownloader.java increments `remoteFetchRequestCount` + // immediately before initiating the download. Each retry of the same segment + // counts as a separate request (matches Java behavior). + metrics.record_remote_fetch_request(); + + // Try download ONCE + let download_result = fetcher.fetch(&request).await; + + match download_result { + Ok(fetch_result) => { + // Success - permit will be released on drop (FileSource handles file deletion) + metrics.record_remote_fetch_bytes(fetch_result.file_size as u64); + DownloadResult::Success { + result: RemoteLogFile { + file_path: fetch_result.file_path, + file_size: fetch_result.file_size, + permit: PrefetchPermit::new(permit, recycle_notify.clone()), + }, + result_sender: request.result_sender, + } + } + Err(_e) if request.result_sender.is_closed() => { + // Receiver dropped (cancelled) - release permit, don't re-queue + drop(permit); + DownloadResult::Cancelled + } + Err(e) => { + // Download failed - check if we should retry or give up + // Counted per attempt, so retries each contribute one error. + metrics.record_remote_fetch_error(); + let retry_count = request.retry_count + 1; + + if retry_count > MAX_RETRY_COUNT { + // Too many retries - give up and fail the future + log::error!( + "Failed to download remote log segment {} after {} retries: {}. Giving up.", + request.segment.segment_id, + retry_count, + e + ); + drop(permit); // Release immediately + + DownloadResult::FailedPermanently { + error: Error::UnexpectedError { + message: format!( + "Failed to download remote log segment after {retry_count} retries: {e}" + ), + source: Some(Box::new(e)), + }, + result_sender: request.result_sender, + } + } else { + // Retry with exponential backoff + let backoff_delay = calculate_backoff_delay(retry_count); + let next_retry_at = tokio::time::Instant::now() + backoff_delay; + + log::warn!( + "Failed to download remote log segment {}: {}. Retry {}/{} after {:?}", + request.segment.segment_id, + e, + retry_count, + MAX_RETRY_COUNT, + backoff_delay + ); + drop(permit); // Release immediately - critical! + + // Update retry state + let mut retry_request = request; + retry_request.retry_count = retry_count; + retry_request.next_retry_at = Some(next_retry_at); + + // Re-queue request to same priority queue + // Future stays with request, NOT completed - will complete on successful retry + DownloadResult::FailedRetry { + request: retry_request, + } + } + } + } +} + +/// Coordinator event loop - owns all download state and reacts to events +async fn coordinator_loop( + mut coordinator: DownloadCoordinator, + mut request_receiver: mpsc::UnboundedReceiver, +) { + loop { + // Drain once at start of iteration to process ready work + coordinator.drain(); + + // Calculate sleep duration until next retry (if any deferred requests) + let next_retry_sleep = coordinator.next_retry_deadline().map(|deadline| { + let now = tokio::time::Instant::now(); + if deadline > now { + deadline - now + } else { + tokio::time::Duration::from_millis(0) // Ready now + } + }); + + tokio::select! { + // Event 1: NewRequest + Some(request) = request_receiver.recv() => { + coordinator.download_queue.push(Reverse(request)); + // Immediately try to start this download + continue; + } + + // Event 2: DownloadFinished + Some(result) = coordinator.active_downloads.join_next() => { + coordinator.in_flight -= 1; + + match result { + Ok(DownloadResult::Success { result, result_sender }) => { + // Success - deliver result to future + if !result_sender.is_closed() { + let _ = result_sender.send(Ok(result)); + } + // Permit held in RemoteLogFile until consumed + } + Ok(DownloadResult::FailedRetry { request }) => { + // Re-queue immediately (don't block coordinator with sleep) + // The retry time will be checked in drain() before processing + // (Java line 177: segmentsToFetch.add(request)) + // Permit already released (Java line 174) + coordinator.download_queue.push(Reverse(request)); + } + Ok(DownloadResult::FailedPermanently { error, result_sender }) => { + // Permanent failure - deliver error to future + if !result_sender.is_closed() { + let _ = result_sender.send(Err(error)); + } + // Permit already released + } + Ok(DownloadResult::Cancelled) => { + // Cancelled - permit already released, nothing to do + } + Err(e) => { + log::error!("Download task panicked: {e:?}"); + // Permit already released via RAII + } + } + // Immediately try to start another download + continue; + } + + // Event 3: Recycled (only wait when blocked on permits with pending work) + _ = coordinator.recycle_notify.notified(), + if coordinator.should_wait_for_recycle() => { + // Wake up to try draining + continue; + } + + // Event 4: Retry timer - wake up when next retry is ready + _ = tokio::time::sleep(next_retry_sleep.unwrap_or(tokio::time::Duration::from_secs(3600))), + if next_retry_sleep.is_some() => { + // Wake up to retry deferred requests + continue; + } + + else => break, // All channels closed AND no work pending + } + } +} + +type CompletionCallback = Box; + +/// Future for a remote log download request +pub struct RemoteLogDownloadFuture { + result: Arc>>>, + completion_callbacks: Arc>>, +} + +impl RemoteLogDownloadFuture { + pub fn new(receiver: oneshot::Receiver>) -> Self { + let result = Arc::new(Mutex::new(None)); + let result_clone = Arc::clone(&result); + let completion_callbacks: Arc>> = + Arc::new(Mutex::new(Vec::new())); + let callbacks_clone = Arc::clone(&completion_callbacks); + + // Spawn a task to wait for the download and update result, then call callbacks + tokio::spawn(async move { + let download_result = match receiver.await { + Ok(Ok(path)) => Ok(path), + Ok(Err(e)) => Err(e), + Err(e) => Err(Error::UnexpectedError { + message: format!("Download & Read future cancelled: {e:?}"), + source: None, + }), + }; + + *result_clone.lock() = Some(download_result); + + // Call all registered callbacks + // We need to take the callbacks to avoid holding the lock while calling them + // This also ensures that any callbacks registered after this point will be called immediately + let callbacks: Vec = { + let mut callbacks_guard = callbacks_clone.lock(); + mem::take(&mut *callbacks_guard) + }; + for callback in callbacks { + callback(); + } + + // After calling callbacks, any new callbacks registered will see is_done() == true + // and will be called immediately in on_complete() + }); + + Self { + result, + completion_callbacks, + } + } + + /// Register a callback to be called when download completes (similar to Java's onComplete) + pub fn on_complete(&self, callback: F) + where + F: Fn() + Send + Sync + 'static, + { + // Acquire callbacks lock first to ensure atomicity of the check-and-register operation + let mut callbacks_guard = self.completion_callbacks.lock(); + + // Check completion status while holding the callbacks lock. + // This ensures that: + // 1. If the task completes between checking is_done() and registering the callback, + // we'll see the completion state correctly + // 2. The background task cannot clear the callbacks list while we're checking/registering + let is_done = self.result.lock().is_some(); + + if is_done { + // If already completed, call immediately (drop lock first to avoid deadlock) + drop(callbacks_guard); + callback(); + } else { + // Register the callback while holding the callbacks lock. + // This ensures that even if the background task completes right after we check + // is_done(), it will wait for us to release the lock before taking callbacks. + // When it does take callbacks, it will see our callback in the list and execute it. + callbacks_guard.push(Box::new(callback)); + // Lock is automatically released here + } + } + + pub fn is_done(&self) -> bool { + self.result.lock().is_some() + } + + /// Take the RemoteLogFile (including the permit) from this future + /// This should only be called when the download is complete + /// This is the correct way to consume the download - it transfers permit ownership + pub fn take_remote_log_file(&self) -> Result { + let mut guard = self.result.lock(); + match guard.take() { + Some(Ok(remote_log_file)) => Ok(remote_log_file), + Some(Err(e)) => { + let error_msg = format!("{e}"); + Err(Error::IoUnexpectedError { + message: format!("Fail to get remote log file: {error_msg}"), + source: io::Error::other(error_msg), + }) + } + None => Err(Error::IoUnexpectedError { + message: "Remote log file already taken or not ready".to_string(), + source: io::Error::other("Remote log file already taken or not ready"), + }), + } + } +} + +/// Downloader for remote log segment files. +/// +/// # Shutdown behavior +/// +/// When the downloader is dropped, the request channel closes, signaling the coordinator +/// to stop accepting new work. The coordinator will finish any in-flight downloads but +/// won't wait for completion. Pending futures will fail. +pub struct RemoteLogDownloader { + request_sender: Option>, +} + +impl RemoteLogDownloader { + pub(crate) fn new( + local_log_dir: TempDir, + max_prefetch_segments: usize, + max_concurrent_downloads: usize, + remote_log_read_concurrency: usize, + credentials_rx: CredentialsReceiver, + metrics: Arc, + ) -> Result { + let fetcher = Arc::new(ProductionFetcher { + credentials_rx, + local_log_dir: Arc::new(local_log_dir), + remote_log_read_concurrency, + }); + + Self::new_with_fetcher( + fetcher, + max_prefetch_segments, + max_concurrent_downloads, + metrics, + ) + } + + /// Create a RemoteLogDownloader with a custom fetcher (for testing). + pub(crate) fn new_with_fetcher( + fetcher: Arc, + max_prefetch_segments: usize, + max_concurrent_downloads: usize, + metrics: Arc, + ) -> Result { + let (request_sender, request_receiver) = mpsc::unbounded_channel(); + + let coordinator = DownloadCoordinator { + download_queue: BinaryHeap::new(), + active_downloads: JoinSet::new(), + in_flight: 0, + prefetch_semaphore: Arc::new(Semaphore::new(max_prefetch_segments)), + max_concurrent_downloads, + recycle_notify: Arc::new(Notify::new()), + fetcher, + metrics, + }; + + // Spawn coordinator task - it will exit when request_sender is dropped + tokio::spawn(coordinator_loop(coordinator, request_receiver)); + + Ok(Self { + request_sender: Some(request_sender), + }) + } + + /// Request to fetch a remote log segment to local. This method is non-blocking. + pub fn request_remote_log( + &self, + remote_log_tablet_dir: &str, + segment: &RemoteLogSegment, + ) -> RemoteLogDownloadFuture { + let (result_sender, result_receiver) = oneshot::channel(); + + let request = RemoteLogDownloadRequest { + segment: segment.clone(), + remote_log_tablet_dir: remote_log_tablet_dir.to_string(), + result_sender, + retry_count: 0, + next_retry_at: None, + }; + + // Send to coordinator (non-blocking) + if let Some(ref sender) = self.request_sender { + if sender.send(request).is_err() { + // Coordinator is gone - immediately fail the future + let (error_sender, error_receiver) = oneshot::channel(); + let _ = error_sender.send(Err(Error::UnexpectedError { + message: "RemoteLogDownloader coordinator has shut down".to_string(), + source: None, + })); + return RemoteLogDownloadFuture::new(error_receiver); + } + } + + RemoteLogDownloadFuture::new(result_receiver) + } +} + +impl Drop for RemoteLogDownloader { + fn drop(&mut self) { + // Drop the request sender to signal coordinator shutdown. + // This causes request_receiver.recv() to return None, allowing the + // coordinator to exit gracefully after processing pending work. + // The coordinator task will finish on its own when it sees the channel closed. + drop(self.request_sender.take()); + } +} + +impl RemoteLogDownloader { + /// Download a file from remote storage to local using streaming read/write. + async fn download_file( + remote_log_tablet_dir: &str, + remote_path: &str, + local_path: &Path, + remote_fs_props: &HashMap, + remote_log_read_concurrency: usize, + ) -> Result { + // Handle both URL (e.g., "s3://bucket/path") and local file paths + // If the path doesn't contain "://", treat it as a local file path + let remote_log_tablet_dir_url = if remote_log_tablet_dir.contains("://") { + remote_log_tablet_dir.to_string() + } else { + format!("file://{remote_log_tablet_dir}") + }; + + // Create FileIO from the remote log tablet dir URL to get the storage + let file_io_builder = FileIO::from_url(&remote_log_tablet_dir_url)?; + + // For S3/S3A URLs, inject S3 credentials from props + let file_io_builder = if remote_log_tablet_dir.starts_with("s3://") + || remote_log_tablet_dir.starts_with("s3a://") + || remote_log_tablet_dir.starts_with("oss://") + { + file_io_builder.with_props( + remote_fs_props + .iter() + .map(|(k, v)| (k.as_str(), v.as_str())), + ) + } else { + file_io_builder + }; + + // Build storage and create operator directly + let storage = Storage::build(file_io_builder)?; + let (op, relative_path) = storage.create(remote_path)?; + + // Timeout for remote storage operations (30 seconds) + const REMOTE_OP_TIMEOUT: Duration = Duration::from_secs(30); + const CHUNK_SIZE: usize = 8 * 1024 * 1024; // 8MiB + + Self::download_file_streaming( + &op, + relative_path, + remote_path, + local_path, + CHUNK_SIZE, + remote_log_read_concurrency, + REMOTE_OP_TIMEOUT, + ) + .await?; + + Ok(local_path.to_path_buf()) + } + + async fn download_file_streaming( + op: &opendal::Operator, + relative_path: &str, + remote_path: &str, + local_path: &Path, + chunk_size: usize, + streaming_read_concurrency: usize, + remote_op_timeout: Duration, + ) -> Result<()> { + let mut local_file = tokio::fs::File::create(local_path).await?; + + let reader_future = op + .reader_with(relative_path) + .chunk(chunk_size) + .concurrent(streaming_read_concurrency); + let reader = tokio::time::timeout(remote_op_timeout, reader_future) + .await + .map_err(|e| Error::IoUnexpectedError { + message: format!("Timeout creating streaming reader for {remote_path}: {e}."), + source: io::ErrorKind::TimedOut.into(), + })??; + + let mut stream = tokio::time::timeout(remote_op_timeout, reader.into_bytes_stream(..)) + .await + .map_err(|e| Error::IoUnexpectedError { + message: format!("Timeout creating streaming bytes stream for {remote_path}: {e}."), + source: io::ErrorKind::TimedOut.into(), + })??; + + let mut chunk_count = 0u64; + while let Some(chunk) = tokio::time::timeout(remote_op_timeout, stream.try_next()) + .await + .map_err(|e| Error::IoUnexpectedError { + message: format!( + "Timeout streaming chunk from remote storage: {remote_path}, exception: {e}." + ), + source: io::ErrorKind::TimedOut.into(), + })?? + { + chunk_count += 1; + if chunk_count <= 3 || chunk_count % 10 == 0 { + log::debug!("Remote log streaming download: chunk #{chunk_count} ({remote_path})"); + } + local_file.write_all(&chunk).await?; + } + + local_file.sync_all().await?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::TablePath; + use crate::test_utils::test_scanner_metrics; + use std::sync::atomic::{AtomicUsize, Ordering}; + + /// Helper function to create a TableBucket for testing + fn create_table_bucket(table_id: i64, bucket_id: i32) -> TableBucket { + TableBucket::new(table_id, bucket_id) + } + + /// `ScannerMetrics` instance shared across the local test fixtures. The + /// labels are arbitrary because none of the tests in this module install + /// a metrics recorder; the metrics just need to exist for the API + /// surface. + fn metrics() -> Arc { + test_scanner_metrics(&TablePath::new("db", "tbl")) + } + + /// Simplified fake fetcher for testing + struct FakeFetcher { + completion_gate: Arc, + in_flight: Arc, + max_seen_in_flight: Arc, + fail_count: Arc>, + auto_complete: bool, + } + + impl FakeFetcher { + fn new(fail_count: usize, auto_complete: bool) -> Self { + Self { + completion_gate: Arc::new(Notify::new()), + in_flight: Arc::new(AtomicUsize::new(0)), + max_seen_in_flight: Arc::new(AtomicUsize::new(0)), + fail_count: Arc::new(Mutex::new(fail_count)), + auto_complete, + } + } + + fn max_seen_in_flight(&self) -> usize { + self.max_seen_in_flight.load(Ordering::SeqCst) + } + + fn in_flight(&self) -> usize { + self.in_flight.load(Ordering::SeqCst) + } + + fn release_one(&self) { + self.completion_gate.notify_one(); + } + + fn release_all(&self) { + self.completion_gate.notify_waiters(); + } + } + + impl RemoteLogFetcher for FakeFetcher { + fn fetch( + &self, + request: &RemoteLogDownloadRequest, + ) -> Pin> + Send>> { + let gate = self.completion_gate.clone(); + let in_flight = self.in_flight.clone(); + let max_seen = self.max_seen_in_flight.clone(); + let fail_count = self.fail_count.clone(); + let segment_id = request.segment().segment_id.clone(); + let auto_complete = self.auto_complete; + + Box::pin(async move { + // Track in-flight + let current = in_flight.fetch_add(1, Ordering::SeqCst) + 1; + max_seen.fetch_max(current, Ordering::SeqCst); + + // Wait for gate (or auto-complete) + if !auto_complete { + gate.notified().await; + } else { + tokio::task::yield_now().await; + } + + // Check if should fail + let should_fail = { + let mut count = fail_count.lock(); + if *count > 0 { + *count -= 1; + true + } else { + false + } + }; + + in_flight.fetch_sub(1, Ordering::SeqCst); + + if should_fail { + Err(Error::UnexpectedError { + message: format!("Fake fetch failed for {segment_id}"), + source: None, + }) + } else { + let fake_data = vec![1, 2, 3, 4]; + let temp_dir = env::temp_dir(); + let timestamp = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + let file_path = + temp_dir.join(format!("fake_segment_{segment_id}_{timestamp}.log")); + tokio::fs::write(&file_path, &fake_data).await?; + + Ok(FetchResult { + file_path, + file_size: fake_data.len(), + }) + } + }) + } + } + + /// Helper function to create a RemoteLogSegment for testing + fn create_segment( + segment_id: &str, + start_offset: i64, + max_timestamp: i64, + table_bucket: TableBucket, + ) -> RemoteLogSegment { + RemoteLogSegment { + segment_id: segment_id.to_string(), + start_offset, + end_offset: start_offset + 1000, + size_in_bytes: 1024, + table_bucket, + max_timestamp, + } + } + + /// Helper function to create a RemoteLogDownloadRequest for testing + fn create_request(segment: RemoteLogSegment) -> RemoteLogDownloadRequest { + let (result_sender, _) = oneshot::channel(); + RemoteLogDownloadRequest { + remote_log_tablet_dir: "test_dir".to_string(), + segment, + result_sender, + retry_count: 0, + next_retry_at: None, + } + } + + #[test] + fn test_priority_ordering_matching_java_test_case() { + // Test priority ordering: timestamp across buckets, offset within bucket + // Does NOT test tie-breakers (segment_id) - those are implementation details + + let bucket1 = create_table_bucket(1, 0); + let bucket2 = create_table_bucket(1, 1); + let bucket3 = create_table_bucket(1, 2); + let bucket4 = create_table_bucket(1, 3); + + // Create segments with distinct timestamps/offsets (no ties) + let seg_negative = create_segment("seg_neg", 0, -1, bucket1.clone()); + let seg_zero = create_segment("seg_zero", 0, 0, bucket2.clone()); + let seg_1000 = create_segment("seg_1000", 0, 1000, bucket3.clone()); + let seg_2000 = create_segment("seg_2000", 0, 2000, bucket4.clone()); + let seg_same_bucket_100 = create_segment("seg_sb_100", 100, 5000, bucket1.clone()); + let seg_same_bucket_50 = create_segment("seg_sb_50", 50, 5000, bucket1.clone()); + + let mut heap = BinaryHeap::new(); + heap.push(Reverse(create_request(seg_2000))); + heap.push(Reverse(create_request(seg_same_bucket_100))); + heap.push(Reverse(create_request(seg_1000))); + heap.push(Reverse(create_request(seg_zero))); + heap.push(Reverse(create_request(seg_negative))); + heap.push(Reverse(create_request(seg_same_bucket_50))); + + // Verify ordering by timestamp/offset, not segment_id + let first = heap.pop().unwrap().0; + assert_eq!(first.segment.max_timestamp, -1, "Lowest timestamp first"); + + let second = heap.pop().unwrap().0; + assert_eq!(second.segment.max_timestamp, 0); + + let third = heap.pop().unwrap().0; + assert_eq!(third.segment.max_timestamp, 1000); + + let fourth = heap.pop().unwrap().0; + assert_eq!(fourth.segment.max_timestamp, 2000); + + // Last two are same bucket (ts=5000), ordered by offset + let fifth = heap.pop().unwrap().0; + assert_eq!(fifth.segment.max_timestamp, 5000); + assert_eq!( + fifth.segment.start_offset, 50, + "Lower offset first within bucket" + ); + + let sixth = heap.pop().unwrap().0; + assert_eq!(sixth.segment.max_timestamp, 5000); + assert_eq!(sixth.segment.start_offset, 100); + } + + #[tokio::test] + async fn test_concurrency_and_priority() { + // Test concurrency limiting and priority-based scheduling together + let fake_fetcher = Arc::new(FakeFetcher::new(0, false)); // Manual control + + let downloader = RemoteLogDownloader::new_with_fetcher( + fake_fetcher.clone(), + 10, // High prefetch limit + 2, // Max concurrent downloads = 2 + metrics(), + ) + .unwrap(); + + let bucket = create_table_bucket(1, 0); + + // Request 4 segments with same priority (to isolate concurrency limiting from priority) + let segs: Vec<_> = (0..4) + .map(|i| create_segment(&format!("seg{i}"), i * 100, 1000, bucket.clone())) + .collect(); + + let _futures: Vec<_> = segs + .iter() + .map(|seg| downloader.request_remote_log("dir", seg)) + .collect(); + + // Wait for exactly 2 to start + tokio::time::sleep(Duration::from_millis(50)).await; + assert_eq!( + fake_fetcher.in_flight(), + 2, + "Concurrency limit: exactly 2 should be in-flight" + ); + + // Release one + fake_fetcher.release_one(); + tokio::time::sleep(Duration::from_millis(50)).await; + + // Max should never exceed 2 + assert_eq!( + fake_fetcher.max_seen_in_flight(), + 2, + "Max concurrent should not exceed 2" + ); + + // Release all + fake_fetcher.release_all(); + } + + #[tokio::test] + async fn test_prefetch_limit() { + // Test that prefetch semaphore limits outstanding downloads + let fake_fetcher = Arc::new(FakeFetcher::new(0, true)); // Auto-complete + + let downloader = RemoteLogDownloader::new_with_fetcher( + fake_fetcher, + 2, // Max prefetch = 2 + 10, // High concurrent limit + metrics(), + ) + .unwrap(); + + let bucket = create_table_bucket(1, 0); + + // Request 4 downloads + let segs: Vec<_> = (0..4) + .map(|i| create_segment(&format!("seg{i}"), i * 100, 1000, bucket.clone())) + .collect(); + + let mut futures: Vec<_> = segs + .iter() + .map(|seg| downloader.request_remote_log("dir", seg)) + .collect(); + + // Wait for first 2 to complete + let deadline = tokio::time::Instant::now() + Duration::from_secs(2); + loop { + if futures.iter().filter(|f| f.is_done()).count() >= 2 { + break; + } + if tokio::time::Instant::now() > deadline { + panic!("Timeout waiting for first 2 downloads"); + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + + // Verify 3rd and 4th are blocked (prefetch limit) + tokio::time::sleep(Duration::from_millis(50)).await; + assert_eq!( + futures.iter().filter(|f| f.is_done()).count(), + 2, + "Prefetch limit: only 2 should complete" + ); + + // Drop first 2 (releases permits) + let f4 = futures.pop().unwrap(); + let f3 = futures.pop().unwrap(); + drop(futures); + + // 3rd and 4th should now complete + let deadline = tokio::time::Instant::now() + Duration::from_secs(2); + loop { + if f3.is_done() && f4.is_done() { + break; + } + if tokio::time::Instant::now() > deadline { + panic!("Timeout after permit release"); + } + tokio::time::sleep(Duration::from_millis(10)).await; + } + } + + #[tokio::test] + async fn test_retry_and_cancellation() { + // Test retry with exponential backoff + let fake_fetcher = Arc::new(FakeFetcher::new(2, true)); // Fail twice, succeed third time + + let downloader = + RemoteLogDownloader::new_with_fetcher(fake_fetcher.clone(), 10, 1, metrics()).unwrap(); + + let bucket = create_table_bucket(1, 0); + let seg = create_segment("seg1", 0, 1000, bucket); + + let future = downloader.request_remote_log("dir", &seg); + + // Should succeed after retries + let deadline = tokio::time::Instant::now() + Duration::from_secs(5); + loop { + if future.is_done() { + break; + } + if tokio::time::Instant::now() > deadline { + panic!("Timeout waiting for retry to succeed"); + } + tokio::time::sleep(Duration::from_millis(50)).await; + } + + assert!(future.is_done(), "Should succeed after retries"); + + // Test cancellation + let seg2 = create_segment("seg2", 100, 1000, create_table_bucket(1, 0)); + let fake_fetcher2 = Arc::new(FakeFetcher::new(100, true)); // Fail forever + let downloader2 = + RemoteLogDownloader::new_with_fetcher(fake_fetcher2.clone(), 10, 1, metrics()).unwrap(); + + let future2 = downloader2.request_remote_log("dir", &seg2); + tokio::time::sleep(Duration::from_millis(50)).await; + + // Drop to cancel + drop(future2); + tokio::time::sleep(Duration::from_millis(50)).await; + + assert_eq!( + fake_fetcher2.in_flight(), + 0, + "Cancellation should release resources" + ); + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs new file mode 100644 index 0000000000..f0cb320171 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs @@ -0,0 +1,2763 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::ClientSchemaGetter; +use crate::client::connection::FlussConnection; +use crate::client::credentials::SecurityTokenManager; +use crate::client::metadata::Metadata; +use crate::client::table::batch_scanner::LimitBatchScanner; +use crate::client::table::log_fetch_buffer::{ + CompletedFetch, DefaultCompletedFetch, FetchErrorAction, FetchErrorContext, FetchErrorLogLevel, + LogFetchBuffer, RemotePendingFetch, +}; +use crate::client::table::remote_log::{RemoteLogDownloader, RemoteLogFetchInfo}; +use crate::config::Config; +use crate::error::Error::UnsupportedOperation; +use crate::error::{ApiError, Error, FlussError, Result}; +use crate::metadata::{ + LogFormat, PhysicalTablePath, RowType, SchemaInfo, TableBucket, TableInfo, TablePath, +}; +use crate::metrics::ScannerMetrics; +use crate::proto::{ + ErrorResponse, FetchLogRequest, FetchLogResponse, PbFetchLogReqForBucket, PbFetchLogReqForTable, +}; +use crate::record::{ + LogRecordsBatches, ReadContext, ScanBatch, ScanRecord, ScanRecords, to_arrow_schema, +}; +use crate::rpc::{RpcClient, RpcError, message}; +use crate::util::FairBucketStatusMap; +use crate::{PartitionId, TableId}; +use arrow_schema::SchemaRef; +use log::{debug, warn}; +use parking_lot::{Mutex, RwLock}; +use prost::Message; +use std::{ + collections::{HashMap, HashSet}, + slice::from_ref, + sync::Arc, + time::{Duration, Instant}, +}; +use tempfile::TempDir; + +pub struct TableScan<'a> { + conn: &'a FlussConnection, + table_info: TableInfo, + metadata: Arc, + /// Column indices to project. None means all columns, Some(vec) means only the specified columns (non-empty). + projected_fields: Option>, + /// Optional row limit. When set, callers may construct a [`BatchScanner`] for a one-shot bounded scan. + limit: Option, +} + +impl<'a> TableScan<'a> { + pub fn new(conn: &'a FlussConnection, table_info: TableInfo, metadata: Arc) -> Self { + Self { + conn, + table_info, + metadata, + projected_fields: None, + limit: None, + } + } + + /// Sets a row limit for the scan, enabling [`Self::create_bucket_batch_scanner`]. + /// + /// The limit must be positive. A limit is incompatible with the log + /// scanners, which reject it. + pub fn limit(mut self, n: i32) -> Result { + if n <= 0 { + return Err(Error::IllegalArgument { + message: format!("Scan limit must be positive, got {n}"), + }); + } + self.limit = Some(n); + Ok(self) + } + + /// Log scanners don't support limit pushdown; reject a configured limit + /// rather than silently ignoring it. + fn reject_limit(&self, scanner: &str) -> Result<()> { + if let Some(limit) = self.limit { + return Err(Error::UnsupportedOperation { + message: format!( + "{scanner} doesn't support limit pushdown. Table: {}, requested limit: {limit}", + self.table_info.table_path + ), + }); + } + Ok(()) + } + + /// Creates a one-shot bounded scan of `table_bucket`. + /// + /// Requires a previously-configured limit via [`Self::limit`]. Creation is + /// cheap; the `LimitScanRequest` runs on the first + /// [`LimitBatchScanner::next_batch`]. + pub fn create_bucket_batch_scanner( + self, + table_bucket: TableBucket, + ) -> Result { + let limit = self.limit.ok_or_else(|| Error::IllegalArgument { + message: "create_bucket_batch_scanner requires a limit configured via .limit(n)" + .to_string(), + })?; + if table_bucket.table_id() != self.table_info.table_id { + return Err(Error::IllegalArgument { + message: format!( + "Bucket table_id {} does not match scan table_id {}", + table_bucket.table_id(), + self.table_info.table_id + ), + }); + } + let num_buckets = self.table_info.get_num_buckets(); + if table_bucket.bucket_id() < 0 || table_bucket.bucket_id() >= num_buckets { + return Err(Error::IllegalArgument { + message: format!( + "Bucket id {} out of range for table with {num_buckets} buckets", + table_bucket.bucket_id() + ), + }); + } + // Log tables decode as Arrow IPC, so only ARROW format is supported (KV + // tables use the value-record path and are exempt). + if !self.table_info.has_primary_key() { + validate_scan_support(&self.table_info.table_path, &self.table_info)?; + } + // Pre-seed the current schema; older versions are fetched lazily during + // KV decode. Mirrors `Table::new_lookup`. + let latest = SchemaInfo::new( + self.table_info.get_schema().clone(), + self.table_info.get_schema_id(), + ); + let schema_getter = Arc::new(ClientSchemaGetter::new( + self.table_info.table_path.clone(), + self.conn.get_admin()?, + latest, + )); + Ok(LimitBatchScanner::new( + self.conn.get_connections(), + self.metadata.clone(), + self.table_info, + schema_getter, + self.projected_fields, + table_bucket, + limit, + )) + } + + /// Projects the scan to only include specified columns by their indices. + /// + /// # Arguments + /// * `column_indices` - Zero-based indices of columns to include in the scan + /// + /// # Errors + /// Returns an error if `column_indices` is empty or if any column index is out of range. + /// + /// # Example + /// ``` + /// # use fluss::client::FlussConnection; + /// # use fluss::config::Config; + /// # use fluss::error::Result; + /// # use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + /// # use fluss::row::InternalRow; + /// # use std::time::Duration; + /// + /// # pub async fn example() -> Result<()> { + /// let mut config = Config::default(); + /// config.bootstrap_servers = "127.0.0.1:9123".to_string(); + /// let conn = FlussConnection::new(config).await?; + /// + /// let table_descriptor = TableDescriptor::builder() + /// .schema( + /// Schema::builder() + /// .column("col1", DataTypes::int()) + /// .column("col2", DataTypes::string()) + /// .column("col3", DataTypes::string()) + /// .column("col4", DataTypes::string()) + /// .build()?, + /// ).build()?; + /// let table_path = TablePath::new("fluss".to_owned(), "rust_test_long".to_owned()); + /// let admin = conn.get_admin()?; + /// admin.create_table(&table_path, &table_descriptor, true) + /// .await?; + /// let table_info = admin.get_table_info(&table_path).await?; + /// let table = conn.get_table(&table_path).await?; + /// + /// // Project columns by indices + /// let scanner = table.new_scan().project(&[0, 2, 3])?.create_log_scanner()?; + /// let scan_records = scanner.poll(Duration::from_secs(10)).await?; + /// for record in scan_records { + /// let row = record.row(); + /// println!( + /// "{{{}, {}, {}}}@{}", + /// row.get_int(0)?, + /// row.get_string(2)?, + /// row.get_string(3)?, + /// record.offset() + /// ); + /// } + /// # Ok(()) + /// # } + /// ``` + pub fn project(mut self, column_indices: &[usize]) -> Result { + if column_indices.is_empty() { + return Err(Error::IllegalArgument { + message: "Column indices cannot be empty".to_string(), + }); + } + let field_count = self.table_info.row_type().fields().len(); + for &idx in column_indices { + if idx >= field_count { + return Err(Error::IllegalArgument { + message: format!( + "Column index {} out of range (max: {})", + idx, + field_count - 1 + ), + }); + } + } + self.projected_fields = Some(column_indices.to_vec()); + Ok(self) + } + + /// Projects the scan to only include specified columns by their names. + /// + /// # Arguments + /// * `column_names` - Names of columns to include in the scan + /// + /// # Errors + /// Returns an error if `column_names` is empty or if any column name is not found in the table schema. + /// + /// # Example + /// ``` + /// # use fluss::client::FlussConnection; + /// # use fluss::config::Config; + /// # use fluss::error::Result; + /// # use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; + /// # use fluss::row::InternalRow; + /// # use std::time::Duration; + /// + /// # pub async fn example() -> Result<()> { + /// let mut config = Config::default(); + /// config.bootstrap_servers = "127.0.0.1:9123".to_string(); + /// let conn = FlussConnection::new(config).await?; + /// + /// let table_descriptor = TableDescriptor::builder() + /// .schema( + /// Schema::builder() + /// .column("col1", DataTypes::int()) + /// .column("col2", DataTypes::string()) + /// .column("col3", DataTypes::string()) + /// .build()?, + /// ).build()?; + /// let table_path = TablePath::new("fluss".to_owned(), "rust_test_long".to_owned()); + /// let admin = conn.get_admin()?; + /// admin.create_table(&table_path, &table_descriptor, true) + /// .await?; + /// let table = conn.get_table(&table_path).await?; + /// + /// // Project columns by column names + /// let scanner = table.new_scan().project_by_name(&["col1", "col3"])?.create_log_scanner()?; + /// let scan_records = scanner.poll(Duration::from_secs(10)).await?; + /// for record in scan_records { + /// let row = record.row(); + /// println!( + /// "{{{}, {}}}@{}", + /// row.get_int(0)?, + /// row.get_string(1)?, + /// record.offset() + /// ); + /// } + /// # Ok(()) + /// # } + /// ``` + pub fn project_by_name(mut self, column_names: &[&str]) -> Result { + if column_names.is_empty() { + return Err(Error::IllegalArgument { + message: "Column names cannot be empty".to_string(), + }); + } + let row_type = self.table_info.row_type(); + let mut indices = Vec::new(); + + for name in column_names { + let idx = row_type + .fields() + .iter() + .position(|f| f.name() == *name) + .ok_or_else(|| Error::IllegalArgument { + message: format!("Column '{name}' not found"), + })?; + indices.push(idx); + } + + self.projected_fields = Some(indices); + Ok(self) + } + + pub fn create_log_scanner(self) -> Result { + self.reject_limit("LogScanner")?; + validate_scan_support(&self.table_info.table_path, &self.table_info)?; + let inner = LogScannerInner::new( + &self.table_info, + self.metadata.clone(), + self.conn.get_connections(), + self.conn.config(), + self.projected_fields, + )?; + Ok(LogScanner { + inner: Arc::new(inner), + }) + } + + pub fn create_record_batch_log_scanner(self) -> Result { + self.reject_limit("RecordBatchLogScanner")?; + validate_scan_support(&self.table_info.table_path, &self.table_info)?; + let inner = LogScannerInner::new( + &self.table_info, + self.metadata.clone(), + self.conn.get_connections(), + self.conn.config(), + self.projected_fields, + )?; + Ok(RecordBatchLogScanner { + inner: Arc::new(inner), + }) + } +} + +/// Scanner for reading log records one at a time with per-record metadata. +/// +/// Use this scanner when you need access to individual record offsets and timestamps. +/// For batch-level access, use [`RecordBatchLogScanner`] instead. +pub struct LogScanner { + inner: Arc, +} + +/// Scanner for reading log data as Arrow RecordBatches. +/// +/// More efficient than [`LogScanner`] for batch-level analytics where per-record +/// metadata (offsets, timestamps) is not needed. +/// +/// This type is intentionally **not** `Clone`. To perform a bounded read, move +/// the scanner into a [`crate::client::RecordBatchLogReader`] — the compiler +/// then prevents concurrent polls by construction. +pub struct RecordBatchLogScanner { + inner: Arc, +} + +/// Private shared implementation for both scanner types +struct LogScannerInner { + table_path: TablePath, + table_id: TableId, + metadata: Arc, + log_scanner_status: Arc, + log_fetcher: LogFetcher, + is_partitioned_table: bool, + arrow_schema: SchemaRef, + /// Guards against subscription changes while a + /// [`crate::client::RecordBatchLogReader`] is iterating. + reader_active: std::sync::atomic::AtomicBool, + /// Holds the snapshot fields used by [`PollGuard`] to derive the + /// scanner poll-timing metrics. The mutex makes the state updates + /// in `record_poll_start` / `record_poll_end` atomic; metric + /// emission and `log::warn!` calls happen after the lock is + /// released. The start↔end pairing depends on the single-consumer + /// contract documented on [`LogScanner::poll`] and + /// [`RecordBatchLogScanner::poll`] (mirrors Java's + /// `LogScannerImpl.acquire()`). Overlapping polls on the same + /// scanner trip a `debug_assert!` in `record_poll_start` (debug + /// builds) or emit a `log::warn!` (release builds). + poll_state: Mutex, + /// Per-table scanner metric handles, pre-bound with `database`/`table` + /// labels. + metrics: Arc, +} + +/// Snapshot state used to derive the scanner poll-timing metrics. +/// +/// The mutex makes the state updates in `record_poll_start` / +/// `record_poll_end` atomic with respect to themselves; metric +/// emission (`metrics::gauge!(...).set(...)`) and `log::warn!` calls +/// happen after the lock is released so a user-installed recorder or +/// logger cannot stall the critical section. The mutex does **not** by +/// itself preserve start↔end pairing across overlapping `poll()` calls +/// — that invariant relies on the single-consumer contract that +/// mirrors Java's `LogScannerImpl.acquire()`. Concurrent polls on the +/// same scanner are detected by a `debug_assert!` in +/// `record_poll_start` (panics in debug / tests) and a `log::warn!` on +/// both anomalous paths (`record_poll_start` sees a stale `Some`; +/// `record_poll_end` sees `None`) for release-build observability. +#[derive(Default, Debug)] +struct PollState { + /// Instant captured at the most recent `record_poll_start()`. `None` + /// before the first poll. + last_poll_at: Option, + /// Instant captured at the start of the in-flight poll. `None` after + /// the last `record_poll_end()`. + poll_start_at: Option, + /// Cached ms between the two most recent poll starts, used to compute + /// `poll_idle_ratio` in `record_poll_end`. + time_between_poll_ms: f64, +} + +/// Pairs `record_poll_start` with `record_poll_end`. Created +/// at the top of `poll_records` / `poll_batches`; `record_poll_end` runs on +/// drop, including the cancellation path (caller drops the future). +struct PollGuard<'a> { + inner: &'a LogScannerInner, +} + +impl<'a> PollGuard<'a> { + fn new(inner: &'a LogScannerInner) -> Self { + inner.record_poll_start(); + Self { inner } + } +} + +impl Drop for PollGuard<'_> { + fn drop(&mut self) { + self.inner.record_poll_end(); + } +} + +impl LogScannerInner { + fn new( + table_info: &TableInfo, + metadata: Arc, + connections: Arc, + config: &Config, + projected_fields: Option>, + ) -> Result { + let log_scanner_status = Arc::new(LogScannerStatus::new()); + + let full_row_type = table_info.get_row_type(); + let arrow_schema = match &projected_fields { + Some(indices) => { + let projected_fields_vec: Vec<_> = indices + .iter() + .map(|&i| full_row_type.fields()[i].clone()) + .collect(); + let projected_row_type = crate::metadata::RowType::new(projected_fields_vec); + to_arrow_schema(&projected_row_type)? + } + None => to_arrow_schema(full_row_type)?, + }; + + let metrics = Arc::new(ScannerMetrics::new(&table_info.table_path)); + Ok(Self { + table_path: table_info.table_path.clone(), + table_id: table_info.table_id, + is_partitioned_table: table_info.is_partitioned(), + metadata: metadata.clone(), + log_scanner_status: log_scanner_status.clone(), + log_fetcher: LogFetcher::new( + table_info.clone(), + connections, + metadata, + log_scanner_status.clone(), + config, + projected_fields, + Arc::clone(&metrics), + )?, + arrow_schema, + reader_active: std::sync::atomic::AtomicBool::new(false), + poll_state: Mutex::new(PollState::default()), + metrics, + }) + } + + fn check_no_active_reader(&self) -> Result<()> { + if self + .reader_active + .load(std::sync::atomic::Ordering::Acquire) + { + return Err(Error::IllegalArgument { + message: "Cannot modify subscriptions while a RecordBatchLogReader is active. \ + Drop the reader first." + .to_string(), + }); + } + Ok(()) + } + + async fn poll_records(&self, timeout: Duration) -> Result { + // Pairs record_poll_start (now) with record_poll_end + // (drop). Runs on every exit, including the cancellation path + // where the caller drops this future. + let _poll_guard = PollGuard::new(self); + let start = Instant::now(); + let deadline = start + timeout; + + loop { + // Try to collect fetches + let fetch_result = self.poll_for_fetches().await?; + + if !fetch_result.is_empty() { + // We have data, send next round of fetches and return + // This enables pipelining while user processes the data + self.log_fetcher.send_fetches().await?; + return Ok(ScanRecords::new(fetch_result)); + } + + // No data available, check if we should wait + let now = Instant::now(); + if now >= deadline { + // Timeout reached, return empty result + return Ok(ScanRecords::new(HashMap::new())); + } + + // Wait for buffer to become non-empty with remaining time + let remaining = deadline - now; + let has_data = self + .log_fetcher + .log_fetch_buffer + .await_not_empty(remaining) + .await?; + + if !has_data { + // Timeout while waiting + return Ok(ScanRecords::new(HashMap::new())); + } + + // Buffer became non-empty, try again + } + } + + /// Records the start of a `poll()` call and emits + /// `SCANNER_TIME_BETWEEN_POLL_MS`. The first poll emits `0.0`, + /// matching Java's `ScannerMetricGroup.recordPollStart` + /// (`timeMsBetweenPoll = lastPollMs != 0L ? pollStartMs - lastPollMs : 0L`). + /// + /// Single-consumer contract: a previous poll must have recorded its + /// end before the next start. Java enforces this with + /// `LogScannerImpl.acquire()` (throws `ConcurrentModificationException`). + /// Rust surfaces violations as: + /// - debug builds: `debug_assert!` panics (caught by tests), + /// - release builds: `log::warn!` + the in-flight `poll_start_at` is + /// overwritten so the metric series keeps moving; the resulting + /// `time_between_poll_ms` / `poll_idle_ratio` values for the + /// overlapping polls are not meaningful until the overlap clears. + fn record_poll_start(&self) { + let now = Instant::now(); + // Compute under the lock; emit the metric outside the critical + // section so a user-installed recorder cannot stall the next poll. + let (between_ms, overlap) = { + let mut state = self.poll_state.lock(); + let overlap = state.poll_start_at.is_some(); + debug_assert!( + !overlap, + "concurrent poll() detected on the same scanner; \ + LogScanner / RecordBatchLogScanner are single-consumer \ + (see LogScannerImpl.acquire() for Java parity)" + ); + let between_ms = match state.last_poll_at { + Some(prev) => now.duration_since(prev).as_secs_f64() * 1000.0, + None => 0.0, + }; + state.time_between_poll_ms = between_ms; + state.last_poll_at = Some(now); + state.poll_start_at = Some(now); + (between_ms, overlap) + }; + if overlap { + warn!( + "concurrent poll() detected on scanner; single-consumer \ + contract violated, poll-timing metrics will be inaccurate \ + until the overlap clears" + ); + } + self.metrics.record_time_between_poll_ms(between_ms); + } + + /// Computes `poll_idle_ratio = poll_time / (poll_time + between_time)`. + /// On the first poll, `between_time` is 0 so the ratio is 1.0 + /// (poll-bound). + /// + /// Orphan call: if no matching `record_poll_start` is in flight, + /// emits a `log::warn!` (single-consumer contract may have been + /// violated, e.g. in release builds where the start-side + /// `debug_assert!` is compiled out) and skips the metric update. + fn record_poll_end(&self) { + let now = Instant::now(); + // Compute under the lock; emit metric / warn outside the critical + // section so neither the user-installed recorder nor the logger + // can stall the next poll. + let (orphan, ratio) = { + let mut state = self.poll_state.lock(); + match state.poll_start_at.take() { + None => (true, None), + Some(start) => { + let poll_time_ms = now.duration_since(start).as_secs_f64() * 1000.0; + let total = poll_time_ms + state.time_between_poll_ms; + let r = (total > 0.0).then_some(poll_time_ms / total); + (false, r) + } + } + }; + if orphan { + warn!( + "record_poll_end called without a matching record_poll_start; \ + single-consumer contract may have been violated, idle ratio \ + for this poll is not emitted" + ); + return; + } + if let Some(r) = ratio { + self.metrics.record_poll_idle_ratio(r); + } + } + + async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> { + self.check_no_active_reader()?; + if self.is_partitioned_table { + return Err(Error::UnsupportedOperation { + message: "The table is a partitioned table, please use \"subscribe_partition\" to \ + subscribe a partitioned bucket instead." + .to_string(), + }); + } + let table_bucket = TableBucket::new(self.table_id, bucket); + self.metadata + .check_and_update_table_metadata(from_ref(&self.table_path)) + .await?; + self.log_scanner_status + .assign_scan_bucket(table_bucket, offset); + Ok(()) + } + + async fn subscribe_buckets(&self, bucket_offsets: &HashMap) -> Result<()> { + self.check_no_active_reader()?; + if self.is_partitioned_table { + return Err(Error::UnsupportedOperation { + message: + "The table is a partitioned table, please use \"subscribe_partition_buckets\" instead." + .to_string(), + }); + } + + let mut scan_bucket_offsets = HashMap::new(); + for (bucket_id, offset) in bucket_offsets { + let table_bucket = TableBucket::new(self.table_id, *bucket_id); + scan_bucket_offsets.insert(table_bucket, *offset); + } + self.do_subscribe_buckets(scan_bucket_offsets).await + } + + async fn subscribe_partition( + &self, + partition_id: PartitionId, + bucket: i32, + offset: i64, + ) -> Result<()> { + self.check_no_active_reader()?; + if !self.is_partitioned_table { + return Err(Error::UnsupportedOperation { + message: "The table is not a partitioned table, please use \"subscribe\" to \ + subscribe a non-partitioned bucket instead." + .to_string(), + }); + } + let table_bucket = + TableBucket::new_with_partition(self.table_id, Some(partition_id), bucket); + self.metadata + .check_and_update_table_metadata(from_ref(&self.table_path)) + .await?; + self.log_scanner_status + .assign_scan_bucket(table_bucket, offset); + Ok(()) + } + + async fn subscribe_partition_buckets( + &self, + partition_bucket_offsets: &HashMap<(PartitionId, i32), i64>, + ) -> Result<()> { + self.check_no_active_reader()?; + if !self.is_partitioned_table { + return Err(UnsupportedOperation { + message: "The table is not a partitioned table, please use \"subscribe_buckets\" \ + to subscribe to non-partitioned buckets instead." + .to_string(), + }); + } + + let mut scan_bucket_offsets = HashMap::new(); + for (&(partition_id, bucket_id), &offset) in partition_bucket_offsets { + let table_bucket = + TableBucket::new_with_partition(self.table_id, Some(partition_id), bucket_id); + scan_bucket_offsets.insert(table_bucket, offset); + } + self.do_subscribe_buckets(scan_bucket_offsets).await + } + + async fn do_subscribe_buckets(&self, bucket_offsets: HashMap) -> Result<()> { + if bucket_offsets.is_empty() { + return Err(Error::UnexpectedError { + message: "Bucket offsets are empty.".to_string(), + source: None, + }); + } + + self.metadata + .check_and_update_table_metadata(from_ref(&self.table_path)) + .await?; + + self.log_scanner_status.assign_scan_buckets(bucket_offsets); + Ok(()) + } + + async fn unsubscribe(&self, bucket: i32) -> Result<()> { + self.check_no_active_reader()?; + if self.is_partitioned_table { + return Err(Error::UnsupportedOperation { + message: + "The table is a partitioned table, please use \"unsubscribe_partition\" to \ + unsubscribe a partitioned bucket instead." + .to_string(), + }); + } + let table_bucket = TableBucket::new(self.table_id, bucket); + self.log_scanner_status + .unassign_scan_buckets(from_ref(&table_bucket)); + Ok(()) + } + + async fn unsubscribe_partition(&self, partition_id: PartitionId, bucket: i32) -> Result<()> { + self.check_no_active_reader()?; + if !self.is_partitioned_table { + return Err(Error::UnsupportedOperation { + message: "Can't unsubscribe a partition for a non-partitioned table.".to_string(), + }); + } + let table_bucket = + TableBucket::new_with_partition(self.table_id, Some(partition_id), bucket); + self.log_scanner_status + .unassign_scan_buckets(from_ref(&table_bucket)); + Ok(()) + } + + async fn poll_for_fetches(&self) -> Result>> { + let result = self.log_fetcher.collect_fetches()?; + if !result.is_empty() { + return Ok(result); + } + + // send any new fetches (won't resend pending fetches). + self.log_fetcher.send_fetches().await?; + + // Collect completed fetches from buffer + self.log_fetcher.collect_fetches() + } + + async fn poll_batches(&self, timeout: Duration) -> Result> { + let _poll_guard = PollGuard::new(self); + let start = Instant::now(); + let deadline = start + timeout; + + loop { + let batches = self.poll_for_batches().await?; + + if !batches.is_empty() { + self.log_fetcher.send_fetches().await?; + return Ok(batches); + } + + let now = Instant::now(); + if now >= deadline { + return Ok(Vec::new()); + } + + let remaining = deadline - now; + let has_data = self + .log_fetcher + .log_fetch_buffer + .await_not_empty(remaining) + .await?; + + if !has_data { + return Ok(Vec::new()); + } + } + } + + async fn poll_for_batches(&self) -> Result> { + let result = self.log_fetcher.collect_batches()?; + if !result.is_empty() { + return Ok(result); + } + + self.log_fetcher.send_fetches().await?; + self.log_fetcher.collect_batches() + } +} + +// Implementation for LogScanner (records mode) +impl LogScanner { + pub async fn poll(&self, timeout: Duration) -> Result { + self.inner.poll_records(timeout).await + } + + pub async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> { + self.inner.subscribe(bucket, offset).await + } + + pub async fn subscribe_buckets(&self, bucket_offsets: &HashMap) -> Result<()> { + self.inner.subscribe_buckets(bucket_offsets).await + } + + pub async fn subscribe_partition( + &self, + partition_id: PartitionId, + bucket: i32, + offset: i64, + ) -> Result<()> { + self.inner + .subscribe_partition(partition_id, bucket, offset) + .await + } + + pub async fn subscribe_partition_buckets( + &self, + partition_bucket_offsets: &HashMap<(PartitionId, i32), i64>, + ) -> Result<()> { + self.inner + .subscribe_partition_buckets(partition_bucket_offsets) + .await + } + + pub async fn unsubscribe(&self, bucket: i32) -> Result<()> { + self.inner.unsubscribe(bucket).await + } + + pub async fn unsubscribe_partition( + &self, + partition_id: PartitionId, + bucket: i32, + ) -> Result<()> { + self.inner.unsubscribe_partition(partition_id, bucket).await + } +} + +// Implementation for RecordBatchLogScanner (batches mode) +impl RecordBatchLogScanner { + /// Poll for batches with metadata (bucket and offset information). + pub async fn poll(&self, timeout: Duration) -> Result> { + self.inner.poll_batches(timeout).await + } + + pub async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> { + self.inner.subscribe(bucket, offset).await + } + + pub async fn subscribe_buckets(&self, bucket_offsets: &HashMap) -> Result<()> { + self.inner.subscribe_buckets(bucket_offsets).await + } + + pub async fn subscribe_partition( + &self, + partition_id: PartitionId, + bucket: i32, + offset: i64, + ) -> Result<()> { + self.inner + .subscribe_partition(partition_id, bucket, offset) + .await + } + + /// Returns whether the table is partitioned + pub fn is_partitioned(&self) -> bool { + self.inner.is_partitioned_table + } + + /// Returns all subscribed buckets with their current offsets + pub fn get_subscribed_buckets(&self) -> Vec<(TableBucket, i64)> { + self.inner.log_scanner_status.get_all_subscriptions() + } + + pub async fn subscribe_partition_buckets( + &self, + partition_bucket_offsets: &HashMap<(PartitionId, i32), i64>, + ) -> Result<()> { + self.inner + .subscribe_partition_buckets(partition_bucket_offsets) + .await + } + + pub async fn unsubscribe(&self, bucket: i32) -> Result<()> { + self.inner.unsubscribe(bucket).await + } + + pub async fn unsubscribe_partition( + &self, + partition_id: PartitionId, + bucket: i32, + ) -> Result<()> { + self.inner.unsubscribe_partition(partition_id, bucket).await + } + + /// Returns the Arrow schema for batches produced by this scanner. + pub fn schema(&self) -> SchemaRef { + self.inner.arrow_schema.clone() + } + + pub fn table_path(&self) -> &TablePath { + &self.inner.table_path + } + + pub fn table_id(&self) -> TableId { + self.inner.table_id + } + + /// Creates a new handle to the same underlying scanner state. + /// + /// Binding layers (Python, C++) that hold the scanner behind shared + /// ownership (`Arc`) cannot move it into a [`crate::client::RecordBatchLogReader`]. + /// This method produces a second handle so the reader can take ownership + /// while the binding retains its reference for subscription management. + /// + /// **Not intended for general use** — prefer moving the scanner directly. + #[doc(hidden)] + pub fn new_shared_handle(&self) -> Self { + RecordBatchLogScanner { + inner: Arc::clone(&self.inner), + } + } + + /// Atomically marks the scanner as having an active reader. + /// + /// Returns `Err(IllegalArgument)` if another reader is already active on + /// this scanner — only one [`crate::client::RecordBatchLogReader`] may + /// iterate per scanner at a time. This mirrors Java's + /// `LogScannerImpl.acquire()` single-consumer guard. + pub(crate) fn try_set_reader_active(&self) -> Result<()> { + use std::sync::atomic::Ordering; + self.inner + .reader_active + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .map(|_| ()) + .map_err(|_| Error::IllegalArgument { + message: "Another RecordBatchLogReader is already active on this scanner. \ + Drop the existing reader first." + .to_string(), + }) + } + + /// Clears the active-reader guard, re-enabling subscription changes. + pub(crate) fn clear_reader_active(&self) { + self.inner + .reader_active + .store(false, std::sync::atomic::Ordering::Release); + } + + /// Synchronous, infallible counterpart to [`unsubscribe`](Self::unsubscribe). + /// + /// Exists so [`crate::client::RecordBatchLogReader`]'s `Drop` impl can + /// release lingering subscriptions without `.await`. The async version is + /// also synchronous under the hood (it only acquires a lock and removes + /// from a map — no IO), so this exposes the same work without the + /// async wrapper. Silently no-ops on partitioned/non-partitioned mismatch + /// because `Drop` cannot return errors; callers must pick the correct + /// variant. + /// + /// **Not intended for general use** — prefer the async [`unsubscribe`]. + pub(crate) fn unsubscribe_sync(&self, bucket: i32) { + if self.inner.is_partitioned_table { + return; + } + let table_bucket = TableBucket::new(self.inner.table_id, bucket); + self.inner + .log_scanner_status + .unassign_scan_buckets(from_ref(&table_bucket)); + } + + /// Synchronous, infallible counterpart to + /// [`unsubscribe_partition`](Self::unsubscribe_partition). See + /// [`unsubscribe_sync`](Self::unsubscribe_sync) for rationale. + pub(crate) fn unsubscribe_partition_sync(&self, partition_id: PartitionId, bucket: i32) { + if !self.inner.is_partitioned_table { + return; + } + let table_bucket = + TableBucket::new_with_partition(self.inner.table_id, Some(partition_id), bucket); + self.inner + .log_scanner_status + .unassign_scan_buckets(from_ref(&table_bucket)); + } +} + +struct LogFetcher { + conns: Arc, + metadata: Arc, + table_path: TablePath, + is_partitioned: bool, + log_scanner_status: Arc, + read_context: ReadContext, + remote_read_context: ReadContext, + remote_log_downloader: Arc, + /// Background security token manager for remote filesystem access. + /// Kept alive to run the background refresh task; stopped on drop. + #[allow(dead_code)] + security_token_manager: Arc, + log_fetch_buffer: Arc, + nodes_with_pending_fetch_requests: Arc>>, + /// Per-table scanner metric handles shared with the owning + /// `LogScannerInner` and `RemoteLogDownloader`. + metrics: Arc, + max_poll_records: usize, + fetch_max_bytes: i32, + fetch_min_bytes: i32, + fetch_wait_max_time_ms: i32, + fetch_max_bytes_for_bucket: i32, +} + +struct FetchResponseContext { + metadata: Arc, + log_fetch_buffer: Arc, + log_scanner_status: Arc, + read_context: ReadContext, + remote_read_context: ReadContext, + remote_log_downloader: Arc, + /// Per-table scanner metric handles for `scanner.fetch_*` recording. + metrics: Arc, + /// `Instant` captured immediately before the FetchLog RPC; used to compute + /// `scanner.fetch_latency_ms` on a successful response. + request_start_time: Instant, +} + +impl LogFetcher { + pub fn new( + table_info: TableInfo, + conns: Arc, + metadata: Arc, + log_scanner_status: Arc, + config: &Config, + projected_fields: Option>, + metrics: Arc, + ) -> Result { + let full_row_type = table_info.get_row_type(); + let full_arrow_schema = to_arrow_schema(full_row_type)?; + let projected_row_type = match &projected_fields { + None => Arc::new(full_row_type.clone()), + Some(fields) => Arc::new(RowType::new( + fields + .iter() + .map(|&i| full_row_type.fields()[i].clone()) + .collect(), + )), + }; + let read_context = Self::create_read_context( + full_arrow_schema.clone(), + projected_row_type.clone(), + projected_fields.clone(), + false, + )? + .with_fluss_row_type(projected_row_type.clone()); + let remote_read_context = Self::create_read_context( + full_arrow_schema, + projected_row_type.clone(), + projected_fields.clone(), + true, + )? + .with_fluss_row_type(projected_row_type); + + let tmp_dir = TempDir::with_prefix("fluss-remote-logs")?; + let log_fetch_buffer = Arc::new(LogFetchBuffer::new(read_context.clone())); + + // Create security token manager for background token refresh + let security_token_manager = + Arc::new(SecurityTokenManager::new(conns.clone(), metadata.clone())); + + // Subscribe to credentials updates and pass to remote log downloader + let credentials_rx = security_token_manager.subscribe(); + + let remote_log_downloader = Arc::new(RemoteLogDownloader::new( + tmp_dir, + config.scanner_remote_log_prefetch_num, + config.remote_file_download_thread_num, + config.scanner_remote_log_read_concurrency, + credentials_rx, + Arc::clone(&metrics), + )?); + + // Start the background token refresh task + security_token_manager.start(); + + Ok(LogFetcher { + conns: conns.clone(), + metadata: metadata.clone(), + table_path: table_info.table_path.clone(), + is_partitioned: table_info.is_partitioned(), + log_scanner_status, + read_context, + remote_read_context, + remote_log_downloader, + security_token_manager, + log_fetch_buffer, + nodes_with_pending_fetch_requests: Arc::new(Mutex::new(HashSet::new())), + metrics, + max_poll_records: config.scanner_log_max_poll_records, + fetch_max_bytes: config.scanner_log_fetch_max_bytes, + fetch_min_bytes: config.scanner_log_fetch_min_bytes, + fetch_wait_max_time_ms: config.scanner_log_fetch_wait_max_time_ms, + fetch_max_bytes_for_bucket: config.scanner_log_fetch_max_bytes_for_bucket, + }) + } + + fn create_read_context( + full_arrow_schema: SchemaRef, + row_type: Arc, + projected_fields: Option>, + is_from_remote: bool, + ) -> Result { + match projected_fields { + None => Ok(ReadContext::new( + full_arrow_schema, + row_type, + is_from_remote, + )), + Some(fields) => ReadContext::with_projection_pushdown( + full_arrow_schema, + row_type, + fields, + is_from_remote, + ), + } + } + + fn describe_fetch_error( + error: FlussError, + table_bucket: &TableBucket, + fetch_offset: i64, + error_message: &str, + ) -> FetchErrorContext { + match error { + FlussError::NotLeaderOrFollower + | FlussError::LogStorageException + | FlussError::KvStorageException + | FlussError::StorageException + | FlussError::FencedLeaderEpochException + | FlussError::LeaderNotAvailableException => FetchErrorContext { + action: FetchErrorAction::Ignore, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "Error in fetch for bucket {table_bucket}: {error:?}: {error_message}" + ), + }, + FlussError::UnknownTableOrBucketException => FetchErrorContext { + action: FetchErrorAction::Ignore, + log_level: FetchErrorLogLevel::Warn, + log_message: format!( + "Received unknown table or bucket error in fetch for bucket {table_bucket}" + ), + }, + FlussError::LogOffsetOutOfRangeException => FetchErrorContext { + action: FetchErrorAction::LogOffsetOutOfRange, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "The fetching offset {fetch_offset} is out of range for bucket {table_bucket}: {error_message}" + ), + }, + FlussError::AuthorizationException => FetchErrorContext { + action: FetchErrorAction::Authorization, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "Authorization error while fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}" + ), + }, + FlussError::UnknownServerError => FetchErrorContext { + action: FetchErrorAction::Ignore, + log_level: FetchErrorLogLevel::Warn, + log_message: format!( + "Unknown server error while fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}" + ), + }, + FlussError::CorruptMessage => FetchErrorContext { + action: FetchErrorAction::CorruptMessage, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "Encountered corrupt message when fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}" + ), + }, + _ => FetchErrorContext { + action: FetchErrorAction::Unexpected, + log_level: FetchErrorLogLevel::Debug, + log_message: format!( + "Unexpected error code {error:?} while fetching at offset {fetch_offset} from bucket {table_bucket}: {error_message}" + ), + }, + } + } + + fn should_invalidate_table_meta(error: FlussError) -> bool { + matches!( + error, + FlussError::NotLeaderOrFollower + | FlussError::LeaderNotAvailableException + | FlussError::FencedLeaderEpochException + | FlussError::UnknownTableOrBucketException + | FlussError::InvalidCoordinatorException + ) + } + + async fn check_and_update_metadata(&self, table_buckets: &[TableBucket]) -> Result<()> { + let mut partition_ids = Vec::new(); + let mut need_update = false; + + for tb in table_buckets { + if self.get_table_bucket_leader(tb).is_some() { + continue; + } + + if self.is_partitioned { + partition_ids.push(tb.partition_id().unwrap()); + } else { + need_update = true; + break; + } + } + + let update_result = if self.is_partitioned && !partition_ids.is_empty() { + self.metadata + .update_tables_metadata( + &HashSet::from([&self.table_path]), + &HashSet::new(), + partition_ids, + ) + .await + } else if need_update { + self.metadata.update_table_metadata(&self.table_path).await + } else { + Ok(()) + }; + + // TODO: Handle PartitionNotExist error like java side + update_result.or_else(|e| { + if let Error::RpcError { source, .. } = &e + && matches!(source, RpcError::ConnectionError(_) | RpcError::Poisoned(_)) + { + warn!("Retrying after encountering error while updating table metadata: {e}"); + Ok(()) + } else { + Err(e) + } + })?; + Ok(()) + } + + /// Send fetch requests asynchronously without waiting for responses + async fn send_fetches(&self) -> Result<()> { + self.check_and_update_metadata(self.fetchable_buckets().as_slice()) + .await?; + let fetch_request = self.prepare_fetch_log_requests().await; + + for (leader, fetch_request) in fetch_request { + debug!("Adding pending request for node id {leader}"); + // Check if we already have a pending request for this node + { + self.nodes_with_pending_fetch_requests.lock().insert(leader); + } + + let cluster = self.metadata.get_cluster().clone(); + + let conns = Arc::clone(&self.conns); + let log_fetch_buffer = self.log_fetch_buffer.clone(); + let log_scanner_status = self.log_scanner_status.clone(); + let read_context = self.read_context.clone(); + let remote_read_context = self.remote_read_context.clone(); + let remote_log_downloader = Arc::clone(&self.remote_log_downloader); + let nodes_with_pending = self.nodes_with_pending_fetch_requests.clone(); + let metadata = self.metadata.clone(); + let metrics = Arc::clone(&self.metrics); + // Spawn async task to handle the fetch request + // Note: These tasks are not explicitly tracked or cancelled when LogFetcher is dropped. + // This is acceptable because: + // 1. Tasks will naturally complete (network requests will return or timeout) + // 2. Tasks use Arc references, so resources are properly shared + // 3. When the program exits, tokio runtime will clean up all tasks + // 4. Tasks are short-lived (network I/O operations) + tokio::spawn(async move { + // make sure it will always remove leader from pending nodes + let _guard = scopeguard::guard((), |_| { + nodes_with_pending.lock().remove(&leader); + }); + + let server_node = match cluster.get_tablet_server(leader) { + Some(node) => node, + None => { + warn!("No server node found for leader {leader}, retrying"); + Self::handle_fetch_failure(metadata, &leader, &fetch_request).await; + return; + } + }; + + let con = match conns.get_connection(server_node).await { + Ok(con) => con, + Err(e) => { + warn!("Retrying after error getting connection to destination node: {e:?}"); + Self::handle_fetch_failure(metadata, &leader, &fetch_request).await; + return; + } + }; + + // Java increment the fetch counter and capture `requestStartTime` immediately + // before the RPC. Failed connection acquisition above is not counted. + let request_start_time = Instant::now(); + metrics.record_fetch_request(); + + let fetch_response = match con + .request(message::FetchLogRequest::new(fetch_request.clone())) + .await + { + Ok(resp) => resp, + Err(e) => { + warn!( + "Retrying after error fetching log from destination node {server_node:?}: {e:?}" + ); + Self::handle_fetch_failure(metadata, &leader, &fetch_request).await; + return; + } + }; + + // Build the context after the RPC so `request_start_time` measures only RPC wall-clock + // — not tablet-server lookup or connection acquisition, which is matching Java's bebaviour + // Building it here also skips the allocation on the early-return error paths above. + let response_context = FetchResponseContext { + metadata: metadata.clone(), + log_fetch_buffer, + log_scanner_status, + read_context, + remote_read_context, + remote_log_downloader, + metrics, + request_start_time, + }; + Self::handle_fetch_response(fetch_response, response_context).await; + }); + } + + Ok(()) + } + + async fn handle_fetch_failure( + metadata: Arc, + server_id: &i32, + request: &FetchLogRequest, + ) { + let table_ids = request.tables_req.iter().map(|r| r.table_id).collect(); + metadata.invalidate_server(server_id, table_ids); + } + + /// Handle fetch response and add completed fetches to buffer + async fn handle_fetch_response( + fetch_response: FetchLogResponse, + context: FetchResponseContext, + ) { + let FetchResponseContext { + metadata, + log_fetch_buffer, + log_scanner_status, + read_context, + remote_read_context, + remote_log_downloader, + metrics, + request_start_time, + } = context; + + // `encoded_len()` mirrors Java's `fetchLogResponse.totalSize()`: + // both report the serialized API message body size, excluding protocol + // headers and framing. Recorded unconditionally (including zero-record + // responses) to match Java's histogram semantics. + metrics.record_fetch_latency_ms(request_start_time.elapsed().as_secs_f64() * 1000.0); + metrics.record_bytes_per_request(fetch_response.encoded_len() as f64); + + for pb_fetch_log_resp in fetch_response.tables_resp { + let table_id = pb_fetch_log_resp.table_id; + let fetch_log_for_buckets = pb_fetch_log_resp.buckets_resp; + + for fetch_log_for_bucket in fetch_log_for_buckets { + let bucket: i32 = fetch_log_for_bucket.bucket_id; + let table_bucket = TableBucket::new_with_partition( + table_id, + fetch_log_for_bucket.partition_id, + bucket, + ); + + // todo: check fetch result code for per-bucket + let Some(fetch_offset) = log_scanner_status.get_bucket_offset(&table_bucket) else { + debug!( + "Ignoring fetch log response for bucket {table_bucket} because the bucket has been unsubscribed." + ); + continue; + }; + + if let Some(error_code) = fetch_log_for_bucket.error_code + && error_code != FlussError::None.code() + { + let api_error: ApiError = ErrorResponse { + error_code, + error_message: fetch_log_for_bucket.error_message.clone(), + } + .into(); + + let error = FlussError::for_code(error_code); + if Self::should_invalidate_table_meta(error) { + // TODO: Consider triggering table meta invalidation from sender/lookup paths. + let table_id = table_bucket.table_id(); + let cluster = metadata.get_cluster(); + if let Some(table_path) = cluster.get_table_path_by_id(table_id) { + let physical_tables = HashSet::from([PhysicalTablePath::of(Arc::new( + table_path.clone(), + ))]); + metadata.invalidate_physical_table_meta(&physical_tables); + } else { + warn!( + "Table id {table_id} is missing from table_path_by_id while invalidating table metadata" + ); + } + } + let error_context = Self::describe_fetch_error( + error, + &table_bucket, + fetch_offset, + api_error.message.as_str(), + ); + log_scanner_status.move_bucket_to_end(table_bucket.clone()); + match error_context.log_level { + FetchErrorLogLevel::Debug => { + debug!("{}", error_context.log_message); + } + FetchErrorLogLevel::Warn => { + warn!("{}", error_context.log_message); + } + } + log_fetch_buffer.add_api_error( + table_bucket.clone(), + api_error, + error_context, + fetch_offset, + ); + continue; + } + + // Check if this is a remote log fetch + if let Some(ref remote_log_fetch_info) = fetch_log_for_bucket.remote_log_fetch_info + { + // Remote fs props are already set by the background SecurityTokenManager + let remote_fetch_info = + RemoteLogFetchInfo::from_proto(remote_log_fetch_info, table_bucket.clone()); + + let high_watermark = fetch_log_for_bucket.high_watermark.unwrap_or(-1); + Self::pending_remote_fetches( + remote_log_downloader.clone(), + log_fetch_buffer.clone(), + remote_read_context.clone(), + &table_bucket, + remote_fetch_info, + fetch_offset, + high_watermark, + ); + } else if fetch_log_for_bucket.records.is_some() { + // Handle regular in-memory records - create completed fetch directly + let high_watermark = fetch_log_for_bucket.high_watermark.unwrap_or(-1); + let records = fetch_log_for_bucket.records.unwrap_or(vec![]); + let size_in_bytes = records.len(); + let log_record_batch = LogRecordsBatches::new(records); + + let completed_fetch = DefaultCompletedFetch::new( + table_bucket.clone(), + log_record_batch, + size_in_bytes, + read_context.clone(), + fetch_offset, + high_watermark, + ); + log_fetch_buffer.add(Box::new(completed_fetch)); + } + } + } + } + + fn pending_remote_fetches( + remote_log_downloader: Arc, + log_fetch_buffer: Arc, + read_context: ReadContext, + table_bucket: &TableBucket, + remote_fetch_info: RemoteLogFetchInfo, + fetch_offset: i64, + high_watermark: i64, + ) { + // Download and process remote log segments + let mut pos_in_log_segment = remote_fetch_info.first_start_pos; + let mut current_fetch_offset = fetch_offset; + for (i, segment) in remote_fetch_info.remote_log_segments.iter().enumerate() { + if i > 0 { + pos_in_log_segment = 0; + current_fetch_offset = segment.start_offset; + } + + // todo: + // 1: control the max threads to download remote segment + // 2: introduce priority queue to priority highest for earliest segment + let download_future = remote_log_downloader + .request_remote_log(&remote_fetch_info.remote_log_tablet_dir, segment); + + // Register callback to be called when download completes + // (similar to Java's downloadFuture.onComplete) + // This must be done before creating RemotePendingFetch to avoid move issues + let table_bucket = table_bucket.clone(); + let log_fetch_buffer_clone = log_fetch_buffer.clone(); + download_future.on_complete(move || { + log_fetch_buffer_clone.try_complete(&table_bucket); + }); + + let pending_fetch = RemotePendingFetch::new( + segment.clone(), + download_future, + pos_in_log_segment, + current_fetch_offset, + high_watermark, + read_context.clone(), + ); + // Add to pending fetches in buffer (similar to Java's logFetchBuffer.pend) + log_fetch_buffer.pend(Box::new(pending_fetch)); + } + } + + /// Collect completed fetches from buffer + /// Reference: LogFetchCollector.collectFetch in Java + fn collect_fetches(&self) -> Result>> { + let mut result: HashMap> = HashMap::new(); + let mut records_remaining = self.max_poll_records; + + let collect_result: Result<()> = { + while records_remaining > 0 { + // Get the next in line fetch, or get a new one from buffer + let next_in_line = self.log_fetch_buffer.next_in_line_fetch(); + + if next_in_line.is_none() || next_in_line.as_ref().unwrap().is_consumed() { + // Get a new fetch from buffer + if let Some(completed_fetch) = self.log_fetch_buffer.poll() { + // Initialize the fetch if not already initialized + if !completed_fetch.is_initialized() { + let size_in_bytes = completed_fetch.size_in_bytes(); + match self.initialize_fetch(completed_fetch) { + Ok(initialized) => { + self.log_fetch_buffer.set_next_in_line_fetch(initialized); + continue; + } + Err(e) => { + // Remove a completedFetch upon a parse with exception if + // (1) it contains no records, and + // (2) there are no fetched records with actual content preceding this + // exception. + if result.is_empty() && size_in_bytes == 0 { + // todo: do we need to consider it like java ? + // self.log_fetch_buffer.poll(); + } + return Err(e); + } + } + } else { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(completed_fetch)); + } + // Note: poll() already removed the fetch from buffer, so no need to call poll() + } else { + // No more fetches available + break; + } + } else { + // Fetch records from next_in_line + if let Some(mut next_fetch) = next_in_line { + let records = match self + .fetch_records_from_fetch(&mut next_fetch, records_remaining) + { + Ok(records) => records, + Err(e) => { + if !next_fetch.is_consumed() { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(next_fetch)); + } + return Err(e); + } + }; + + if !records.is_empty() { + let table_bucket = next_fetch.table_bucket().clone(); + // Merge with existing records for this bucket + let existing = result.entry(table_bucket).or_default(); + let records_count = records.len(); + existing.extend(records); + + records_remaining = records_remaining.saturating_sub(records_count); + } + + // If the fetch is not fully consumed, put it back for the next round + if !next_fetch.is_consumed() { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(next_fetch)); + } + // If consumed, next_fetch will be dropped here (which is correct) + } + } + } + Ok(()) + }; + + match collect_result { + Ok(()) => Ok(result), + Err(e) => { + if result.is_empty() { + Err(e) + } else { + Ok(result) + } + } + } + } + + /// Initialize a completed fetch, checking offset match and updating high watermark + fn initialize_fetch( + &self, + mut completed_fetch: Box, + ) -> Result>> { + if let Some(error) = completed_fetch.take_error() { + return Err(error); + } + + let table_bucket = completed_fetch.table_bucket().clone(); + let fetch_offset = completed_fetch.next_fetch_offset(); + + if let Some(api_error) = completed_fetch.api_error() { + let error = FlussError::for_code(api_error.code); + let error_message = api_error.message.as_str(); + self.log_scanner_status + .move_bucket_to_end(table_bucket.clone()); + let action = completed_fetch + .fetch_error_context() + .map(|context| context.action) + .unwrap_or(FetchErrorAction::Unexpected); + match action { + FetchErrorAction::Ignore => { + return Ok(None); + } + FetchErrorAction::LogOffsetOutOfRange => { + return Err(Error::UnexpectedError { + message: format!( + "The fetching offset {fetch_offset} is out of range: {error_message}" + ), + source: None, + }); + } + FetchErrorAction::Authorization => { + return Err(Error::FlussAPIError { + api_error: ApiError { + code: api_error.code, + message: api_error.message.to_string(), + }, + }); + } + FetchErrorAction::CorruptMessage => { + return Err(Error::UnexpectedError { + message: format!( + "Encountered corrupt message when fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}" + ), + source: None, + }); + } + FetchErrorAction::Unexpected => { + return Err(Error::UnexpectedError { + message: format!( + "Unexpected error code {error:?} while fetching at offset {fetch_offset} from bucket {table_bucket}: {error_message}" + ), + source: None, + }); + } + } + } + + // Check if bucket is still subscribed + let Some(current_offset) = self.log_scanner_status.get_bucket_offset(&table_bucket) else { + warn!( + "Discarding stale fetch response for bucket {table_bucket:?} since the bucket has been unsubscribed" + ); + return Ok(None); + }; + + // Check if offset matches + if fetch_offset != current_offset { + warn!( + "Discarding stale fetch response for bucket {table_bucket:?} since its offset {fetch_offset} does not match the expected offset {current_offset}" + ); + return Ok(None); + } + + // Update high watermark + let high_watermark = completed_fetch.high_watermark(); + if high_watermark >= 0 { + self.log_scanner_status + .update_high_watermark(&table_bucket, high_watermark); + } + + completed_fetch.set_initialized(); + Ok(Some(completed_fetch)) + } + + /// Fetch records from a completed fetch, checking offset match + fn fetch_records_from_fetch( + &self, + next_in_line_fetch: &mut Box, + max_records: usize, + ) -> Result> { + let table_bucket = next_in_line_fetch.table_bucket().clone(); + let current_offset = self.log_scanner_status.get_bucket_offset(&table_bucket); + + if current_offset.is_none() { + warn!( + "Ignoring fetched records for {table_bucket:?} since the bucket has been unsubscribed" + ); + next_in_line_fetch.drain(); + return Ok(Vec::new()); + } + + let current_offset = current_offset.unwrap(); + let fetch_offset = next_in_line_fetch.next_fetch_offset(); + + // Check if this fetch is next in line + if fetch_offset == current_offset { + let records = next_in_line_fetch.fetch_records(max_records)?; + let next_fetch_offset = next_in_line_fetch.next_fetch_offset(); + + if next_fetch_offset > current_offset { + self.log_scanner_status + .update_offset(&table_bucket, next_fetch_offset); + } + + if next_in_line_fetch.is_consumed() && next_in_line_fetch.records_read() > 0 { + self.log_scanner_status + .move_bucket_to_end(table_bucket.clone()); + } + + Ok(records) + } else { + // These records aren't next in line, ignore them + warn!( + "Ignoring fetched records for {table_bucket:?} at offset {fetch_offset} since the current offset is {current_offset}" + ); + next_in_line_fetch.drain(); + Ok(Vec::new()) + } + } + + /// Collect completed fetches as ScanBatches (with bucket and offset metadata) + fn collect_batches(&self) -> Result> { + // Limit memory usage with both batch count and byte size constraints. + // Max 100 batches per poll, but also check total bytes (soft cap ~64MB). + const MAX_BATCHES: usize = 100; + const MAX_BYTES: usize = 64 * 1024 * 1024; // 64MB soft cap + let mut result: Vec = Vec::new(); + let mut batches_remaining = MAX_BATCHES; + let mut bytes_consumed: usize = 0; + + let collect_result: Result<()> = { + while batches_remaining > 0 && bytes_consumed < MAX_BYTES { + let next_in_line = self.log_fetch_buffer.next_in_line_fetch(); + + match next_in_line { + Some(mut next_fetch) if !next_fetch.is_consumed() => { + let scan_batches = + self.fetch_batches_from_fetch(&mut next_fetch, batches_remaining)?; + let batch_count = scan_batches.len(); + + if !scan_batches.is_empty() { + // Track bytes consumed (soft cap - may exceed by one fetch) + let batch_bytes: usize = scan_batches + .iter() + .map(|sb| sb.batch().get_array_memory_size()) + .sum(); + bytes_consumed += batch_bytes; + + result.extend(scan_batches); + batches_remaining = batches_remaining.saturating_sub(batch_count); + } + + if !next_fetch.is_consumed() { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(next_fetch)); + } + } + _ => { + if let Some(completed_fetch) = self.log_fetch_buffer.poll() { + if !completed_fetch.is_initialized() { + let size_in_bytes = completed_fetch.size_in_bytes(); + match self.initialize_fetch(completed_fetch) { + Ok(initialized) => { + self.log_fetch_buffer.set_next_in_line_fetch(initialized); + continue; + } + Err(e) => { + if result.is_empty() && size_in_bytes == 0 { + continue; + } + return Err(e); + } + } + } else { + self.log_fetch_buffer + .set_next_in_line_fetch(Some(completed_fetch)); + } + } else { + break; + } + } + } + } + Ok(()) + }; + + match collect_result { + Ok(()) => Ok(result), + Err(e) => { + if result.is_empty() { + Err(e) + } else { + Ok(result) + } + } + } + } + + fn fetch_batches_from_fetch( + &self, + next_in_line_fetch: &mut Box, + max_batches: usize, + ) -> Result> { + let table_bucket = next_in_line_fetch.table_bucket().clone(); + let current_offset = self.log_scanner_status.get_bucket_offset(&table_bucket); + + if current_offset.is_none() { + warn!( + "Ignoring fetched batches for {table_bucket:?} since the bucket has been unsubscribed" + ); + next_in_line_fetch.drain(); + return Ok(Vec::new()); + } + + let current_offset = current_offset.unwrap(); + let fetch_offset = next_in_line_fetch.next_fetch_offset(); + + if fetch_offset == current_offset { + let batches_with_offsets = next_in_line_fetch.fetch_batches(max_batches)?; + let next_fetch_offset = next_in_line_fetch.next_fetch_offset(); + + if next_fetch_offset > current_offset { + self.log_scanner_status + .update_offset(&table_bucket, next_fetch_offset); + } + + // Convert to ScanBatch with bucket info + Ok(batches_with_offsets + .into_iter() + .map(|(batch, base_offset)| { + ScanBatch::new(table_bucket.clone(), batch, base_offset) + }) + .collect()) + } else { + warn!( + "Ignoring fetched batches for {table_bucket:?} at offset {fetch_offset} since the current offset is {current_offset}" + ); + next_in_line_fetch.drain(); + Ok(Vec::new()) + } + } + + async fn prepare_fetch_log_requests(&self) -> HashMap { + let mut fetch_log_req_for_buckets = HashMap::new(); + let mut table_id = None; + let mut ready_for_fetch_count = 0; + for bucket in self.fetchable_buckets() { + if table_id.is_none() { + table_id = Some(bucket.table_id()); + } + + let offset = match self.log_scanner_status.get_bucket_offset(&bucket) { + Some(offset) => offset, + None => { + debug!( + "Skipping fetch request for bucket {bucket} because the bucket has been unsubscribed." + ); + continue; + } + }; + + match self.get_table_bucket_leader(&bucket) { + None => { + log::trace!( + "Skipping fetch request for bucket {bucket} because leader is not available." + ) + } + Some(leader) => { + if self + .nodes_with_pending_fetch_requests + .lock() + .contains(&leader) + { + log::trace!( + "Skipping fetch request for bucket {bucket} because previous request to server {leader} has not been processed." + ) + } else { + let fetch_log_req_for_bucket = PbFetchLogReqForBucket { + partition_id: bucket.partition_id(), + bucket_id: bucket.bucket_id(), + fetch_offset: offset, + max_fetch_bytes: self.fetch_max_bytes_for_bucket, + }; + + fetch_log_req_for_buckets + .entry(leader) + .or_insert_with(Vec::new) + .push(fetch_log_req_for_bucket); + ready_for_fetch_count += 1; + } + } + } + } + + if ready_for_fetch_count == 0 { + HashMap::new() + } else { + let (projection_enabled, projected_fields) = + match self.read_context.project_fields_in_order() { + None => (false, vec![]), + Some(fields) => (true, fields.iter().map(|&i| i as i32).collect()), + }; + + fetch_log_req_for_buckets + .into_iter() + .map(|(leader_id, feq_for_buckets)| { + let req_for_table = PbFetchLogReqForTable { + table_id: table_id.unwrap(), + projection_pushdown_enabled: projection_enabled, + projected_fields: projected_fields.clone(), + buckets_req: feq_for_buckets, + filter_predicate: None, + filter_schema_id: None, + }; + + let fetch_log_request = FetchLogRequest { + follower_server_id: -1, + max_bytes: self.fetch_max_bytes, + tables_req: vec![req_for_table], + max_wait_ms: Some(self.fetch_wait_max_time_ms), + min_bytes: Some(self.fetch_min_bytes), + }; + (leader_id, fetch_log_request) + }) + .collect() + } + } + + fn fetchable_buckets(&self) -> Vec { + // Get buckets that are not already in the buffer + let buffered = self.log_fetch_buffer.buffered_buckets(); + let buffered_set: HashSet = buffered.into_iter().collect(); + self.log_scanner_status + .fetchable_buckets(|tb| !buffered_set.contains(tb)) + } + + fn get_table_bucket_leader(&self, tb: &TableBucket) -> Option { + let cluster = self.metadata.get_cluster(); + cluster.leader_for(tb).map(|leader| leader.id()) + } +} + +pub struct LogScannerStatus { + bucket_status_map: Arc>>, +} + +#[allow(dead_code)] +impl LogScannerStatus { + pub fn new() -> Self { + Self { + bucket_status_map: Arc::new(RwLock::new(FairBucketStatusMap::new())), + } + } + + pub fn prepare_to_poll(&self) -> bool { + let map = self.bucket_status_map.read(); + map.size() > 0 + } + + pub fn move_bucket_to_end(&self, table_bucket: TableBucket) { + let mut map = self.bucket_status_map.write(); + map.move_to_end(table_bucket); + } + + /// Gets the offset of a bucket if it exists + pub fn get_bucket_offset(&self, table_bucket: &TableBucket) -> Option { + let map = self.bucket_status_map.read(); + map.status_value(table_bucket).map(|status| status.offset()) + } + + pub fn update_high_watermark(&self, table_bucket: &TableBucket, high_watermark: i64) { + if let Some(status) = self.get_status(table_bucket) { + status.set_high_watermark(high_watermark); + } + } + + pub fn update_offset(&self, table_bucket: &TableBucket, offset: i64) { + if let Some(status) = self.get_status(table_bucket) { + status.set_offset(offset); + } + } + + pub fn assign_scan_buckets(&self, scan_bucket_offsets: HashMap) { + let mut map = self.bucket_status_map.write(); + for (bucket, offset) in scan_bucket_offsets { + let status = map + .status_value(&bucket) + .cloned() + .unwrap_or_else(|| Arc::new(BucketScanStatus::new(offset))); + status.set_offset(offset); + map.update(bucket, status); + } + } + + pub fn assign_scan_bucket(&self, table_bucket: TableBucket, offset: i64) { + let status = Arc::new(BucketScanStatus::new(offset)); + self.bucket_status_map.write().update(table_bucket, status); + } + + /// Unassigns scan buckets + pub fn unassign_scan_buckets(&self, buckets: &[TableBucket]) { + let mut map = self.bucket_status_map.write(); + for bucket in buckets { + map.remove(bucket); + } + } + + /// Gets fetchable buckets based on availability predicate + pub fn fetchable_buckets(&self, is_available: F) -> Vec + where + F: Fn(&TableBucket) -> bool, + { + let map = self.bucket_status_map.read(); + let mut result = Vec::new(); + map.for_each(|bucket, _| { + if is_available(bucket) { + result.push(bucket.clone()); + } + }); + result + } + + /// Returns all subscribed buckets with their current offsets + pub fn get_all_subscriptions(&self) -> Vec<(TableBucket, i64)> { + let map = self.bucket_status_map.read(); + let mut result = Vec::new(); + map.for_each(|bucket, status| { + result.push((bucket.clone(), status.offset())); + }); + result + } + + /// Helper to get bucket status + fn get_status(&self, table_bucket: &TableBucket) -> Option> { + let map = self.bucket_status_map.read(); + map.status_value(table_bucket).cloned() + } +} + +impl Default for LogScannerStatus { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug)] +#[allow(dead_code)] +pub struct BucketScanStatus { + offset: RwLock, + high_watermark: RwLock, +} + +#[allow(dead_code)] +impl BucketScanStatus { + pub fn new(offset: i64) -> Self { + Self { + offset: RwLock::new(offset), + high_watermark: RwLock::new(0), + } + } + + pub fn offset(&self) -> i64 { + *self.offset.read() + } + + pub fn set_offset(&self, offset: i64) { + *self.offset.write() = offset + } + + pub fn high_watermark(&self) -> i64 { + *self.high_watermark.read() + } + + pub fn set_high_watermark(&self, high_watermark: i64) { + *self.high_watermark.write() = high_watermark + } +} + +fn validate_scan_support(table_path: &TablePath, table_info: &TableInfo) -> Result<()> { + if table_info.schema.primary_key().is_some() { + return Err(UnsupportedOperation { + message: format!("Table {table_path} is not a Log Table and doesn't support scan."), + }); + } + + let log_format = table_info.table_config.get_log_format()?; + if LogFormat::ARROW != log_format { + return Err(UnsupportedOperation { + message: format!( + "Scan is only supported for ARROW format and table {table_path} uses {log_format} format" + ), + }); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::WriteRecord; + use crate::client::metadata::Metadata; + use crate::compression::{ + ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType, + DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }; + use crate::metadata::{DataTypes, PhysicalTablePath, Schema, TableInfo, TablePath}; + use crate::proto::{PbFetchLogRespForBucket, PbFetchLogRespForTable}; + use crate::record::MemoryLogRecordsArrowBuilder; + use crate::row::{Datum, GenericRow}; + use crate::rpc::FlussError; + use crate::test_utils::{ + assert_scanner_entries_labeled, build_cluster_arc, build_table_info, test_scanner_metrics, + }; + + fn build_records(table_info: &TableInfo, table_path: Arc) -> Result> { + let mut builder = MemoryLogRecordsArrowBuilder::new( + 1, + table_info.get_row_type(), + false, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + usize::MAX, + Arc::new(ArrowCompressionRatioEstimator::default()), + )?; + let physical_table_path = Arc::new(PhysicalTablePath::of(table_path)); + let row = GenericRow { + values: vec![Datum::Int32(1)], + }; + let record = + WriteRecord::for_append(Arc::new(table_info.clone()), physical_table_path, 1, &row); + builder.append(&record)?; + builder.build() + } + + #[tokio::test] + async fn collect_fetches_updates_offset() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + let fetcher = LogFetcher::new( + table_info.clone(), + Arc::new(RpcClient::new()), + metadata, + status.clone(), + &Config::default(), + None, + test_scanner_metrics(&table_path), + )?; + + let bucket = TableBucket::new(1, 0); + status.assign_scan_bucket(bucket.clone(), 0); + + let data = build_records(&table_info, Arc::new(table_path))?; + let log_records = LogRecordsBatches::new(data.clone()); + let row_type = Arc::new(table_info.get_row_type().clone()); + let read_context = ReadContext::new(to_arrow_schema(&row_type)?, row_type, false); + let completed = + DefaultCompletedFetch::new(bucket.clone(), log_records, data.len(), read_context, 0, 0); + fetcher.log_fetch_buffer.add(Box::new(completed)); + + let fetched = fetcher.collect_fetches()?; + assert_eq!(fetched.get(&bucket).unwrap().len(), 1); + assert_eq!(status.get_bucket_offset(&bucket), Some(1)); + Ok(()) + } + + #[tokio::test] + async fn fetch_records_from_fetch_drains_unassigned_bucket() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + let fetcher = LogFetcher::new( + table_info.clone(), + Arc::new(RpcClient::new()), + metadata, + status, + &Config::default(), + None, + test_scanner_metrics(&table_path), + )?; + + let bucket = TableBucket::new(1, 0); + let data = build_records(&table_info, Arc::new(table_path))?; + let log_records = LogRecordsBatches::new(data.clone()); + let row_type = Arc::new(table_info.get_row_type().clone()); + let read_context = ReadContext::new(to_arrow_schema(&row_type)?, row_type, false); + let mut completed: Box = Box::new(DefaultCompletedFetch::new( + bucket, + log_records, + data.len(), + read_context, + 0, + 0, + )); + + let records = fetcher.fetch_records_from_fetch(&mut completed, 10)?; + assert!(records.is_empty()); + assert!(completed.is_consumed()); + Ok(()) + } + + #[tokio::test] + async fn prepare_fetch_log_requests_skips_pending() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + status.assign_scan_bucket(TableBucket::new(1, 0), 0); + let fetcher = LogFetcher::new( + table_info, + Arc::new(RpcClient::new()), + metadata, + status, + &Config::default(), + None, + test_scanner_metrics(&table_path), + )?; + + fetcher.nodes_with_pending_fetch_requests.lock().insert(1); + + let requests = fetcher.prepare_fetch_log_requests().await; + assert!(requests.is_empty()); + Ok(()) + } + + #[tokio::test] + async fn handle_fetch_response_sets_error() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + status.assign_scan_bucket(TableBucket::new(1, 0), 5); + let fetcher = LogFetcher::new( + table_info.clone(), + Arc::new(RpcClient::new()), + metadata.clone(), + status.clone(), + &Config::default(), + None, + test_scanner_metrics(&table_path), + )?; + + let response = FetchLogResponse { + tables_resp: vec![PbFetchLogRespForTable { + table_id: 1, + buckets_resp: vec![PbFetchLogRespForBucket { + partition_id: None, + bucket_id: 0, + error_code: Some(FlussError::AuthorizationException.code()), + error_message: Some("denied".to_string()), + high_watermark: None, + log_start_offset: None, + remote_log_fetch_info: None, + records: None, + filtered_end_offset: None, + }], + }], + }; + + let response_context = FetchResponseContext { + metadata: metadata.clone(), + log_fetch_buffer: fetcher.log_fetch_buffer.clone(), + log_scanner_status: fetcher.log_scanner_status.clone(), + read_context: fetcher.read_context.clone(), + remote_read_context: fetcher.remote_read_context.clone(), + remote_log_downloader: fetcher.remote_log_downloader.clone(), + metrics: Arc::clone(&fetcher.metrics), + request_start_time: Instant::now(), + }; + + LogFetcher::handle_fetch_response(response, response_context).await; + + let completed = fetcher.log_fetch_buffer.poll().expect("completed fetch"); + let api_error = completed.api_error().expect("api error"); + assert_eq!(api_error.code, FlussError::AuthorizationException.code()); + Ok(()) + } + + #[tokio::test] + async fn handle_fetch_response_invalidates_table_meta() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let status = Arc::new(LogScannerStatus::new()); + status.assign_scan_bucket(TableBucket::new(1, 0), 5); + let fetcher = LogFetcher::new( + table_info.clone(), + Arc::new(RpcClient::new()), + metadata.clone(), + status.clone(), + &Config::default(), + None, + test_scanner_metrics(&table_path), + )?; + + let bucket = TableBucket::new(1, 0); + assert!(metadata.leader_for(&table_path, &bucket).await?.is_some()); + + let response = FetchLogResponse { + tables_resp: vec![PbFetchLogRespForTable { + table_id: 1, + buckets_resp: vec![PbFetchLogRespForBucket { + partition_id: None, + bucket_id: 0, + error_code: Some(FlussError::NotLeaderOrFollower.code()), + error_message: Some("not leader".to_string()), + high_watermark: None, + log_start_offset: None, + remote_log_fetch_info: None, + records: None, + filtered_end_offset: None, + }], + }], + }; + + let response_context = FetchResponseContext { + metadata: metadata.clone(), + log_fetch_buffer: fetcher.log_fetch_buffer.clone(), + log_scanner_status: fetcher.log_scanner_status.clone(), + read_context: fetcher.read_context.clone(), + remote_read_context: fetcher.remote_read_context.clone(), + remote_log_downloader: fetcher.remote_log_downloader.clone(), + metrics: Arc::clone(&fetcher.metrics), + request_start_time: Instant::now(), + }; + + LogFetcher::handle_fetch_response(response, response_context).await; + + assert!(metadata.get_cluster().leader_for(&bucket).is_none()); + Ok(()) + } + + fn create_test_table_info( + has_primary_key: bool, + log_format: Option<&str>, + ) -> (TableInfo, TablePath) { + let mut schema_builder = Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()); + + if has_primary_key { + schema_builder = schema_builder.primary_key(vec!["id"]); + } + + let schema = schema_builder.build().unwrap(); + let table_path = TablePath::new("test_db", "test_table"); + + let mut properties = HashMap::new(); + if let Some(format) = log_format { + properties.insert("table.log.format".to_string(), format.to_string()); + } + + let table_info = TableInfo::new( + table_path.clone(), + 1, + 1, + schema, + vec![], + Arc::from(vec![]), + 1, + properties, + HashMap::new(), + None, + 0, + 0, + ); + + (table_info, table_path) + } + + #[test] + fn test_validate_scan_support() { + // Primary key table + let (table_info, table_path) = create_test_table_info(true, Some("ARROW")); + let result = validate_scan_support(&table_path, &table_info); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(matches!(err, UnsupportedOperation { .. })); + assert!(err.to_string().contains( + format!("Table {table_path} is not a Log Table and doesn't support scan.").as_str() + )); + + // Indexed format + let (table_info, table_path) = create_test_table_info(false, Some("INDEXED")); + let result = validate_scan_support(&table_path, &table_info); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(matches!(err, UnsupportedOperation { .. })); + assert!(err.to_string().contains(format!("Scan is only supported for ARROW format and table {table_path} uses INDEXED format").as_str())); + + // Default format + let (table_info, table_path) = create_test_table_info(false, None); + let result = validate_scan_support(&table_path, &table_info); + assert!(result.is_ok()); + + // Arrow format + let (table_info, table_path) = create_test_table_info(false, Some("ARROW")); + let result = validate_scan_support(&table_path, &table_info); + assert!(result.is_ok()); + } + + #[tokio::test] + async fn prepare_fetch_log_requests_uses_configured_fetch_params() -> Result<()> { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + status.assign_scan_bucket(TableBucket::new(1, 0), 0); + + let config = Config { + scanner_log_fetch_max_bytes: 1234, + scanner_log_fetch_min_bytes: 7, + scanner_log_fetch_wait_max_time_ms: 89, + scanner_log_fetch_max_bytes_for_bucket: 512, + ..Config::default() + }; + + let fetcher = LogFetcher::new( + table_info, + Arc::new(RpcClient::new()), + metadata, + status, + &config, + None, + test_scanner_metrics(&table_path), + )?; + + let requests = fetcher.prepare_fetch_log_requests().await; + // In this test cluster, leader id should exist; but even if it changes, + // assert over all built requests. + assert!(!requests.is_empty()); + for req in requests.values() { + assert_eq!(req.max_bytes, 1234); + assert_eq!(req.min_bytes, Some(7)); + assert_eq!(req.max_wait_ms, Some(89)); + + for table_req in &req.tables_req { + for bucket_req in &table_req.buckets_req { + assert_eq!(bucket_req.max_fetch_bytes, 512); + } + } + } + Ok(()) + } + + /// Builds a self-contained `LogScannerInner` for poll-timing tests + /// inside a `current_thread` runtime so callers can drive `PollGuard` + /// lifecycles synchronously. + fn with_test_log_scanner_inner(body: F) { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("build current_thread runtime"); + rt.block_on(async { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let inner = LogScannerInner::new( + &table_info, + metadata, + Arc::new(RpcClient::new()), + &Config::default(), + None, + ) + .expect("build LogScannerInner"); + body(&inner); + }); + } + + fn snapshot_gauge( + snapshotter: &metrics_util::debugging::Snapshotter, + name: &str, + ) -> Option { + use metrics_util::debugging::DebugValue; + snapshotter + .snapshot() + .into_vec() + .into_iter() + .find_map(|(key, _, _, val)| { + if key.key().name() == name { + if let DebugValue::Gauge(g) = val { + return Some(g.into_inner()); + } + } + None + }) + } + + /// Exercises the `PollGuard` lifecycle across two consecutive + /// `record_poll_start` calls. Asserts both poll-timing gauges are + /// emitted at the right moments and `record_poll_end` runs on guard + /// drop (also the cancellation-safety path, since dropping the + /// `poll()` future drops the guard). + #[test] + fn poll_guard_emits_time_between_poll_and_idle_ratio() { + use crate::metrics::{SCANNER_POLL_IDLE_RATIO, SCANNER_TIME_BETWEEN_POLL_MS}; + use metrics_util::debugging::DebuggingRecorder; + + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + with_test_log_scanner_inner(|inner| { + // First poll: emits time_between_poll_ms=0 (Java parity: + // ScannerMetricGroup.recordPollStart emits 0 when there is + // no previous poll). Idle ratio is also emitted as 1.0 + // on drop (poll_time / (poll_time + 0) = 1.0). + { + let _g = PollGuard::new(inner); + std::thread::sleep(std::time::Duration::from_millis(5)); + } + + // Brief gap so time_between_poll_ms is observably > 0. + std::thread::sleep(std::time::Duration::from_millis(5)); + + // Second poll: refreshes both time_between_poll_ms (>0) + // and a fresh idle ratio. + { + let _g = PollGuard::new(inner); + std::thread::sleep(std::time::Duration::from_millis(5)); + } + }); + }); + + let between = snapshot_gauge(&snapshotter, SCANNER_TIME_BETWEEN_POLL_MS) + .expect("time_between_poll_ms must be emitted on every poll"); + assert!( + between > 0.0, + "second-poll time_between_poll_ms must be positive, got {between}" + ); + + let ratio = snapshot_gauge(&snapshotter, SCANNER_POLL_IDLE_RATIO) + .expect("poll_idle_ratio must be emitted on poll end"); + assert!( + (0.0..=1.0).contains(&ratio), + "poll_idle_ratio must be in [0, 1], got {ratio}" + ); + + // Both gauges must carry `database=db` / `table=tbl` (the fixture + // values from `with_test_log_scanner_inner`). + assert_scanner_entries_labeled(&snapshotter.snapshot().into_vec(), "db", "tbl"); + } + + /// Java parity: `ScannerMetricGroup.recordPollStart` emits + /// `timeMsBetweenPoll = 0` on the very first poll. The Rust gauge + /// must do the same so dashboards see the metric series from poll #1. + #[test] + fn time_between_poll_ms_emits_zero_on_first_poll() { + use crate::metrics::SCANNER_TIME_BETWEEN_POLL_MS; + use metrics_util::debugging::DebuggingRecorder; + + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + with_test_log_scanner_inner(|inner| { + let _g = PollGuard::new(inner); + // Drop at end of scope completes the poll; the value of + // SCANNER_TIME_BETWEEN_POLL_MS was emitted at start, not end. + }); + }); + + let between = snapshot_gauge(&snapshotter, SCANNER_TIME_BETWEEN_POLL_MS) + .expect("time_between_poll_ms must be emitted on the first poll"); + assert_eq!( + between, 0.0, + "first-poll time_between_poll_ms must be 0.0 (Java parity), got {between}" + ); + assert_scanner_entries_labeled(&snapshotter.snapshot().into_vec(), "db", "tbl"); + } + + /// Pins the single-consumer contract: overlapping `PollGuard`s on the + /// same scanner trip the `debug_assert!` in `record_poll_start`. + /// Release builds skip the check, so the test is gated on + /// `debug_assertions`. + #[cfg(debug_assertions)] + #[test] + #[should_panic(expected = "concurrent poll() detected")] + fn overlapping_polls_panic_in_debug_builds() { + with_test_log_scanner_inner(|inner| { + let _g1 = PollGuard::new(inner); + // _g1 has not been dropped → poll_start_at is still Some, + // so the second start must panic. + let _g2 = PollGuard::new(inner); + }); + } + + /// Drives `handle_fetch_response` against a local metrics recorder and + /// asserts that latency + bytes-per-request histograms are emitted with + /// values that mirror what Java would record. This complements the unit + /// tests in `metrics.rs` (which only verify the facade) by exercising + /// the actual instrumented call path. + /// + /// Note: uses a `current_thread` runtime inside `with_local_recorder` + /// (rather than `#[tokio::test]`) because the metrics facade installs a + /// thread-local recorder; running the async work on the same thread is + /// the only way to observe the emitted metrics in the snapshot. Both + /// the fetcher construction and the `handle_fetch_response` call run + /// inside the runtime (the security-token manager and remote-log + /// downloader require a Tokio reactor). + #[test] + fn handle_fetch_response_emits_latency_and_bytes_metrics() { + use crate::metrics::{SCANNER_BYTES_PER_REQUEST, SCANNER_FETCH_LATENCY_MS}; + use metrics_util::debugging::{DebugValue, DebuggingRecorder}; + + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + let expected_bytes = metrics::with_local_recorder(&recorder, || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("build current_thread runtime"); + + rt.block_on(async { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = build_table_info(table_path.clone(), 1, 1); + let cluster = build_cluster_arc(&table_path, 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster)); + let status = Arc::new(LogScannerStatus::new()); + status.assign_scan_bucket(TableBucket::new(1, 0), 5); + let fetcher = LogFetcher::new( + table_info, + Arc::new(RpcClient::new()), + metadata.clone(), + status, + &Config::default(), + None, + test_scanner_metrics(&table_path), + ) + .expect("build LogFetcher"); + + let response = FetchLogResponse { + tables_resp: vec![PbFetchLogRespForTable { + table_id: 1, + buckets_resp: vec![PbFetchLogRespForBucket { + partition_id: None, + bucket_id: 0, + error_code: Some(FlussError::None.code()), + error_message: None, + high_watermark: Some(7), + log_start_offset: Some(0), + remote_log_fetch_info: None, + records: None, + filtered_end_offset: None, + }], + }], + }; + let expected_bytes = response.encoded_len() as f64; + let response_context = FetchResponseContext { + metadata: metadata.clone(), + log_fetch_buffer: fetcher.log_fetch_buffer.clone(), + log_scanner_status: fetcher.log_scanner_status.clone(), + read_context: fetcher.read_context.clone(), + remote_read_context: fetcher.remote_read_context.clone(), + remote_log_downloader: fetcher.remote_log_downloader.clone(), + metrics: Arc::clone(&fetcher.metrics), + request_start_time: Instant::now(), + }; + + LogFetcher::handle_fetch_response(response, response_context).await; + expected_bytes + }) + }); + + let entries: Vec<_> = snapshotter.snapshot().into_vec(); + let find_histogram = |name: &str| -> Vec { + entries + .iter() + .find_map(|(key, _, _, val)| { + if key.key().name() == name { + if let DebugValue::Histogram(v) = val { + return Some(v.iter().map(|f| f.into_inner()).collect()); + } + } + None + }) + .unwrap_or_default() + }; + + let latency_samples = find_histogram(SCANNER_FETCH_LATENCY_MS); + assert_eq!(latency_samples.len(), 1, "expected one latency sample"); + assert!( + latency_samples[0] >= 0.0, + "latency must be non-negative, got {}", + latency_samples[0] + ); + + let bytes_samples = find_histogram(SCANNER_BYTES_PER_REQUEST); + assert_eq!( + bytes_samples, + vec![expected_bytes], + "bytes histogram must record encoded_len() for parity with Java fetchLogResponse.totalSize()", + ); + + // Every emitted scanner metric must carry both `database` and `table` + // labels — that's the whole point of `ScannerMetrics`. If a future + // contributor adds a new `metrics::*!` macro inline (bypassing + // `ScannerMetrics`), this assertion catches it. + assert_scanner_entries_labeled(&entries, "db", "tbl"); + } +} diff --git a/fluss-rust/crates/fluss/src/client/table/upsert.rs b/fluss-rust/crates/fluss/src/client/table/upsert.rs new file mode 100644 index 0000000000..52ec37b37b --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/table/upsert.rs @@ -0,0 +1,560 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::{RowBytes, WriteFormat, WriteRecord, WriteResultFuture, WriterClient}; +use crate::error::Error::{IllegalArgument, UnexpectedError}; +use crate::error::Result; +use crate::metadata::{RowType, TableInfo, TablePath}; +use crate::row::InternalRow; +use crate::row::encode::{KeyEncoder, KeyEncoderFactory, RowEncoder, RowEncoderFactory}; +use crate::row::field_getter::FieldGetter; +use std::sync::{Arc, Mutex}; + +use crate::client::table::partition_getter::{PartitionGetter, get_physical_path}; +use bitvec::prelude::bitvec; +use bytes::Bytes; + +#[allow(dead_code)] +pub struct TableUpsert { + table_path: TablePath, + table_info: TableInfo, + writer_client: Arc, + target_columns: Option>>, +} + +#[allow(dead_code)] +impl TableUpsert { + pub fn new( + table_path: TablePath, + table_info: TableInfo, + writer_client: Arc, + ) -> Self { + Self { + table_path, + table_info, + writer_client, + target_columns: None, + } + } + + pub fn partial_update(&self, target_columns: Option>) -> Result { + if let Some(columns) = &target_columns { + let num_columns = self.table_info.row_type().fields().len(); + + if let Some(&invalid_column) = columns.iter().find(|&&col| col >= num_columns) { + return Err(IllegalArgument { + message: format!( + "Invalid target column index: {invalid_column} for table {}. The table only has {num_columns} columns.", + self.table_path + ), + }); + } + } + + Ok(Self { + table_path: self.table_path.clone(), + table_info: self.table_info.clone(), + writer_client: self.writer_client.clone(), + target_columns: target_columns.map(Arc::new), + }) + } + + pub fn partial_update_with_column_names(&self, target_column_names: &[&str]) -> Result { + let row_type = self.table_info.row_type(); + let col_indices: Vec<(&str, Option)> = target_column_names + .iter() + .map(|col_name| (*col_name, row_type.get_field_index(col_name))) + .collect(); + + if let Some((missing_name, _)) = col_indices.iter().find(|(_, ix)| ix.is_none()) { + return Err(IllegalArgument { + message: format!( + "Cannot find target column `{}` for table {}.", + missing_name, self.table_path + ), + }); + } + + let valid_col_indices: Vec = col_indices + .into_iter() + .map(|(_, index)| index.unwrap()) + .collect(); + + self.partial_update(Some(valid_col_indices)) + } + + pub fn create_writer(&self) -> Result { + UpsertWriterFactory::create( + Arc::new(self.table_path.clone()), + Arc::new(self.table_info.clone()), + self.target_columns.clone(), + Arc::clone(&self.writer_client), + ) + } +} + +pub struct UpsertWriter { + table_path: Arc, + writer_client: Arc, + partition_field_getter: Option, + primary_key_encoder: Mutex>, + target_columns: Option>>, + // Use primary key encoder as bucket key encoder when None + bucket_key_encoder: Option>>, + write_format: WriteFormat, + row_encoder: Mutex>, + field_getters: Box<[FieldGetter]>, + table_info: Arc, +} + +struct UpsertWriterFactory; + +impl UpsertWriterFactory { + pub fn create( + table_path: Arc, + table_info: Arc, + partial_update_columns: Option>>, + writer_client: Arc, + ) -> Result { + let data_lake_format = &table_info.table_config.get_datalake_format()?; + let row_type = table_info.row_type(); + let physical_pks = table_info.get_physical_primary_keys(); + + let names = table_info.get_schema().auto_increment_col_names(); + + Self::sanity_check( + row_type, + &table_info.primary_keys, + names, + &partial_update_columns, + )?; + + let primary_key_encoder = KeyEncoderFactory::of(row_type, physical_pks, data_lake_format)?; + let bucket_key_encoder = if !table_info.is_default_bucket_key() { + Some(KeyEncoderFactory::of( + row_type, + table_info.get_bucket_keys(), + data_lake_format, + )?) + } else { + // Defaults to using primary key encoder when None for bucket key + None + }; + + let kv_format = table_info.get_table_config().get_kv_format()?; + let write_format = WriteFormat::from_kv_format(&kv_format)?; + + let field_getters = FieldGetter::create_field_getters(row_type); + + let partition_field_getter = if table_info.is_partitioned() { + Some(PartitionGetter::new( + row_type, + Arc::clone(table_info.get_partition_keys()), + )?) + } else { + None + }; + + Ok(UpsertWriter { + table_path, + partition_field_getter, + writer_client, + primary_key_encoder: Mutex::new(primary_key_encoder), + target_columns: partial_update_columns, + bucket_key_encoder: bucket_key_encoder.map(Mutex::new), + write_format, + row_encoder: Mutex::new(Box::new(RowEncoderFactory::create( + kv_format, + row_type.clone(), + )?)), + field_getters, + table_info: table_info.clone(), + }) + } + + #[allow(dead_code)] + fn sanity_check( + row_type: &RowType, + primary_keys: &Vec, + auto_increment_col_names: &Vec, + target_columns: &Option>>, + ) -> Result<()> { + if target_columns.is_none() { + if !auto_increment_col_names.is_empty() { + return Err(IllegalArgument { + message: format!( + "This table has auto increment column {}. Explicitly specifying values for an auto increment column is not allowed. Please Specify non-auto-increment columns as target columns using partialUpdate first.", + auto_increment_col_names.join(", ") + ), + }); + } + return Ok(()); + } + + let field_count = row_type.fields().len(); + + let mut target_column_set = bitvec![0; field_count]; + + let columns = target_columns.as_ref().unwrap().as_ref(); + + for &target_index in columns { + target_column_set.set(target_index, true); + } + + let mut pk_column_set = bitvec![0; field_count]; + + // check the target columns contains the primary key + for primary_key in primary_keys { + let pk_index = row_type.get_field_index(primary_key.as_str()); + match pk_index { + Some(pk_index) => { + if !target_column_set[pk_index] { + return Err(IllegalArgument { + message: format!( + "The target write columns {} must contain the primary key columns {}", + row_type.project(columns)?.get_field_names().join(", "), + primary_keys.join(", ") + ), + }); + } + pk_column_set.set(pk_index, true); + } + None => { + return Err(IllegalArgument { + message: format!( + "The specified primary key {primary_key} is not in row type {row_type}" + ), + }); + } + } + } + + let mut auto_increment_column_set = bitvec![0; field_count]; + // explicitly specifying values for an auto increment column is not allowed + for auto_increment_col_name in auto_increment_col_names { + let auto_increment_field_index = + row_type.get_field_index(auto_increment_col_name.as_str()); + + if let Some(index) = auto_increment_field_index { + if target_column_set[index] { + return Err(IllegalArgument { + message: format!( + "Explicitly specifying values for the auto increment column {auto_increment_col_name} is not allowed." + ), + }); + } + + auto_increment_column_set.set(index, true); + } + } + + // check the columns not in targetColumns should be nullable + for i in 0..field_count { + // column not in primary key and not in auto increment column + if !pk_column_set[i] && !auto_increment_column_set[i] { + // the column should be nullable + if !row_type.fields().get(i).unwrap().data_type.is_nullable() { + return Err(IllegalArgument { + message: format!( + "Partial Update requires all columns except primary key to be nullable, but column {} is NOT NULL.", + row_type.fields().get(i).unwrap().name() + ), + }); + } + } + } + + Ok(()) + } +} + +impl UpsertWriter { + fn check_field_count(&self, row: &R) -> Result<()> { + let expected = self.table_info.get_row_type().fields().len(); + if row.get_field_count() != expected { + return Err(IllegalArgument { + message: format!( + "The field count of the row does not match the table schema. Expected: {}, Actual: {}", + expected, + row.get_field_count() + ), + }); + } + Ok(()) + } + + fn get_keys(&self, row: &dyn InternalRow) -> Result<(Bytes, Option)> { + let key = self + .primary_key_encoder + .lock() + .map_err(|e| UnexpectedError { + message: format!("primary_key_encoder lock poisoned: {e}"), + source: None, + })? + .encode_key(row)?; + let bucket_key = match &self.bucket_key_encoder { + Some(encoder) => Some( + encoder + .lock() + .map_err(|e| UnexpectedError { + message: format!("bucket_key_encoder lock poisoned: {e}"), + source: None, + })? + .encode_key(row)?, + ), + None => Some(key.clone()), + }; + Ok((key, bucket_key)) + } + + fn encode_row(&self, row: &R) -> Result { + let mut encoder = self.row_encoder.lock().map_err(|e| UnexpectedError { + message: format!("row_encoder lock poisoned: {e}"), + source: None, + })?; + encoder.start_new_row()?; + for (pos, field_getter) in self.field_getters.iter().enumerate() { + let datum = field_getter.get_field(row)?; + encoder.encode_field(pos, datum)?; + } + encoder.finish_row() + } + + /// Flush data written that have not yet been sent to the server, forcing the client to send the + /// requests to server and blocks on the completion of the requests associated with these + /// records. A request is considered completed when it is successfully acknowledged according to + /// the CLIENT_WRITER_ACKS configuration option you have specified or else it + /// results in an error. + pub async fn flush(&self) -> Result<()> { + self.writer_client.flush().await + } + + /// Inserts row into Fluss table if they do not already exist, or updates them if they do exist. + /// + /// This method returns a [`WriteResultFuture`] immediately after queueing the write, + /// enabling fire-and-forget semantics for efficient batching. + /// + /// # Arguments + /// * row - the row to upsert. + /// + /// # Returns + /// A [`WriteResultFuture`] that can be awaited to wait for server acknowledgment, + /// or dropped for fire-and-forget behavior (use `flush()` to ensure delivery). + pub fn upsert(&self, row: &R) -> Result { + self.check_field_count(row)?; + + let (key, bucket_key) = self.get_keys(row)?; + + let row_bytes: RowBytes<'_> = match row.as_encoded_bytes(self.write_format) { + Some(bytes) => RowBytes::Borrowed(bytes), + None => RowBytes::Owned(self.encode_row(row)?), + }; + + let write_record = WriteRecord::for_upsert( + Arc::clone(&self.table_info), + Arc::new(get_physical_path( + &self.table_path, + self.partition_field_getter.as_ref(), + row, + )?), + self.table_info.schema_id, + key, + bucket_key, + self.write_format, + self.target_columns.clone(), + Some(row_bytes), + ); + + let result_handle = self.writer_client.send(&write_record)?; + Ok(WriteResultFuture::new(result_handle)) + } + + /// Delete certain row by the input row in Fluss table, the input row must contain the primary + /// key. + /// + /// This method returns a [`WriteResultFuture`] immediately after queueing the delete, + /// enabling fire-and-forget semantics for efficient batching. + /// + /// # Arguments + /// * row - the row to delete (must contain the primary key fields). + /// + /// # Returns + /// A [`WriteResultFuture`] that can be awaited to wait for server acknowledgment, + /// or dropped for fire-and-forget behavior (use `flush()` to ensure delivery). + pub fn delete(&self, row: &R) -> Result { + self.check_field_count(row)?; + + let (key, bucket_key) = self.get_keys(row)?; + + let write_record = WriteRecord::for_upsert( + Arc::clone(&self.table_info), + Arc::new(get_physical_path( + &self.table_path, + self.partition_field_getter.as_ref(), + row, + )?), + self.table_info.schema_id, + key, + bucket_key, + self.write_format, + self.target_columns.clone(), + None, + ); + + let result_handle = self.writer_client.send(&write_record)?; + Ok(WriteResultFuture::new(result_handle)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{DataField, DataTypes}; + + #[test] + fn sanity_check() { + // No target columns specified but table has auto-increment column + let fields = vec![ + DataField::new("id", DataTypes::int().as_non_nullable(), None), + DataField::new("name", DataTypes::string(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["id".to_string()]; + let auto_increment_col_names = vec!["id".to_string()]; + let target_columns = None; + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!(result.unwrap_err().to_string().contains( + "This table has auto increment column id. Explicitly specifying values for an auto increment column is not allowed. Please Specify non-auto-increment columns as target columns using partialUpdate first." + )); + + // Target columns do not contain primary key + let fields = vec![ + DataField::new("id", DataTypes::int().as_non_nullable(), None), + DataField::new("name", DataTypes::string(), None), + DataField::new("value", DataTypes::int(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["id".to_string()]; + let auto_increment_col_names = vec![]; + let target_columns = Some(Arc::new(vec![1usize])); + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!( + result + .unwrap_err() + .to_string() + .contains("The target write columns name must contain the primary key columns id") + ); + + // Primary key column not found in row type + let fields = vec![ + DataField::new("id", DataTypes::int().as_non_nullable(), None), + DataField::new("name", DataTypes::string(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["nonexistent_pk".to_string()]; + let auto_increment_col_names = vec![]; + let target_columns = Some(Arc::new(vec![0usize, 1])); + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!( + result + .unwrap_err() + .to_string() + .contains("The specified primary key nonexistent_pk is not in row type") + ); + + // Target columns include auto-increment column + let fields = vec![ + DataField::new("id", DataTypes::int().as_non_nullable(), None), + DataField::new("seq", DataTypes::bigint().as_non_nullable(), None), + DataField::new("name", DataTypes::string(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["id".to_string()]; + let auto_increment_col_names = vec!["seq".to_string()]; + let target_columns = Some(Arc::new(vec![0usize, 1, 2])); + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!(result.unwrap_err().to_string().contains( + "Explicitly specifying values for the auto increment column seq is not allowed." + )); + + // Non-nullable column not in target columns (partial update requires nullable) + let fields = vec![ + DataField::new("id", DataTypes::int().as_non_nullable(), None), + DataField::new( + "required_field", + DataTypes::string().as_non_nullable(), + None, + ), + DataField::new("optional_field", DataTypes::int(), None), + ]; + let row_type = RowType::new(fields); + let primary_keys = vec!["id".to_string()]; + let auto_increment_col_names = vec![]; + let target_columns = Some(Arc::new(vec![0usize])); + + let result = UpsertWriterFactory::sanity_check( + &row_type, + &primary_keys, + &auto_increment_col_names, + &target_columns, + ); + + assert!(result.unwrap_err().to_string().contains( + "Partial Update requires all columns except primary key to be nullable, but column required_field is NOT NULL." + )); + } +} + +/// The result of upserting a record +/// Currently this is an empty struct to allow for compatible evolution in the future +#[derive(Default)] +#[allow(dead_code)] +pub struct UpsertResult; + +/// The result of deleting a record +/// Currently this is an empty struct to allow for compatible evolution in the future +#[derive(Default)] +#[allow(dead_code)] +pub struct DeleteResult; diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs new file mode 100644 index 0000000000..244edf7399 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs @@ -0,0 +1,1759 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::broadcast; +use crate::client::write::IdempotenceManager; +use crate::client::write::batch::WriteBatch::{ArrowLog, Kv}; +use crate::client::write::batch::{ArrowLogWriteBatch, KvWriteBatch, WriteBatch}; +use crate::client::write::dynamic_batch_size::DynamicWriteBatchSizeEstimator; +use crate::client::{LogWriteRecord, Record, ResultHandle, WriteRecord}; +use crate::cluster::{BucketLocation, Cluster, ServerNode}; +use crate::compression::ArrowCompressionRatioEstimator; +use crate::config::Config; +use crate::error::{Error, Result}; +use crate::metadata::{PhysicalTablePath, TableBucket}; +use crate::record::{NO_BATCH_SEQUENCE, NO_WRITER_ID}; +use crate::util::current_time_ms; +use crate::{BucketId, PartitionId, TableId}; +use dashmap::DashMap; +use parking_lot::{Condvar, Mutex, RwLock}; +use std::collections::{HashMap, HashSet, VecDeque}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicI32, AtomicI64, AtomicUsize, Ordering}; +use std::time::{Duration, Instant}; +use tokio::sync::Notify; + +/// Byte-counting semaphore that blocks producers when total buffered memory +/// exceeds the configured limit. Matches Java's `LazyMemorySegmentPool` behavior. +/// +/// TODO: Replace `notify_all()` with per-waiter FIFO signaling (Java uses per-request +/// Condition objects in a Deque) to avoid thundering herd under high contention. +/// +/// TODO: Track actual batch memory usage instead of reserving a fixed `writer_batch_size` +/// per batch. This over-counts when batches don't fill completely, reducing effective +/// throughput. Requires tighter coupling with batch internals. +pub(crate) struct MemoryLimiter { + state: Mutex, + cond: Condvar, + max_memory: usize, + wait_timeout: Duration, + closed: AtomicBool, + waiting_count: AtomicUsize, +} + +impl MemoryLimiter { + pub fn new(max_memory: usize, wait_timeout: Duration) -> Self { + Self { + state: Mutex::new(0), + cond: Condvar::new(), + max_memory, + wait_timeout, + closed: AtomicBool::new(false), + waiting_count: AtomicUsize::new(0), + } + } + + /// Try to acquire `size` bytes. Blocks until memory is available, + /// the timeout expires, or the limiter is closed. + /// Returns a `MemoryPermit` on success. + pub fn acquire(self: &Arc, size: usize) -> Result { + if self.closed.load(Ordering::Acquire) { + return Err(Error::WriterClosed { + message: "Memory limiter is closed".to_string(), + }); + } + + if size > self.max_memory { + return Err(Error::IllegalArgument { + message: format!( + "Batch size {} exceeds total buffer memory limit {}", + size, self.max_memory + ), + }); + } + + let mut used = self.state.lock(); + let deadline = Instant::now() + self.wait_timeout; + while *used + size > self.max_memory { + self.waiting_count.fetch_add(1, Ordering::Relaxed); + let result = self.cond.wait_until(&mut used, deadline); + self.waiting_count.fetch_sub(1, Ordering::Relaxed); + + if self.closed.load(Ordering::Acquire) { + return Err(Error::WriterClosed { + message: "Memory limiter is closed".to_string(), + }); + } + if result.timed_out() && *used + size > self.max_memory { + return Err(Error::BufferExhausted { + message: format!( + "Failed to allocate {} bytes for write batch within {}ms. \ + {} of {} bytes in use, {} threads waiting.", + size, + self.wait_timeout.as_millis(), + *used, + self.max_memory, + self.waiting_count.load(Ordering::Relaxed), + ), + }); + } + } + + *used += size; + Ok(MemoryPermit { + limiter: Arc::clone(self), + size, + }) + } + + fn release(&self, size: usize) { + let mut used = self.state.lock(); + *used = used.saturating_sub(size); + self.cond.notify_all(); + } + + /// Returns true if any producers are currently blocked waiting for memory. + /// Used by `ready()` to mark all batches as immediately sendable when + /// memory is exhausted (matching Java's `exhausted` flag). + pub fn has_waiters(&self) -> bool { + self.waiting_count.load(Ordering::Relaxed) > 0 + } + + /// Mark the limiter as closed and wake all blocked producers. + fn close(&self) { + self.closed.store(true, Ordering::Release); + self.cond.notify_all(); + } +} + +/// RAII guard that releases memory back to the `MemoryLimiter` on drop. +pub(crate) struct MemoryPermit { + limiter: Arc, + size: usize, +} + +impl std::fmt::Debug for MemoryPermit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MemoryPermit") + .field("size", &self.size) + .finish_non_exhaustive() + } +} + +impl Drop for MemoryPermit { + fn drop(&mut self) { + if self.size > 0 { + self.limiter.release(self.size); + } + } +} + +// Type alias to simplify complex nested types +type BucketBatches = Vec<(BucketId, Arc>>)>; + +#[allow(dead_code)] +pub struct RecordAccumulator { + config: Config, + write_batches: DashMap, BucketAndWriteBatches>, + // batch_id -> (complete callback, memory permit) + incomplete_batches: RwLock>, + batch_timeout_ms: i64, + closed: AtomicBool, + flushes_in_progress: AtomicI32, + appends_in_progress: i32, + nodes_drain_index: Mutex>, + batch_id: AtomicI64, + idempotence_manager: Arc, + memory_limiter: Arc, + /// Wakes the sender task when new batches are created or existing batches + /// become full, so the sender can drain them immediately instead of waiting + /// for its next poll cycle. This is the Rust equivalent of Java's + /// `Sender.wakeup()` / Kafka's `RecordAccumulator.wakeup()`. + sender_wakeup: Notify, +} + +impl RecordAccumulator { + pub fn new(config: Config, idempotence_manager: Arc) -> Self { + let batch_timeout_ms = config.writer_batch_timeout_ms; + let memory_limiter = Arc::new(MemoryLimiter::new( + config.writer_buffer_memory_size, + Duration::from_millis(config.writer_buffer_wait_timeout_ms), + )); + RecordAccumulator { + config, + write_batches: Default::default(), + incomplete_batches: Default::default(), + batch_timeout_ms, + closed: Default::default(), + flushes_in_progress: Default::default(), + appends_in_progress: Default::default(), + nodes_drain_index: Default::default(), + batch_id: Default::default(), + idempotence_manager, + memory_limiter, + sender_wakeup: Notify::new(), + } + } + + fn try_append( + &self, + record: &WriteRecord, + dq: &mut VecDeque, + ) -> Result> { + let dq_size = dq.len(); + if let Some(last_batch) = dq.back_mut() { + return if let Some(result_handle) = last_batch.try_append(record)? { + Ok(Some(RecordAppendResult::new( + result_handle, + dq_size > 1 || last_batch.is_closed(), + false, + false, + ))) + } else { + Ok(None) + }; + } + Ok(None) + } + + fn append_new_batch( + &self, + cluster: &Cluster, + record: &WriteRecord, + dq: &mut VecDeque, + permit: MemoryPermit, + alloc_size: usize, + compression_ratio_estimator: Arc, + ) -> Result { + let physical_table_path = &record.physical_table_path; + let table_path = physical_table_path.get_table_path(); + let table_info = cluster.get_table(table_path)?; + let arrow_compression_info = table_info.get_table_config().get_arrow_compression_info()?; + let row_type = &table_info.row_type; + + let schema_id = table_info.schema_id; + + let mut batch: WriteBatch = match record.record() { + Record::Log(_) => ArrowLog(ArrowLogWriteBatch::new( + self.batch_id.fetch_add(1, Ordering::Relaxed), + Arc::clone(physical_table_path), + schema_id, + arrow_compression_info, + row_type, + current_time_ms(), + matches!(&record.record, Record::Log(LogWriteRecord::RecordBatch(_))), + alloc_size, + compression_ratio_estimator, + )?), + Record::Kv(kv_record) => Kv(KvWriteBatch::new( + self.batch_id.fetch_add(1, Ordering::Relaxed), + Arc::clone(physical_table_path), + schema_id, + alloc_size, + record.write_format.to_kv_format()?, + kv_record.target_columns.clone(), + current_time_ms(), + )), + }; + + let batch_id = batch.batch_id(); + + let result_handle = batch + .try_append(record)? + .expect("must append to a new batch"); + + let batch_is_closed = batch.is_closed(); + dq.push_back(batch); + + self.incomplete_batches + .write() + .insert(batch_id, (result_handle.clone(), permit)); + Ok(RecordAppendResult::new( + result_handle, + dq.len() > 1 || batch_is_closed, + true, + false, + )) + } + + pub fn append( + &self, + record: &WriteRecord<'_>, + bucket_id: BucketId, + cluster: &Cluster, + abort_if_batch_full: bool, + ) -> Result { + let physical_table_path = &record.physical_table_path; + let table_path = physical_table_path.get_table_path(); + let table_info = cluster.get_table(table_path)?; + let is_partitioned_table = table_info.is_partitioned(); + + let partition_id = if is_partitioned_table { + cluster.get_partition_id(physical_table_path) + } else { + None + }; + + let (dq, compression_ratio_estimator, dynamic_target) = { + let mut binding = self + .write_batches + .entry(Arc::clone(physical_table_path)) + .or_insert_with(|| { + BucketAndWriteBatches::new( + table_info.table_id, + is_partitioned_table, + partition_id, + &self.config, + ) + }); + let bucket_and_batches = binding.value_mut(); + let dq = bucket_and_batches + .batches + .entry(bucket_id) + .or_insert_with(|| Arc::new(Mutex::new(VecDeque::new()))) + .clone(); + let dynamic_target = bucket_and_batches + .dynamic_batch_size + .as_ref() + .map(|est| est.current()); + ( + dq, + Arc::clone(&bucket_and_batches.compression_ratio_estimator), + dynamic_target, + ) + }; + + let mut dq_guard = dq.lock(); + if let Some(append_result) = self.try_append(record, &mut dq_guard)? { + return Ok(append_result); + } + + if abort_if_batch_full { + return Ok(RecordAppendResult::new_without_result_handle( + true, false, true, + )); + } + + // Drop dq lock before blocking on memory to prevent deadlock: + // producer holds dq + blocks on memory, while sender needs dq to drain. + drop(dq_guard); + + let batch_size = dynamic_target.unwrap_or(self.config.writer_batch_size as usize); + let record_size = record.estimated_record_size(); + let alloc_size = batch_size.max(record_size); + let permit = self.memory_limiter.acquire(alloc_size)?; + + // Re-acquire dq lock after memory is available + let mut dq_guard = dq.lock(); + // Re-try: another thread may have created a batch while we waited + if let Some(append_result) = self.try_append(record, &mut dq_guard)? { + return Ok(append_result); // permit drops here, memory released + } + + self.append_new_batch( + cluster, + record, + &mut dq_guard, + permit, + alloc_size, + compression_ratio_estimator, + ) + } + + pub fn ready(&self, cluster: &Arc) -> Result { + // Snapshot just the Arcs we need, avoiding cloning the entire BucketAndWriteBatches struct + let entries: Vec<(Arc, Option, BucketBatches)> = self + .write_batches + .iter() + .map(|entry| { + let physical_table_path = Arc::clone(entry.key()); + let partition_id = entry.value().partition_id; + let bucket_batches: Vec<_> = entry + .value() + .batches + .iter() + .map(|(bucket_id, batch_arc)| (*bucket_id, batch_arc.clone())) + .collect(); + (physical_table_path, partition_id, bucket_batches) + }) + .collect(); + + let mut ready_nodes = HashSet::new(); + let mut next_ready_check_delay_ms = self.batch_timeout_ms; + let mut unknown_leader_tables = HashSet::new(); + let exhausted = self.memory_limiter.has_waiters(); + + for (physical_table_path, mut partition_id, bucket_batches) in entries { + next_ready_check_delay_ms = self.bucket_ready( + &physical_table_path, + physical_table_path.get_partition_name().is_some(), + &mut partition_id, + bucket_batches, + &mut ready_nodes, + &mut unknown_leader_tables, + cluster, + next_ready_check_delay_ms, + exhausted, + )? + } + + Ok(ReadyCheckResult { + ready_nodes, + next_ready_check_delay_ms, + unknown_leader_tables, + }) + } + + #[allow(clippy::too_many_arguments)] + fn bucket_ready( + &self, + physical_table_path: &Arc, + is_partitioned_table: bool, + partition_id: &mut Option, + bucket_batches: BucketBatches, + ready_nodes: &mut HashSet, + unknown_leader_tables: &mut HashSet>, + cluster: &Cluster, + next_ready_check_delay_ms: i64, + exhausted: bool, + ) -> Result { + let mut next_delay = next_ready_check_delay_ms; + + // First check this table has partitionId. + if is_partitioned_table && partition_id.is_none() { + let partition_id = cluster.get_partition_id(physical_table_path); + + if partition_id.is_some() { + // Update the cached partition_id + if let Some(mut entry) = self.write_batches.get_mut(physical_table_path) { + entry.partition_id = partition_id; + } + } else { + log::debug!( + "Partition does not exist for {}, bucket will not be set to ready", + physical_table_path.as_ref() + ); + + // TODO: we shouldn't add unready partitions to unknownLeaderTables, + // because it cases PartitionNotExistException later + unknown_leader_tables.insert(Arc::clone(physical_table_path)); + return Ok(next_delay); + } + } + + for (bucket_id, batch) in bucket_batches { + let batch_guard = batch.lock(); + if batch_guard.is_empty() { + continue; + } + + let batch = batch_guard.front().unwrap(); + let waited_time_ms = batch.waited_time_ms(current_time_ms()); + let deque_size = batch_guard.len(); + let full = deque_size > 1 || batch.is_closed(); + let table_bucket = cluster.get_table_bucket(physical_table_path, bucket_id)?; + if let Some(leader) = cluster.leader_for(&table_bucket) { + next_delay = self.batch_ready( + leader, + waited_time_ms, + full, + exhausted, + ready_nodes, + next_delay, + ); + } else { + unknown_leader_tables.insert(Arc::clone(physical_table_path)); + } + } + Ok(next_delay) + } + + fn batch_ready( + &self, + leader: &ServerNode, + waited_time_ms: i64, + full: bool, + exhausted: bool, + ready_nodes: &mut HashSet, + next_ready_check_delay_ms: i64, + ) -> i64 { + if !ready_nodes.contains(leader) { + let expired = waited_time_ms >= self.batch_timeout_ms; + let sendable = full + || expired + || exhausted + || self.closed.load(Ordering::Acquire) + || self.flush_in_progress(); + + if sendable { + ready_nodes.insert(leader.clone()); + } else { + let time_left_ms = self.batch_timeout_ms.saturating_sub(waited_time_ms); + return next_ready_check_delay_ms.min(time_left_ms); + } + } + next_ready_check_delay_ms + } + + pub fn drain( + &self, + cluster: Arc, + nodes: &HashSet, + max_size: i32, + ) -> Result>> { + if nodes.is_empty() { + return Ok(HashMap::new()); + } + let mut batches = HashMap::new(); + for node in nodes { + let ready = self.drain_batches_for_one_node(&cluster, node, max_size)?; + if !ready.is_empty() { + batches.insert(node.id(), ready); + } + } + + Ok(batches) + } + + /// Matches Java's `shouldStopDrainBatchesForBucket`. Returns true if + /// this bucket should be skipped during drain. + fn should_stop_drain_batches_for_bucket( + &self, + first: &WriteBatch, + table_bucket: &TableBucket, + ) -> bool { + if !self.idempotence_manager.is_enabled() { + return false; + } + if !self.idempotence_manager.is_writer_id_valid() { + return true; + } + + // Use batch_id comparison instead of sequence comparison. After + // handle_failed_batch adjusts InFlightBatch sequences, the WriteBatch's + // stored sequence may be stale (re_enqueue syncs it, but this is more + // robust). Java can compare sequences because resetWriterState mutates + // the batch directly; Rust uses lightweight InFlightBatch proxies. + let is_first_in_flight = self.idempotence_manager.in_flight_count(table_bucket) == 0 + || (first.has_batch_sequence() + && self + .idempotence_manager + .is_first_in_flight_batch(table_bucket, first.batch_id())); + + if is_first_in_flight { + return false; + } + + if !first.has_batch_sequence() { + // Fresh batch: respect max in-flight limit + !self + .idempotence_manager + .can_send_more_requests(table_bucket) + } else { + // Re-enqueued batch that's NOT first in-flight: stop + true + } + } + + fn drain_batches_for_one_node( + &self, + cluster: &Cluster, + node: &ServerNode, + max_size: i32, + ) -> Result> { + let mut size: usize = 0; + let buckets = self.get_all_buckets_in_current_node(node, cluster); + let mut ready = Vec::new(); + + if buckets.is_empty() { + return Ok(ready); + } + + let start = { + let mut nodes_drain_index_guard = self.nodes_drain_index.lock(); + let drain_index = nodes_drain_index_guard.entry(node.id()).or_insert(0); + *drain_index % buckets.len() + }; + + let mut current_index = start; + let mut last_processed_index; + + loop { + let bucket = &buckets[current_index]; + let table_path = bucket.physical_table_path(); + let table_bucket = bucket.table_bucket.clone(); + last_processed_index = current_index; + current_index = (current_index + 1) % buckets.len(); + + let deque = self + .write_batches + .get(table_path) + .and_then(|bucket_and_write_batches| { + bucket_and_write_batches + .batches + .get(&table_bucket.bucket_id()) + .cloned() + }); + + if let Some(deque) = deque { + let mut maybe_batch = None; + { + let mut batch_lock = deque.lock(); + if !batch_lock.is_empty() { + let first_batch = batch_lock.front().unwrap(); + + if size + first_batch.estimated_size_in_bytes() > max_size as usize + && !ready.is_empty() + { + // there is a rare case that a single batch size is larger than the request size + // due to compression; in this case we will still eventually send this batch in + // a single request. + break; + } + + // Improvement: `continue` instead of `break` to skip + // only this bucket, not all buckets for the node. + if self.should_stop_drain_batches_for_bucket(first_batch, &table_bucket) { + if current_index == start { + break; + } + continue; + } + + maybe_batch = Some(batch_lock.pop_front().unwrap()); + } + } + + if let Some(ref mut batch) = maybe_batch { + // Assign writer state to fresh batches (matching Java's drain loop) + let writer_id = if self.idempotence_manager.is_enabled() { + self.idempotence_manager.writer_id() + } else { + NO_WRITER_ID + }; + if writer_id != NO_WRITER_ID && !batch.has_batch_sequence() { + self.idempotence_manager + .maybe_update_writer_id(&table_bucket); + let seq = self + .idempotence_manager + .next_sequence_and_increment(&table_bucket); + batch.set_writer_state(writer_id, seq); + self.idempotence_manager.add_in_flight_batch( + &table_bucket, + seq, + batch.batch_id(), + ); + } + } + + if let Some(mut batch) = maybe_batch { + let current_batch_size = batch.estimated_size_in_bytes(); + size += current_batch_size; + + self.record_actual_batch_size(table_path, current_batch_size); + + // mark the batch as drained. + batch.drained(current_time_ms()); + ready.push(ReadyWriteBatch { + table_bucket, + write_batch: batch, + }); + } + } + if current_index == start { + break; + } + } + + // Store the last processed index to maintain round-robin fairness + { + let mut nodes_drain_index_guard = self.nodes_drain_index.lock(); + nodes_drain_index_guard.insert(node.id(), last_processed_index); + } + + Ok(ready) + } + + pub fn remove_incomplete_batches(&self, batch_id: i64) { + self.incomplete_batches.write().remove(&batch_id); + } + + fn record_actual_batch_size(&self, table_path: &Arc, actual: usize) { + let Some(entry) = self.write_batches.get(table_path) else { + return; + }; + let Some(estimator) = entry.dynamic_batch_size.as_ref() else { + return; + }; + let prev = estimator.current(); + let next = estimator.update(actual); + if next != prev { + log::debug!( + "Set estimated batch size for {} from {} to {}", + table_path.as_ref(), + prev, + next + ); + } + } + + #[cfg(test)] + fn estimated_batch_size(&self, table_path: &Arc) -> Option { + self.write_batches + .get(table_path)? + .dynamic_batch_size + .as_ref() + .map(|est| est.current()) + } + + pub fn re_enqueue(&self, mut ready_write_batch: ReadyWriteBatch) { + ready_write_batch.write_batch.re_enqueued(); + + // Sync WriteBatch sequence with IdempotenceManager's adjusted sequence. + // When handle_failed_batch adjusts InFlightBatch sequences (after a prior + // batch fails), the WriteBatch is not updated (unlike Java which calls + // resetWriterState on the actual batch). We must sync here so that: + // 1. should_stop_drain_batches_for_bucket comparisons work correctly + // 2. build() produces bytes with the correct (adjusted) sequence + if self.idempotence_manager.is_enabled() + && ready_write_batch.write_batch.has_batch_sequence() + { + if let Some(adjusted_seq) = self.idempotence_manager.get_adjusted_sequence( + &ready_write_batch.table_bucket, + ready_write_batch.write_batch.batch_id(), + ) { + if adjusted_seq != ready_write_batch.write_batch.batch_sequence() { + let writer_id = ready_write_batch.write_batch.writer_id(); + ready_write_batch + .write_batch + .set_writer_state(writer_id, adjusted_seq); + } + } + } + + let dq = self.get_or_create_deque(&ready_write_batch); + let mut dq_guard = dq.lock(); + if self.idempotence_manager.is_enabled() { + self.insert_in_sequence_order(&mut dq_guard, ready_write_batch); + } else { + dq_guard.push_front(ready_write_batch.write_batch); + } + } + + /// Insert a re-enqueued batch in sequence order. Matches Java's + /// `insertInSequenceOrder`. If the batch is the next expected in-flight, + /// push to front; otherwise, find the correct sorted position. + fn insert_in_sequence_order( + &self, + dq: &mut VecDeque, + ready_write_batch: ReadyWriteBatch, + ) { + debug_assert!( + ready_write_batch.write_batch.batch_sequence() != NO_BATCH_SEQUENCE, + "Re-enqueuing a batch without a sequence (batch_id={})", + ready_write_batch.write_batch.batch_id() + ); + debug_assert!( + self.idempotence_manager + .in_flight_count(&ready_write_batch.table_bucket) + > 0, + "Re-enqueuing a batch not tracked in in-flight (batch_id={}, bucket={})", + ready_write_batch.write_batch.batch_id(), + ready_write_batch.table_bucket + ); + + if dq.is_empty() { + dq.push_front(ready_write_batch.write_batch); + return; + } + + // If it's the first in-flight batch for its bucket, push to front + if self.idempotence_manager.is_first_in_flight_batch( + &ready_write_batch.table_bucket, + ready_write_batch.write_batch.batch_id(), + ) { + dq.push_front(ready_write_batch.write_batch); + return; + } + + // Find the correct position sorted by batch_sequence + let batch_seq = ready_write_batch.write_batch.batch_sequence(); + let mut insert_pos = dq.len(); + for (i, existing) in dq.iter().enumerate() { + if existing.has_batch_sequence() && existing.batch_sequence() > batch_seq { + insert_pos = i; + break; + } + } + dq.insert(insert_pos, ready_write_batch.write_batch); + } + + fn get_or_create_deque( + &self, + ready_write_batch: &ReadyWriteBatch, + ) -> Arc>> { + let physical_table_path = ready_write_batch.write_batch.physical_table_path(); + let bucket_id = ready_write_batch.table_bucket.bucket_id(); + let table_id = ready_write_batch.table_bucket.table_id(); + let partition_id = ready_write_batch.table_bucket.partition_id(); + let is_partitioned_table = partition_id.is_some(); + + let mut binding = self + .write_batches + .entry(Arc::clone(physical_table_path)) + .or_insert_with(|| { + BucketAndWriteBatches::new( + table_id, + is_partitioned_table, + partition_id, + &self.config, + ) + }); + let bucket_and_batches = binding.value_mut(); + bucket_and_batches + .batches + .entry(bucket_id) + .or_insert_with(|| Arc::new(Mutex::new(VecDeque::new()))) + .clone() + } + + /// Mark the accumulator as closed. All batches become immediately ready + /// (sendable) in `batch_ready`, triggering a full drain without waiting + /// for `batch_timeout_ms`. Matches Java's `RecordAccumulator.close()`. + pub fn close(&self) { + self.closed.store(true, Ordering::Release); + self.wakeup_sender(); + } + + pub fn is_closed(&self) -> bool { + self.closed.load(Ordering::Acquire) + } + + pub fn abort_batches(&self, error: broadcast::Error) { + self.memory_limiter.close(); + // Complete batches still in deques (not yet drained). + for mut entry in self.write_batches.iter_mut() { + for (_bucket_id, deque) in entry.value_mut().batches.iter_mut() { + let mut dq = deque.lock(); + while let Some(batch) = dq.pop_front() { + batch.complete(Err(error.clone())); + } + } + } + // Fail any remaining handles (including in-flight batches that were + // drained but not yet completed). This is a no-op for handles already + // completed above via WriteBatch::complete. + let mut incomplete = self.incomplete_batches.write(); + for (handle, _permit) in incomplete.values() { + handle.fail(error.clone()); + } + incomplete.clear(); + } + + pub fn has_incomplete(&self) -> bool { + !self.incomplete_batches.read().is_empty() + } + + /// Wake the sender task so it can drain ready batches immediately. + pub fn wakeup_sender(&self) { + self.sender_wakeup.notify_one(); + } + + /// Returns a future that completes when `wakeup_sender()` is called. + pub fn notified(&self) -> tokio::sync::futures::Notified<'_> { + self.sender_wakeup.notified() + } + + fn get_all_buckets_in_current_node( + &self, + current: &ServerNode, + cluster: &Cluster, + ) -> Vec { + let mut buckets = vec![]; + for bucket_locations in cluster.get_bucket_locations_by_path().values() { + for bucket_location in bucket_locations { + if let Some(leader) = bucket_location.leader() { + if current.id() == leader.id() { + buckets.push(bucket_location.clone()); + } + } + } + } + buckets + } + + pub fn has_undrained(&self) -> bool { + for entry in self.write_batches.iter() { + for (_, batch_deque) in entry.value().batches.iter() { + if !batch_deque.lock().is_empty() { + return true; + } + } + } + false + } + + pub fn get_physical_table_paths_in_batches(&self) -> Vec> { + self.write_batches + .iter() + .map(|entry| Arc::clone(entry.key())) + .collect() + } + + fn flush_in_progress(&self) -> bool { + self.flushes_in_progress.load(Ordering::SeqCst) > 0 + } + + pub fn begin_flush(&self) { + self.flushes_in_progress.fetch_add(1, Ordering::SeqCst); + self.wakeup_sender(); + } + + #[allow(unused_must_use)] + pub async fn await_flush_completion(&self) -> Result<()> { + // Clone handles before awaiting to avoid holding RwLock read guard across await points + let handles: Vec<_> = self + .incomplete_batches + .read() + .values() + .map(|(h, _)| h.clone()) + .collect(); + + // Await on all handles + let result = async { + for result_handle in handles { + result_handle.wait().await?; + } + Ok(()) + } + .await; + + // Always decrement flushes_in_progress, even if an error occurred + // This mimics the Java finally block behavior + self.flushes_in_progress.fetch_sub(1, Ordering::SeqCst); + + result + } +} + +pub struct ReadyWriteBatch { + pub table_bucket: TableBucket, + pub write_batch: WriteBatch, +} + +impl ReadyWriteBatch { + pub fn write_batch(&self) -> &WriteBatch { + &self.write_batch + } +} + +#[allow(dead_code)] +struct BucketAndWriteBatches { + table_id: TableId, + is_partitioned_table: bool, + partition_id: Option, + batches: HashMap>>>, + /// Compression ratio estimator shared across Arrow log batches for this table. + compression_ratio_estimator: Arc, + /// `None` when `writer_dynamic_batch_size_enabled` is false. + dynamic_batch_size: Option, +} + +impl BucketAndWriteBatches { + fn new( + table_id: TableId, + is_partitioned_table: bool, + partition_id: Option, + config: &Config, + ) -> Self { + let dynamic_batch_size = config.writer_dynamic_batch_size_enabled.then(|| { + DynamicWriteBatchSizeEstimator::new( + config.writer_dynamic_batch_size_min as usize, + config.writer_batch_size as usize, + ) + }); + Self { + table_id, + is_partitioned_table, + partition_id, + batches: Default::default(), + compression_ratio_estimator: Arc::new(ArrowCompressionRatioEstimator::default()), + dynamic_batch_size, + } + } +} + +pub struct RecordAppendResult { + pub batch_is_full: bool, + pub new_batch_created: bool, + pub abort_record_for_new_batch: bool, + pub result_handle: Option, +} + +impl RecordAppendResult { + fn new( + result_handle: ResultHandle, + batch_is_full: bool, + new_batch_created: bool, + abort_record_for_new_batch: bool, + ) -> Self { + Self { + batch_is_full, + new_batch_created, + abort_record_for_new_batch, + result_handle: Some(result_handle), + } + } + + fn new_without_result_handle( + batch_is_full: bool, + new_batch_created: bool, + abort_record_for_new_batch: bool, + ) -> Self { + Self { + batch_is_full, + new_batch_created, + abort_record_for_new_batch, + result_handle: None, + } + } +} + +pub struct ReadyCheckResult { + pub ready_nodes: HashSet, + pub next_ready_check_delay_ms: i64, + pub unknown_leader_tables: HashSet>, +} + +impl ReadyCheckResult { + pub fn new( + ready_nodes: HashSet, + next_ready_check_delay_ms: i64, + unknown_leader_tables: HashSet>, + ) -> Self { + ReadyCheckResult { + ready_nodes, + next_ready_check_delay_ms, + unknown_leader_tables, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::write::write_format::WriteFormat; + use crate::client::write::{RowBytes, WriteRecord}; + use crate::metadata::TablePath; + use crate::row::{Datum, GenericRow}; + use crate::test_utils::{build_cluster, build_table_info}; + use bytes::Bytes; + use std::sync::Arc; + + fn disabled_idempotence() -> Arc { + Arc::new(IdempotenceManager::new(false, 5)) + } + + fn enabled_idempotence() -> Arc { + Arc::new(IdempotenceManager::new(true, 5)) + } + + #[tokio::test] + async fn re_enqueue_increments_attempts() -> Result<()> { + let config = Config::default(); + let accumulator = RecordAccumulator::new(config, disabled_idempotence()); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let cluster = Arc::new(build_cluster(&table_path, 1, 1)); + let row = GenericRow { + values: vec![Datum::Int32(1)], + }; + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + + accumulator.append(&record, 0, &cluster, false)?; + + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?; + let mut drained = batches.remove(&1).expect("drained batches"); + let batch = drained.pop().expect("batch"); + assert_eq!(batch.write_batch.attempts(), 0); + + accumulator.re_enqueue(batch); + + let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?; + let mut drained = batches.remove(&1).expect("drained batches"); + let batch = drained.pop().expect("batch"); + assert_eq!(batch.write_batch.attempts(), 1); + Ok(()) + } + + #[tokio::test] + async fn flush_counter_decremented_on_error() -> Result<()> { + use crate::client::write::broadcast::BroadcastOnce; + use std::sync::atomic::Ordering; + + let config = Config::default(); + let accumulator = RecordAccumulator::new(config, disabled_idempotence()); + + accumulator.begin_flush(); + assert_eq!(accumulator.flushes_in_progress.load(Ordering::SeqCst), 1); + + // Create a failing batch by dropping the BroadcastOnce without broadcasting + { + let broadcast = BroadcastOnce::default(); + let receiver = broadcast.receiver(); + let handle = ResultHandle::new(receiver); + let permit = accumulator.memory_limiter.acquire(1024).unwrap(); + accumulator + .incomplete_batches + .write() + .insert(1, (handle, permit)); + // broadcast is dropped here, causing an error + } + + // Await flush completion should fail but still decrement counter + let result = accumulator.await_flush_completion().await; + assert!(result.is_err()); + + // Counter should still be decremented (this is the critical fix!) + assert_eq!(accumulator.flushes_in_progress.load(Ordering::SeqCst), 0); + assert!(!accumulator.flush_in_progress()); + + Ok(()) + } + + fn append_and_drain( + accumulator: &RecordAccumulator, + cluster: &Arc, + table_path: &TablePath, + bucket_id: i32, + ) -> Result { + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 2)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let row = GenericRow { + values: vec![Datum::Int32(1)], + }; + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + accumulator.append(&record, bucket_id, cluster, false)?; + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?; + let mut drained = batches.remove(&1).expect("drained batches"); + Ok(drained.pop().expect("batch")) + } + + #[test] + fn test_should_stop_drain_for_fresh_batch_over_limit() { + let idempotence = Arc::new(IdempotenceManager::new(true, 2)); + idempotence.set_writer_id(42); + let config = Config::default(); + let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence)); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = Arc::new(build_cluster(&table_path, 1, 1)); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let row = GenericRow { + values: vec![Datum::Int32(1)], + }; + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + accumulator + .append(&record, 0, &cluster, false) + .expect("append"); + + let table_bucket = TableBucket::new(1, 0); + + // Add 2 in-flight batches (reaching the max_in_flight=2) + idempotence.add_in_flight_batch(&table_bucket, 0, 100); + idempotence.add_in_flight_batch(&table_bucket, 1, 101); + + // Get the front batch from the deque + let entry = accumulator + .write_batches + .get(&PhysicalTablePath::of(Arc::new(table_path))) + .unwrap(); + let dq = entry.batches.get(&0).unwrap(); + let dq_guard = dq.lock(); + let first_batch = dq_guard.front().unwrap(); + + // Fresh batch (no batch_sequence) with in-flight at limit → should stop + assert!(!first_batch.has_batch_sequence()); + assert!(accumulator.should_stop_drain_batches_for_bucket(first_batch, &table_bucket)); + + // Remove one in-flight → under limit → should not stop + drop(dq_guard); + idempotence.remove_in_flight_batch(&table_bucket, 101); + let dq_guard = entry.batches.get(&0).unwrap().lock(); + let first_batch = dq_guard.front().unwrap(); + assert!(!accumulator.should_stop_drain_batches_for_bucket(first_batch, &table_bucket)); + } + + #[test] + fn test_should_stop_drain_for_retry_not_first_inflight() { + let idempotence = enabled_idempotence(); + idempotence.set_writer_id(42); + let config = Config::default(); + let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence)); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = Arc::new(build_cluster(&table_path, 1, 1)); + + // Drain two separate batches to get batch0(seq=0) and batch1(seq=1) + let batch0 = + append_and_drain(&accumulator, &cluster, &table_path, 0).expect("drain batch0"); + let batch1 = + append_and_drain(&accumulator, &cluster, &table_path, 0).expect("drain batch1"); + + assert_eq!(batch0.write_batch.batch_sequence(), 0); + assert_eq!(batch1.write_batch.batch_sequence(), 1); + + let batch1_id = batch1.write_batch.batch_id(); + let table_bucket = batch0.table_bucket.clone(); + + // Re-enqueue only batch1 (simulating batch0 still in-flight, batch1 got error) + accumulator.re_enqueue(batch1); + + let entry = accumulator + .write_batches + .get(&PhysicalTablePath::of(Arc::new(table_path))) + .unwrap(); + let dq = entry.batches.get(&0).unwrap(); + let dq_guard = dq.lock(); + let first_batch = dq_guard.front().unwrap(); + + // Batch1 is re-enqueued with seq=1, but batch0 (seq=0) is the first in-flight. + // batch1's batch_id != first in-flight batch_id → should stop. + assert!(first_batch.has_batch_sequence()); + assert_eq!(first_batch.batch_id(), batch1_id); + assert!(accumulator.should_stop_drain_batches_for_bucket(first_batch, &table_bucket)); + } + + #[tokio::test] + async fn test_insert_in_sequence_order() -> Result<()> { + let idempotence = enabled_idempotence(); + idempotence.set_writer_id(42); + let config = Config::default(); + let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence)); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = Arc::new(build_cluster(&table_path, 1, 2)); + + // Create and drain 3 batches to get them with sequences 0, 1, 2 + let batch0 = append_and_drain(&accumulator, &cluster, &table_path, 0)?; + let batch1 = append_and_drain(&accumulator, &cluster, &table_path, 0)?; + let batch2 = append_and_drain(&accumulator, &cluster, &table_path, 0)?; + + assert_eq!(batch0.write_batch.batch_sequence(), 0); + assert_eq!(batch1.write_batch.batch_sequence(), 1); + assert_eq!(batch2.write_batch.batch_sequence(), 2); + + let batch0_id = batch0.write_batch.batch_id(); + let batch1_id = batch1.write_batch.batch_id(); + let batch2_id = batch2.write_batch.batch_id(); + let table_bucket = batch0.table_bucket.clone(); + + // Re-enqueue in reverse order: 2, 0, 1 + // insert_in_sequence_order should sort them as: 0, 1, 2 + accumulator.re_enqueue(batch2); + accumulator.re_enqueue(batch0); + accumulator.re_enqueue(batch1); + + // Verify the deque order directly + let entry = accumulator + .write_batches + .get(&PhysicalTablePath::of(Arc::new(table_path))) + .unwrap(); + let dq = entry.batches.get(&0).unwrap(); + let dq_guard = dq.lock(); + assert_eq!(dq_guard.len(), 3); + // batch0 (seq=0) is the first in-flight, so it should be at front + assert_eq!(dq_guard[0].batch_id(), batch0_id); + assert_eq!(dq_guard[0].batch_sequence(), 0); + assert_eq!(dq_guard[1].batch_id(), batch1_id); + assert_eq!(dq_guard[1].batch_sequence(), 1); + assert_eq!(dq_guard[2].batch_id(), batch2_id); + assert_eq!(dq_guard[2].batch_sequence(), 2); + drop(dq_guard); + + // Drain: first in-flight is seq=0, so batch0 passes should_stop check + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?; + let drained = batches.remove(&1).expect("drained batches"); + assert_eq!(drained.len(), 1); + assert_eq!(drained[0].write_batch.batch_sequence(), 0); + + // Complete batch0 so batch1 becomes first in-flight + idempotence.handle_completed_batch(&table_bucket, batch0_id, 42); + + let mut batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?; + let drained = batches.remove(&1).expect("drained"); + assert_eq!(drained[0].write_batch.batch_sequence(), 1); + + idempotence.handle_completed_batch(&table_bucket, batch1_id, 42); + + let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?; + let drained = batches.remove(&1).expect("drained"); + assert_eq!(drained[0].write_batch.batch_sequence(), 2); + + Ok(()) + } + + #[tokio::test] + async fn test_abort_batches() -> Result<()> { + let idempotence = disabled_idempotence(); + let config = Config::default(); + let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence)); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let cluster = Arc::new(build_cluster(&table_path, 1, 1)); + let row = GenericRow { + values: vec![Datum::Int32(1)], + }; + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + + let result = accumulator.append(&record, 0, &cluster, false)?; + let handle = result.result_handle.expect("handle"); + assert!(accumulator.has_incomplete()); + + accumulator.abort_batches(broadcast::Error::Client { + message: "test abort".to_string(), + }); + + assert!(!accumulator.has_incomplete()); + assert!(!accumulator.has_undrained()); + + // The handle should receive the error + let batch_result = handle.wait().await?; + assert!(matches!( + batch_result, + Err(broadcast::Error::Client { message }) if message == "test abort" + )); + Ok(()) + } + + #[tokio::test] + async fn test_drain_skips_blocked_bucket_continues_others() -> Result<()> { + // Use max_in_flight=1 so that one in-flight batch blocks further draining + let idempotence = Arc::new(IdempotenceManager::new(true, 1)); + idempotence.set_writer_id(42); + let config = Config::default(); + let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence)); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = Arc::new(build_cluster(&table_path, 1, 2)); + + // Append to both buckets + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 2)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let row = GenericRow { + values: vec![Datum::Int32(1)], + }; + + // Append to bucket 0 + let record = + WriteRecord::for_append(table_info.clone(), physical_table_path.clone(), 1, &row); + accumulator.append(&record, 0, &cluster, false)?; + + // Append to bucket 1 + let record = + WriteRecord::for_append(table_info.clone(), physical_table_path.clone(), 1, &row); + accumulator.append(&record, 1, &cluster, false)?; + + // Drain once — both buckets get batches assigned with sequences + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?; + let drained = batches.get(&1).expect("drained"); + // Both buckets should produce batches + assert_eq!(drained.len(), 2); + + // Now: both buckets have 1 in-flight each (added during drain). + // Append another record to each bucket. + let record = + WriteRecord::for_append(table_info.clone(), physical_table_path.clone(), 1, &row); + accumulator.append(&record, 0, &cluster, false)?; + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + accumulator.append(&record, 1, &cluster, false)?; + + // With max_in_flight=1, both buckets are at limit → should_stop returns true + // for fresh batches. The drain should skip both (continue, not break). + let batches2 = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?; + // No batches should be drained (both blocked) + assert!( + batches2.is_empty() || batches2.get(&1).is_none_or(|b| b.is_empty()), + "Expected no batches when all buckets are blocked" + ); + + // Complete the in-flight for bucket 0 + let bucket0_batch = &drained[0]; + idempotence.handle_completed_batch( + &bucket0_batch.table_bucket, + bucket0_batch.write_batch.batch_id(), + 42, + ); + + // Now bucket 0 is unblocked but bucket 1 is still blocked + let batches3 = accumulator.drain(cluster, &nodes, 1024 * 1024)?; + let drained3 = batches3.get(&1).expect("some drained"); + // Only bucket 0 should produce a batch (continue skipped bucket 1) + assert_eq!(drained3.len(), 1); + assert_eq!(drained3[0].table_bucket.bucket_id(), 0); + + Ok(()) + } + + #[test] + fn test_memory_limiter_acquire_release() { + let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(1))); + + let permit1 = limiter.acquire(512).unwrap(); + let permit2 = limiter.acquire(512).unwrap(); + + // At capacity — verify used is 1024 + assert_eq!(*limiter.state.lock(), 1024); + + // Release one permit, verify used drops + drop(permit1); + assert_eq!(*limiter.state.lock(), 512); + + drop(permit2); + assert_eq!(*limiter.state.lock(), 0); + } + + #[test] + fn test_memory_limiter_oversized_batch_fails_immediately() { + let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(60))); + + let result = limiter.acquire(2048); + assert!(matches!(result.unwrap_err(), Error::IllegalArgument { .. })); + } + + #[test] + fn test_memory_limiter_blocks_then_unblocks() { + let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(5))); + + let permit = limiter.acquire(1024).unwrap(); + assert_eq!(*limiter.state.lock(), 1024); + + // Spawn a thread that tries to acquire — it should block + let limiter2 = Arc::clone(&limiter); + let handle = std::thread::spawn(move || limiter2.acquire(512)); + + // Give the thread time to block + std::thread::sleep(Duration::from_millis(50)); + // Still at capacity (thread is blocked) + assert_eq!(*limiter.state.lock(), 1024); + + // Release the permit — thread should unblock + drop(permit); + + let result = handle.join().unwrap(); + assert!(result.is_ok()); + let _permit2 = result.unwrap(); + assert_eq!(*limiter.state.lock(), 512); + } + + #[test] + fn test_memory_limiter_timeout() { + let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_millis(100))); + + let _permit = limiter.acquire(1024).unwrap(); + + // This should timeout + let start = Instant::now(); + let result = limiter.acquire(512); + let elapsed = start.elapsed(); + + assert!(matches!(result.unwrap_err(), Error::BufferExhausted { .. })); + assert!(elapsed >= Duration::from_millis(80)); // allow some timing slack + } + + #[test] + fn test_memory_limiter_close_fails_immediately() { + let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(60))); + + let _permit = limiter.acquire(512).unwrap(); + + limiter.close(); + + // New acquire should fail immediately, not block for 60s + let start = Instant::now(); + let result = limiter.acquire(256); + let elapsed = start.elapsed(); + + assert!(matches!(result.unwrap_err(), Error::WriterClosed { .. })); + assert!(elapsed < Duration::from_millis(50)); + } + + #[test] + fn test_memory_limiter_close_unblocks_waiting_threads() { + let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(60))); + + // Fill the limiter completely + let _permit = limiter.acquire(1024).unwrap(); + + // Spawn a thread that blocks waiting for memory + let limiter2 = Arc::clone(&limiter); + let handle = std::thread::spawn(move || { + let start = Instant::now(); + let result = limiter2.acquire(512); + (result, start.elapsed()) + }); + + // Give the thread time to block + std::thread::sleep(Duration::from_millis(50)); + assert_eq!(limiter.waiting_count.load(Ordering::Relaxed), 1); + + // Close the limiter — should unblock the waiting thread + limiter.close(); + + let (result, elapsed) = handle.join().unwrap(); + assert!(matches!(result.unwrap_err(), Error::WriterClosed { .. })); + assert!(elapsed < Duration::from_secs(5)); // should not wait the full 60s + } + + #[test] + fn test_oversized_kv_record_does_not_panic() { + use crate::client::write::write_format::WriteFormat; + use crate::client::write::{RowBytes, WriteRecord}; + use bytes::Bytes; + + // Use a tiny batch size so the KV record exceeds it + let config = Config { + writer_batch_size: 64, + writer_buffer_memory_size: 1024 * 1024, + ..Config::default() + }; + + let accumulator = RecordAccumulator::new(config, disabled_idempotence()); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let cluster = Arc::new(build_cluster(&table_path, 1, 1)); + + // Create a KV record larger than batch_size (64 bytes) + let key = Bytes::from(vec![0u8; 32]); + let value = vec![0u8; 256]; + let record = WriteRecord::for_upsert( + table_info, + physical_table_path, + 1, + key, + None, + WriteFormat::CompactedKv, + None, + Some(RowBytes::Owned(Bytes::from(value))), + ); + + // This used to panic with "must append to a new batch" because + // the KV write limit was hardcoded to DEFAULT_WRITE_LIMIT (256 bytes) + // instead of using alloc_size = max(batch_size, record_size). + let result = accumulator.append(&record, 0, &cluster, false); + assert!(result.is_ok(), "oversized KV record should not panic"); + } + + #[test] + fn test_memory_permit_accounts_for_oversized_record() { + use crate::client::write::write_format::WriteFormat; + use crate::client::write::{RowBytes, WriteRecord}; + use bytes::Bytes; + + let config = Config { + writer_batch_size: 64, + writer_buffer_memory_size: 1024 * 1024, + ..Config::default() + }; + + let accumulator = RecordAccumulator::new(config, disabled_idempotence()); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let cluster = Arc::new(build_cluster(&table_path, 1, 1)); + + let key = Bytes::from(vec![0u8; 32]); + let value = vec![0u8; 256]; + let record = WriteRecord::for_upsert( + table_info, + physical_table_path, + 1, + key, + None, + WriteFormat::CompactedKv, + None, + Some(RowBytes::Owned(Bytes::from(value))), + ); + + // estimated_record_size includes batch header overhead + let expected_alloc = record.estimated_record_size(); + assert!(expected_alloc > 64, "record should exceed batch_size=64"); + + accumulator.append(&record, 0, &cluster, false).unwrap(); + + // The permit should reserve max(batch_size, estimated_record_size) bytes. + let used = *accumulator.memory_limiter.state.lock(); + assert_eq!( + used, expected_alloc, + "memory limiter should reserve max(batch_size, estimated_record_size)" + ); + } + + #[tokio::test] + async fn test_sender_wakeup_notifies() { + let accumulator = RecordAccumulator::new(Config::default(), disabled_idempotence()); + + // notified() should complete when wakeup_sender() is called + let notified = accumulator.notified(); + accumulator.wakeup_sender(); + // If wakeup doesn't work, this would hang forever. + tokio::time::timeout(Duration::from_millis(100), notified) + .await + .expect("notified should complete after wakeup_sender"); + } + + #[test] + fn dynamic_batch_size_shrinks_after_small_drained_batch() { + let target = 256 * 1024; + let config = Config { + writer_dynamic_batch_size_enabled: true, + writer_batch_size: target, + writer_dynamic_batch_size_min: 4 * 1024, + writer_buffer_memory_size: 1024 * 1024, + ..Config::default() + }; + let accumulator = RecordAccumulator::new(config, disabled_idempotence()); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let cluster = Arc::new(build_cluster(&table_path, 1, 1)); + let row = GenericRow { + values: vec![Datum::Int32(1)], + }; + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + + accumulator.append(&record, 0, &cluster, false).unwrap(); + assert_eq!(*accumulator.memory_limiter.state.lock(), target as usize); + + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut drained = accumulator + .drain(cluster.clone(), &nodes, 1024 * 1024) + .unwrap(); + let mut batches = drained.remove(&1).expect("drained batches"); + let batch = batches.pop().expect("batch"); + accumulator.remove_incomplete_batches(batch.write_batch.batch_id()); + assert_eq!(*accumulator.memory_limiter.state.lock(), 0); + + accumulator.append(&record, 0, &cluster, false).unwrap(); + let second = *accumulator.memory_limiter.state.lock(); + assert!(second < target as usize, "{second} >= {target}"); + } + + #[test] + fn dynamic_batch_size_grows_after_full_drained_batch() { + let max = 256 * 1024; + let config = Config { + writer_dynamic_batch_size_enabled: true, + writer_batch_size: max, + writer_dynamic_batch_size_min: 4 * 1024, + writer_buffer_memory_size: 4 * 1024 * 1024, + ..Config::default() + }; + let accumulator = RecordAccumulator::new(config, disabled_idempotence()); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let cluster = Arc::new(build_cluster(&table_path, 1, 1)); + let nodes = HashSet::from([cluster.get_tablet_server(1).unwrap().clone()]); + + let kv = |size: usize| { + WriteRecord::for_upsert( + Arc::clone(&table_info), + Arc::clone(&physical_table_path), + 1, + Bytes::from(vec![0u8; 32]), + None, + WriteFormat::CompactedKv, + None, + Some(RowBytes::Owned(Bytes::from(vec![0u8; size]))), + ) + }; + let drain_one = || { + let mut d = accumulator.drain(cluster.clone(), &nodes, max).unwrap(); + let b = d.remove(&1).unwrap().pop().unwrap(); + accumulator.remove_incomplete_batches(b.write_batch.batch_id()); + }; + let target = || { + accumulator + .estimated_batch_size(&physical_table_path) + .unwrap() + }; + + accumulator.append(&kv(1), 0, &cluster, false).unwrap(); + drain_one(); + let after_shrink = target(); + assert!( + after_shrink < max as usize, + "shrink failed: after_shrink={after_shrink} max={max}" + ); + + // 0.9 sits safely above GROW_THRESHOLD (0.8) to avoid f64 boundary noise. + accumulator + .append(&kv(after_shrink * 9 / 10), 0, &cluster, false) + .unwrap(); + drain_one(); + let after_grow = target(); + assert!( + after_grow > after_shrink, + "grow failed: after_grow={after_grow} after_shrink={after_shrink}" + ); + } + + #[test] + fn dynamic_batch_size_disabled_keeps_static_target() { + let target = 256 * 1024; + let config = Config { + writer_dynamic_batch_size_enabled: false, + writer_batch_size: target, + writer_dynamic_batch_size_min: 4 * 1024, + writer_buffer_memory_size: 1024 * 1024, + ..Config::default() + }; + let accumulator = RecordAccumulator::new(config, disabled_idempotence()); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + let cluster = Arc::new(build_cluster(&table_path, 1, 1)); + let row = GenericRow { + values: vec![Datum::Int32(1)], + }; + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + for _ in 0..3 { + accumulator.append(&record, 0, &cluster, false).unwrap(); + assert_eq!(*accumulator.memory_limiter.state.lock(), target as usize); + + let mut drained = accumulator + .drain(cluster.clone(), &nodes, 1024 * 1024) + .unwrap(); + let mut batches = drained.remove(&1).expect("drained batches"); + let batch = batches.pop().expect("batch"); + accumulator.remove_incomplete_batches(batch.write_batch.batch_id()); + } + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs new file mode 100644 index 0000000000..fd70cb9715 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/batch.rs @@ -0,0 +1,790 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::broadcast::{BatchWriteResult, BroadcastOnce}; +use crate::client::{Record, ResultHandle, WriteRecord}; +use crate::compression::{ArrowCompressionInfo, ArrowCompressionRatioEstimator}; +use crate::error::{Error, Result}; +use crate::metadata::{KvFormat, PhysicalTablePath, RowType}; +use crate::record::MemoryLogRecordsArrowBuilder; +use crate::record::kv::KvRecordBatchBuilder; +use crate::record::{NO_BATCH_SEQUENCE, NO_WRITER_ID}; +use bytes::Bytes; +use std::cmp::max; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicI32, Ordering}; + +pub struct InnerWriteBatch { + batch_id: i64, + physical_table_path: Arc, + create_ms: i64, + results: BroadcastOnce, + completed: AtomicBool, + attempts: AtomicI32, + drained_ms: i64, + batch_sequence: i32, + writer_id: i64, +} + +impl InnerWriteBatch { + fn new(batch_id: i64, physical_table_path: Arc, create_ms: i64) -> Self { + InnerWriteBatch { + batch_id, + physical_table_path, + create_ms, + results: Default::default(), + completed: AtomicBool::new(false), + attempts: AtomicI32::new(0), + drained_ms: -1, + batch_sequence: NO_BATCH_SEQUENCE, + writer_id: NO_WRITER_ID, + } + } + + pub fn batch_sequence(&self) -> i32 { + self.batch_sequence + } + + pub fn writer_id(&self) -> i64 { + self.writer_id + } + + pub fn has_batch_sequence(&self) -> bool { + self.batch_sequence != NO_BATCH_SEQUENCE + } + + fn waited_time_ms(&self, now: i64) -> i64 { + max(0i64, now - self.create_ms) + } + + fn complete(&self, write_result: BatchWriteResult) -> bool { + if self + .completed + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_err() + { + return false; + } + self.results.broadcast(write_result); + true + } + + fn drained(&mut self, now_ms: i64) { + self.drained_ms = max(self.drained_ms, now_ms); + } + + fn physical_table_path(&self) -> &Arc { + &self.physical_table_path + } + + fn attempts(&self) -> i32 { + self.attempts.load(Ordering::Acquire) + } + + fn re_enqueued(&self) { + self.attempts.fetch_add(1, Ordering::AcqRel); + } + + fn is_done(&self) -> bool { + self.completed.load(Ordering::Acquire) + } +} + +pub enum WriteBatch { + ArrowLog(ArrowLogWriteBatch), + Kv(KvWriteBatch), +} + +impl WriteBatch { + pub fn inner_batch(&self) -> &InnerWriteBatch { + match self { + WriteBatch::ArrowLog(batch) => &batch.write_batch, + WriteBatch::Kv(batch) => &batch.write_batch, + } + } + + pub fn inner_batch_mut(&mut self) -> &mut InnerWriteBatch { + match self { + WriteBatch::ArrowLog(batch) => &mut batch.write_batch, + WriteBatch::Kv(batch) => &mut batch.write_batch, + } + } + + pub fn try_append(&mut self, write_record: &WriteRecord) -> Result> { + match self { + WriteBatch::ArrowLog(batch) => batch.try_append(write_record), + WriteBatch::Kv(batch) => batch.try_append(write_record), + } + } + + pub fn waited_time_ms(&self, now: i64) -> i64 { + self.inner_batch().waited_time_ms(now) + } + + pub fn close(&mut self) -> Result<()> { + match self { + WriteBatch::ArrowLog(batch) => { + batch.close(); + Ok(()) + } + WriteBatch::Kv(batch) => batch.close(), + } + } + + pub fn estimated_size_in_bytes(&self) -> usize { + match self { + WriteBatch::ArrowLog(batch) => batch.estimated_size_in_bytes(), + WriteBatch::Kv(batch) => batch.estimated_size_in_bytes(), + } + } + + pub fn is_closed(&self) -> bool { + match self { + WriteBatch::ArrowLog(batch) => batch.is_closed(), + WriteBatch::Kv(batch) => batch.is_closed(), + } + } + + pub fn drained(&mut self, now_ms: i64) { + self.inner_batch_mut().drained(now_ms); + } + + pub fn build(&mut self) -> Result { + match self { + WriteBatch::ArrowLog(batch) => batch.build(), + WriteBatch::Kv(batch) => batch.build(), + } + } + + pub fn complete(&self, write_result: BatchWriteResult) -> bool { + self.inner_batch().complete(write_result) + } + + pub fn batch_id(&self) -> i64 { + self.inner_batch().batch_id + } + + pub fn physical_table_path(&self) -> &Arc { + self.inner_batch().physical_table_path() + } + + pub fn attempts(&self) -> i32 { + self.inner_batch().attempts() + } + + pub fn re_enqueued(&self) { + self.inner_batch().re_enqueued(); + } + + pub fn is_done(&self) -> bool { + self.inner_batch().is_done() + } + + pub fn batch_sequence(&self) -> i32 { + self.inner_batch().batch_sequence() + } + + pub fn writer_id(&self) -> i64 { + self.inner_batch().writer_id() + } + + pub fn has_batch_sequence(&self) -> bool { + self.inner_batch().has_batch_sequence() + } + + pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) { + match self { + WriteBatch::ArrowLog(batch) => batch.set_writer_state(writer_id, batch_base_sequence), + WriteBatch::Kv(batch) => batch.set_writer_state(writer_id, batch_base_sequence), + } + } +} + +pub struct ArrowLogWriteBatch { + pub write_batch: InnerWriteBatch, + pub arrow_builder: MemoryLogRecordsArrowBuilder, + built_records: Option, +} + +impl ArrowLogWriteBatch { + #[allow(clippy::too_many_arguments)] + pub fn new( + batch_id: i64, + physical_table_path: Arc, + schema_id: i32, + arrow_compression_info: ArrowCompressionInfo, + row_type: &RowType, + create_ms: i64, + to_append_record_batch: bool, + write_limit: usize, + compression_ratio_estimator: Arc, + ) -> Result { + let base = InnerWriteBatch::new(batch_id, physical_table_path, create_ms); + Ok(Self { + write_batch: base, + arrow_builder: MemoryLogRecordsArrowBuilder::new( + schema_id, + row_type, + to_append_record_batch, + arrow_compression_info, + write_limit, + compression_ratio_estimator, + )?, + built_records: None, + }) + } + + pub fn batch_id(&self) -> i64 { + self.write_batch.batch_id + } + + pub fn try_append(&mut self, write_record: &WriteRecord) -> Result> { + if self.arrow_builder.is_closed() || self.arrow_builder.is_full() { + Ok(None) + } else { + // append successfully + if self.arrow_builder.append(write_record)? { + Ok(Some(ResultHandle::new(self.write_batch.results.receiver()))) + } else { + // append fail + Ok(None) + } + } + } + + pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) { + self.arrow_builder + .set_writer_state(writer_id, batch_base_sequence); + self.write_batch.batch_sequence = batch_base_sequence; + self.write_batch.writer_id = writer_id; + self.built_records = None; + } + + pub fn build(&mut self) -> Result { + if let Some(bytes) = &self.built_records { + return Ok(bytes.clone()); + } + let bytes = Bytes::from(self.arrow_builder.build()?); + self.built_records = Some(bytes.clone()); + Ok(bytes) + } + + pub fn is_closed(&self) -> bool { + self.arrow_builder.is_closed() + } + + pub fn close(&mut self) { + self.arrow_builder.close() + } + + /// Get an estimate of the number of bytes written to the underlying buffer. + /// The returned value is exactly correct if the batch has been built. + pub fn estimated_size_in_bytes(&self) -> usize { + if let Some(ref bytes) = self.built_records { + // Return actual size if already built + bytes.len() + } else { + // Delegate to arrow builder for estimated size + self.arrow_builder.estimated_size_in_bytes() + } + } +} + +pub struct KvWriteBatch { + write_batch: InnerWriteBatch, + kv_batch_builder: KvRecordBatchBuilder, + target_columns: Option>>, + schema_id: i32, +} + +impl KvWriteBatch { + #[allow(clippy::too_many_arguments)] + pub fn new( + batch_id: i64, + physical_table_path: Arc, + schema_id: i32, + write_limit: usize, + kv_format: KvFormat, + target_columns: Option>>, + create_ms: i64, + ) -> Self { + let base = InnerWriteBatch::new(batch_id, physical_table_path, create_ms); + Self { + write_batch: base, + kv_batch_builder: KvRecordBatchBuilder::new(schema_id, write_limit, kv_format), + target_columns, + schema_id, + } + } + + pub fn try_append(&mut self, write_record: &WriteRecord) -> Result> { + let kv_write_record = match &write_record.record { + Record::Kv(record) => record, + _ => { + return Err(Error::UnsupportedOperation { + message: "Only KvRecord to append to KvWriteBatch ".to_string(), + }); + } + }; + + let key = kv_write_record.key.as_ref(); + + if self.schema_id != write_record.schema_id { + return Err(Error::UnexpectedError { + message: format!( + "schema id {} of the write record to append is not the same as the current schema id {} in the batch.", + write_record.schema_id, self.schema_id + ), + source: None, + }); + }; + + if self.target_columns != kv_write_record.target_columns { + return Err(Error::UnexpectedError { + message: format!( + "target columns {:?} of the write record to append are not the same as the current target columns {:?} in the batch.", + kv_write_record.target_columns, + self.target_columns.as_deref() + ), + source: None, + }); + } + + let row_bytes = kv_write_record.row_bytes(); + + if self.is_closed() || !self.kv_batch_builder.has_room_for_row(key, row_bytes) { + Ok(None) + } else { + // append successfully + self.kv_batch_builder + .append_row(key, row_bytes) + .map_err(|e| Error::UnexpectedError { + message: "Failed to append row to KvWriteBatch".to_string(), + source: Some(Box::new(e)), + })?; + Ok(Some(ResultHandle::new(self.write_batch.results.receiver()))) + } + } + + pub fn build(&mut self) -> Result { + self.kv_batch_builder.build() + } + + pub fn is_closed(&self) -> bool { + self.kv_batch_builder.is_closed() + } + + pub fn close(&mut self) -> Result<()> { + self.kv_batch_builder.close() + } + + pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) { + self.kv_batch_builder + .set_writer_state(writer_id, batch_base_sequence); + self.write_batch.batch_sequence = batch_base_sequence; + self.write_batch.writer_id = writer_id; + } + + pub fn target_columns(&self) -> Option<&Arc>> { + self.target_columns.as_ref() + } + + /// Get an estimate of the number of bytes written to the underlying buffer. + /// This returns the current size including header and all appended records. + pub fn estimated_size_in_bytes(&self) -> usize { + self.kv_batch_builder.get_size_in_bytes() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::{RowBytes, WriteFormat}; + use crate::metadata::TablePath; + use crate::test_utils::build_table_info; + + #[test] + fn complete_only_once() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let physical_path = PhysicalTablePath::of(Arc::new(table_path)); + let batch = InnerWriteBatch::new(1, Arc::new(physical_path), 0); + assert!(batch.complete(Ok(()))); + assert!(!batch.complete(Err(crate::client::broadcast::Error::Dropped))); + } + + #[test] + fn attempts_increment_on_reenqueue() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let physical_path = PhysicalTablePath::of(Arc::new(table_path)); + let batch = InnerWriteBatch::new(1, Arc::new(physical_path), 0); + assert_eq!(batch.attempts(), 0); + batch.re_enqueued(); + assert_eq!(batch.attempts(), 1); + } + + #[test] + fn test_arrow_log_write_batch_estimated_size() { + use crate::client::WriteRecord; + use crate::compression::{ + ArrowCompressionInfo, ArrowCompressionType, DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }; + use crate::metadata::{DataField, DataTypes, RowType}; + use crate::row::GenericRow; + use arrow::array::{Int32Array, RecordBatch, StringArray}; + use std::sync::Arc; + + let row_type = RowType::new(vec![ + DataField::new("id".to_string(), DataTypes::int(), None), + DataField::new("name".to_string(), DataTypes::string(), None), + ]); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path))); + + // Test 1: RowAppendRecordBatchBuilder (to_append_record_batch=false) + { + let mut batch = ArrowLogWriteBatch::new( + 1, + Arc::clone(&physical_table_path), + 1, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + &row_type, + 0, + false, + 2 * 1024 * 1024, + Arc::new(ArrowCompressionRatioEstimator::default()), + ) + .unwrap(); + + // Append rows + for _ in 0..200 { + let mut row = GenericRow::new(2); + row.set_field(0, 1_i32); + row.set_field(1, "hello"); + let record = WriteRecord::for_append( + Arc::clone(&table_info), + Arc::clone(&physical_table_path), + 1, + &row, + ); + batch.try_append(&record).unwrap(); + } + + let estimated_size = batch.estimated_size_in_bytes(); + assert!(estimated_size > 0); + + let built_data = batch.build().unwrap(); + let actual_size = built_data.len(); + + let diff = actual_size.abs_diff(estimated_size); + let threshold = actual_size / 10; // 10% tolerance + assert!( + diff <= threshold, + "RowAppend: estimated_size {estimated_size} and actual_size {actual_size} differ by more than 10%" + ); + } + + // Test 2: PrebuiltRecordBatchBuilder (to_append_record_batch=true) + { + let mut batch = ArrowLogWriteBatch::new( + 1, + physical_table_path.clone(), + 1, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + &row_type, + 0, + true, + 2 * 1024 * 1024, + Arc::new(ArrowCompressionRatioEstimator::default()), + ) + .unwrap(); + + // Create a pre-built RecordBatch + let schema = crate::record::to_arrow_schema(&row_type).unwrap(); + let ids: Vec = (0..200).collect(); + let names: Vec<&str> = (0..200).map(|_| "hello").collect(); + let record_batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(ids)), + Arc::new(StringArray::from(names)), + ], + ) + .unwrap(); + + let record = WriteRecord::for_append_record_batch( + Arc::clone(&table_info), + Arc::clone(&physical_table_path), + 1, + record_batch, + ); + batch.try_append(&record).unwrap(); + + let estimated_size = batch.estimated_size_in_bytes(); + assert!(estimated_size > 0); + + let built_data = batch.build().unwrap(); + let actual_size = built_data.len(); + + let diff = actual_size.abs_diff(estimated_size); + let threshold = actual_size / 10; // 10% tolerance + assert!( + diff <= threshold, + "Prebuilt: estimated_size {estimated_size} and actual_size {actual_size} differ by more than 10%" + ); + } + } + + #[test] + fn test_kv_write_batch_estimated_size() { + use crate::metadata::KvFormat; + + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path))); + + let mut batch = KvWriteBatch::new( + 1, + Arc::clone(&physical_path), + 1, + 256, + KvFormat::COMPACTED, + None, + 0, + ); + + for _ in 0..200 { + let record = WriteRecord::for_upsert( + Arc::clone(&table_info), + Arc::clone(&physical_path), + 1, + Bytes::from(vec![1_u8, 2_u8, 3_u8]), + None, + WriteFormat::CompactedKv, + None, + Some(RowBytes::Owned(Bytes::from(vec![1_u8, 2_u8, 3_u8]))), + ); + batch.try_append(&record).unwrap(); + } + + let estimated_size = batch.estimated_size_in_bytes(); + let actual_size = batch.build().unwrap().len(); + + assert_eq!( + actual_size, estimated_size, + "estimated size {estimated_size} is not equal to actual size" + ); + } + + /// Verifies byte-size-based fullness: + /// 1. Actual built size stays within the configured limit (no compression). + /// 2. Old 256-record cap is gone — large batches accept >256 small rows. + /// 3. Compression feedback loop: shared estimator updates after build(), + /// second batch with same estimator accepts more records. + #[test] + fn test_arrow_batch_byte_size_fullness() { + use crate::client::WriteRecord; + use crate::compression::{ + ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType, + DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }; + use crate::metadata::{DataField, DataTypes, RowType}; + use crate::row::GenericRow; + use std::sync::Arc; + + let row_type = RowType::new(vec![ + DataField::new("id".to_string(), DataTypes::int(), None), + DataField::new("name".to_string(), DataTypes::string(), None), + ]); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path))); + + // --- Part 1: actual built size stays within limit (uncompressed) --- + let write_limit: usize = 16 * 1024; + let mut batch = ArrowLogWriteBatch::new( + 1, + Arc::clone(&physical_table_path), + 1, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + &row_type, + 0, + false, + write_limit, + Arc::new(ArrowCompressionRatioEstimator::default()), + ) + .unwrap(); + + let mut appended = 0; + for i in 0..100_000 { + let mut row = GenericRow::new(2); + row.set_field(0, i); + row.set_field(1, "hello_world"); + let record = WriteRecord::for_append( + Arc::clone(&table_info), + Arc::clone(&physical_table_path), + 1, + &row, + ); + match batch.try_append(&record).unwrap() { + Some(_) => appended += 1, + None => break, + } + } + + assert!( + appended > 0 && appended < 100_000, + "batch should have filled, appended: {appended}" + ); + let built = batch.build().unwrap(); + assert!( + built.len() <= write_limit * 120 / 100, + "actual size {} exceeds write_limit {write_limit} by more than 20%", + built.len() + ); + + // --- Part 2: old 256-record cap is gone --- + let row_type_small = RowType::new(vec![DataField::new( + "id".to_string(), + DataTypes::int(), + None, + )]); + let mut batch = ArrowLogWriteBatch::new( + 2, + Arc::clone(&physical_table_path), + 1, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + &row_type_small, + 0, + false, + 2 * 1024 * 1024, + Arc::new(ArrowCompressionRatioEstimator::default()), + ) + .unwrap(); + + let mut appended = 0; + for i in 0..1000 { + let mut row = GenericRow::new(1); + row.set_field(0, i); + let record = WriteRecord::for_append( + Arc::clone(&table_info), + Arc::clone(&physical_table_path), + 1, + &row, + ); + match batch.try_append(&record).unwrap() { + Some(_) => appended += 1, + None => break, + } + } + assert_eq!(appended, 1000, "2MB batch should fit 1000 tiny rows"); + + // --- Part 3: compression feedback loop --- + let estimator = Arc::new(ArrowCompressionRatioEstimator::default()); + assert_eq!(estimator.estimation(), 1.0); + + let write_limit = 64 * 1024; + let compression = ArrowCompressionInfo { + compression_type: ArrowCompressionType::Zstd, + compression_level: 3, + }; + + // First batch: fill and build with ZSTD. + let mut batch1 = ArrowLogWriteBatch::new( + 3, + Arc::clone(&physical_table_path), + 1, + compression.clone(), + &row_type, + 0, + false, + write_limit, + Arc::clone(&estimator), + ) + .unwrap(); + + for i in 0..500 { + let mut row = GenericRow::new(2); + row.set_field(0, i); + row.set_field(1, "aaaaaaaaaaaaaaaa"); + let record = WriteRecord::for_append( + Arc::clone(&table_info), + Arc::clone(&physical_table_path), + 1, + &row, + ); + if batch1.try_append(&record).unwrap().is_none() { + break; + } + } + batch1.build().unwrap(); + + // Estimator should have decreased (ZSTD compresses repeated data well). + assert!( + estimator.estimation() < 1.0, + "ratio should decrease after compressed build, got: {}", + estimator.estimation() + ); + + // Second batch: same estimator → knows data compresses well → accepts more rows. + let mut batch2 = ArrowLogWriteBatch::new( + 4, + Arc::clone(&physical_table_path), + 1, + compression, + &row_type, + 0, + false, + write_limit, + Arc::clone(&estimator), + ) + .unwrap(); + + let mut appended2 = 0; + for i in 0..10_000 { + let mut row = GenericRow::new(2); + row.set_field(0, i); + row.set_field(1, "aaaaaaaaaaaaaaaa"); + let record = WriteRecord::for_append( + Arc::clone(&table_info), + Arc::clone(&physical_table_path), + 1, + &row, + ); + match batch2.try_append(&record).unwrap() { + Some(_) => appended2 += 1, + None => break, + } + } + assert!( + appended2 > 500, + "second batch should accept more records with updated ratio, got: {appended2}" + ); + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/broadcast.rs b/fluss-rust/crates/fluss/src/client/write/broadcast.rs new file mode 100644 index 0000000000..9e00403586 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/broadcast.rs @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use parking_lot::RwLock; +use std::sync::Arc; +use thiserror::Error; +use tokio::sync::Notify; + +pub type Result = std::result::Result; + +pub type BatchWriteResult = Result<(), Error>; + +#[derive(Debug, Error, Clone, PartialEq, Eq)] +pub enum Error { + #[error("BroadcastOnce dropped")] + Dropped, + #[error("Write failed: {message} (code {code})")] + WriteFailed { code: i32, message: String }, + #[error("Write failed before request was sent: {message}")] + Client { message: String }, +} + +#[derive(Debug, Clone)] +pub struct BroadcastOnceReceiver { + shared: Arc>, +} + +impl BroadcastOnceReceiver { + /// Returns `Some(_)` if data has been produced + pub fn peek(&self) -> Option> { + self.shared.data.read().clone() + } + + /// Waits for [`BroadcastOnce::broadcast`] to be called or returns an error + /// if the [`BroadcastOnce`] is dropped without a value being published + pub async fn receive(&self) -> Result { + let notified = self.shared.notify.notified(); + + if let Some(v) = self.peek() { + return v; + } + + notified.await; + + self.peek().expect("just got notified") + } + + /// Force-complete with an error if not already completed. + /// Used by `abort_batches` to fail in-flight handles that can't be + /// reached through `WriteBatch::complete`. + pub(crate) fn fail(&self, error: Error) { + let mut data = self.shared.data.write(); + if data.is_none() { + *data = Some(Err(error)); + self.shared.notify.notify_waiters(); + } + } +} + +#[derive(Debug)] +struct Shared { + data: RwLock>>, + notify: Notify, +} + +#[derive(Debug)] +pub struct BroadcastOnce +where + T: Send + Sync, +{ + shared: Arc>, +} + +impl Default for BroadcastOnce +where + T: Send + Sync, +{ + fn default() -> Self { + Self { + shared: Arc::new(Shared { + data: Default::default(), + notify: Default::default(), + }), + } + } +} + +impl BroadcastOnce { + /// Returns a [`BroadcastOnceReceiver`] that can be used to wait on + /// a call to [`BroadcastOnce::broadcast`] on this instance + pub fn receiver(&self) -> BroadcastOnceReceiver { + BroadcastOnceReceiver { + shared: Arc::clone(&self.shared), + } + } + + /// Broadcast a value to all [`BroadcastOnceReceiver`] handles + pub fn broadcast(&self, r: T) { + let mut locked = self.shared.data.write(); + assert!(locked.is_none(), "double publish"); + + *locked = Some(Ok(r)); + self.shared.notify.notify_waiters(); + } +} + +impl Drop for BroadcastOnce +where + T: Send + Sync, +{ + fn drop(&mut self) { + let mut data = self.shared.data.write(); + if data.is_none() { + log::warn!("BroadcastOnce dropped without producing"); + *data = Some(Err(Error::Dropped)); + self.shared.notify.notify_waiters(); + } + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs new file mode 100644 index 0000000000..8ad38e3d42 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs @@ -0,0 +1,259 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::bucketing::BucketingFunction; +use crate::cluster::Cluster; +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::PhysicalTablePath; +use bytes::Bytes; +use rand::Rng; +use std::sync::Arc; +use std::sync::atomic::{AtomicI32, Ordering}; + +pub trait BucketAssigner: Sync + Send { + fn abort_if_batch_full(&self) -> bool; + + fn on_new_batch(&self, cluster: &Cluster, prev_bucket_id: i32); + + fn assign_bucket(&self, bucket_key: Option<&Bytes>, cluster: &Cluster) -> Result; +} + +#[derive(Debug)] +pub struct StickyBucketAssigner { + table_path: Arc, + current_bucket_id: AtomicI32, +} + +impl StickyBucketAssigner { + pub fn new(table_path: Arc) -> Self { + Self { + table_path, + current_bucket_id: AtomicI32::new(-1), + } + } + + fn next_bucket(&self, cluster: &Cluster, prev_bucket_id: i32) -> i32 { + let old_bucket = self.current_bucket_id.load(Ordering::Relaxed); + let mut new_bucket = old_bucket; + if old_bucket < 0 || old_bucket == prev_bucket_id { + let available_buckets = cluster.get_available_buckets_for_table_path(&self.table_path); + if available_buckets.is_empty() { + let mut rng = rand::rng(); + let mut random: i32 = rng.random(); + random &= i32::MAX; + new_bucket = random % cluster.get_bucket_count(self.table_path.get_table_path()); + } else if available_buckets.len() == 1 { + new_bucket = available_buckets[0].table_bucket.bucket_id(); + } else { + let mut rng = rand::rng(); + while new_bucket < 0 || new_bucket == old_bucket { + let mut random: i32 = rng.random(); + random &= i32::MAX; + new_bucket = available_buckets + [(random % available_buckets.len() as i32) as usize] + .bucket_id(); + } + } + } + + if old_bucket < 0 { + self.current_bucket_id.store(new_bucket, Ordering::Relaxed); + } else { + self.current_bucket_id + .compare_exchange( + prev_bucket_id, + new_bucket, + Ordering::Relaxed, + Ordering::Relaxed, + ) + .ok(); + } + self.current_bucket_id.load(Ordering::Relaxed) + } +} + +impl BucketAssigner for StickyBucketAssigner { + fn abort_if_batch_full(&self) -> bool { + true + } + + fn on_new_batch(&self, cluster: &Cluster, prev_bucket_id: i32) { + self.next_bucket(cluster, prev_bucket_id); + } + + fn assign_bucket(&self, _bucket_key: Option<&Bytes>, cluster: &Cluster) -> Result { + let bucket_id = self.current_bucket_id.load(Ordering::Relaxed); + if bucket_id < 0 { + Ok(self.next_bucket(cluster, bucket_id)) + } else { + Ok(bucket_id) + } + } +} + +/// Unlike [StickyBucketAssigner], each record is assigned to the next bucket +/// in a rotating sequence, providing even data distribution across all buckets. +pub struct RoundRobinBucketAssigner { + table_path: Arc, + num_buckets: i32, + counter: AtomicI32, +} + +impl RoundRobinBucketAssigner { + pub fn new(table_path: Arc, num_buckets: i32) -> Self { + let mut rng = rand::rng(); + Self { + table_path, + num_buckets, + counter: AtomicI32::new(rng.random()), + } + } +} + +impl BucketAssigner for RoundRobinBucketAssigner { + fn abort_if_batch_full(&self) -> bool { + false + } + + fn on_new_batch(&self, _cluster: &Cluster, _prev_bucket_id: i32) {} + + fn assign_bucket(&self, _bucket_key: Option<&Bytes>, cluster: &Cluster) -> Result { + let next_value = self.counter.fetch_add(1, Ordering::Relaxed); + let available_buckets = cluster.get_available_buckets_for_table_path(&self.table_path); + if available_buckets.is_empty() { + Ok((next_value & i32::MAX) % self.num_buckets) + } else { + let idx = (next_value & i32::MAX) % available_buckets.len() as i32; + Ok(available_buckets[idx as usize].bucket_id()) + } + } +} + +/// A [BucketAssigner] which assigns based on a modulo hashing function +pub struct HashBucketAssigner { + num_buckets: i32, + bucketing_function: Box, +} + +#[allow(dead_code)] +impl HashBucketAssigner { + /// Creates a new [HashBucketAssigner] based on the given [BucketingFunction]. + /// See [BucketingFunction.of(Option<&DataLakeFormat>)] for bucketing functions. + /// + /// + /// # Arguments + /// * `num_buckets` - The number of buckets + /// * `bucketing_function` - The bucketing function + /// + /// # Returns + /// * [HashBucketAssigner] - The hash bucket assigner + pub fn new(num_buckets: i32, bucketing_function: Box) -> Self { + HashBucketAssigner { + num_buckets, + bucketing_function, + } + } +} + +impl BucketAssigner for HashBucketAssigner { + fn abort_if_batch_full(&self) -> bool { + false + } + + fn on_new_batch(&self, _: &Cluster, _: i32) { + // do nothing + } + + fn assign_bucket(&self, bucket_key: Option<&Bytes>, _: &Cluster) -> Result { + let key = bucket_key.ok_or_else(|| IllegalArgument { + message: "no bucket key provided".to_string(), + })?; + self.bucketing_function.bucketing(key, self.num_buckets) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::bucketing::BucketingFunction; + use crate::cluster::Cluster; + use crate::metadata::TablePath; + use crate::test_utils::build_cluster; + use std::sync::Arc; + + #[test] + fn sticky_bucket_assigner_picks_available_bucket() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let cluster = build_cluster(&table_path, 1, 2); + let assigner = StickyBucketAssigner::new(Arc::new(PhysicalTablePath::of(Arc::new( + table_path.clone(), + )))); + let bucket = assigner.assign_bucket(None, &cluster).expect("bucket"); + assert!((0..2).contains(&bucket)); + + assigner.on_new_batch(&cluster, bucket); + let next_bucket = assigner.assign_bucket(None, &cluster).expect("bucket"); + assert!((0..2).contains(&next_bucket)); + } + + #[test] + fn round_robin_assigner_cycles_through_buckets() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let num_buckets = 3; + let cluster = build_cluster(&table_path, 1, num_buckets); + let physical = Arc::new(PhysicalTablePath::of(Arc::new(table_path))); + let assigner = RoundRobinBucketAssigner::new(physical, num_buckets); + + let mut seen = Vec::new(); + for _ in 0..(num_buckets * 2) { + let bucket = assigner.assign_bucket(None, &cluster).expect("bucket"); + assert!((0..num_buckets).contains(&bucket)); + seen.push(bucket); + } + + assert_eq!(seen[0], seen[3]); + assert_eq!(seen[1], seen[4]); + assert_eq!(seen[2], seen[5]); + } + + #[test] + fn round_robin_assigner_does_not_abort_on_batch_full() { + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let physical = Arc::new(PhysicalTablePath::of(Arc::new(table_path))); + let assigner = RoundRobinBucketAssigner::new(physical, 3); + assert!(!assigner.abort_if_batch_full()); + } + + #[test] + fn hash_bucket_assigner_requires_key() { + let assigner = HashBucketAssigner::new(3, ::of(None)); + let cluster = Cluster::default(); + let err = assigner.assign_bucket(None, &cluster).unwrap_err(); + assert!(matches!(err, IllegalArgument { .. })); + } + + #[test] + fn hash_bucket_assigner_hashes_key() { + let assigner = HashBucketAssigner::new(4, ::of(None)); + let cluster = Cluster::default(); + let bucket = assigner + .assign_bucket(Some(&Bytes::from_static(b"key")), &cluster) + .expect("bucket"); + assert!((0..4).contains(&bucket)); + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/dynamic_batch_size.rs b/fluss-rust/crates/fluss/src/client/write/dynamic_batch_size.rs new file mode 100644 index 0000000000..408263ee5f --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/dynamic_batch_size.rs @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Per-table batch size estimator. Mirrors Java's `DynamicWriteBatchSizeEstimator`: +//! grow 10% above 80% fill, shrink 5% below 50%, clamped to `[min, max]`. + +use std::sync::atomic::{AtomicUsize, Ordering}; + +const GROW_THRESHOLD: f64 = 0.8; +const SHRINK_THRESHOLD: f64 = 0.5; +const GROW_FACTOR: f64 = 1.1; +const SHRINK_FACTOR: f64 = 0.95; + +#[derive(Debug)] +pub(crate) struct DynamicWriteBatchSizeEstimator { + current: AtomicUsize, + min: usize, + max: usize, +} + +impl DynamicWriteBatchSizeEstimator { + pub fn new(min_size: usize, max_size: usize) -> Self { + Self { + current: AtomicUsize::new(max_size), + min: min_size.min(max_size), + max: max_size, + } + } + + pub fn current(&self) -> usize { + self.current.load(Ordering::Relaxed) + } + + /// Last-write-wins on races, matching Java's `ConcurrentHashMap.put`. + pub fn update(&self, actual: usize) -> usize { + let prev = self.current.load(Ordering::Relaxed); + let cur = prev as f64; + let actual = actual as f64; + let next = if actual > cur * GROW_THRESHOLD { + cur * GROW_FACTOR + } else if actual < cur * SHRINK_THRESHOLD { + cur * SHRINK_FACTOR + } else { + cur + }; + let clamped = (next as usize).clamp(self.min, self.max); + if clamped != prev { + self.current.store(clamped, Ordering::Relaxed); + } + clamped + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const MIN: usize = 256 * 1024; + const MAX: usize = 2 * 1024 * 1024; + /// ~41 shrink steps, ~22 grow steps; 50 covers both with margin. + const CONVERGENCE_STEPS: usize = 50; + + #[test] + fn starts_at_max() { + let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX); + assert_eq!(est.current(), MAX); + } + + #[test] + fn min_clamped_to_max_when_misconfigured() { + let est = DynamicWriteBatchSizeEstimator::new(MAX * 2, MAX); + assert_eq!(est.current(), MAX); + assert_eq!(est.update(0), MAX); + } + + #[test] + fn grows_when_above_grow_threshold() { + let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX); + for _ in 0..CONVERGENCE_STEPS { + est.update(0); + } + assert_eq!(est.current(), MIN); + + // 0.9 sits safely past the 0.8 threshold and avoids f64 boundary noise. + let next = est.update((MIN as f64 * 0.9) as usize); + assert_eq!(next, ((MIN as f64) * GROW_FACTOR) as usize); + } + + #[test] + fn shrinks_when_below_shrink_threshold() { + let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX); + // 0.4 sits safely below the strict 0.5 threshold. + let next = est.update((MAX as f64 * 0.4) as usize); + assert_eq!(next, ((MAX as f64) * SHRINK_FACTOR) as usize); + } + + #[test] + fn shrink_clamps_to_min() { + let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX); + for _ in 0..CONVERGENCE_STEPS { + est.update(0); + } + assert_eq!(est.current(), MIN); + } + + #[test] + fn grow_clamps_to_max() { + let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX); + for _ in 0..CONVERGENCE_STEPS { + est.update(0); + } + for _ in 0..CONVERGENCE_STEPS { + est.update(est.current()); + } + assert_eq!(est.current(), MAX); + } + + #[test] + fn oversized_actual_clamps_at_max() { + let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX); + assert_eq!(est.update(MAX * 4), MAX); + } + + #[test] + fn dead_zone_is_a_fixed_point() { + let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX); + let initial = est.current(); + for _ in 0..20 { + est.update((est.current() as f64 * 0.65) as usize); + } + assert_eq!(est.current(), initial); + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/idempotence.rs b/fluss-rust/crates/fluss/src/client/write/idempotence.rs new file mode 100644 index 0000000000..eeec8761b2 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/idempotence.rs @@ -0,0 +1,767 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::metadata::TableBucket; +use crate::record::{NO_BATCH_SEQUENCE, NO_WRITER_ID}; +use crate::rpc::FlussError; +use log::debug; +use parking_lot::Mutex; +use std::collections::{HashMap, HashSet}; +use std::sync::atomic::{AtomicI64, Ordering}; + +struct InFlightBatch { + batch_sequence: i32, + batch_id: i64, +} + +struct BucketEntry { + writer_id: i64, + next_sequence: i32, + last_acked_sequence: i32, + in_flight: Vec, + reset_batch_ids: HashSet, +} + +impl BucketEntry { + fn new() -> Self { + Self { + writer_id: NO_WRITER_ID, + next_sequence: 0, + last_acked_sequence: -1, + in_flight: Vec::new(), + reset_batch_ids: HashSet::new(), + } + } +} + +pub struct IdempotenceManager { + writer_id: AtomicI64, + bucket_entries: Mutex>, + enabled: bool, + max_in_flight_requests_per_bucket: usize, +} + +impl IdempotenceManager { + pub fn new(enabled: bool, max_in_flight_requests_per_bucket: usize) -> Self { + Self { + writer_id: AtomicI64::new(NO_WRITER_ID), + bucket_entries: Mutex::new(HashMap::new()), + enabled, + max_in_flight_requests_per_bucket, + } + } + + pub fn is_enabled(&self) -> bool { + self.enabled + } + + pub fn writer_id(&self) -> i64 { + self.writer_id.load(Ordering::Acquire) + } + + pub fn has_writer_id(&self) -> bool { + self.writer_id() != NO_WRITER_ID + } + + pub fn is_writer_id_valid(&self) -> bool { + self.has_writer_id() + } + + pub fn in_flight_count(&self, bucket: &TableBucket) -> usize { + let entries = self.bucket_entries.lock(); + entries.get(bucket).map_or(0, |e| e.in_flight.len()) + } + + pub fn can_send_more_requests(&self, bucket: &TableBucket) -> bool { + self.in_flight_count(bucket) < self.max_in_flight_requests_per_bucket + } + + pub fn set_writer_id(&self, id: i64) { + self.writer_id.store(id, Ordering::Release); + } + + pub fn reset_writer_id(&self) { + self.writer_id.store(NO_WRITER_ID, Ordering::Release); + self.bucket_entries.lock().clear(); + } + + pub fn next_sequence_and_increment(&self, bucket: &TableBucket) -> i32 { + let mut entries = self.bucket_entries.lock(); + let entry = entries + .entry(bucket.clone()) + .or_insert_with(BucketEntry::new); + let seq = entry.next_sequence; + entry.next_sequence += 1; + seq + } + + pub fn add_in_flight_batch(&self, bucket: &TableBucket, batch_sequence: i32, batch_id: i64) { + debug_assert!( + batch_sequence != NO_BATCH_SEQUENCE, + "Can't track batch for bucket {bucket} when batch sequence is not set" + ); + let mut entries = self.bucket_entries.lock(); + let entry = entries + .entry(bucket.clone()) + .or_insert_with(BucketEntry::new); + // Insert sorted by batch_sequence + let pos = entry + .in_flight + .binary_search_by_key(&batch_sequence, |b| b.batch_sequence) + .unwrap_or_else(|e| e); + entry.in_flight.insert( + pos, + InFlightBatch { + batch_sequence, + batch_id, + }, + ); + } + + pub fn handle_completed_batch( + &self, + bucket: &TableBucket, + batch_id: i64, + batch_writer_id: i64, + ) { + if batch_writer_id != self.writer_id() { + debug!( + "Ignoring completed batch for bucket {bucket} with stale writer_id {batch_writer_id} (current: {})", + self.writer_id() + ); + return; + } + let mut entries = self.bucket_entries.lock(); + if let Some(entry) = entries.get_mut(bucket) { + // Find by batch_id to handle the case where the in-flight entry's sequence + // was adjusted by a prior handle_failed_batch call. + if let Some(pos) = entry.in_flight.iter().position(|b| b.batch_id == batch_id) { + let adjusted_seq = entry.in_flight[pos].batch_sequence; + entry.in_flight.remove(pos); + entry.reset_batch_ids.remove(&batch_id); + if adjusted_seq > entry.last_acked_sequence { + entry.last_acked_sequence = adjusted_seq; + } + } + } + } + + /// Handle a failed batch. Matches Java's `IdempotenceManager.handleFailedBatch`. + /// + /// For `OutOfOrderSequenceException` or `UnknownWriterIdException`, resets ALL + /// writer state (matching Java: "we cannot make any guarantees about the previously + /// committed message"). + /// + /// For other errors, removes the specific in-flight entry by `batch_id` and + /// optionally adjusts downstream sequences. `adjust_sequences` should only be true + /// when the batch has NOT exhausted its retries. + pub fn handle_failed_batch( + &self, + bucket: &TableBucket, + batch_id: i64, + batch_writer_id: i64, + error: Option, + adjust_sequences: bool, + ) { + if batch_writer_id != self.writer_id() { + debug!( + "Ignoring failed batch for bucket {bucket} with stale writer_id {batch_writer_id} (current: {})", + self.writer_id() + ); + return; + } + + let mut entries = self.bucket_entries.lock(); + + // Matches Java: OutOfOrderSequence or UnknownWriterId → reset all writer state. + // Java's synchronized handleFailedBatch can call synchronized resetWriterId + // because Java monitors are reentrant. We inline the reset here to stay in + // the same lock scope. + if let Some(e) = error { + if e == FlussError::OutOfOrderSequenceException + || e == FlussError::UnknownWriterIdException + { + debug!( + "Resetting writer ID due to {e:?} for bucket {bucket} \ + (writer_id={batch_writer_id}, batch_id={batch_id})" + ); + self.writer_id.store(NO_WRITER_ID, Ordering::Release); + entries.clear(); + return; + } + } + if let Some(entry) = entries.get_mut(bucket) { + // Find and remove by batch_id, capturing the (possibly adjusted) sequence + let failed_sequence = entry + .in_flight + .iter() + .position(|b| b.batch_id == batch_id) + .map(|pos| { + let seq = entry.in_flight[pos].batch_sequence; + entry.in_flight.remove(pos); + seq + }); + entry.reset_batch_ids.remove(&batch_id); + if adjust_sequences { + if let Some(failed_seq) = failed_sequence { + // Decrement sequences of in-flight batches that have higher sequences + for b in &mut entry.in_flight { + if b.batch_sequence > failed_seq { + b.batch_sequence -= 1; + debug_assert!( + b.batch_sequence >= 0, + "Batch sequence for batch_id={} went negative: {}", + b.batch_id, + b.batch_sequence + ); + entry.reset_batch_ids.insert(b.batch_id); + } + } + // Roll back next_sequence + if entry.next_sequence > failed_seq { + entry.next_sequence -= 1; + debug_assert!( + entry.next_sequence >= 0, + "Next sequence went negative: {}", + entry.next_sequence + ); + } + } + } + } + } + + #[cfg(test)] + pub fn remove_in_flight_batch(&self, bucket: &TableBucket, batch_id: i64) { + let mut entries = self.bucket_entries.lock(); + if let Some(entry) = entries.get_mut(bucket) { + entry.in_flight.retain(|b| b.batch_id != batch_id); + } + } + + /// If the bucket's stored writer_id doesn't match the current writer_id + /// and there are no in-flight batches, reset the bucket entry to start + /// sequences from 0. Matches Java's `IdempotenceManager.maybeUpdateWriterId`. + pub fn maybe_update_writer_id(&self, bucket: &TableBucket) { + let current_writer_id = self.writer_id(); + let mut entries = self.bucket_entries.lock(); + let entry = entries + .entry(bucket.clone()) + .or_insert_with(BucketEntry::new); + if entry.writer_id != current_writer_id && entry.in_flight.is_empty() { + entry.writer_id = current_writer_id; + entry.next_sequence = 0; + entry.last_acked_sequence = -1; + debug!( + "Writer id of bucket {bucket} set to {current_writer_id}. Reinitialize batch sequence at beginning." + ); + } + } + + /// Returns true if the given batch (identified by `batch_id`) is the first + /// in-flight batch for its bucket. Uses batch_id rather than batch_sequence + /// because sequence adjustment (`handle_failed_batch` with `adjust_sequences`) + /// modifies InFlightBatch sequences without updating the actual WriteBatch, + /// so batch_sequence on the WriteBatch may be stale. + pub fn is_first_in_flight_batch(&self, bucket: &TableBucket, batch_id: i64) -> bool { + let entries = self.bucket_entries.lock(); + entries + .get(bucket) + .and_then(|e| e.in_flight.first()) + .is_some_and(|b| b.batch_id == batch_id) + } + + /// Returns the current (possibly adjusted) in-flight sequence for a batch. + /// Used by `re_enqueue` to sync the WriteBatch's sequence with the adjusted + /// InFlightBatch sequence. + /// + /// Does NOT clear `reset_batch_ids` — the reset marker must survive + /// re-enqueue so that `can_retry_for_error` can still see it on subsequent + /// retries. It is cleared only on terminal events: `handle_completed_batch` + /// or `handle_failed_batch`. This matches Java where `reopened` persists + /// across retries and is only cleared in `close()` (resource cleanup). + pub fn get_adjusted_sequence(&self, bucket: &TableBucket, batch_id: i64) -> Option { + let entries = self.bucket_entries.lock(); + let entry = entries.get(bucket)?; + entry + .in_flight + .iter() + .find(|b| b.batch_id == batch_id) + .map(|b| b.batch_sequence) + } + + pub fn is_next_sequence(&self, bucket: &TableBucket, batch_sequence: i32) -> bool { + let entries = self.bucket_entries.lock(); + if let Some(entry) = entries.get(bucket) { + entry.last_acked_sequence + 1 == batch_sequence + } else { + // No entry means sequence 0 is expected (last_acked = -1, so -1 + 1 = 0) + batch_sequence == 0 + } + } + + /// Returns true if the batch has already been committed on the server. + /// + /// If the batch's sequence is less than or equal to `last_acked_sequence`, it means + /// a higher-sequence batch has already been acknowledged. This implies the current batch + /// was also successfully written on the server (otherwise the higher-sequence batch could + /// not have been committed). + pub fn is_already_committed(&self, bucket: &TableBucket, batch_sequence: i32) -> bool { + let entries = self.bucket_entries.lock(); + entries + .get(bucket) + .is_some_and(|e| e.last_acked_sequence >= 0 && batch_sequence <= e.last_acked_sequence) + } + + pub fn can_retry_for_error( + &self, + bucket: &TableBucket, + batch_sequence: i32, + batch_id: i64, + error: FlussError, + ) -> bool { + if !self.has_writer_id() { + return false; + } + let entries = self.bucket_entries.lock(); + let entry = entries.get(bucket); + let is_reset = entry.is_some_and(|e| e.reset_batch_ids.contains(&batch_id)); + + if error == FlussError::OutOfOrderSequenceException { + // Inline is_next_sequence logic to avoid double-locking + let is_next = entry.map_or(batch_sequence == 0, |e| { + e.last_acked_sequence + 1 == batch_sequence + }); + return is_reset || !is_next; + } + if error == FlussError::UnknownWriterIdException { + return is_reset; + } + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_bucket(bucket_id: i32) -> TableBucket { + TableBucket::new(1, bucket_id) + } + + /// Setup: 3 in-flight batches (seq 0,1,2 / batch_id 100,101,102) for bucket 0. + fn setup_three_in_flight() -> (IdempotenceManager, TableBucket) { + let mgr = IdempotenceManager::new(true, 5); + mgr.set_writer_id(42); + let b0 = test_bucket(0); + let _ = mgr.next_sequence_and_increment(&b0); // 0 + let _ = mgr.next_sequence_and_increment(&b0); // 1 + let _ = mgr.next_sequence_and_increment(&b0); // 2 + mgr.add_in_flight_batch(&b0, 0, 100); + mgr.add_in_flight_batch(&b0, 1, 101); + mgr.add_in_flight_batch(&b0, 2, 102); + (mgr, b0) + } + + #[test] + fn test_handle_completed_batch() { + let (mgr, b0) = setup_three_in_flight(); + + // Basic: complete middle batch, verify removal and last_acked update + mgr.handle_completed_batch(&b0, 101, 42); + { + let entries = mgr.bucket_entries.lock(); + let entry = entries.get(&b0).unwrap(); + assert_eq!(entry.last_acked_sequence, 1); + assert_eq!(entry.in_flight.len(), 2); + assert_eq!(entry.in_flight[0].batch_sequence, 0); + assert_eq!(entry.in_flight[1].batch_sequence, 2); + } + + // Adjusted: fail batch_id=100 (seq=0) with adjustment, then complete + // batch_id=102 whose seq was adjusted from 2→1. last_acked should use + // the adjusted sequence. + let (mgr, b0) = setup_three_in_flight(); + mgr.handle_failed_batch(&b0, 101, 42, None, true); + mgr.handle_completed_batch(&b0, 102, 42); + { + let entries = mgr.bucket_entries.lock(); + let entry = entries.get(&b0).unwrap(); + assert_eq!(entry.last_acked_sequence, 1); // adjusted, not original 2 + assert_eq!(entry.in_flight.len(), 1); + assert_eq!(entry.in_flight[0].batch_id, 100); + } + } + + #[test] + fn test_handle_failed_batch() { + // With sequence adjustment + let (mgr, b0) = setup_three_in_flight(); + mgr.handle_failed_batch(&b0, 101, 42, None, true); + { + let entries = mgr.bucket_entries.lock(); + let entry = entries.get(&b0).unwrap(); + assert_eq!(entry.in_flight.len(), 2); + assert_eq!(entry.in_flight[0].batch_sequence, 0); + assert_eq!(entry.in_flight[1].batch_sequence, 1); // was 2, decremented + assert_eq!(entry.next_sequence, 2); // was 3, decremented + } + + // Without sequence adjustment (retries exhausted) + let (mgr, b0) = setup_three_in_flight(); + mgr.handle_failed_batch(&b0, 101, 42, None, false); + { + let entries = mgr.bucket_entries.lock(); + let entry = entries.get(&b0).unwrap(); + assert_eq!(entry.in_flight.len(), 2); + assert_eq!(entry.in_flight[0].batch_sequence, 0); + assert_eq!(entry.in_flight[1].batch_sequence, 2); // NOT decremented + assert_eq!(entry.next_sequence, 3); // NOT decremented + } + + // OOS / UnknownWriterId errors reset all writer state + for error in [ + FlussError::OutOfOrderSequenceException, + FlussError::UnknownWriterIdException, + ] { + let (mgr, b0) = setup_three_in_flight(); + mgr.handle_failed_batch(&b0, 100, 42, Some(error), true); + assert!(!mgr.has_writer_id()); + assert!(mgr.bucket_entries.lock().is_empty()); + } + } + + #[test] + fn test_can_retry_out_of_order() { + let mgr = IdempotenceManager::new(true, 5); + let b0 = test_bucket(0); + + // No writer_id → never retriable + assert!(!mgr.can_retry_for_error(&b0, 0, 100, FlussError::OutOfOrderSequenceException)); + + mgr.set_writer_id(42); + mgr.add_in_flight_batch(&b0, 0, 100); + mgr.add_in_flight_batch(&b0, 1, 101); + + // seq=0 IS next expected (last_acked=-1+1=0) → genuine violation, NOT retriable + assert!(!mgr.can_retry_for_error(&b0, 0, 100, FlussError::OutOfOrderSequenceException)); + // seq=1 is NOT next expected → retriable + assert!(mgr.can_retry_for_error(&b0, 1, 101, FlussError::OutOfOrderSequenceException)); + } + + #[test] + fn test_can_retry_after_sequence_reset() { + // OOS: batch whose seq was adjusted to match last_acked+1 is still retriable + let (mgr, b0) = setup_three_in_flight(); + mgr.handle_completed_batch(&b0, 100, 42); // last_acked=0 + mgr.handle_failed_batch(&b0, 101, 42, None, true); // batch_id=102 adjusted to seq=1 + + // seq=1 == last_acked(0)+1, but batch was reset → retriable + assert!(mgr.can_retry_for_error(&b0, 1, 102, FlussError::OutOfOrderSequenceException)); + + // UnknownWriterId: non-reset → NOT retriable, reset → retriable + let (mgr, b0) = setup_three_in_flight(); + assert!(!mgr.can_retry_for_error(&b0, 0, 100, FlussError::UnknownWriterIdException)); + mgr.handle_failed_batch(&b0, 101, 42, None, true); // batch_id=102 is reset + assert!(mgr.can_retry_for_error(&b0, 1, 102, FlussError::UnknownWriterIdException)); + } + + #[test] + fn test_maybe_update_writer_id() { + let mgr = IdempotenceManager::new(true, 5); + mgr.set_writer_id(42); + let b0 = test_bucket(0); + + mgr.maybe_update_writer_id(&b0); + let seq = mgr.next_sequence_and_increment(&b0); + mgr.add_in_flight_batch(&b0, seq, 100); + + // With in-flight batches: rotation is deferred + mgr.set_writer_id(99); + mgr.maybe_update_writer_id(&b0); + { + let entries = mgr.bucket_entries.lock(); + let entry = entries.get(&b0).unwrap(); + assert_eq!(entry.writer_id, 42); // unchanged + assert_eq!(entry.next_sequence, 1); + } + + // Complete must use the writer_id that was active when batch was sent + mgr.handle_completed_batch(&b0, 100, 99); + mgr.maybe_update_writer_id(&b0); + { + let entries = mgr.bucket_entries.lock(); + let entry = entries.get(&b0).unwrap(); + assert_eq!(entry.writer_id, 99); + assert_eq!(entry.next_sequence, 0); + assert_eq!(entry.last_acked_sequence, -1); + } + } + + #[test] + fn test_is_first_in_flight_batch() { + let (mgr, b0) = setup_three_in_flight(); + + assert!(mgr.is_first_in_flight_batch(&b0, 100)); + assert!(!mgr.is_first_in_flight_batch(&b0, 101)); + + // After adjustment + completion, batch_id still identifies first correctly + mgr.handle_failed_batch(&b0, 101, 42, None, true); + mgr.handle_completed_batch(&b0, 100, 42); + assert!(mgr.is_first_in_flight_batch(&b0, 102)); + assert!(!mgr.is_first_in_flight_batch(&b0, 100)); + } + + #[test] + fn test_can_send_more_requests() { + let mgr = IdempotenceManager::new(true, 2); + let b0 = test_bucket(0); + + assert!(mgr.can_send_more_requests(&b0)); + + mgr.add_in_flight_batch(&b0, 0, 100); + assert!(mgr.can_send_more_requests(&b0)); + + mgr.add_in_flight_batch(&b0, 1, 101); + assert!(!mgr.can_send_more_requests(&b0)); // at limit + + mgr.remove_in_flight_batch(&b0, 100); + assert!(mgr.can_send_more_requests(&b0)); // under limit again + } + + #[test] + fn test_is_already_committed() { + let mgr = IdempotenceManager::new(true, 5); + let b0 = test_bucket(0); + mgr.set_writer_id(42); + + // No entry yet → not committed + assert!(!mgr.is_already_committed(&b0, 0)); + + // Initialize bucket and ack seq=0 + let _ = mgr.next_sequence_and_increment(&b0); // 0 + mgr.add_in_flight_batch(&b0, 0, 100); + mgr.handle_completed_batch(&b0, 100, 42); // last_acked=0 + + // seq=0 <= last_acked(0) → committed + assert!(mgr.is_already_committed(&b0, 0)); + // seq=1 > last_acked(0) → not committed + assert!(!mgr.is_already_committed(&b0, 1)); + + // Ack up to seq=4, then check seq=0 still committed + for seq in 1..=4 { + let _ = mgr.next_sequence_and_increment(&b0); + mgr.add_in_flight_batch(&b0, seq, 100 + seq as i64); + mgr.handle_completed_batch(&b0, 100 + seq as i64, 42); + } + assert!(mgr.is_already_committed(&b0, 0)); // seq=0 <= last_acked(4) + assert!(mgr.is_already_committed(&b0, 4)); // seq=4 <= last_acked(4) + assert!(!mgr.is_already_committed(&b0, 5)); // seq=5 > last_acked(4) + } + + #[test] + fn test_reset_batch_ids_cleaned_on_complete() { + let (mgr, b0) = setup_three_in_flight(); + + // Fail batch_id=100 → batch_id=101 and 102 marked as reset + mgr.handle_failed_batch(&b0, 100, 42, None, true); + { + let entries = mgr.bucket_entries.lock(); + let entry = entries.get(&b0).unwrap(); + assert!(entry.reset_batch_ids.contains(&101)); + assert!(entry.reset_batch_ids.contains(&102)); + } + + // Complete batch_id=101 → cleaned from reset set + mgr.handle_completed_batch(&b0, 101, 42); + { + let entries = mgr.bucket_entries.lock(); + let entry = entries.get(&b0).unwrap(); + assert!(!entry.reset_batch_ids.contains(&101)); + assert!(entry.reset_batch_ids.contains(&102)); // still there + } + } + + #[test] + fn test_get_adjusted_sequence() { + let (mgr, b0) = setup_three_in_flight(); + + // No entry for unknown bucket + assert_eq!(mgr.get_adjusted_sequence(&test_bucket(9), 100), None); + + // Before adjustment: returns original sequences + assert_eq!(mgr.get_adjusted_sequence(&b0, 101), Some(1)); + assert_eq!(mgr.get_adjusted_sequence(&b0, 999), None); + + // After adjustment: returns adjusted sequences + mgr.handle_failed_batch(&b0, 100, 42, None, true); + assert_eq!(mgr.get_adjusted_sequence(&b0, 100), None); // removed + assert_eq!(mgr.get_adjusted_sequence(&b0, 101), Some(0)); // was 1 + assert_eq!(mgr.get_adjusted_sequence(&b0, 102), Some(1)); // was 2 + + // Reset flag survives get_adjusted_sequence (unlike the old take_ variant). + // This matches Java where `reopened` persists across retries. + { + let entries = mgr.bucket_entries.lock(); + let entry = entries.get(&b0).unwrap(); + assert!(entry.reset_batch_ids.contains(&101)); + assert!(entry.reset_batch_ids.contains(&102)); + } + } + + // --- Scenario tests --- + // Simulate Sender-level orchestration on IdempotenceManager. + // Each test mirrors a Java SenderTest integration test, exercising the same + // state transitions that Sender.handle_write_batch_error / complete_batch make. + // + // Convention: retriable failures make NO IdempotenceManager call (batch stays + // in-flight, Sender re-enqueues via accumulator). Non-retriable failures call + // handle_failed_batch. Successes call handle_completed_batch. + + #[test] + fn scenario_multiple_inflight_retried_in_order() { + // Java: testIdempotenceWithMultipleInflightBatchesRetriedInOrder + // 3 batches in-flight, batch 0 times out, batches 1+2 get OOS. + // All are retriable and must be retried one-at-a-time in sequence order. + let (mgr, b0) = setup_three_in_flight(); + + // Batch 0 (seq=0) times out → retriable, stays in in-flight + // Batch 1 (seq=1) OOS → retriable (not next expected seq) + assert!(mgr.can_retry_for_error(&b0, 1, 101, FlussError::OutOfOrderSequenceException)); + // Batch 2 (seq=2) OOS → retriable + assert!(mgr.can_retry_for_error(&b0, 2, 102, FlussError::OutOfOrderSequenceException)); + + // Retry phase: only first-in-flight batch should be drained + assert!(mgr.is_first_in_flight_batch(&b0, 100)); + assert!(!mgr.is_first_in_flight_batch(&b0, 101)); + + // Retry batch 0 succeeds → last_acked=0 + mgr.handle_completed_batch(&b0, 100, 42); + assert_eq!(last_acked(&mgr, &b0), 0); + + // Batch 1 is now first, retry succeeds → last_acked=1 + assert!(mgr.is_first_in_flight_batch(&b0, 101)); + mgr.handle_completed_batch(&b0, 101, 42); + assert_eq!(last_acked(&mgr, &b0), 1); + + // Batch 2 is now first, retry succeeds → last_acked=2 + assert!(mgr.is_first_in_flight_batch(&b0, 102)); + mgr.handle_completed_batch(&b0, 102, 42); + assert_eq!(last_acked(&mgr, &b0), 2); + } + + #[test] + fn scenario_out_of_order_responses() { + // Java: testCorrectHandlingOfOutOfOrderResponses + // Server responds to batch 1 (OOS) before batch 0 (timeout). + // Both re-enqueued, retried in order. + let mgr = IdempotenceManager::new(true, 5); + mgr.set_writer_id(42); + let b0 = test_bucket(0); + let _ = mgr.next_sequence_and_increment(&b0); + let _ = mgr.next_sequence_and_increment(&b0); + mgr.add_in_flight_batch(&b0, 0, 100); + mgr.add_in_flight_batch(&b0, 1, 101); + + // Batch 1 response arrives first: OOS → retriable (seq 1 ≠ next expected 0) + assert!(mgr.can_retry_for_error(&b0, 1, 101, FlussError::OutOfOrderSequenceException)); + // Batch 0 response: timeout → retriable (no IdempotenceManager call) + + // Retry: batch 0 must go first + assert!(mgr.is_first_in_flight_batch(&b0, 100)); + mgr.handle_completed_batch(&b0, 100, 42); + assert_eq!(last_acked(&mgr, &b0), 0); + + // Then batch 1 + assert!(mgr.is_first_in_flight_batch(&b0, 101)); + mgr.handle_completed_batch(&b0, 101, 42); + assert_eq!(last_acked(&mgr, &b0), 1); + } + + #[test] + fn scenario_second_batch_succeeds_first() { + // Java: testCorrectHandlingOfOutOfOrderResponsesWhenSecondSucceeds + // + testCorrectHandlingOfDuplicateSequenceError (same at this level) + // Batch 1 succeeds before batch 0. last_acked jumps ahead, then batch 0 + // completes without regressing last_acked. + let mgr = IdempotenceManager::new(true, 5); + mgr.set_writer_id(42); + let b0 = test_bucket(0); + let _ = mgr.next_sequence_and_increment(&b0); + let _ = mgr.next_sequence_and_increment(&b0); + mgr.add_in_flight_batch(&b0, 0, 100); + mgr.add_in_flight_batch(&b0, 1, 101); + + // Batch 1 succeeds first → last_acked jumps to 1 + mgr.handle_completed_batch(&b0, 101, 42); + assert_eq!(last_acked(&mgr, &b0), 1); + + // Batch 0 timeout → retriable → re-enqueued → retry succeeds + mgr.handle_completed_batch(&b0, 100, 42); + // last_acked stays 1 (0 < 1, higher wins) + assert_eq!(last_acked(&mgr, &b0), 1); + assert!( + mgr.bucket_entries + .lock() + .get(&b0) + .unwrap() + .in_flight + .is_empty() + ); + } + + #[test] + fn scenario_unknown_writer_id_resets_and_restarts() { + // Java: testRetryAfterResettingInFlightBatchSequence + // Batch 0 times out (retriable), batch 1 gets UnknownWriterId (non-retriable). + // UnknownWriterId resets all state. After new writer ID, sequences restart at 0. + let mgr = IdempotenceManager::new(true, 5); + mgr.set_writer_id(42); + let b0 = test_bucket(0); + let _ = mgr.next_sequence_and_increment(&b0); + let _ = mgr.next_sequence_and_increment(&b0); + mgr.add_in_flight_batch(&b0, 0, 100); + mgr.add_in_flight_batch(&b0, 1, 101); + + // Batch 0 times out → retriable (stays in in-flight) + // Batch 1 UnknownWriterId → NOT retriable (non-reset batch) + assert!(!mgr.can_retry_for_error(&b0, 1, 101, FlussError::UnknownWriterIdException)); + + // Sender calls fail_batch → handle_failed_batch with error → full reset + mgr.handle_failed_batch( + &b0, + 101, + 42, + Some(FlussError::UnknownWriterIdException), + true, + ); + assert!(!mgr.has_writer_id()); + assert!(mgr.bucket_entries.lock().is_empty()); + + // New writer ID allocated, sequences restart at 0 + mgr.set_writer_id(99); + assert_eq!(mgr.next_sequence_and_increment(&b0), 0); + } + + fn last_acked(mgr: &IdempotenceManager, bucket: &TableBucket) -> i32 { + mgr.bucket_entries + .lock() + .get(bucket) + .unwrap() + .last_acked_sequence + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/mod.rs b/fluss-rust/crates/fluss/src/client/write/mod.rs new file mode 100644 index 0000000000..a65b5d5af1 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/mod.rs @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod accumulator; +mod batch; +mod dynamic_batch_size; +mod idempotence; + +use crate::client::broadcast::{self as client_broadcast, BatchWriteResult, BroadcastOnceReceiver}; +use crate::error::Error; +use crate::metadata::{PhysicalTablePath, TableInfo}; + +use crate::row::InternalRow; +pub use accumulator::*; +use arrow::array::RecordBatch; +use bytes::Bytes; +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +pub(crate) mod broadcast; +mod bucket_assigner; + +mod sender; +mod write_format; +mod writer_client; + +pub(crate) use idempotence::IdempotenceManager; +pub use write_format::WriteFormat; +pub(crate) use writer_client::WriterClient; + +#[allow(dead_code)] +pub struct WriteRecord<'a> { + record: Record<'a>, + physical_table_path: Arc, + bucket_key: Option, + schema_id: i32, + write_format: WriteFormat, + table_info: Arc, +} + +impl<'a> WriteRecord<'a> { + pub fn record(&self) -> &Record<'a> { + &self.record + } + + pub fn physical_table_path(&self) -> &Arc { + &self.physical_table_path + } + + /// Minimum batch capacity needed to fit this record, including batch header + /// overhead. Used to size memory reservations and KV write limits so that + /// oversized records don't panic on append. + pub fn estimated_record_size(&self) -> usize { + match &self.record { + Record::Kv(kv) => { + let record_size = crate::record::kv::KvRecord::size_of( + &kv.key, + kv.row_bytes.as_ref().map(|rb| rb.as_slice()), + ); + crate::record::kv::RECORD_BATCH_HEADER_SIZE + record_size + } + Record::Log(_) => 0, // Arrow batches use record count, not byte size + } + } +} + +pub enum Record<'a> { + Log(LogWriteRecord<'a>), + Kv(KvWriteRecord<'a>), +} + +pub enum LogWriteRecord<'a> { + InternalRow(&'a dyn InternalRow), + RecordBatch(Arc), +} + +#[derive(Clone)] +pub enum RowBytes<'a> { + Borrowed(&'a [u8]), + Owned(Bytes), +} + +impl<'a> RowBytes<'a> { + pub fn as_slice(&self) -> &[u8] { + match self { + RowBytes::Borrowed(slice) => slice, + RowBytes::Owned(bytes) => bytes.as_ref(), + } + } +} + +pub struct KvWriteRecord<'a> { + key: Bytes, + target_columns: Option>>, + row_bytes: Option>, +} + +impl<'a> KvWriteRecord<'a> { + fn new( + key: Bytes, + target_columns: Option>>, + row_bytes: Option>, + ) -> Self { + KvWriteRecord { + key, + target_columns, + row_bytes, + } + } + + pub fn row_bytes(&self) -> Option<&[u8]> { + self.row_bytes.as_ref().map(|rb| rb.as_slice()) + } +} + +impl<'a> WriteRecord<'a> { + pub fn for_append( + table_info: Arc, + physical_table_path: Arc, + schema_id: i32, + row: &'a dyn InternalRow, + ) -> Self { + Self { + table_info, + record: Record::Log(LogWriteRecord::InternalRow(row)), + physical_table_path, + bucket_key: None, + schema_id, + write_format: WriteFormat::ArrowLog, + } + } + + pub fn for_append_record_batch( + table_info: Arc, + physical_table_path: Arc, + schema_id: i32, + row: RecordBatch, + ) -> Self { + Self { + table_info, + record: Record::Log(LogWriteRecord::RecordBatch(Arc::new(row))), + physical_table_path, + bucket_key: None, + schema_id, + write_format: WriteFormat::ArrowLog, + } + } + + #[allow(clippy::too_many_arguments)] + pub fn for_upsert( + table_info: Arc, + physical_table_path: Arc, + schema_id: i32, + key: Bytes, + bucket_key: Option, + write_format: WriteFormat, + target_columns: Option>>, + row_bytes: Option>, + ) -> Self { + Self { + table_info, + record: Record::Kv(KvWriteRecord::new(key, target_columns, row_bytes)), + physical_table_path, + bucket_key, + schema_id, + write_format, + } + } +} + +#[derive(Debug, Clone)] +pub struct ResultHandle { + receiver: BroadcastOnceReceiver, +} + +impl ResultHandle { + pub fn new(receiver: BroadcastOnceReceiver) -> Self { + ResultHandle { receiver } + } + + /// Force-complete with an error if not already completed. + pub(crate) fn fail(&self, error: client_broadcast::Error) { + self.receiver.fail(error); + } + + pub async fn wait(&self) -> Result { + self.receiver + .receive() + .await + .map_err(|e| Error::UnexpectedError { + message: format!("Fail to wait write result {e:?}"), + source: None, + }) + } + + pub fn result(&self, batch_result: BatchWriteResult) -> Result<(), Error> { + batch_result.map_err(|e| match e { + client_broadcast::Error::WriteFailed { code, message } => Error::FlussAPIError { + api_error: crate::rpc::ApiError { code, message }, + }, + client_broadcast::Error::Client { message } => Error::UnexpectedError { + message, + source: None, + }, + client_broadcast::Error::Dropped => Error::UnexpectedError { + message: "Fail to get write result because broadcast was dropped.".to_string(), + source: None, + }, + }) + } +} + +/// A future that represents a pending write operation. +/// +/// This type implements [`Future`], allowing users to either: +/// 1. Await immediately to block on acknowledgment: `writer.upsert(&row)?.await?` +/// 2. Fire-and-forget with later flush: `writer.upsert(&row)?; writer.flush().await?` +/// +/// This pattern is similar to rdkafka's `DeliveryFuture` and allows for efficient batching +/// when users don't need immediate per-record acknowledgment. +pub struct WriteResultFuture { + inner: Pin> + Send>>, +} + +impl std::fmt::Debug for WriteResultFuture { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("WriteResultFuture").finish_non_exhaustive() + } +} + +impl WriteResultFuture { + /// Create a new WriteResultFuture from a ResultHandle. + pub fn new(result_handle: ResultHandle) -> Self { + Self { + inner: Box::pin(async move { + let result = result_handle.wait().await?; + result_handle.result(result) + }), + } + } +} + +impl Future for WriteResultFuture { + type Output = Result<(), Error>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + self.inner.as_mut().poll(cx) + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs new file mode 100644 index 0000000000..8e738d0dc5 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/sender.rs @@ -0,0 +1,1398 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::broadcast; +use crate::client::metadata::Metadata; +use crate::client::write::IdempotenceManager; +use crate::client::write::batch::WriteBatch; +use crate::client::{ReadyWriteBatch, RecordAccumulator}; +use crate::error::Error::UnexpectedError; +use crate::error::{FlussError, Result}; +use crate::metadata::{PhysicalTablePath, TableBucket, TablePath}; +use crate::proto::{ + PbProduceLogRespForBucket, PbPutKvRespForBucket, PbTablePath, ProduceLogResponse, PutKvResponse, +}; +use crate::record::{NO_BATCH_SEQUENCE, NO_WRITER_ID}; +use crate::rpc::ServerConnection; +use crate::rpc::message::{InitWriterRequest, ProduceLogRequest, PutKvRequest}; +use crate::{PartitionId, TableId}; +use futures::StreamExt; +use futures::stream::FuturesUnordered; +use log::{debug, warn}; +use parking_lot::Mutex; +use std::collections::{HashMap, HashSet}; +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; +use tokio::sync::mpsc; + +type SendFuture<'a> = Pin> + Send + 'a>>; + +/// Result of a synchronous drain: send futures, optional delay, and unknown leader tables. +type DrainResult<'a> = ( + Vec>, + Option, + HashSet>, +); + +#[allow(dead_code)] +pub struct Sender { + running: AtomicBool, + metadata: Arc, + accumulator: Arc, + in_flight_batches: Mutex>>, + max_request_size: i32, + ack: i16, + max_request_timeout_ms: i32, + retries: i32, + idempotence_manager: Arc, +} + +impl Sender { + pub fn new( + metadata: Arc, + accumulator: Arc, + max_request_size: i32, + max_request_timeout_ms: i32, + ack: i16, + retries: i32, + idempotence_manager: Arc, + ) -> Self { + Self { + running: AtomicBool::new(true), + metadata, + accumulator, + in_flight_batches: Default::default(), + max_request_size, + ack, + max_request_timeout_ms, + retries, + idempotence_manager, + } + } + + const WRITER_ID_RETRY_TIMES: u32 = 3; + const WRITER_ID_RETRY_INTERVAL_MS: u64 = 100; + + async fn maybe_wait_for_writer_id(&self) -> Result<()> { + if !self.idempotence_manager.is_enabled() || self.idempotence_manager.has_writer_id() { + return Ok(()); + } + let mut retry_count = 0u32; + loop { + match self.try_init_writer_id().await { + Ok(()) => return Ok(()), + Err(e) => { + // Authorization errors are not transient — fail immediately. + if e.api_error() == Some(FlussError::AuthorizationException) { + return Err(e); + } + if retry_count >= Self::WRITER_ID_RETRY_TIMES { + return Err(e); + } + if e.api_error().is_some_and(Self::is_invalid_metadata_error) { + let physical_paths = self.accumulator.get_physical_table_paths_in_batches(); + let physical_refs: HashSet<&Arc> = + physical_paths.iter().collect(); + if let Err(meta_err) = self + .metadata + .update_tables_metadata(&HashSet::new(), &physical_refs, vec![]) + .await + { + warn!("Failed to refresh metadata after writer ID error: {meta_err}"); + } + } + retry_count += 1; + let delay_ms = Self::WRITER_ID_RETRY_INTERVAL_MS * 2u64.pow(retry_count); + warn!( + "Failed to allocate writer ID (attempt {retry_count}/{}), retrying in {delay_ms}ms: {e}", + Self::WRITER_ID_RETRY_TIMES, + ); + tokio::time::sleep(Duration::from_millis(delay_ms)).await; + } + } + } + } + + async fn try_init_writer_id(&self) -> Result<()> { + // Deduplicate by (database, table) since multiple physical paths (partitions) + // may share the same table. Matches Java's Set dedup. + let mut seen = HashSet::new(); + let table_paths: Vec = self + .accumulator + .get_physical_table_paths_in_batches() + .iter() + .filter_map(|path| { + let key = ( + path.get_database_name().to_string(), + path.get_table_name().to_string(), + ); + if seen.insert(key.clone()) { + Some(PbTablePath { + database_name: key.0, + table_name: key.1, + }) + } else { + None + } + }) + .collect(); + if table_paths.is_empty() { + debug!("No table paths in batches, skipping writer ID allocation"); + return Ok(()); + } + let cluster = self.metadata.get_cluster(); + let server = cluster.get_one_available_server().ok_or(UnexpectedError { + message: "No tablet server available to allocate writer ID".to_string(), + source: None, + })?; + let connection = self.metadata.get_connection(server).await?; + let response = connection + .request(InitWriterRequest::new(table_paths)) + .await?; + self.idempotence_manager.set_writer_id(response.writer_id); + debug!( + "Allocated writer ID {} for idempotent writes", + response.writer_id + ); + Ok(()) + } + + fn maybe_abort_batches(&self, error: &crate::error::Error) { + if self.accumulator.has_incomplete() { + warn!("Aborting write batches due to fatal error: {error}"); + self.accumulator.abort_batches(broadcast::Error::Client { + message: format!("Writer ID allocation failed: {error}"), + }); + } + } + + /// Sequential init + drain + metadata refresh. Used by `run_once` (shutdown) + /// where blocking is acceptable. + async fn prepare_sends(&self) -> Result<(Vec>, Option)> { + if let Err(e) = self.maybe_wait_for_writer_id().await { + warn!("Failed to allocate writer ID after retries: {e}"); + self.maybe_abort_batches(&e); + return Ok((vec![], None)); + } + let (futures, delay, unknown_leaders) = self.drain_ready_sends()?; + if !unknown_leaders.is_empty() { + if let Err(e) = self.refresh_unknown_leaders(&unknown_leaders).await { + warn!("Metadata refresh for unknown leaders failed: {e}"); + } + } + Ok((futures, delay)) + } + + /// Fully synchronous drain: `ready()` → `drain()` → build send futures. + /// No async work — safe to call on the hot path without starving + /// `pending.next()`. Returns unknown leader tables so the caller can + /// schedule a concurrent metadata refresh. + fn drain_ready_sends(&self) -> Result> { + let cluster = self.metadata.get_cluster(); + let ready_check_result = self.accumulator.ready(&cluster)?; + + let unknown_leaders = ready_check_result.unknown_leader_tables; + + if ready_check_result.ready_nodes.is_empty() { + return Ok(( + vec![], + Some(ready_check_result.next_ready_check_delay_ms as u64), + unknown_leaders, + )); + } + + let batches = self.accumulator.drain( + cluster.clone(), + &ready_check_result.ready_nodes, + self.max_request_size, + )?; + + let mut futures = Vec::new(); + if !batches.is_empty() { + self.add_to_inflight_batches(&batches); + for (leader_id, leader_batches) in batches { + futures.push( + Box::pin(self.send_write_request(leader_id, self.ack, leader_batches)) + as SendFuture<'_>, + ); + } + } + + Ok((futures, None, unknown_leaders)) + } + + /// Refresh metadata for buckets with unknown leaders. Runs as a concurrent + /// maintenance task so it never blocks the response-processing hot path. + async fn refresh_unknown_leaders( + &self, + unknown_leaders: &HashSet>, + ) -> Result<()> { + let mut table_paths: HashSet<&TablePath> = HashSet::new(); + let mut physical_table_paths: HashSet<&Arc> = HashSet::new(); + + for path in unknown_leaders { + if path.get_partition_name().is_some() { + physical_table_paths.insert(path); + } else { + table_paths.insert(path.get_table_path()); + } + } + + if let Err(e) = self + .metadata + .update_tables_metadata(&table_paths, &physical_table_paths, vec![]) + .await + { + match e.api_error() { + Some(FlussError::PartitionNotExists) => { + warn!("Partition does not exist during metadata update, continuing: {e}"); + } + _ => return Err(e), + } + } + + debug!("Updated metadata for unknown leader tables: {unknown_leaders:?}"); + Ok(()) + } + + /// Blocking version of drain + send, used during shutdown drain. + async fn run_once(&self) -> Result<()> { + let (futures, delay) = self.prepare_sends().await?; + if let Some(ms) = delay { + tokio::time::sleep(Duration::from_millis(ms)).await; + return Ok(()); + } + for result in futures::future::join_all(futures).await { + result?; + } + Ok(()) + } + + fn add_to_inflight_batches(&self, batches: &HashMap>) { + let mut in_flight = self.in_flight_batches.lock(); + for batch_list in batches.values() { + for batch in batch_list { + in_flight + .entry(batch.table_bucket.clone()) + .or_default() + .push(batch.write_batch.batch_id()); + } + } + } + + async fn send_write_request( + &self, + destination: i32, + acks: i16, + batches: Vec, + ) -> Result<()> { + if batches.is_empty() { + return Ok(()); + } + let mut records_by_bucket = HashMap::new(); + let mut write_batch_by_table: HashMap> = HashMap::new(); + + for batch in batches { + let table_bucket = batch.table_bucket.clone(); + write_batch_by_table + .entry(table_bucket.table_id()) + .or_default() + .push(table_bucket.clone()); + records_by_bucket.insert(table_bucket, batch); + } + + let cluster = self.metadata.get_cluster(); + + let destination_node = match cluster.get_tablet_server(destination) { + Some(node) => node, + None => { + self.handle_batches_with_error( + records_by_bucket.into_values().collect(), + FlussError::LeaderNotAvailableException, + format!("Destination node not found in metadata cache {destination}."), + ) + .await?; + return Ok(()); + } + }; + let connection = match self.metadata.get_connection(destination_node).await { + Ok(connection) => connection, + Err(e) => { + self.handle_batches_with_error( + records_by_bucket.into_values().collect(), + FlussError::NetworkException, + format!("Failed to connect destination node {destination}: {e}"), + ) + .await?; + return Ok(()); + } + }; + + for (table_id, table_buckets) in write_batch_by_table { + let mut request_batches: Vec = table_buckets + .iter() + .filter_map(|bucket| records_by_bucket.remove(bucket)) + .collect(); + + if request_batches.is_empty() { + continue; + } + + let write_request = match Self::build_write_request( + table_id, + acks, + self.max_request_timeout_ms, + &mut request_batches, + ) { + Ok(req) => req, + Err(e) => { + self.handle_batches_with_local_error( + request_batches, + format!("Failed to build write request: {e}"), + )?; + continue; + } + }; + + // let's put in back into records_by_bucket + // since response handle will use it. + for request_batch in request_batches { + records_by_bucket.insert(request_batch.table_bucket.clone(), request_batch); + } + + self.send_and_handle_response( + &connection, + write_request, + table_id, + &table_buckets, + &mut records_by_bucket, + ) + .await?; + } + + Ok(()) + } + + fn build_write_request( + table_id: i64, + acks: i16, + timeout_ms: i32, + request_batches: &mut [ReadyWriteBatch], + ) -> Result { + let first_batch = &request_batches.first().unwrap().write_batch; + + let request = match first_batch { + WriteBatch::ArrowLog(_) => { + let req = ProduceLogRequest::new(table_id, acks, timeout_ms, request_batches)?; + WriteRequest::ProduceLog(req) + } + WriteBatch::Kv(kv_write_batch) => { + let target_columns = kv_write_batch.target_columns(); + for batch in request_batches.iter().skip(1) { + match &batch.write_batch { + WriteBatch::ArrowLog(_) => { + return Err(UnexpectedError { + message: "Expecting KvWriteBatch but found ArrowLogWriteBatch" + .to_string(), + source: None, + }); + } + WriteBatch::Kv(kvb) => { + if target_columns != kvb.target_columns() { + return Err(UnexpectedError { + message: format!( + "All the write batches to make put kv request should have the same target columns, but got {:?} and {:?}.", + target_columns, + kvb.target_columns() + ), + source: None, + }); + } + } + } + } + let cols = target_columns + .map(|arc| arc.iter().map(|&c| c as i32).collect()) + .unwrap_or_default(); + let req = PutKvRequest::new(table_id, acks, timeout_ms, cols, request_batches)?; + WriteRequest::PutKv(req) + } + }; + + Ok(request) + } + + async fn send_and_handle_response( + &self, + connection: &ServerConnection, + write_request: WriteRequest, + table_id: i64, + table_buckets: &[TableBucket], + records_by_bucket: &mut HashMap, + ) -> Result<()> { + macro_rules! send { + ($request:expr) => { + match connection.request($request).await { + Ok(response) => { + self.handle_write_response( + table_id, + table_buckets, + records_by_bucket, + response, + ) + .await + } + Err(e) => { + self.handle_batches_with_error( + table_buckets + .iter() + .filter_map(|b| records_by_bucket.remove(b)) + .collect(), + FlussError::NetworkException, + format!("Failed to send write request: {e}"), + ) + .await + } + } + }; + } + + match write_request { + WriteRequest::ProduceLog(req) => send!(req), + WriteRequest::PutKv(req) => send!(req), + } + } + + async fn handle_write_response( + &self, + table_id: i64, + request_buckets: &[TableBucket], + records_by_bucket: &mut HashMap, + response: R, + ) -> Result<()> { + let mut invalid_metadata_tables: HashSet = HashSet::new(); + let mut invalid_physical_table_paths: HashSet> = HashSet::new(); + let mut pending_buckets: HashSet = request_buckets.iter().cloned().collect(); + + for bucket_resp in response.buckets_resp() { + let tb = TableBucket::new_with_partition( + table_id, + bucket_resp.partition_id(), + bucket_resp.bucket_id(), + ); + let Some(ready_batch) = records_by_bucket.remove(&tb) else { + panic!("Missing ready batch for table bucket {tb}"); + }; + pending_buckets.remove(&tb); + + match bucket_resp.error_code() { + Some(code) if code != FlussError::None.code() => { + let error = FlussError::for_code(code); + let message = bucket_resp + .error_message() + .cloned() + .unwrap_or_else(|| error.message().to_string()); + if let Some(physical_table_path) = + self.handle_write_batch_error(ready_batch, error, message)? + { + invalid_metadata_tables + .insert(physical_table_path.get_table_path().clone()); + invalid_physical_table_paths.insert(physical_table_path); + } + } + _ => self.complete_batch(ready_batch), + } + } + + for bucket in pending_buckets { + if let Some(ready_batch) = records_by_bucket.remove(&bucket) { + if let Some(physical_table_path) = self.handle_write_batch_error( + ready_batch, + FlussError::UnknownServerError, + format!("Missing response for table bucket {bucket}"), + )? { + invalid_metadata_tables.insert(physical_table_path.get_table_path().clone()); + invalid_physical_table_paths.insert(physical_table_path); + } + } + } + + self.update_metadata_if_needed(invalid_metadata_tables, invalid_physical_table_paths) + .await; + Ok(()) + } + + // TODO: Java has a second overload `completeBatch(batch, bucket, logEndOffset)` used for + // KV responses. When callers need write offset info, change BatchWriteResult to carry + // optional offset metadata and plumb it through BroadcastOnce → ResultHandle → WriteResultFuture. + fn complete_batch(&self, ready_write_batch: ReadyWriteBatch) { + if self.idempotence_manager.is_enabled() + && ready_write_batch.write_batch.batch_sequence() != NO_BATCH_SEQUENCE + { + self.idempotence_manager.handle_completed_batch( + &ready_write_batch.table_bucket, + ready_write_batch.write_batch.batch_id(), + ready_write_batch.write_batch.writer_id(), + ); + } + self.finish_batch(ready_write_batch, Ok(())); + } + + fn fail_batch( + &self, + ready_write_batch: ReadyWriteBatch, + error: broadcast::Error, + fluss_error: Option, + adjust_sequences: bool, + ) { + if self.idempotence_manager.is_enabled() + && ready_write_batch.write_batch.batch_sequence() != NO_BATCH_SEQUENCE + { + self.idempotence_manager.handle_failed_batch( + &ready_write_batch.table_bucket, + ready_write_batch.write_batch.batch_id(), + ready_write_batch.write_batch.writer_id(), + fluss_error, + adjust_sequences, + ); + } + self.finish_batch(ready_write_batch, Err(error)); + } + + fn finish_batch(&self, ready_write_batch: ReadyWriteBatch, result: broadcast::Result<()>) { + if ready_write_batch.write_batch.complete(result) { + self.remove_from_inflight_batches(&ready_write_batch); + // remove from incomplete batches + self.accumulator + .remove_incomplete_batches(ready_write_batch.write_batch.batch_id()) + } + } + + async fn handle_batches_with_error( + &self, + batches: Vec, + error: FlussError, + message: String, + ) -> Result<()> { + let mut invalid_metadata_tables: HashSet = HashSet::new(); + let mut invalid_physical_table_paths: HashSet> = HashSet::new(); + + for batch in batches { + if let Some(physical_table_path) = + self.handle_write_batch_error(batch, error, message.clone())? + { + invalid_metadata_tables.insert(physical_table_path.get_table_path().clone()); + invalid_physical_table_paths.insert(physical_table_path); + } + } + self.update_metadata_if_needed(invalid_metadata_tables, invalid_physical_table_paths) + .await; + Ok(()) + } + + fn handle_batches_with_local_error( + &self, + batches: Vec, + message: String, + ) -> Result<()> { + for batch in batches { + // Local errors (e.g. build failure) — server never saw the batch, + // so it's always safe to adjust sequences. + self.fail_batch( + batch, + broadcast::Error::Client { + message: message.clone(), + }, + None, + true, + ); + } + Ok(()) + } + + fn handle_write_batch_error( + &self, + ready_write_batch: ReadyWriteBatch, + error: FlussError, + message: String, + ) -> Result>> { + let physical_table_path = Arc::clone(ready_write_batch.write_batch.physical_table_path()); + + if error == FlussError::DuplicateSequenceException { + warn!( + "Duplicate sequence for {} on bucket {}: {message}", + physical_table_path.as_ref(), + ready_write_batch.table_bucket.bucket_id() + ); + self.complete_batch(ready_write_batch); + return Ok(None); + } + + if error == FlussError::OutOfOrderSequenceException + && self.idempotence_manager.is_enabled() + && self.idempotence_manager.is_already_committed( + &ready_write_batch.table_bucket, + ready_write_batch.write_batch.batch_sequence(), + ) + { + warn!( + "Batch for {} on bucket {} with sequence {} received OutOfOrderSequenceException \ + but has already been committed. Treating as success due to lost response.", + physical_table_path.as_ref(), + ready_write_batch.table_bucket.bucket_id(), + ready_write_batch.write_batch.batch_sequence(), + ); + self.complete_batch(ready_write_batch); + return Ok(None); + } + + if self.can_retry(&ready_write_batch, error) { + warn!( + "Retrying write batch for {} on bucket {} after error {error:?}: {message}", + physical_table_path.as_ref(), + ready_write_batch.table_bucket.bucket_id() + ); + + // If idempotence is enabled, only retry if the current writer ID still matches + // the batch's writer ID. If the writer ID was reset (e.g., by another bucket's + // error), fail the batch instead of retrying with stale state. + if self.idempotence_manager.is_enabled() { + let batch_writer_id = ready_write_batch.write_batch.writer_id(); + if batch_writer_id != NO_WRITER_ID + && self.idempotence_manager.writer_id() != batch_writer_id + { + warn!( + "Writer ID changed from {} to {} since batch was sent, failing instead of retrying", + batch_writer_id, + self.idempotence_manager.writer_id() + ); + self.fail_batch( + ready_write_batch, + broadcast::Error::WriteFailed { + code: FlussError::UnknownWriterIdException.code(), + message: format!( + "Attempted to retry sending a batch but the writer id has changed from {} to {}. This batch will be dropped.", + batch_writer_id, + self.idempotence_manager.writer_id() + ), + }, + Some(FlussError::UnknownWriterIdException), + false, + ); + return Ok( + Self::is_invalid_metadata_error(error).then_some(physical_table_path) + ); + } + } + + self.re_enqueue_batch(ready_write_batch); + return Ok(Self::is_invalid_metadata_error(error).then_some(physical_table_path)); + } + + // Generic error path. handle_failed_batch will detect remaining + // OutOfOrderSequence (not already committed) / UnknownWriterId cases and + // reset all writer state internally (matching Java). + // For other errors, only adjust sequences if the batch didn't exhaust its retries. + let can_adjust = ready_write_batch.write_batch.attempts() < self.retries; + self.fail_batch( + ready_write_batch, + broadcast::Error::WriteFailed { + code: error.code(), + message, + }, + Some(error), + can_adjust, + ); + Ok(Self::is_invalid_metadata_error(error).then_some(physical_table_path)) + } + + fn re_enqueue_batch(&self, ready_write_batch: ReadyWriteBatch) { + self.remove_from_inflight_batches(&ready_write_batch); + // TODO: add retry metrics (Java: writerMetricGroup.recordsRetryTotal().inc(recordCount)) + self.accumulator.re_enqueue(ready_write_batch); + } + + fn remove_from_inflight_batches(&self, ready_write_batch: &ReadyWriteBatch) { + let batch_id = ready_write_batch.write_batch.batch_id(); + let mut in_flight_guard = self.in_flight_batches.lock(); + if let Some(in_flight) = in_flight_guard.get_mut(&ready_write_batch.table_bucket) { + in_flight.retain(|id| *id != batch_id); + if in_flight.is_empty() { + in_flight_guard.remove(&ready_write_batch.table_bucket); + } + } + } + + fn can_retry(&self, ready_write_batch: &ReadyWriteBatch, error: FlussError) -> bool { + if ready_write_batch.write_batch.attempts() >= self.retries + || ready_write_batch.write_batch.is_done() + { + return false; + } + if Self::is_retriable_error(error) { + return true; + } + // Idempotent-specific retry logic + let seq = ready_write_batch.write_batch.batch_sequence(); + if self.idempotence_manager.is_enabled() && seq != NO_BATCH_SEQUENCE { + return self.idempotence_manager.can_retry_for_error( + &ready_write_batch.table_bucket, + seq, + ready_write_batch.write_batch.batch_id(), + error, + ); + } + false + } + + async fn update_metadata_if_needed( + &self, + table_paths: HashSet, + physical_table_path: HashSet>, + ) { + if table_paths.is_empty() { + return; + } + let table_path_refs: HashSet<&TablePath> = table_paths.iter().collect(); + let physical_table_path_refs: HashSet<&Arc> = + physical_table_path.iter().collect(); + if let Err(e) = self + .metadata + .update_tables_metadata(&table_path_refs, &physical_table_path_refs, vec![]) + .await + { + warn!("Failed to update metadata after write error: {e:?}"); + } + } + + fn is_invalid_metadata_error(error: FlussError) -> bool { + matches!( + error, + FlussError::NotLeaderOrFollower + | FlussError::UnknownTableOrBucketException + | FlussError::LeaderNotAvailableException + | FlussError::NetworkException + ) + } + + fn is_retriable_error(error: FlussError) -> bool { + matches!( + error, + FlussError::NetworkException + | FlussError::NotLeaderOrFollower + | FlussError::UnknownTableOrBucketException + | FlussError::LeaderNotAvailableException + | FlussError::LogStorageException + | FlussError::KvStorageException + | FlussError::StorageException + | FlussError::RequestTimeOut + | FlussError::NotEnoughReplicasAfterAppendException + | FlussError::NotEnoughReplicasException + | FlussError::CorruptMessage + | FlussError::CorruptRecordException + ) + } + + /// Event-loop sender: drain batches and fire RPCs into a `FuturesUnordered`, + /// then process responses as they arrive. This interleaves drain cycles with + /// response handling — when a fast leader responds, we immediately drain and + /// send more batches for its buckets while slow leaders are still in-flight. + /// + /// Slow work (writer-ID init with retry backoff, metadata refresh for + /// unknown leaders) runs as concurrent maintenance tasks so it never blocks + /// `pending.next()`. The drain path (`drain_ready_sends`) is fully + /// synchronous — no `.await` on the hot path. Without this separation, + /// backoff sleeps during writer-ID init could stall response processing + /// and cause severe backpressure when the accumulator memory budget is full + /// (responses not polled → memory not freed → writers block). + /// Single-select event loop with `need_drain` tick. + /// + /// Invariants: + /// - `need_drain` is a one-shot "try a drain tick ASAP" flag. + /// - Each iteration either performs a sync drain tick (if flagged) or blocks + /// in a single `tokio::select!`. + /// - `accumulator.notified()` is always listened to (producer wakeups). + /// - The idle timer is only armed when truly idle (no futures in any pool). + /// - When writer_id isn't ready, a drain tick is a no-op but the loop stays + /// responsive (notified/init/meta can still wake it). + pub async fn run_with_shutdown(&self, mut shutdown_rx: mpsc::Receiver<()>) -> Result<()> { + let mut pending: FuturesUnordered> = FuturesUnordered::new(); + let mut init_futs: FuturesUnordered> = FuturesUnordered::new(); + let mut meta_futs: FuturesUnordered> = FuturesUnordered::new(); + let mut pending_unknown: HashSet> = HashSet::new(); + + let mut need_drain = true; // drain on first iteration to pick up any pre-existing batches + let mut next_delay_ms: u64 = 1; + + loop { + // Spawn writer-ID init task if needed and not already running. + if init_futs.is_empty() + && self.idempotence_manager.is_enabled() + && !self.idempotence_manager.has_writer_id() + && self.accumulator.has_undrained() + { + init_futs.push(Box::pin(self.maybe_wait_for_writer_id())); + } + + // Spawn metadata refresh if we have accumulated unknown leaders + // and no refresh is currently running. + if !pending_unknown.is_empty() && meta_futs.is_empty() { + let leaders = std::mem::take(&mut pending_unknown); + meta_futs.push(Box::pin(async move { + self.refresh_unknown_leaders(&leaders).await + })); + } + + // Drain tick: synchronous, never blocks response processing. + // Clear unconditionally — "need_drain" means "try", not "must succeed". + if need_drain { + need_drain = false; + + if !self.idempotence_manager.is_enabled() + || self.idempotence_manager.has_writer_id() + { + match self.drain_ready_sends() { + Ok((futures, delay, unknown_leaders)) => { + if let Some(d) = delay { + next_delay_ms = d; + } + pending_unknown.extend(unknown_leaders); + for f in futures { + pending.push(f); + } + } + Err(e) => { + warn!("Error in drain cycle: {e}"); + } + } + } + } + + let truly_idle = pending.is_empty() && init_futs.is_empty() && meta_futs.is_empty(); + debug_assert!(next_delay_ms >= 1); + + // One select to rule them all. + tokio::select! { + _ = shutdown_rx.recv() => break, + + // Always listen for producer wakeups. + _ = self.accumulator.notified() => { + need_drain = true; + } + + // Process in-flight send responses. + Some(result) = pending.next(), if !pending.is_empty() => { + if let Err(e) = result { + warn!("Uncaught error in send request, continuing: {e}"); + } + need_drain = true; + } + + // Writer-ID init completed. + Some(result) = init_futs.next(), if !init_futs.is_empty() => { + match result { + Ok(()) => need_drain = true, + Err(e) => { + warn!("Failed to allocate writer ID after retries: {e}"); + self.maybe_abort_batches(&e); + } + } + } + + // Metadata refresh completed — new leaders may now be known. + Some(result) = meta_futs.next(), if !meta_futs.is_empty() => { + if let Err(e) = result { + warn!("Metadata refresh for unknown leaders failed: {e}"); + } + need_drain = true; + } + + // Idle timer: batch timeout / linger expiry. + _ = tokio::time::sleep(Duration::from_millis(next_delay_ms)), if truly_idle => { + need_drain = true; + } + } + } + + // Graceful shutdown: drain remaining batches, then wait for all + // in-flight sends to complete. + while self.accumulator.has_undrained() { + if let Err(e) = self.run_once().await { + warn!("Error during shutdown drain, continuing: {e}"); + } + } + while let Some(result) = pending.next().await { + if let Err(e) = result { + warn!("Error in send during shutdown, continuing: {e}"); + } + } + self.close(); + Ok(()) + } + + pub fn close(&self) { + self.running.store(false, Ordering::Relaxed); + } +} + +enum WriteRequest { + ProduceLog(ProduceLogRequest), + PutKv(PutKvRequest), +} + +trait BucketResponse { + fn bucket_id(&self) -> i32; + fn error_code(&self) -> Option; + fn error_message(&self) -> Option<&String>; + + fn partition_id(&self) -> Option; +} + +impl BucketResponse for PbProduceLogRespForBucket { + fn bucket_id(&self) -> i32 { + self.bucket_id + } + fn error_code(&self) -> Option { + self.error_code + } + fn error_message(&self) -> Option<&String> { + self.error_message.as_ref() + } + + fn partition_id(&self) -> Option { + self.partition_id + } +} + +impl BucketResponse for PbPutKvRespForBucket { + fn bucket_id(&self) -> i32 { + self.bucket_id + } + fn error_code(&self) -> Option { + self.error_code + } + fn error_message(&self) -> Option<&String> { + self.error_message.as_ref() + } + + fn partition_id(&self) -> Option { + self.partition_id + } +} + +trait WriteResponse { + type BucketResp: BucketResponse; + fn buckets_resp(&self) -> &[Self::BucketResp]; +} + +impl WriteResponse for ProduceLogResponse { + type BucketResp = PbProduceLogRespForBucket; + fn buckets_resp(&self) -> &[Self::BucketResp] { + &self.buckets_resp + } +} + +impl WriteResponse for PutKvResponse { + type BucketResp = PbPutKvRespForBucket; + fn buckets_resp(&self) -> &[Self::BucketResp] { + &self.buckets_resp + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::WriteRecord; + use crate::cluster::Cluster; + use crate::config::Config; + use crate::metadata::{PhysicalTablePath, TablePath}; + use crate::proto::{PbProduceLogRespForBucket, ProduceLogResponse}; + use crate::row::{Datum, GenericRow}; + use crate::rpc::FlussError; + use crate::test_utils::{build_cluster_arc, build_table_info}; + use std::collections::{HashMap, HashSet}; + + fn disabled_idempotence() -> Arc { + Arc::new(IdempotenceManager::new(false, 5)) + } + + fn enabled_idempotence() -> Arc { + Arc::new(IdempotenceManager::new(true, 5)) + } + + fn build_ready_batch( + accumulator: &RecordAccumulator, + cluster: Arc, + table_path: Arc, + ) -> Result<(ReadyWriteBatch, crate::client::ResultHandle)> { + let table_info = Arc::new(build_table_info(table_path.as_ref().clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(table_path)); + let row = GenericRow { + values: vec![Datum::Int32(1)], + }; + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + let result = accumulator.append(&record, 0, &cluster, false)?; + let result_handle = result.result_handle.expect("result handle"); + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?; + let mut drained = batches.remove(&1).expect("drained batches"); + let batch = drained.pop().expect("batch"); + Ok((batch, result_handle)) + } + + #[tokio::test] + async fn handle_write_batch_error_retries() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let idempotence = disabled_idempotence(); + let accumulator = Arc::new(RecordAccumulator::new( + Config::default(), + Arc::clone(&idempotence), + )); + let sender = Sender::new( + metadata, + accumulator.clone(), + 1024 * 1024, + 1000, + 1, + 1, + idempotence, + ); + + let (batch, _handle) = + build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path.clone())?; + let mut inflight = HashMap::new(); + inflight.insert(1, vec![batch]); + sender.add_to_inflight_batches(&inflight); + let batch = inflight.remove(&1).unwrap().pop().unwrap(); + + sender.handle_write_batch_error( + batch, + FlussError::RequestTimeOut, + "timeout".to_string(), + )?; + + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?; + let mut drained = batches.remove(&1).expect("drained batches"); + let batch = drained.pop().expect("batch"); + assert_eq!(batch.write_batch.attempts(), 1); + Ok(()) + } + + #[tokio::test] + async fn handle_write_batch_error_fails() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let idempotence = disabled_idempotence(); + let accumulator = Arc::new(RecordAccumulator::new( + Config::default(), + Arc::clone(&idempotence), + )); + let sender = Sender::new( + metadata, + accumulator.clone(), + 1024 * 1024, + 1000, + 1, + 0, + idempotence, + ); + + let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?; + sender.handle_write_batch_error( + batch, + FlussError::InvalidTableException, + "invalid".to_string(), + )?; + + let batch_result = handle.wait().await?; + assert!(matches!( + batch_result, + Err(broadcast::Error::WriteFailed { code, .. }) + if code == FlussError::InvalidTableException.code() + )); + Ok(()) + } + + #[tokio::test] + async fn handle_produce_response_duplicate_sequence_completes() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let idempotence = disabled_idempotence(); + let accumulator = Arc::new(RecordAccumulator::new( + Config::default(), + Arc::clone(&idempotence), + )); + let sender = Sender::new( + metadata, + accumulator.clone(), + 1024 * 1024, + 1000, + 1, + 0, + idempotence, + ); + + let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster, table_path)?; + let request_buckets = vec![batch.table_bucket.clone()]; + let mut records_by_bucket = HashMap::new(); + records_by_bucket.insert(batch.table_bucket.clone(), batch); + + let response = ProduceLogResponse { + buckets_resp: vec![PbProduceLogRespForBucket { + bucket_id: 0, + error_code: Some(FlussError::DuplicateSequenceException.code()), + error_message: Some("dup".to_string()), + ..Default::default() + }], + }; + + sender + .handle_write_response(1, &request_buckets, &mut records_by_bucket, response) + .await?; + + let batch_result = handle.wait().await?; + assert!(matches!(batch_result, Ok(()))); + Ok(()) + } + + #[tokio::test] + async fn test_unknown_writer_id_resets() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let idempotence = enabled_idempotence(); + let accumulator = Arc::new(RecordAccumulator::new( + Config::default(), + Arc::clone(&idempotence), + )); + idempotence.set_writer_id(42); + let sender = Sender::new( + metadata, + accumulator.clone(), + 1024 * 1024, + 1000, + -1, + i32::MAX, + Arc::clone(&idempotence), + ); + + // build_ready_batch drains the batch, which assigns seq=0 and adds in-flight + let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?; + assert_eq!(batch.write_batch.batch_sequence(), 0); + assert_eq!(batch.write_batch.writer_id(), 42); + + sender.handle_write_batch_error( + batch, + FlussError::UnknownWriterIdException, + "unknown writer".to_string(), + )?; + + // Writer ID should be reset + assert!(!idempotence.has_writer_id()); + + // Batch should be failed (not retried) + let batch_result = handle.wait().await?; + assert!(matches!( + batch_result, + Err(broadcast::Error::WriteFailed { code, .. }) + if code == FlussError::UnknownWriterIdException.code() + )); + Ok(()) + } + + #[tokio::test] + async fn test_out_of_order_sequence_non_retriable_resets() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let idempotence = enabled_idempotence(); + let accumulator = Arc::new(RecordAccumulator::new( + Config::default(), + Arc::clone(&idempotence), + )); + idempotence.set_writer_id(42); + // retries=0 means can_retry returns false immediately (attempts >= retries) + let sender = Sender::new( + metadata, + accumulator.clone(), + 1024 * 1024, + 1000, + -1, + 0, + Arc::clone(&idempotence), + ); + + // build_ready_batch drains the batch, which assigns seq=0 and adds in-flight + let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?; + assert_eq!(batch.write_batch.batch_sequence(), 0); + + // OutOfOrderSequence with retries exhausted → non-retriable → resets writer ID + sender.handle_write_batch_error( + batch, + FlussError::OutOfOrderSequenceException, + "out of order".to_string(), + )?; + + // Writer ID should be reset (matching Java behavior) + assert!(!idempotence.has_writer_id()); + + // Batch should be failed + let batch_result = handle.wait().await?; + assert!(matches!( + batch_result, + Err(broadcast::Error::WriteFailed { code, .. }) + if code == FlussError::OutOfOrderSequenceException.code() + )); + Ok(()) + } + + #[tokio::test] + async fn test_stale_writer_id_prevents_retry() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let metadata = Arc::new(Metadata::new_for_test(cluster.clone())); + let idempotence = enabled_idempotence(); + let accumulator = Arc::new(RecordAccumulator::new( + Config::default(), + Arc::clone(&idempotence), + )); + idempotence.set_writer_id(42); + let sender = Sender::new( + metadata, + accumulator.clone(), + 1024 * 1024, + 1000, + -1, + i32::MAX, + Arc::clone(&idempotence), + ); + + // build_ready_batch drains the batch, which assigns seq=0 and adds in-flight + let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?; + assert_eq!(batch.write_batch.writer_id(), 42); + let mut inflight = HashMap::new(); + inflight.insert(1, vec![batch]); + sender.add_to_inflight_batches(&inflight); + let batch = inflight.remove(&1).unwrap().pop().unwrap(); + + // Simulate writer ID reset (e.g., another bucket got UnknownWriterIdException) + idempotence.reset_writer_id(); + idempotence.set_writer_id(99); // new writer ID allocated + + // NetworkException is normally retriable, but writer ID changed + sender.handle_write_batch_error( + batch, + FlussError::NetworkException, + "connection reset".to_string(), + )?; + + // Batch should be failed (not retried) because writer ID is stale + let batch_result = handle.wait().await?; + assert!(matches!( + batch_result, + Err(broadcast::Error::WriteFailed { code, .. }) + if code == FlussError::UnknownWriterIdException.code() + )); + Ok(()) + } + + #[tokio::test] + async fn test_writer_state_assigned_on_drain() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let idempotence = enabled_idempotence(); + let accumulator = Arc::new(RecordAccumulator::new( + Config::default(), + Arc::clone(&idempotence), + )); + idempotence.set_writer_id(99); + + // Append a record to the accumulator + let table_info = Arc::new(build_table_info(table_path.as_ref().clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(table_path)); + let row = GenericRow { + values: vec![Datum::Int32(42)], + }; + let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row); + accumulator.append(&record, 0, &cluster, false)?; + + // Drain the batches — accumulator now assigns writer state during drain + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?; + + // Verify the batch got writer state assigned by the accumulator + let batch_list = batches.values().next().unwrap(); + let batch = &batch_list[0]; + assert_eq!(batch.write_batch.batch_sequence(), 0); + assert_eq!(batch.write_batch.writer_id(), 99); + Ok(()) + } + + #[tokio::test] + async fn test_reenqueued_batch_keeps_sequence_on_redrain() -> Result<()> { + let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string())); + let cluster = build_cluster_arc(table_path.as_ref(), 1, 1); + let idempotence = enabled_idempotence(); + let accumulator = Arc::new(RecordAccumulator::new( + Config::default(), + Arc::clone(&idempotence), + )); + idempotence.set_writer_id(99); + + // build_ready_batch drains the batch, which now assigns writer state + // (seq=0) during drain since idempotence is enabled. + let (batch, _handle) = + build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?; + + let writer_id = idempotence.writer_id(); + assert_eq!(batch.write_batch.batch_sequence(), 0); + assert!(batch.write_batch.has_batch_sequence()); + assert_eq!(batch.write_batch.writer_id(), writer_id); + + // Re-enqueue the batch (simulating a retriable error) + accumulator.re_enqueue(batch); + + // Drain again + let server = cluster.get_tablet_server(1).expect("server"); + let nodes = HashSet::from([server.clone()]); + let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?; + let batch_list = batches.values_mut().next().unwrap(); + let ready_batch = &mut batch_list[0]; + + // Re-enqueued batch keeps its original sequence + assert!(ready_batch.write_batch.has_batch_sequence()); + assert_eq!(ready_batch.write_batch.writer_id(), writer_id); + assert_eq!(ready_batch.write_batch.batch_sequence(), 0); + // Only one sequence was allocated (during the first drain) + assert_eq!( + idempotence.next_sequence_and_increment(&ready_batch.table_bucket), + 1 + ); + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/write_format.rs b/fluss-rust/crates/fluss/src/client/write/write_format.rs new file mode 100644 index 0000000000..147152cae4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/write_format.rs @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::KvFormat; +use std::fmt::Display; + +#[derive(Copy, Clone)] +pub enum WriteFormat { + ArrowLog, + CompactedLog, + CompactedKv, +} + +impl WriteFormat { + pub const fn is_log(&self) -> bool { + matches!(self, Self::ArrowLog | Self::CompactedLog) + } + + pub fn is_kv(&self) -> bool { + !self.is_log() + } + + pub fn to_kv_format(&self) -> Result { + match self { + WriteFormat::CompactedKv => Ok(KvFormat::COMPACTED), + other => Err(IllegalArgument { + message: format!("WriteFormat `{other}` is not a KvFormat"), + }), + } + } + + pub fn from_kv_format(kv_format: &KvFormat) -> Result { + match kv_format { + KvFormat::COMPACTED => Ok(WriteFormat::CompactedKv), + other => Err(IllegalArgument { + message: format!("Unknown KvFormat: `{other}`"), + }), + } + } +} + +impl Display for WriteFormat { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + WriteFormat::ArrowLog => f.write_str("ArrowLog"), + WriteFormat::CompactedLog => f.write_str("CompactedLog"), + WriteFormat::CompactedKv => f.write_str("CompactedKv"), + } + } +} diff --git a/fluss-rust/crates/fluss/src/client/write/writer_client.rs b/fluss-rust/crates/fluss/src/client/write/writer_client.rs new file mode 100644 index 0000000000..ffdf96b1df --- /dev/null +++ b/fluss-rust/crates/fluss/src/client/write/writer_client.rs @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::BucketId; +use crate::bucketing::BucketingFunction; +use crate::client::metadata::Metadata; +use crate::client::write::IdempotenceManager; +use crate::client::write::broadcast; +use crate::client::write::bucket_assigner::{ + BucketAssigner, HashBucketAssigner, RoundRobinBucketAssigner, StickyBucketAssigner, +}; +use crate::client::write::sender::Sender; +use crate::client::{RecordAccumulator, ResultHandle, WriteRecord}; +use crate::config::Config; +use crate::config::NoKeyAssigner; +use crate::error::{Error, Result}; +use crate::metadata::{PhysicalTablePath, TableInfo}; +use bytes::Bytes; +use dashmap::DashMap; +use log::warn; +use parking_lot::Mutex; +use std::sync::Arc; +use std::time::Duration; +use tokio::sync::mpsc; +use tokio::task::JoinHandle; + +#[allow(dead_code)] +pub struct WriterClient { + config: Config, + max_request_size: i32, + accumulate: Arc, + shutdown_tx: Mutex>>, + sender_join_handle: Mutex>>, + metadata: Arc, + bucket_assigners: DashMap, Arc>, + idempotence_manager: Arc, +} + +impl WriterClient { + pub fn new(config: Config, metadata: Arc) -> Result { + let ack = Self::get_ack(&config)?; + + let idempotence_manager = Arc::new(IdempotenceManager::new( + config.writer_enable_idempotence, + config.writer_max_inflight_requests_per_bucket, + )); + + let (shutdown_tx, shutdown_rx) = mpsc::channel(1); + + let accumulator = Arc::new(RecordAccumulator::new( + config.clone(), + Arc::clone(&idempotence_manager), + )); + + let sender = Arc::new(Sender::new( + metadata.clone(), + accumulator.clone(), + config.writer_request_max_size, + 30_000, + ack, + config.writer_retries, + Arc::clone(&idempotence_manager), + )); + + let join_handle = tokio::spawn(async move { + if let Err(e) = sender.run_with_shutdown(shutdown_rx).await { + warn!("Sender loop exited with error: {e}"); + } + }); + + Ok(Self { + max_request_size: config.writer_request_max_size, + config, + shutdown_tx: Mutex::new(Some(shutdown_tx)), + sender_join_handle: Mutex::new(Some(join_handle)), + accumulate: accumulator, + metadata, + bucket_assigners: Default::default(), + idempotence_manager, + }) + } + + fn get_ack(config: &Config) -> Result { + let acks = config.writer_acks.as_str(); + if acks.eq_ignore_ascii_case("all") { + Ok(-1) + } else { + acks.parse::().map_err(|e| Error::IllegalArgument { + message: format!("invalid writer ack '{acks}': {e}"), + }) + } + } + + pub fn send(&self, record: &WriteRecord<'_>) -> Result { + if self.accumulate.is_closed() { + return Err(Error::WriterClosed { + message: "Cannot send: writer is closed".to_string(), + }); + } + let physical_table_path = &record.physical_table_path; + let cluster = self.metadata.get_cluster(); + let bucket_key = record.bucket_key.as_ref(); + + let (bucket_assigner, bucket_id) = + self.assign_bucket(&record.table_info, bucket_key, physical_table_path)?; + + let mut result = self.accumulate.append( + record, + bucket_id, + &cluster, + bucket_assigner.abort_if_batch_full(), + )?; + + if result.abort_record_for_new_batch { + let prev_bucket_id = bucket_id; + bucket_assigner.on_new_batch(&cluster, prev_bucket_id); + let bucket_id = bucket_assigner.assign_bucket(bucket_key, &cluster)?; + result = self.accumulate.append(record, bucket_id, &cluster, false)?; + } + + if result.batch_is_full || result.new_batch_created { + self.accumulate.wakeup_sender(); + } + + Ok(result.result_handle.expect("result_handle should exist")) + } + fn assign_bucket( + &self, + table_info: &Arc, + bucket_key: Option<&Bytes>, + table_path: &Arc, + ) -> Result<(Arc, BucketId)> { + let cluster = self.metadata.get_cluster(); + let bucket_assigner = { + if let Some(assigner) = self.bucket_assigners.get(table_path) { + assigner.clone() + } else { + let assigner = Self::create_bucket_assigner( + table_info, + Arc::clone(table_path), + bucket_key, + &self.config, + )?; + self.bucket_assigners + .insert(Arc::clone(table_path), Arc::clone(&assigner)); + assigner + } + }; + let bucket_id = bucket_assigner.assign_bucket(bucket_key, &cluster)?; + Ok((bucket_assigner, bucket_id)) + } + + /// Close the writer with a timeout. Matches Java's two-phase shutdown: + /// + /// 1. **Graceful**: Signal the sender to drain all remaining batches. + /// `accumulator.close()` makes all batches immediately ready (no need + /// to wait for `batch_timeout_ms`). + /// 2. **Force** (if timeout exceeded): Abort the sender task and fail + /// all remaining batches with an error. + /// + /// Idempotent: calling `close` a second time returns `Ok(())` immediately. + pub async fn close(&self, timeout: Duration) -> Result<()> { + // Take shutdown_tx and join_handle out of their Mutexes. + // Second call sees None and returns early. + let shutdown_tx = self.shutdown_tx.lock().take(); + let join_handle = self.sender_join_handle.lock().take(); + + let Some(mut join_handle) = join_handle else { + return Ok(()); + }; + + // Phase 1: Signal graceful shutdown. + // Mark accumulator closed so all batches become immediately sendable. + self.accumulate.close(); + // Drop the shutdown sender — recv() returns None, breaking the sender loop. + drop(shutdown_tx); + + // Phase 2: Wait for graceful drain, bounded by timeout. + tokio::select! { + result = &mut join_handle => { + if let Err(e) = result { + warn!("Sender task panicked during shutdown: {e}"); + } + } + _ = tokio::time::sleep(timeout) => { + // Phase 3: Force close — timeout exceeded. + warn!("Graceful shutdown timed out after {timeout:?}, force closing"); + join_handle.abort(); + let _ = join_handle.await; // Wait for cancellation to complete + self.accumulate.abort_batches(broadcast::Error::Client { + message: "Writer force closed (shutdown timeout exceeded)".to_string(), + }); + } + } + Ok(()) + } + + pub async fn flush(&self) -> Result<()> { + self.accumulate.begin_flush(); + self.accumulate.await_flush_completion().await?; + Ok(()) + } + + pub fn create_bucket_assigner( + table_info: &Arc, + table_path: Arc, + bucket_key: Option<&Bytes>, + config: &Config, + ) -> Result> { + if bucket_key.is_some() { + let datalake_format = table_info.get_table_config().get_datalake_format()?; + let function = ::of(datalake_format.as_ref()); + Ok(Arc::new(HashBucketAssigner::new( + table_info.num_buckets, + function, + ))) + } else { + match config.writer_bucket_no_key_assigner { + NoKeyAssigner::Sticky => Ok(Arc::new(StickyBucketAssigner::new(table_path))), + NoKeyAssigner::RoundRobin => Ok(Arc::new(RoundRobinBucketAssigner::new( + table_path, + table_info.num_buckets, + ))), + } + } + } +} diff --git a/fluss-rust/crates/fluss/src/cluster/cluster.rs b/fluss-rust/crates/fluss/src/cluster/cluster.rs new file mode 100644 index 0000000000..d5518709ec --- /dev/null +++ b/fluss-rust/crates/fluss/src/cluster/cluster.rs @@ -0,0 +1,541 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::cluster::{BucketLocation, ServerNode, ServerType}; +use crate::error::{Error, Result}; +use crate::metadata::{ + JsonSerde, PhysicalTablePath, TableBucket, TableDescriptor, TableInfo, TablePath, +}; +use crate::proto::{MetadataResponse, PbBucketMetadata}; +use crate::rpc::{from_pb_server_node, from_pb_table_path}; +use crate::{BucketId, PartitionId, TableId}; +use rand::random_range; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +static EMPTY: Vec = Vec::new(); + +#[derive(Default)] +pub struct Cluster { + coordinator_server: Option, + alive_tablet_servers_by_id: HashMap, + alive_tablet_servers: Vec, + available_locations_by_path: HashMap, Vec>, + available_locations_by_bucket: HashMap, + table_id_by_path: HashMap, + table_path_by_id: HashMap, + table_info_by_path: HashMap, + partitions_id_by_path: HashMap, PartitionId>, + partition_name_by_id: HashMap, +} + +impl Cluster { + #[allow(clippy::too_many_arguments)] + pub fn new( + coordinator_server: Option, + alive_tablet_servers_by_id: HashMap, + available_locations_by_path: HashMap, Vec>, + available_locations_by_bucket: HashMap, + table_id_by_path: HashMap, + table_info_by_path: HashMap, + partitions_id_by_path: HashMap, PartitionId>, + ) -> Self { + let alive_tablet_servers = alive_tablet_servers_by_id.values().cloned().collect(); + let table_path_by_id = table_id_by_path + .iter() + .map(|(path, table_id)| (*table_id, path.clone())) + .collect(); + let partition_name_by_id = partitions_id_by_path + .iter() + .filter_map(|(path, id)| path.get_partition_name().map(|name| (*id, name.clone()))) + .collect(); + Cluster { + coordinator_server, + alive_tablet_servers_by_id, + alive_tablet_servers, + available_locations_by_path, + available_locations_by_bucket, + table_id_by_path, + table_path_by_id, + table_info_by_path, + partitions_id_by_path, + partition_name_by_id, + } + } + + pub fn invalidate_server(&self, server_id: &i32, table_ids: Vec) -> Self { + let alive_tablet_servers_by_id = self + .alive_tablet_servers_by_id + .iter() + .filter(|&(id, _)| id != server_id) + .map(|(id, ts)| (*id, ts.clone())) + .collect(); + + let table_paths: HashSet<&TablePath> = table_ids + .iter() + .filter_map(|id| self.table_path_by_id.get(id)) + .collect(); + + let (available_locations_by_path, available_locations_by_bucket) = + self.filter_bucket_locations_by_path(&table_paths); + + Cluster::new( + self.coordinator_server.clone(), + alive_tablet_servers_by_id, + available_locations_by_path, + available_locations_by_bucket, + self.table_id_by_path.clone(), + self.table_info_by_path.clone(), + self.partitions_id_by_path.clone(), + ) + } + + pub fn invalidate_physical_table_meta( + &self, + physical_tables_to_invalid: &HashSet, + ) -> Self { + let table_paths: HashSet<&TablePath> = physical_tables_to_invalid + .iter() + .map(|path| path.get_table_path()) + .collect(); + let (available_locations_by_path, available_locations_by_bucket) = + self.filter_bucket_locations_by_path(&table_paths); + + Cluster::new( + self.coordinator_server.clone(), + self.alive_tablet_servers_by_id.clone(), + available_locations_by_path, + available_locations_by_bucket, + self.table_id_by_path.clone(), + self.table_info_by_path.clone(), + self.partitions_id_by_path.clone(), + ) + } + + pub fn update(&mut self, cluster: Cluster) { + let Cluster { + coordinator_server, + alive_tablet_servers_by_id, + alive_tablet_servers, + available_locations_by_path, + available_locations_by_bucket, + table_id_by_path, + table_path_by_id, + table_info_by_path, + partitions_id_by_path, + partition_name_by_id, + } = cluster; + self.coordinator_server = coordinator_server; + self.alive_tablet_servers_by_id = alive_tablet_servers_by_id; + self.alive_tablet_servers = alive_tablet_servers; + self.available_locations_by_path = available_locations_by_path; + self.available_locations_by_bucket = available_locations_by_bucket; + self.table_id_by_path = table_id_by_path; + self.table_path_by_id = table_path_by_id; + self.table_info_by_path = table_info_by_path; + self.partitions_id_by_path = partitions_id_by_path; + self.partition_name_by_id = partition_name_by_id; + } + + fn filter_bucket_locations_by_path( + &self, + table_paths: &HashSet<&TablePath>, + ) -> ( + HashMap, Vec>, + HashMap, + ) { + let available_locations_by_path = self + .available_locations_by_path + .iter() + .filter(|&(path, _)| !table_paths.contains(path.get_table_path())) + .map(|(path, locations)| (path.clone(), locations.clone())) + .collect(); + + let available_locations_by_bucket = self + .available_locations_by_bucket + .iter() + .filter(|&(_bucket, location)| { + !table_paths.contains(&location.physical_table_path.get_table_path()) + }) + .map(|(bucket, location)| (bucket.clone(), location.clone())) + .collect(); + + (available_locations_by_path, available_locations_by_bucket) + } + + pub fn from_metadata_response( + metadata_response: MetadataResponse, + origin_cluster: Option<&Cluster>, + ) -> Result { + let mut servers = HashMap::with_capacity(metadata_response.tablet_servers.len()); + for pb_server in metadata_response.tablet_servers { + let server_id = pb_server.node_id; + let server_node = from_pb_server_node(pb_server, ServerType::TabletServer); + servers.insert(server_id, server_node); + } + + let coordinator_server = metadata_response + .coordinator_server + .map(|node| from_pb_server_node(node, ServerType::CoordinatorServer)); + + let mut table_id_by_path = HashMap::new(); + let mut table_info_by_path = HashMap::new(); + let mut partitions_id_by_path = HashMap::new(); + let mut tmp_available_locations_by_path = HashMap::new(); + let mut tmp_available_location_by_bucket = HashMap::new(); + + if let Some(origin) = origin_cluster { + table_info_by_path.extend(origin.get_table_info_by_path().clone()); + table_id_by_path.extend(origin.get_table_id_by_path().clone()); + partitions_id_by_path.extend(origin.partitions_id_by_path.clone()); + tmp_available_locations_by_path.extend(origin.available_locations_by_path.clone()); + tmp_available_location_by_bucket.extend(origin.available_locations_by_bucket.clone()); + } + + // iterate all table metadata + for table_metadata in metadata_response.table_metadata { + let table_id = table_metadata.table_id; + let table_path = from_pb_table_path(&table_metadata.table_path); + let table_descriptor = TableDescriptor::deserialize_json( + &serde_json::from_slice(table_metadata.table_json.as_slice()).map_err(|e| { + Error::JsonSerdeError { + message: format!( + "Error deserializing table_json into TableDescriptor for table_id {table_id} and table_path {table_path}: {e}" + ) + } + })?, + )?; + let table_info = TableInfo::of( + table_path.clone(), + table_id, + table_metadata.schema_id, + table_descriptor, + table_metadata.created_time, + table_metadata.modified_time, + ); + table_info_by_path.insert(table_path.clone(), table_info); + table_id_by_path.insert(table_path.clone(), table_id); + + let bucket_metadata = table_metadata.bucket_metadata; + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))); + + let bucket_locations = get_bucket_locations( + &mut servers, + bucket_metadata.as_slice(), + table_id, + None, + &physical_table_path, + ); + tmp_available_locations_by_path.insert(physical_table_path, bucket_locations); + } + + // iterate all partition metadata + for partition_metadata in metadata_response.partition_metadata { + let table_id = partition_metadata.table_id; + + if let Some(cluster) = origin_cluster { + let partition_name = partition_metadata.partition_name; + let table_path = cluster.get_table_path_by_id(table_id).unwrap(); + let partition_id = partition_metadata.partition_id; + + let physical_table_path = Arc::new(PhysicalTablePath::of_partitioned( + Arc::new(table_path.clone()), + Some(partition_name), + )); + + partitions_id_by_path.insert(Arc::clone(&physical_table_path), partition_id); + + let bucket_locations = get_bucket_locations( + &mut servers, + partition_metadata.bucket_metadata.as_slice(), + table_id, + Some(partition_id), + &physical_table_path, + ); + + tmp_available_locations_by_path.insert(physical_table_path, bucket_locations); + } + } + + for bucket_locations in &mut tmp_available_locations_by_path.values() { + for location in bucket_locations { + if location.leader().is_some() { + tmp_available_location_by_bucket + .insert(location.table_bucket.clone(), location.clone()); + } + } + } + + Ok(Cluster::new( + coordinator_server, + servers, + tmp_available_locations_by_path, + tmp_available_location_by_bucket, + table_id_by_path, + table_info_by_path, + partitions_id_by_path, + )) + } + + pub fn get_coordinator_server(&self) -> Option<&ServerNode> { + self.coordinator_server.as_ref() + } + + pub fn leader_for(&self, table_bucket: &TableBucket) -> Option<&ServerNode> { + let location = self.available_locations_by_bucket.get(table_bucket); + if let Some(location) = location { + location.leader().as_ref() + } else { + None + } + } + + pub fn get_tablet_server(&self, id: i32) -> Option<&ServerNode> { + self.alive_tablet_servers_by_id.get(&id) + } + + pub fn get_table_bucket( + &self, + physical_table_path: &PhysicalTablePath, + bucket_id: BucketId, + ) -> Result { + let table_info = self.get_table(physical_table_path.get_table_path())?; + let partition_id = self.get_partition_id(physical_table_path); + + if physical_table_path.get_partition_name().is_some() && partition_id.is_none() { + return Err(Error::partition_not_exist(format!( + "The partition {} is not found in cluster", + physical_table_path.get_partition_name().unwrap() + ))); + } + + Ok(TableBucket::new_with_partition( + table_info.table_id, + partition_id, + bucket_id, + )) + } + + pub fn get_partition_id(&self, physical_table_path: &PhysicalTablePath) -> Option { + self.partitions_id_by_path.get(physical_table_path).copied() + } + + pub fn get_partition_name(&self, partition_id: PartitionId) -> Option<&String> { + self.partition_name_by_id.get(&partition_id) + } + + pub fn get_table_id(&self, table_path: &TablePath) -> Option { + self.table_id_by_path.get(table_path).copied() + } + + pub fn get_bucket_locations_by_path( + &self, + ) -> &HashMap, Vec> { + &self.available_locations_by_path + } + + pub fn get_table_info_by_path(&self) -> &HashMap { + &self.table_info_by_path + } + + pub fn get_table_id_by_path(&self) -> &HashMap { + &self.table_id_by_path + } + + pub fn get_table_path_by_id(&self, table_id: TableId) -> Option<&TablePath> { + self.table_path_by_id.get(&table_id) + } + + pub fn get_available_buckets_for_table_path( + &self, + table_path: &PhysicalTablePath, + ) -> &Vec { + self.available_locations_by_path + .get(table_path) + .unwrap_or(&EMPTY) + } + + pub fn get_server_nodes(&self) -> Vec { + let mut nodes = Vec::new(); + if let Some(coordinator) = &self.coordinator_server { + nodes.push(coordinator.clone()); + } + nodes.extend(self.alive_tablet_servers.iter().cloned()); + nodes + } + + pub fn get_one_available_server(&self) -> Option<&ServerNode> { + if self.alive_tablet_servers.is_empty() { + return None; + } + let offset = random_range(0..self.alive_tablet_servers.len()); + self.alive_tablet_servers.get(offset) + } + + pub fn get_bucket_count(&self, table_path: &TablePath) -> i32 { + self.table_info_by_path + .get(table_path) + .unwrap_or_else(|| panic!("can't not table info by path {table_path}")) + .num_buckets + } + + pub fn get_table(&self, table_path: &TablePath) -> Result<&TableInfo> { + self.table_info_by_path + .get(table_path) + .ok_or_else(|| Error::invalid_table(format!("Table info not found for {table_path}"))) + } + + pub fn opt_get_table(&self, table_path: &TablePath) -> Option<&TableInfo> { + self.table_info_by_path.get(table_path) + } + + pub fn get_partition_id_by_path(&self) -> &HashMap, PartitionId> { + &self.partitions_id_by_path + } +} + +fn get_bucket_locations( + servers: &mut HashMap, + bucket_metadata: &[PbBucketMetadata], + table_id: i64, + partition_id: Option, + physical_table_path: &Arc, +) -> Vec { + let mut bucket_locations = Vec::new(); + for metadata in bucket_metadata { + let bucket_id = metadata.bucket_id; + let bucket = TableBucket::new_with_partition(table_id, partition_id, bucket_id); + + let server = if let Some(leader_id) = metadata.leader_id + && let Some(server_node) = servers.get(&leader_id) + { + Some(server_node.clone()) + } else { + None + }; + + bucket_locations.push(BucketLocation::new( + bucket.clone(), + server, + Arc::clone(physical_table_path), + )); + } + bucket_locations +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_coordinator() -> ServerNode { + ServerNode::new( + 0, + "coord-host".to_string(), + 9123, + ServerType::CoordinatorServer, + ) + } + + fn make_tablet_servers() -> HashMap { + let mut servers = HashMap::new(); + servers.insert( + 1, + ServerNode::new(1, "ts1-host".to_string(), 9124, ServerType::TabletServer), + ); + servers.insert( + 2, + ServerNode::new(2, "ts2-host".to_string(), 9125, ServerType::TabletServer), + ); + servers + } + + #[test] + fn test_server_node_getters() { + let node = ServerNode::new(5, "myhost".to_string(), 8080, ServerType::TabletServer); + assert_eq!(node.id(), 5); + assert_eq!(node.host(), "myhost"); + assert_eq!(node.port(), 8080); + assert_eq!(node.server_type(), &ServerType::TabletServer); + assert_eq!(node.uid(), "ts-5"); + assert_eq!(node.url(), "myhost:8080"); + } + + #[test] + fn test_server_type_display() { + assert_eq!(ServerType::TabletServer.to_string(), "TabletServer"); + assert_eq!( + ServerType::CoordinatorServer.to_string(), + "CoordinatorServer" + ); + } + + #[test] + fn test_get_server_nodes_with_coordinator_and_tablets() { + let cluster = Cluster::new( + Some(make_coordinator()), + make_tablet_servers(), + HashMap::new(), + HashMap::new(), + HashMap::new(), + HashMap::new(), + HashMap::new(), + ); + + let nodes = cluster.get_server_nodes(); + assert_eq!(nodes.len(), 3); + + let coordinator_count = nodes + .iter() + .filter(|n| *n.server_type() == ServerType::CoordinatorServer) + .count(); + assert_eq!(coordinator_count, 1); + + let tablet_count = nodes + .iter() + .filter(|n| *n.server_type() == ServerType::TabletServer) + .count(); + assert_eq!(tablet_count, 2); + } + + #[test] + fn test_get_server_nodes_no_coordinator() { + let cluster = Cluster::new( + None, + make_tablet_servers(), + HashMap::new(), + HashMap::new(), + HashMap::new(), + HashMap::new(), + HashMap::new(), + ); + + let nodes = cluster.get_server_nodes(); + assert_eq!(nodes.len(), 2); + assert!( + nodes + .iter() + .all(|n| *n.server_type() == ServerType::TabletServer) + ); + } + + #[test] + fn test_get_server_nodes_empty_cluster() { + let cluster = Cluster::default(); + let nodes = cluster.get_server_nodes(); + assert!(nodes.is_empty()); + } +} diff --git a/fluss-rust/crates/fluss/src/cluster/mod.rs b/fluss-rust/crates/fluss/src/cluster/mod.rs new file mode 100644 index 0000000000..863f8ed509 --- /dev/null +++ b/fluss-rust/crates/fluss/src/cluster/mod.rs @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::BucketId; +use crate::metadata::{PhysicalTablePath, TableBucket}; +use std::fmt; +use std::sync::Arc; + +#[allow(clippy::module_inception)] +mod cluster; + +pub use cluster::Cluster; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ServerNode { + id: i32, + uid: String, + host: String, + port: u32, + server_type: ServerType, +} + +impl ServerNode { + pub fn new(id: i32, host: String, port: u32, server_type: ServerType) -> ServerNode { + ServerNode { + id, + uid: match server_type { + ServerType::CoordinatorServer => format!("cs-{id}"), + ServerType::TabletServer => format!("ts-{id}"), + }, + host, + port, + server_type, + } + } + + pub fn uid(&self) -> &str { + &self.uid + } + + pub fn url(&self) -> String { + format!("{}:{}", self.host, self.port) + } + + pub fn id(&self) -> i32 { + self.id + } + + pub fn host(&self) -> &str { + &self.host + } + + pub fn port(&self) -> u32 { + self.port + } + + pub fn server_type(&self) -> &ServerType { + &self.server_type + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum ServerType { + TabletServer, + CoordinatorServer, +} + +impl ServerType { + pub fn to_type_id(&self) -> i32 { + match self { + ServerType::CoordinatorServer => 1, + ServerType::TabletServer => 2, + } + } + + pub fn from_type_id(type_id: i32) -> Option { + match type_id { + 1 => Some(ServerType::CoordinatorServer), + 2 => Some(ServerType::TabletServer), + _ => None, + } + } +} + +impl fmt::Display for ServerType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ServerType::TabletServer => write!(f, "TabletServer"), + ServerType::CoordinatorServer => write!(f, "CoordinatorServer"), + } + } +} + +#[derive(Debug, Clone)] +pub struct BucketLocation { + pub table_bucket: TableBucket, + leader: Option, + physical_table_path: Arc, +} + +impl BucketLocation { + pub fn new( + table_bucket: TableBucket, + leader: Option, + physical_table_path: Arc, + ) -> BucketLocation { + BucketLocation { + table_bucket, + leader, + physical_table_path, + } + } + + pub fn leader(&self) -> &Option { + &self.leader + } + + pub fn table_bucket(&self) -> &TableBucket { + &self.table_bucket + } + + pub fn bucket_id(&self) -> BucketId { + self.table_bucket.bucket_id() + } + + pub fn physical_table_path(&self) -> &Arc { + &self.physical_table_path + } +} diff --git a/fluss-rust/crates/fluss/src/compression/arrow_compression.rs b/fluss-rust/crates/fluss/src/compression/arrow_compression.rs new file mode 100644 index 0000000000..8121a512b1 --- /dev/null +++ b/fluss-rust/crates/fluss/src/compression/arrow_compression.rs @@ -0,0 +1,264 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::{Error, Result}; +use arrow::ipc::CompressionType; +use arrow_schema::ArrowError; +use std::collections::HashMap; + +pub const TABLE_LOG_ARROW_COMPRESSION_ZSTD_LEVEL: &str = "table.log.arrow.compression.zstd.level"; +pub const TABLE_LOG_ARROW_COMPRESSION_TYPE: &str = "table.log.arrow.compression.type"; +pub const DEFAULT_NON_ZSTD_COMPRESSION_LEVEL: i32 = -1; +pub const DEFAULT_ZSTD_COMPRESSION_LEVEL: i32 = 3; + +#[derive(Clone, Debug, PartialEq)] +pub enum ArrowCompressionType { + None, + Lz4Frame, + Zstd, +} + +impl ArrowCompressionType { + fn from_conf(properties: &HashMap) -> Result { + match properties + .get(TABLE_LOG_ARROW_COMPRESSION_TYPE) + .map(|s| s.as_str()) + { + Some("NONE") => Ok(Self::None), + Some("LZ4_FRAME") => Ok(Self::Lz4Frame), + Some("ZSTD") => Ok(Self::Zstd), + Some(other) => Err(Error::IllegalArgument { + message: format!("Unsupported compression type: {other}"), + }), + None => Ok(Self::Zstd), + } + } +} + +#[derive(Clone, Debug)] +pub struct ArrowCompressionInfo { + pub compression_type: ArrowCompressionType, + pub compression_level: i32, +} + +impl ArrowCompressionInfo { + pub fn from_conf(properties: &HashMap) -> Result { + let compression_type = ArrowCompressionType::from_conf(properties)?; + + if compression_type != ArrowCompressionType::Zstd { + return Ok(Self { + compression_type, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }); + } + + match properties + .get(TABLE_LOG_ARROW_COMPRESSION_ZSTD_LEVEL) + .map(|s| s.as_str().parse::()) + { + Some(Ok(level)) if !(1..=22).contains(&level) => Err(Error::IllegalArgument { + message: format!( + "Invalid ZSTD compression level: {level}. Expected a value between 1 and 22." + ), + }), + Some(Err(e)) => Err(Error::IllegalArgument { + message: format!( + "Invalid ZSTD compression level. Expected a value between 1 and 22. {e}" + ), + }), + Some(Ok(level)) => { + // TODO Remove once non-default ZSTD compression level is implemented https://github.com/apache/fluss-rust/issues/109 + if level != DEFAULT_ZSTD_COMPRESSION_LEVEL { + return Err(Error::ArrowError { + message: format!( + "Rust client currently only implements default ZSTD compression level {DEFAULT_ZSTD_COMPRESSION_LEVEL}. Got: {level}." + ), + source: ArrowError::NotYetImplemented(format!( + "zstd compression level {level}." + )), + }); + } + Ok(Self { + compression_type, + compression_level: level, + }) + } + None => Ok(Self { + compression_type, + compression_level: DEFAULT_ZSTD_COMPRESSION_LEVEL, + }), + } + } + + #[cfg(test)] + fn new(compression_type: ArrowCompressionType, compression_level: i32) -> ArrowCompressionInfo { + Self { + compression_type, + compression_level, + } + } + + pub fn get_compression_type(&self) -> Option { + match self.compression_type { + ArrowCompressionType::Zstd => Some(CompressionType::ZSTD), + ArrowCompressionType::Lz4Frame => Some(CompressionType::LZ4_FRAME), + ArrowCompressionType::None => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + #[test] + fn test_from_conf() { + assert_eq!( + ArrowCompressionType::from_conf(&HashMap::new()).unwrap(), + ArrowCompressionType::Zstd + ); + + assert_eq!( + ArrowCompressionType::from_conf(&mk_map(&[( + "table.log.arrow.compression.type", + "NONE" + )])) + .unwrap(), + ArrowCompressionType::None + ); + + assert_eq!( + ArrowCompressionType::from_conf(&mk_map(&[( + "table.log.arrow.compression.type", + "LZ4_FRAME" + )])) + .unwrap(), + ArrowCompressionType::Lz4Frame + ); + + assert_eq!( + ArrowCompressionType::from_conf(&mk_map(&[( + "table.log.arrow.compression.type", + "ZSTD" + )])) + .unwrap(), + ArrowCompressionType::Zstd + ); + } + + #[test] + fn test_from_conf_invalid_compression_type() { + let props = mk_map(&[("table.log.arrow.compression.type", "FOO")]); + + assert!( + ArrowCompressionInfo::from_conf(&props) + .unwrap_err() + .to_string() + .contains( + "Fluss hitting illegal argument error Unsupported compression type: FOO." + ) + ); + } + + #[test] + fn test_from_conf_zstd_compression_level() { + let compression_info = ArrowCompressionInfo::from_conf(&mk_map(&[( + "table.log.arrow.compression.type", + "ZSTD", + )])); + assert_eq!(compression_info.unwrap().compression_level, 3); + } + + // TODO Remove once non-default ZSTD compression level is implemented https://github.com/apache/fluss-rust/issues/109 + #[test] + fn test_from_conf_zstd_compression_level_error_when_non_default() { + let result = ArrowCompressionInfo::from_conf(&mk_map(&[ + ("table.log.arrow.compression.type", "ZSTD"), + ("table.log.arrow.compression.zstd.level", "1"), + ])); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains( + "Rust client currently only implements default ZSTD compression level 3. Got: 1." + )); + } + + #[test] + fn test_from_conf_compression_level_out_of_range() { + let props = mk_map(&[ + ("table.log.arrow.compression.type", "ZSTD"), + ("table.log.arrow.compression.zstd.level", "0"), + ]); + + assert!( + ArrowCompressionInfo::from_conf(&props) + .unwrap_err() + .to_string() + .contains("Expected a value between 1 and 22.") + ); + + let props = mk_map(&[ + ("table.log.arrow.compression.type", "ZSTD"), + ("table.log.arrow.compression.zstd.level", "23"), + ]); + + assert!( + ArrowCompressionInfo::from_conf(&props) + .unwrap_err() + .to_string() + .contains("Expected a value between 1 and 22.") + ); + } + + #[test] + fn test_from_conf_compression_level_parse_error() { + let props = mk_map(&[ + ("table.log.arrow.compression.type", "ZSTD"), + ("table.log.arrow.compression.zstd.level", "not-a-number"), + ]); + + assert!( + ArrowCompressionInfo::from_conf(&props) + .unwrap_err() + .to_string() + .contains("Expected a value between 1 and 22.") + ); + } + + #[test] + fn get_compression_type_maps_correctly() { + assert_eq!( + ArrowCompressionInfo::new(ArrowCompressionType::None, -1).get_compression_type(), + None + ); + assert_eq!( + ArrowCompressionInfo::new(ArrowCompressionType::Lz4Frame, -1).get_compression_type(), + Some(CompressionType::LZ4_FRAME) + ); + assert_eq!( + ArrowCompressionInfo::new(ArrowCompressionType::Zstd, -1).get_compression_type(), + Some(CompressionType::ZSTD) + ); + } + + fn mk_map(pairs: &[(&str, &str)]) -> HashMap { + pairs + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() + } +} diff --git a/fluss-rust/crates/fluss/src/compression/arrow_compression_ratio_estimator.rs b/fluss-rust/crates/fluss/src/compression/arrow_compression_ratio_estimator.rs new file mode 100644 index 0000000000..08b8048aa4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/compression/arrow_compression_ratio_estimator.rs @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::atomic::{AtomicU32, Ordering}; + +/// Adaptive estimator for Arrow compression ratios. +/// +/// Tracks the ratio between compressed and uncompressed Arrow body sizes. +/// The estimate adjusts asymmetrically: it increases quickly when compression +/// worsens (to avoid underestimating batch sizes) and decreases slowly when +/// compression improves (conservative). +/// +/// Thread-safe: uses atomic f32 (stored as u32 bits) matching Java's `volatile float`. +/// +/// Matching Java's `ArrowCompressionRatioEstimator`. +pub struct ArrowCompressionRatioEstimator { + /// Stored as `f32::to_bits()` for atomic access. + ratio_bits: AtomicU32, +} + +const COMPRESSION_RATIO_IMPROVING_STEP: f32 = 0.005; +const COMPRESSION_RATIO_DETERIORATE_STEP: f32 = 0.05; +const DEFAULT_COMPRESSION_RATIO: f32 = 1.0; + +impl ArrowCompressionRatioEstimator { + pub fn new() -> Self { + Self { + ratio_bits: AtomicU32::new(DEFAULT_COMPRESSION_RATIO.to_bits()), + } + } + + pub fn estimation(&self) -> f32 { + f32::from_bits(self.ratio_bits.load(Ordering::Relaxed)) + } + + pub fn update_estimation(&self, observed_ratio: f32) { + let current = self.estimation(); + let new_ratio = if observed_ratio > current { + (current + COMPRESSION_RATIO_DETERIORATE_STEP).max(observed_ratio) + } else if observed_ratio < current { + (current - COMPRESSION_RATIO_IMPROVING_STEP).max(observed_ratio) + } else { + return; + }; + self.ratio_bits + .store(new_ratio.to_bits(), Ordering::Relaxed); + } +} + +impl Default for ArrowCompressionRatioEstimator { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_ratio_is_one() { + let e = ArrowCompressionRatioEstimator::new(); + assert_eq!(e.estimation(), 1.0); + } + + #[test] + fn test_deterioration_jumps_quickly() { + let e = ArrowCompressionRatioEstimator::new(); + // Observed ratio worse than estimate: jump by at least DETERIORATE_STEP + e.update_estimation(1.1); + assert!(e.estimation() >= 1.05); + } + + #[test] + fn test_improvement_moves_slowly() { + let e = ArrowCompressionRatioEstimator::new(); + // Observed ratio better than estimate: move down by at most IMPROVING_STEP + e.update_estimation(0.5); + assert!((e.estimation() - 0.995).abs() < 0.001); + } + + #[test] + fn test_converges_to_observed() { + let e = ArrowCompressionRatioEstimator::new(); + // After many updates with same ratio, should converge + for _ in 0..1000 { + e.update_estimation(0.7); + } + assert!((e.estimation() - 0.7).abs() < 0.01); + } +} diff --git a/fluss-rust/crates/fluss/src/compression/mod.rs b/fluss-rust/crates/fluss/src/compression/mod.rs new file mode 100644 index 0000000000..29923c0a84 --- /dev/null +++ b/fluss-rust/crates/fluss/src/compression/mod.rs @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod arrow_compression; +mod arrow_compression_ratio_estimator; + +pub use arrow_compression::*; +pub use arrow_compression_ratio_estimator::*; diff --git a/fluss-rust/crates/fluss/src/config.rs b/fluss-rust/crates/fluss/src/config.rs new file mode 100644 index 0000000000..cad8d9cb55 --- /dev/null +++ b/fluss-rust/crates/fluss/src/config.rs @@ -0,0 +1,683 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use clap::{Parser, ValueEnum}; +use serde::{Deserialize, Serialize}; +use strum_macros::{Display, EnumString}; + +const DEFAULT_BOOTSTRAP_SERVER: &str = "127.0.0.1:9123"; +const DEFAULT_REQUEST_MAX_SIZE: i32 = 10 * 1024 * 1024; +const DEFAULT_WRITER_BATCH_SIZE: i32 = 2 * 1024 * 1024; +// Mirrors Java's `2 * pageSize` floor with default pageSize = 128 KB. +const DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_MIN: i32 = 256 * 1024; +const DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_ENABLED: bool = true; +const DEFAULT_RETRIES: i32 = i32::MAX; +const DEFAULT_PREFETCH_NUM: usize = 4; +const DEFAULT_DOWNLOAD_THREADS: usize = 3; +const DEFAULT_SCANNER_REMOTE_LOG_READ_CONCURRENCY: usize = 4; +const DEFAULT_MAX_POLL_RECORDS: usize = 500; +const DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024; +const DEFAULT_SCANNER_LOG_FETCH_MIN_BYTES: i32 = 1; +const DEFAULT_SCANNER_LOG_FETCH_WAIT_MAX_TIME_MS: i32 = 500; +const DEFAULT_WRITER_BATCH_TIMEOUT_MS: i64 = 100; +const DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES_FOR_BUCKET: i32 = 1024 * 1024; +const DEFAULT_WRITER_MAX_INFLIGHT_REQUESTS_PER_BUCKET: usize = 5; +const DEFAULT_WRITER_BUFFER_MEMORY_SIZE: usize = 64 * 1024 * 1024; // 64MB, matching Java +const DEFAULT_WRITER_BUFFER_WAIT_TIMEOUT_MS: u64 = u64::MAX; + +const MAX_IN_FLIGHT_REQUESTS_PER_BUCKET_FOR_IDEMPOTENCE: usize = 5; +const DEFAULT_ACKS: &str = "all"; +const DEFAULT_CONNECT_TIMEOUT_MS: u64 = 120_000; +const DEFAULT_SECURITY_PROTOCOL: &str = "PLAINTEXT"; +const DEFAULT_SASL_MECHANISM: &str = "PLAIN"; + +/// Bucket assigner strategy for tables without bucket keys. +/// Matches Java `client.writer.bucket.no-key-assigner`. +#[derive( + Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Deserialize, Serialize, EnumString, Display, +)] +#[serde(rename_all = "snake_case")] +#[strum(ascii_case_insensitive)] +pub enum NoKeyAssigner { + /// Sticks to one bucket until the batch is full, then switches. + #[strum(serialize = "sticky")] + Sticky, + /// Assigns each record to the next bucket in a rotating sequence. + #[strum(serialize = "round_robin")] + RoundRobin, +} + +#[derive(Parser, Clone, Deserialize, Serialize)] +#[command(author, version, about, long_about = None)] +pub struct Config { + #[arg(long, default_value_t = String::from(DEFAULT_BOOTSTRAP_SERVER))] + pub bootstrap_servers: String, + + #[arg(long, default_value_t = DEFAULT_REQUEST_MAX_SIZE)] + pub writer_request_max_size: i32, + + #[arg(long, default_value_t = String::from(DEFAULT_ACKS))] + pub writer_acks: String, + + #[arg(long, default_value_t = DEFAULT_RETRIES)] + pub writer_retries: i32, + + #[arg(long, default_value_t = DEFAULT_WRITER_BATCH_SIZE)] + pub writer_batch_size: i32, + + /// Tune the per-table writer batch size from observed fill ratios. + /// Default: true (matching Java `client.writer.dynamic-batch-size.enabled`). + #[arg(long, default_value_t = DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_ENABLED)] + pub writer_dynamic_batch_size_enabled: bool, + + /// Lower bound for the dynamic batch size estimator. + /// Default: 262144 (256 KB), matching Java's `2 * pageSize` floor. + /// Ignored when `writer_dynamic_batch_size_enabled` is false. + #[arg(long, default_value_t = DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_MIN)] + pub writer_dynamic_batch_size_min: i32, + + #[arg(long, value_enum, default_value_t = NoKeyAssigner::Sticky)] + pub writer_bucket_no_key_assigner: NoKeyAssigner, + + /// Maximum number of remote log segments to prefetch + /// Default: 4 (matching Java CLIENT_SCANNER_REMOTE_LOG_PREFETCH_NUM) + #[arg(long, default_value_t = DEFAULT_PREFETCH_NUM)] + pub scanner_remote_log_prefetch_num: usize, + + /// Maximum concurrent remote log downloads + /// Default: 3 (matching Java REMOTE_FILE_DOWNLOAD_THREAD_NUM) + #[arg(long, default_value_t = DEFAULT_DOWNLOAD_THREADS)] + pub remote_file_download_thread_num: usize, + + /// Intra-file remote log read concurrency for each remote segment download. + /// Download path always uses streaming reader. + #[arg(long, default_value_t = DEFAULT_SCANNER_REMOTE_LOG_READ_CONCURRENCY)] + pub scanner_remote_log_read_concurrency: usize, + + /// Maximum number of records returned in a single call to poll() for LogScanner. + /// Default: 500 (matching Java CLIENT_SCANNER_LOG_MAX_POLL_RECORDS) + #[arg(long, default_value_t = DEFAULT_MAX_POLL_RECORDS)] + pub scanner_log_max_poll_records: usize, + + /// Maximum bytes per fetch response for LogScanner. + /// Default: 16777216 (16MB) + #[arg(long, default_value_t = DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES)] + pub scanner_log_fetch_max_bytes: i32, + + /// Minimum bytes to accumulate before returning a fetch response. + /// Default: 1 + #[arg(long, default_value_t = DEFAULT_SCANNER_LOG_FETCH_MIN_BYTES)] + pub scanner_log_fetch_min_bytes: i32, + + /// Maximum time the server may wait (ms) to satisfy min-bytes. + /// Default: 500 + #[arg(long, default_value_t = DEFAULT_SCANNER_LOG_FETCH_WAIT_MAX_TIME_MS)] + pub scanner_log_fetch_wait_max_time_ms: i32, + + /// The maximum time to wait for a batch to be completed in milliseconds. + /// Default: 100 (matching Java CLIENT_WRITER_BATCH_TIMEOUT) + #[arg(long, default_value_t = DEFAULT_WRITER_BATCH_TIMEOUT_MS)] + pub writer_batch_timeout_ms: i64, + + /// Maximum bytes per fetch response **per bucket** for LogScanner. + /// Default: 1048576 (1MB) + #[arg(long, default_value_t = DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES_FOR_BUCKET)] + pub scanner_log_fetch_max_bytes_for_bucket: i32, + + /// Whether to enable idempotent writes. When enabled, each batch is tagged with + /// a server-allocated writer ID and per-bucket sequence number so the server can + /// detect and deduplicate retried batches. + /// Default: true (matching Java CLIENT_WRITER_ENABLE_IDEMPOTENCE) + #[arg(long, default_value_t = true)] + pub writer_enable_idempotence: bool, + + /// Maximum number of in-flight requests per bucket for idempotent writes. + /// Default: 5 (matching Java client.writer.max-inflight-requests-per-bucket) + #[arg(long, default_value_t = DEFAULT_WRITER_MAX_INFLIGHT_REQUESTS_PER_BUCKET)] + pub writer_max_inflight_requests_per_bucket: usize, + + /// Total memory available for buffering write batches across all buckets. + /// When this limit is reached, `upsert()`/`append()` will block until + /// in-flight batches complete and free memory. + /// Default: 64MB (matching Java's LazyMemorySegmentPool: 512 pages x 128KB) + #[arg(long, default_value_t = DEFAULT_WRITER_BUFFER_MEMORY_SIZE)] + pub writer_buffer_memory_size: usize, + + /// Maximum time in milliseconds to block waiting for buffer memory. + /// If the timeout is exceeded, the write call returns an error. + #[arg(long, default_value_t = DEFAULT_WRITER_BUFFER_WAIT_TIMEOUT_MS)] + pub writer_buffer_wait_timeout_ms: u64, + + /// Connect timeout in milliseconds for TCP transport connect. + /// Default: 120000 (120 seconds). + #[arg(long, default_value_t = DEFAULT_CONNECT_TIMEOUT_MS)] + pub connect_timeout_ms: u64, + + #[arg(long, default_value_t = String::from(DEFAULT_SECURITY_PROTOCOL))] + pub security_protocol: String, + + #[arg(long, default_value_t = String::from(DEFAULT_SASL_MECHANISM))] + pub security_sasl_mechanism: String, + + #[arg(long, default_value_t = String::new())] + pub security_sasl_username: String, + + #[arg(long, default_value_t = String::new())] + #[serde(skip_serializing)] + pub security_sasl_password: String, + /// Maximum number of pending lookup operations + /// Default: 25600 (matching Java CLIENT_LOOKUP_QUEUE_SIZE) + #[arg(long, default_value_t = 25600)] + pub lookup_queue_size: usize, + + /// Maximum batch size of merging lookup operations to one lookup request + /// Default: 128 (matching Java CLIENT_LOOKUP_MAX_BATCH_SIZE) + #[arg(long, default_value_t = 128)] + pub lookup_max_batch_size: usize, + + /// Maximum time to wait for the lookup batch to fill (in milliseconds) + /// Default: 100 (matching Java CLIENT_LOOKUP_BATCH_TIMEOUT) + #[arg(long, default_value_t = 100)] + pub lookup_batch_timeout_ms: u64, + + /// Maximum number of unacknowledged lookup requests + /// Default: 128 (matching Java CLIENT_LOOKUP_MAX_INFLIGHT_SIZE) + #[arg(long, default_value_t = 128)] + pub lookup_max_inflight_requests: usize, + + /// Maximum number of lookup retries + /// Default: i32::MAX (matching Java CLIENT_LOOKUP_MAX_RETRIES) + #[arg(long, default_value_t = i32::MAX)] + pub lookup_max_retries: i32, +} + +impl std::fmt::Debug for Config { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Config") + .field("bootstrap_servers", &self.bootstrap_servers) + .field("writer_request_max_size", &self.writer_request_max_size) + .field("writer_acks", &self.writer_acks) + .field("writer_retries", &self.writer_retries) + .field("writer_batch_size", &self.writer_batch_size) + .field( + "writer_dynamic_batch_size_enabled", + &self.writer_dynamic_batch_size_enabled, + ) + .field( + "writer_dynamic_batch_size_min", + &self.writer_dynamic_batch_size_min, + ) + .field( + "writer_bucket_no_key_assigner", + &self.writer_bucket_no_key_assigner, + ) + .field( + "scanner_remote_log_prefetch_num", + &self.scanner_remote_log_prefetch_num, + ) + .field( + "remote_file_download_thread_num", + &self.remote_file_download_thread_num, + ) + .field( + "scanner_log_max_poll_records", + &self.scanner_log_max_poll_records, + ) + .field( + "scanner_log_fetch_max_bytes", + &self.scanner_log_fetch_max_bytes, + ) + .field( + "scanner_log_fetch_min_bytes", + &self.scanner_log_fetch_min_bytes, + ) + .field( + "scanner_log_fetch_max_bytes_for_bucket", + &self.scanner_log_fetch_max_bytes_for_bucket, + ) + .field( + "scanner_log_fetch_wait_max_time_ms", + &self.scanner_log_fetch_wait_max_time_ms, + ) + .field("writer_batch_timeout_ms", &self.writer_batch_timeout_ms) + .field("writer_enable_idempotence", &self.writer_enable_idempotence) + .field( + "writer_max_inflight_requests_per_bucket", + &self.writer_max_inflight_requests_per_bucket, + ) + .field("writer_buffer_memory_size", &self.writer_buffer_memory_size) + .field( + "writer_buffer_wait_timeout_ms", + &self.writer_buffer_wait_timeout_ms, + ) + .field("connect_timeout_ms", &self.connect_timeout_ms) + .field("security_protocol", &self.security_protocol) + .field("security_sasl_mechanism", &self.security_sasl_mechanism) + .field("security_sasl_username", &self.security_sasl_username) + .field("security_sasl_password", &"[REDACTED]") + .field("lookup_queue_size", &self.lookup_queue_size) + .field("lookup_max_batch_size", &self.lookup_max_batch_size) + .field("lookup_batch_timeout_ms", &self.lookup_batch_timeout_ms) + .field( + "lookup_max_inflight_requests", + &self.lookup_max_inflight_requests, + ) + .field("lookup_max_retries", &self.lookup_max_retries) + .finish() + } +} + +impl Default for Config { + fn default() -> Self { + Self { + bootstrap_servers: String::from(DEFAULT_BOOTSTRAP_SERVER), + writer_request_max_size: DEFAULT_REQUEST_MAX_SIZE, + writer_acks: String::from(DEFAULT_ACKS), + writer_retries: i32::MAX, + writer_batch_size: DEFAULT_WRITER_BATCH_SIZE, + writer_dynamic_batch_size_enabled: DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_ENABLED, + writer_dynamic_batch_size_min: DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_MIN, + writer_bucket_no_key_assigner: NoKeyAssigner::Sticky, + scanner_remote_log_prefetch_num: DEFAULT_PREFETCH_NUM, + remote_file_download_thread_num: DEFAULT_DOWNLOAD_THREADS, + scanner_remote_log_read_concurrency: DEFAULT_SCANNER_REMOTE_LOG_READ_CONCURRENCY, + scanner_log_max_poll_records: DEFAULT_MAX_POLL_RECORDS, + scanner_log_fetch_max_bytes: DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES, + scanner_log_fetch_min_bytes: DEFAULT_SCANNER_LOG_FETCH_MIN_BYTES, + scanner_log_fetch_wait_max_time_ms: DEFAULT_SCANNER_LOG_FETCH_WAIT_MAX_TIME_MS, + scanner_log_fetch_max_bytes_for_bucket: DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES_FOR_BUCKET, + writer_batch_timeout_ms: DEFAULT_WRITER_BATCH_TIMEOUT_MS, + writer_enable_idempotence: true, + writer_max_inflight_requests_per_bucket: + DEFAULT_WRITER_MAX_INFLIGHT_REQUESTS_PER_BUCKET, + writer_buffer_memory_size: DEFAULT_WRITER_BUFFER_MEMORY_SIZE, + writer_buffer_wait_timeout_ms: DEFAULT_WRITER_BUFFER_WAIT_TIMEOUT_MS, + connect_timeout_ms: DEFAULT_CONNECT_TIMEOUT_MS, + security_protocol: String::from(DEFAULT_SECURITY_PROTOCOL), + security_sasl_mechanism: String::from(DEFAULT_SASL_MECHANISM), + security_sasl_username: String::new(), + security_sasl_password: String::new(), + lookup_queue_size: 25600, + lookup_max_batch_size: 128, + lookup_batch_timeout_ms: 100, + lookup_max_inflight_requests: 128, + lookup_max_retries: i32::MAX, + } + } +} + +impl Config { + /// Returns true when the security protocol indicates SASL authentication + /// should be performed. Matches Java's `SaslAuthenticationPlugin` which + /// registers as `"sasl"` (case-insensitive). + pub fn is_sasl_enabled(&self) -> bool { + self.security_protocol.eq_ignore_ascii_case("sasl") + } + /// Validates security configuration. Returns `Ok(())` when the config is + /// consistent, or an error message when SASL is enabled but the config is + /// incomplete or uses an unsupported mechanism. + pub fn validate_security(&self) -> Result<(), String> { + if !self.is_sasl_enabled() { + return Ok(()); + } + if !self.security_sasl_mechanism.eq_ignore_ascii_case("PLAIN") { + return Err(format!( + "Unsupported SASL mechanism: '{}'. Only 'PLAIN' is supported.", + self.security_sasl_mechanism + )); + } + if self.security_sasl_username.is_empty() { + return Err( + "security_sasl_username must be set when security_protocol is 'sasl'".to_string(), + ); + } + if self.security_sasl_password.is_empty() { + return Err( + "security_sasl_password must be set when security_protocol is 'sasl'".to_string(), + ); + } + Ok(()) + } + pub fn validate_scanner(&self) -> Result<(), String> { + if self.scanner_remote_log_prefetch_num == 0 { + return Err("scanner_remote_log_prefetch_num must be > 0".to_string()); + } + if self.scanner_remote_log_read_concurrency == 0 { + return Err("scanner_remote_log_read_concurrency must be > 0".to_string()); + } + if self.remote_file_download_thread_num == 0 { + return Err("remote_file_download_thread_num must be > 0".to_string()); + } + // scanner_log_max_poll_records: validation intentionally omitted to match Java behavior. + // Java allows 0 — tracked in https://github.com/apache/fluss/issues/3068 + if self.scanner_log_fetch_min_bytes <= 0 { + return Err("scanner_log_fetch_min_bytes must be > 0".to_string()); + } + if self.scanner_log_fetch_max_bytes <= 0 { + return Err("scanner_log_fetch_max_bytes must be > 0".to_string()); + } + if self.scanner_log_fetch_max_bytes < self.scanner_log_fetch_min_bytes { + return Err( + "scanner_log_fetch_max_bytes must be >= scanner_log_fetch_min_bytes".to_string(), + ); + } + if self.scanner_log_fetch_wait_max_time_ms < 0 { + return Err("scanner_log_fetch_wait_max_time_ms must be >= 0".to_string()); + } + if self.scanner_log_fetch_max_bytes_for_bucket <= 0 { + return Err("scanner_log_fetch_max_bytes_for_bucket must be > 0".to_string()); + } + if self.scanner_log_fetch_max_bytes_for_bucket > self.scanner_log_fetch_max_bytes { + return Err( + "scanner_log_fetch_max_bytes_for_bucket must be <= scanner_log_fetch_max_bytes" + .to_string(), + ); + } + Ok(()) + } + + pub fn validate_writer(&self) -> Result<(), String> { + if self.writer_request_max_size <= 0 { + return Err("writer_request_max_size must be > 0".to_string()); + } + if self.writer_batch_size <= 0 { + return Err("writer_batch_size must be > 0".to_string()); + } + if self.writer_batch_timeout_ms < 0 { + return Err("writer_batch_timeout_ms must be >= 0".to_string()); + } + if self.writer_max_inflight_requests_per_bucket == 0 { + return Err("writer_max_inflight_requests_per_bucket must be > 0".to_string()); + } + if self.writer_buffer_memory_size == 0 { + return Err("writer_buffer_memory_size must be > 0".to_string()); + } + if self.writer_batch_size > self.writer_request_max_size { + return Err("writer_batch_size must be <= writer_request_max_size".to_string()); + } + if self.writer_batch_size as usize > self.writer_buffer_memory_size { + return Err("writer_batch_size must be <= writer_buffer_memory_size".to_string()); + } + if self.writer_dynamic_batch_size_min <= 0 { + return Err("writer_dynamic_batch_size_min must be > 0".to_string()); + } + if self.writer_dynamic_batch_size_min > self.writer_batch_size { + return Err("writer_dynamic_batch_size_min must be <= writer_batch_size".to_string()); + } + // idempotence checks + if !self.writer_enable_idempotence { + return Ok(()); + } + let acks_is_all = self.writer_acks.eq_ignore_ascii_case("all") || self.writer_acks == "-1"; + if !acks_is_all { + return Err(format!( + "Idempotent writes require acks='all' (-1), but got acks='{}'", + self.writer_acks + )); + } + if self.writer_retries <= 0 { + return Err(format!( + "Idempotent writes require retries > 0, but got retries={}", + self.writer_retries + )); + } + if self.writer_max_inflight_requests_per_bucket + > MAX_IN_FLIGHT_REQUESTS_PER_BUCKET_FOR_IDEMPOTENCE + { + return Err(format!( + "Idempotent writes require max-inflight-requests-per-bucket <= {}, but got {}", + MAX_IN_FLIGHT_REQUESTS_PER_BUCKET_FOR_IDEMPOTENCE, + self.writer_max_inflight_requests_per_bucket + )); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_is_not_sasl() { + let config = Config::default(); + assert!(!config.is_sasl_enabled()); + assert!(config.validate_security().is_ok()); + } + + #[test] + fn test_sasl_enabled_valid() { + let config = Config { + security_protocol: "sasl".to_string(), + security_sasl_mechanism: "PLAIN".to_string(), + security_sasl_username: "admin".to_string(), + security_sasl_password: "secret".to_string(), + ..Config::default() + }; + assert!(config.is_sasl_enabled()); + assert!(config.validate_security().is_ok()); + } + + #[test] + fn test_sasl_enabled_case_insensitive() { + let config = Config { + security_protocol: "SASL".to_string(), + security_sasl_username: "admin".to_string(), + security_sasl_password: "secret".to_string(), + ..Config::default() + }; + assert!(config.is_sasl_enabled()); + assert!(config.validate_security().is_ok()); + } + + #[test] + fn test_sasl_missing_username() { + let config = Config { + security_protocol: "sasl".to_string(), + security_sasl_password: "secret".to_string(), + ..Config::default() + }; + assert!(config.validate_security().is_err()); + } + + #[test] + fn test_sasl_missing_password() { + let config = Config { + security_protocol: "sasl".to_string(), + security_sasl_username: "admin".to_string(), + ..Config::default() + }; + assert!(config.validate_security().is_err()); + } + + #[test] + fn test_sasl_unsupported_mechanism() { + let config = Config { + security_protocol: "sasl".to_string(), + security_sasl_mechanism: "SCRAM-SHA-256".to_string(), + security_sasl_username: "admin".to_string(), + security_sasl_password: "secret".to_string(), + ..Config::default() + }; + assert!(config.validate_security().is_err()); + } + + #[test] + fn test_scanner_defaults_valid() { + let config = Config::default(); + assert!(config.validate_scanner().is_ok()); + } + + #[test] + fn test_scanner_remote_log_prefetch_num_zero() { + let config = Config { + scanner_remote_log_prefetch_num: 0, + ..Config::default() + }; + assert!(config.validate_scanner().is_err()); + } + + #[test] + fn test_scanner_remote_log_read_concurrency_zero() { + let config = Config { + scanner_remote_log_read_concurrency: 0, + ..Config::default() + }; + assert!(config.validate_scanner().is_err()); + } + + #[test] + fn test_remote_file_download_thread_num_zero() { + let config = Config { + remote_file_download_thread_num: 0, + ..Config::default() + }; + assert!(config.validate_scanner().is_err()); + } + + #[test] + fn test_scanner_fetch_invalid_ranges() { + let config = Config { + scanner_log_fetch_min_bytes: 2, + scanner_log_fetch_max_bytes: 1, + ..Config::default() + }; + assert!(config.validate_scanner().is_err()); + } + + #[test] + fn test_scanner_fetch_negative_wait() { + let config = Config { + scanner_log_fetch_wait_max_time_ms: -1, + ..Config::default() + }; + assert!(config.validate_scanner().is_err()); + } + + #[test] + fn test_writer_defaults_valid() { + let config = Config::default(); + assert!(config.validate_writer().is_ok()); + } + + #[test] + fn test_writer_request_max_size_zero() { + let config = Config { + writer_request_max_size: 0, + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } + + #[test] + fn test_writer_batch_size_zero() { + let config = Config { + writer_batch_size: 0, + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } + + #[test] + fn test_writer_batch_timeout_negative() { + let config = Config { + writer_batch_timeout_ms: -1, + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } + + #[test] + fn test_writer_max_inflight_requests_per_bucket_zero() { + let config = Config { + writer_max_inflight_requests_per_bucket: 0, + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } + + #[test] + fn test_writer_buffer_memory_size_zero() { + let config = Config { + writer_buffer_memory_size: 0, + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } + + #[test] + fn test_writer_batch_size_exceeds_request_max_size() { + let config = Config { + writer_batch_size: 20 * 1024 * 1024, + writer_request_max_size: 10 * 1024 * 1024, + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } + + #[test] + fn test_writer_batch_size_exceeds_buffer_memory_size() { + let config = Config { + writer_batch_size: 128 * 1024 * 1024, + writer_buffer_memory_size: 64 * 1024 * 1024, + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } + + #[test] + fn test_idempotence_disabled_skips_validation() { + let config = Config { + writer_enable_idempotence: false, + writer_acks: "0".to_string(), + writer_retries: 0, + writer_max_inflight_requests_per_bucket: 100, + ..Config::default() + }; + assert!(config.validate_writer().is_ok()); + } + + #[test] + fn test_idempotence_requires_acks_all() { + let config = Config { + writer_enable_idempotence: true, + writer_acks: "1".to_string(), + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } + + #[test] + fn test_idempotence_requires_retries() { + let config = Config { + writer_enable_idempotence: true, + writer_retries: 0, + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } + + #[test] + fn test_idempotence_requires_bounded_inflight() { + let config = Config { + writer_enable_idempotence: true, + writer_max_inflight_requests_per_bucket: 10, + ..Config::default() + }; + assert!(config.validate_writer().is_err()); + } +} diff --git a/fluss-rust/crates/fluss/src/error.rs b/fluss-rust/crates/fluss/src/error.rs new file mode 100644 index 0000000000..4bd0690ead --- /dev/null +++ b/fluss-rust/crates/fluss/src/error.rs @@ -0,0 +1,250 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +pub use crate::rpc::RpcError; +pub use crate::rpc::{ApiError, FlussError}; + +use arrow_schema::ArrowError; +use snafu::Snafu; +use std::{io, result}; +use strum::ParseError; + +pub type Result = result::Result; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu( + whatever, + display("Fluss hitting unexpected error {}: {:?}", message, source) + )] + UnexpectedError { + message: String, + /// see + #[snafu(source(from(Box, Some)))] + source: Option>, + }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting unexpected io error {}: {:?}", message, source) + )] + IoUnexpectedError { message: String, source: io::Error }, + + #[snafu( + visibility(pub(crate)), + display( + "Fluss hitting remote storage unexpected error {}: {:?}", + message, + source + ) + )] + RemoteStorageUnexpectedError { + message: String, + source: opendal::Error, + }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting json serde error {}.", message) + )] + JsonSerdeError { message: String }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting unexpected rpc error {}: {:?}", message, source) + )] + RpcError { message: String, source: RpcError }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting row convert error {}.", message) + )] + RowConvertError { message: String }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting Arrow error {}: {:?}.", message, source) + )] + ArrowError { message: String, source: ArrowError }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting illegal argument error {}.", message) + )] + IllegalArgument { message: String }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting IO not supported error {}.", message) + )] + IoUnsupported { message: String }, + + #[snafu( + visibility(pub(crate)), + display("Fluss hitting wakeup error {}.", message) + )] + WakeupError { message: String }, + #[snafu( + visibility(pub(crate)), + display("Fluss hitting unsupported operation error {}.", message) + )] + UnsupportedOperation { message: String }, + + #[snafu(visibility(pub(crate)), display("Fluss writer closed: {}.", message))] + WriterClosed { message: String }, + + #[snafu( + visibility(pub(crate)), + display("Fluss buffer exhausted: {}.", message) + )] + BufferExhausted { message: String }, + + #[snafu(visibility(pub(crate)), display("Fluss API Error: {}.", api_error))] + FlussAPIError { api_error: ApiError }, + + #[snafu( + visibility(pub(crate)), + display("Unsupported API version: {}.", message) + )] + UnsupportedVersion { message: String }, + + /// The server advertised a `server_type` that does not match the one expected + /// for the target `ServerNode` (e.g. connecting to a coordinator on a tablet + /// server address). + #[snafu(visibility(pub(crate)), display("Invalid server type: {}.", message))] + InvalidServerType { message: String }, +} + +/// Convenience constructors for API errors that may be raised client-side. +/// These create `FlussAPIError` with the correct protocol error code, +/// consistent with Java where e.g. `InvalidTableException` always carries code 15. +impl Error { + pub fn table_not_exist(message: impl Into) -> Self { + Error::FlussAPIError { + api_error: ApiError { + code: FlussError::TableNotExist.code(), + message: message.into(), + }, + } + } + + pub fn invalid_table(message: impl Into) -> Self { + Error::FlussAPIError { + api_error: ApiError { + code: FlussError::InvalidTableException.code(), + message: message.into(), + }, + } + } + + pub fn partition_not_exist(message: impl Into) -> Self { + Error::FlussAPIError { + api_error: ApiError { + code: FlussError::PartitionNotExists.code(), + message: message.into(), + }, + } + } + + pub fn invalid_partition(message: impl Into) -> Self { + Error::FlussAPIError { + api_error: ApiError { + code: FlussError::PartitionSpecInvalidException.code(), + message: message.into(), + }, + } + } + + pub fn leader_not_available(message: impl Into) -> Self { + Error::FlussAPIError { + api_error: ApiError { + code: FlussError::LeaderNotAvailableException.code(), + message: message.into(), + }, + } + } + + /// Returns the API error kind if this is an API error, for ergonomic pattern matching. + pub fn api_error(&self) -> Option { + if let Error::FlussAPIError { api_error } = self { + Some(FlussError::for_code(api_error.code)) + } else { + None + } + } + + /// Returns `true` if retrying the request may succeed. + /// [`Error::RpcError`] is always retriable; [`Error::FlussAPIError`] delegates to + /// [`ApiError::is_retriable`]; all other variants are not. + pub fn is_retriable(&self) -> bool { + match self { + Error::RpcError { .. } => true, + Error::FlussAPIError { api_error } => api_error.is_retriable(), + _ => false, + } + } +} + +impl From for Error { + fn from(value: ArrowError) -> Self { + Error::ArrowError { + message: format!("{value}"), + source: value, + } + } +} + +impl From for Error { + fn from(value: RpcError) -> Self { + Error::RpcError { + message: format!("{value}"), + source: value, + } + } +} + +impl From for Error { + fn from(value: io::Error) -> Self { + Error::IoUnexpectedError { + message: format!("{value}"), + source: value, + } + } +} + +impl From for Error { + fn from(value: opendal::Error) -> Self { + Error::RemoteStorageUnexpectedError { + message: format!("{value}"), + source: value, + } + } +} + +impl From for Error { + fn from(value: ApiError) -> Self { + Error::FlussAPIError { api_error: value } + } +} + +impl From for Error { + fn from(value: ParseError) -> Self { + Error::IllegalArgument { + message: value.to_string(), + } + } +} diff --git a/fluss-rust/crates/fluss/src/io/file_io.rs b/fluss-rust/crates/fluss/src/io/file_io.rs new file mode 100644 index 0000000000..adca333f6b --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/file_io.rs @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use crate::error::*; +use std::collections::HashMap; +use std::ops::Range; +use std::sync::Arc; + +use bytes::Bytes; +use jiff::Timestamp; +use opendal::Operator; + +use url::Url; + +use super::Storage; + +use crate::error::Result; + +#[derive(Clone, Debug)] +pub struct FileIO { + storage: Arc, +} + +impl FileIO { + /// Try to infer file io scheme from path. + pub fn from_url(path: &str) -> Result { + let url = Url::parse(path).map_err(|e| Error::IllegalArgument { + message: format!("Invalid URL '{path}': {e}"), + })?; + Ok(FileIOBuilder::new(url.scheme())) + } + + /// Create a new input file to read data. + pub fn new_input(&self, path: &str) -> Result { + let (op, relative_path) = self.storage.create(path)?; + let path = path.to_string(); + let relative_path_pos = path.len() - relative_path.len(); + Ok(InputFile { + op, + path, + relative_path_pos, + }) + } +} + +#[derive(Debug)] +pub struct FileIOBuilder { + scheme_str: Option, + props: HashMap, +} + +impl FileIOBuilder { + pub fn new(scheme_str: impl ToString) -> Self { + Self { + scheme_str: Some(scheme_str.to_string()), + props: HashMap::default(), + } + } + + pub(crate) fn into_parts(self) -> (String, HashMap) { + (self.scheme_str.unwrap_or_default(), self.props) + } + + pub fn with_prop(mut self, key: impl ToString, value: impl ToString) -> Self { + self.props.insert(key.to_string(), value.to_string()); + self + } + + pub fn with_props( + mut self, + args: impl IntoIterator, + ) -> Self { + self.props + .extend(args.into_iter().map(|e| (e.0.to_string(), e.1.to_string()))); + self + } + + pub fn build(self) -> Result { + let storage = Storage::build(self)?; + Ok(FileIO { + storage: Arc::new(storage), + }) + } +} + +pub trait FileRead: Send + Unpin + 'static { + fn read(&self, range: Range) -> impl Future> + Send; +} + +impl FileRead for opendal::Reader { + async fn read(&self, range: Range) -> Result { + Ok(opendal::Reader::read(self, range).await?.to_bytes()) + } +} + +#[derive(Debug)] +pub struct InputFile { + op: Operator, + path: String, + relative_path_pos: usize, +} + +impl InputFile { + pub fn location(&self) -> &str { + &self.path + } + + pub async fn exists(&self) -> Result { + Ok(self.op.exists(&self.path[self.relative_path_pos..]).await?) + } + + pub async fn metadata(&self) -> Result { + let meta = self.op.stat(&self.path[self.relative_path_pos..]).await?; + + Ok(FileStatus { + size: meta.content_length(), + is_dir: meta.is_dir(), + path: self.path.clone(), + last_modified: meta.last_modified().map(Into::into), + }) + } + + pub async fn read(&self) -> Result { + Ok(self + .op + .read(&self.path[self.relative_path_pos..]) + .await? + .to_bytes()) + } + + pub async fn reader(&self) -> Result { + Ok(self.op.reader(&self.path[self.relative_path_pos..]).await?) + } +} + +#[derive(Clone, Debug)] +pub struct FileStatus { + pub size: u64, + pub is_dir: bool, + pub path: String, + pub last_modified: Option, +} diff --git a/fluss-rust/crates/fluss/src/io/mod.rs b/fluss-rust/crates/fluss/src/io/mod.rs new file mode 100644 index 0000000000..74265017aa --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/mod.rs @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +mod file_io; + +pub use file_io::*; + +mod storage; +pub use storage::*; + +#[cfg(feature = "storage-fs")] +mod storage_fs; +#[cfg(feature = "storage-fs")] +use storage_fs::*; + +#[cfg(feature = "storage-memory")] +mod storage_memory; +#[cfg(feature = "storage-memory")] +use storage_memory::*; + +#[cfg(feature = "storage-s3")] +mod storage_s3; +#[cfg(feature = "storage-s3")] +use storage_s3::*; + +#[cfg(feature = "storage-oss")] +mod storage_oss; +#[cfg(feature = "storage-oss")] +use storage_oss::*; diff --git a/fluss-rust/crates/fluss/src/io/storage.rs b/fluss-rust/crates/fluss/src/io/storage.rs new file mode 100644 index 0000000000..a57351783e --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/storage.rs @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use crate::error; +use crate::error::Result; +use crate::io::FileIOBuilder; +use opendal::{Operator, Scheme}; +#[cfg(any(feature = "storage-s3", feature = "storage-oss"))] +use std::collections::HashMap; + +/// The storage carries all supported storage services in fluss +#[derive(Debug)] +pub enum Storage { + #[cfg(feature = "storage-memory")] + Memory, + #[cfg(feature = "storage-fs")] + LocalFs, + #[cfg(feature = "storage-s3")] + S3 { props: HashMap }, + #[cfg(feature = "storage-oss")] + Oss { props: HashMap }, +} + +impl Storage { + #[allow(unused_variables)] + pub(crate) fn build(file_io_builder: FileIOBuilder) -> Result { + let (scheme_str, props) = file_io_builder.into_parts(); + let scheme = Self::parse_scheme(&scheme_str)?; + + match scheme { + #[cfg(feature = "storage-memory")] + Scheme::Memory => Ok(Self::Memory), + #[cfg(feature = "storage-fs")] + Scheme::Fs => Ok(Self::LocalFs), + #[cfg(feature = "storage-s3")] + Scheme::S3 => Ok(Self::S3 { props }), + #[cfg(feature = "storage-oss")] + Scheme::Oss => Ok(Self::Oss { props }), + _ => Err(error::Error::IoUnsupported { + message: format!("Unsupported storage feature {scheme_str}"), + }), + } + } + + pub(crate) fn create<'a>(&self, path: &'a str) -> Result<(Operator, &'a str)> { + match self { + #[cfg(feature = "storage-memory")] + Storage::Memory => { + let op = super::memory_config_build()?; + + if let Some(stripped) = path.strip_prefix("memory:/") { + Ok((op, stripped)) + } else { + Ok((op, &path[1..])) + } + } + #[cfg(feature = "storage-fs")] + Storage::LocalFs => { + let op = super::fs_config_build()?; + if let Some(stripped) = path.strip_prefix("file:/") { + Ok((op, stripped)) + } else { + Ok((op, &path[1..])) + } + } + #[cfg(feature = "storage-s3")] + Storage::S3 { props } => { + let (bucket, key) = super::parse_s3_path(path); + let mut s3_props = props.clone(); + s3_props.insert("bucket".to_string(), bucket.to_string()); + let op = super::s3_config_build(&s3_props)?; + Ok((op, key)) + } + #[cfg(feature = "storage-oss")] + Storage::Oss { props } => { + let (bucket, key) = super::parse_oss_path(path); + let mut oss_props = props.clone(); + oss_props.insert("bucket".to_string(), bucket.to_string()); + let op = super::oss_config_build(&oss_props)?; + Ok((op, key)) + } + } + } + + fn parse_scheme(scheme: &str) -> Result { + match scheme { + "memory" => Ok(Scheme::Memory), + "file" | "" => Ok(Scheme::Fs), + "s3" | "s3a" => Ok(Scheme::S3), + "oss" => Ok(Scheme::Oss), + s => Ok(s.parse::()?), + } + } +} diff --git a/fluss-rust/crates/fluss/src/io/storage_fs.rs b/fluss-rust/crates/fluss/src/io/storage_fs.rs new file mode 100644 index 0000000000..95ca6fa95f --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/storage_fs.rs @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use opendal::Operator; +use opendal::services::FsConfig; + +use crate::error::Result; + +/// Build new opendal operator from give path. +pub(crate) fn fs_config_build() -> Result { + let mut cfg = FsConfig::default(); + cfg.root = Some("/".to_string()); + + Ok(Operator::from_config(cfg)?.finish()) +} diff --git a/fluss-rust/crates/fluss/src/io/storage_memory.rs b/fluss-rust/crates/fluss/src/io/storage_memory.rs new file mode 100644 index 0000000000..af73a90174 --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/storage_memory.rs @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +use crate::error::Result; +use opendal::Operator; +use opendal::services::MemoryConfig; + +pub(crate) fn memory_config_build() -> Result { + Ok(Operator::from_config(MemoryConfig::default())?.finish()) +} diff --git a/fluss-rust/crates/fluss/src/io/storage_oss.rs b/fluss-rust/crates/fluss/src/io/storage_oss.rs new file mode 100644 index 0000000000..3d5d05499a --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/storage_oss.rs @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Result; +use opendal::Configurator; +use opendal::Operator; +use opendal::layers::TimeoutLayer; +use opendal::services::OssConfig; +use std::collections::HashMap; +use std::time::Duration; + +pub(crate) fn oss_config_build(props: &HashMap) -> Result { + let config = OssConfig::from_iter(props.clone())?; + let op = Operator::from_config(config)?.finish(); + + // Add timeout layer to prevent hanging on OSS operations + let timeout_layer = TimeoutLayer::new() + .with_timeout(Duration::from_secs(10)) + .with_io_timeout(Duration::from_secs(30)); + + Ok(op.layer(timeout_layer)) +} + +pub(crate) fn parse_oss_path(path: &str) -> (&str, &str) { + let path = path.strip_prefix("oss://").unwrap_or(path); + + match path.find('/') { + Some(idx) => (&path[..idx], &path[idx + 1..]), + None => (path, ""), + } +} diff --git a/fluss-rust/crates/fluss/src/io/storage_s3.rs b/fluss-rust/crates/fluss/src/io/storage_s3.rs new file mode 100644 index 0000000000..8000d091dd --- /dev/null +++ b/fluss-rust/crates/fluss/src/io/storage_s3.rs @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Result; +use opendal::Configurator; +use opendal::Operator; +use opendal::layers::TimeoutLayer; +use opendal::services::S3Config; +use std::collections::HashMap; +use std::time::Duration; + +pub(crate) fn s3_config_build(props: &HashMap) -> Result { + let config = S3Config::from_iter(props.clone())?; + let op = Operator::from_config(config)?.finish(); + + // Add timeout layer to prevent hanging on S3 operations + let timeout_layer = TimeoutLayer::new() + .with_timeout(Duration::from_secs(10)) + .with_io_timeout(Duration::from_secs(30)); + + Ok(op.layer(timeout_layer)) +} + +pub(crate) fn parse_s3_path(path: &str) -> (&str, &str) { + let path = path + .strip_prefix("s3a://") + .or_else(|| path.strip_prefix("s3://")) + .unwrap_or(path); + + match path.find('/') { + Some(idx) => (&path[..idx], &path[idx + 1..]), + None => (path, ""), + } +} diff --git a/fluss-rust/crates/fluss/src/lib.rs b/fluss-rust/crates/fluss/src/lib.rs new file mode 100644 index 0000000000..027465235c --- /dev/null +++ b/fluss-rust/crates/fluss/src/lib.rs @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Apache Fluss (Incubating) Official Rust Client +//! +//! Official Rust client library for [Apache Fluss (Incubating)](https://fluss.apache.org/). +//! It supports **primary key (KV) tables** (upsert + lookup) and **log tables** (append + scan). +//! +//! # Examples +//! +//! ## Primary key table and log table +//! +//! Connect to a cluster, create a KV table (upsert and lookup), then a log table (append and scan): +//! +//! ```rust,no_run +//! use fluss::client::EARLIEST_OFFSET; +//! use fluss::client::FlussConnection; +//! use fluss::config::Config; +//! use fluss::error::Result; +//! use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath}; +//! use fluss::row::{GenericRow, InternalRow}; +//! use std::time::Duration; +//! +//! #[tokio::main] +//! async fn main() -> Result<()> { +//! let mut config = Config::default(); +//! config.bootstrap_servers = "127.0.0.1:9123".to_string(); +//! let connection = FlussConnection::new(config).await?; +//! let admin = connection.get_admin()?; +//! +//! // ---- Primary key (KV) table: upsert and lookup ---- +//! let kv_path = TablePath::new("fluss", "users"); +//! let mut kv_schema = Schema::builder() +//! .column("id", DataTypes::int()) +//! .column("name", DataTypes::string()) +//! .column("age", DataTypes::bigint()) +//! .primary_key(vec!["id"]); +//! let kv_descriptor = TableDescriptor::builder() +//! .schema(kv_schema.build()?) +//! .build()?; +//! admin.create_table(&kv_path, &kv_descriptor, false).await?; +//! +//! let kv_table = connection.get_table(&kv_path).await?; +//! let upsert_writer = kv_table.new_upsert()?.create_writer()?; +//! let mut row = GenericRow::new(3); +//! row.set_field(0, 1i32); +//! row.set_field(1, "Alice"); +//! row.set_field(2, 30i64); +//! upsert_writer.upsert(&row)?; +//! upsert_writer.flush().await?; +//! +//! let mut lookuper = kv_table.new_lookup()?.create_lookuper()?; +//! let mut key = GenericRow::new(1); +//! key.set_field(0, 1i32); +//! let result = lookuper.lookup(&key).await?; +//! if let Some(r) = result.get_single_row()? { +//! println!("KV lookup: id={}, name={}, age={}", +//! r.get_int(0)?, r.get_string(1)?, r.get_long(2)?); +//! } +//! +//! // ---- Log table: append and scan ---- +//! let log_path = TablePath::new("fluss", "events"); +//! let mut log_schema_builder = Schema::builder() +//! .column("ts", DataTypes::bigint()) +//! .column("message", DataTypes::string()); +//! let log_descriptor = TableDescriptor::builder() +//! .schema(log_schema_builder.build()?) +//! .build()?; +//! admin.create_table(&log_path, &log_descriptor, false).await?; +//! +//! let log_table = connection.get_table(&log_path).await?; +//! let append_writer = log_table.new_append()?.create_writer()?; +//! let mut event = GenericRow::new(2); +//! event.set_field(0, 1700000000i64); +//! event.set_field(1, "hello"); +//! append_writer.append(&event)?; +//! append_writer.flush().await?; +//! +//! let scanner = log_table.new_scan().create_log_scanner()?; +//! scanner.subscribe(0, EARLIEST_OFFSET).await?; +//! let scan_records = scanner.poll(Duration::from_secs(1)).await?; +//! for record in scan_records { +//! let r = record.row(); +//! println!("Log scan: ts={}, message={}", r.get_long(0)?, r.get_string(1)?); +//! } +//! +//! Ok(()) +//! } +//! ``` +//! +//! # Performance +//! +//! For production deployments on Linux, we recommend using +//! [jemalloc](https://crates.io/crates/tikv-jemallocator) as the global allocator. +//! The default glibc allocator (ptmalloc2) can cause RSS bloat and fragmentation under +//! sustained write loads due to repeated same-size alloc/free cycles in Arrow batch building. +//! jemalloc's thread-local size-class bins handle this pattern efficiently. +//! +//! ```toml +//! [target.'cfg(not(target_env = "msvc"))'.dependencies] +//! tikv-jemallocator = "0.6" +//! ``` +//! +//! ```rust,ignore +//! #[cfg(not(target_env = "msvc"))] +//! #[global_allocator] +//! static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +//! ``` + +pub mod client; +pub mod metadata; +pub mod record; +pub mod row; +pub mod rpc; + +mod cluster; +pub use cluster::{ServerNode, ServerType}; + +pub mod config; +pub mod error; +pub mod metrics; + +mod bucketing; +mod compression; +pub mod io; +mod util; + +#[cfg(test)] +mod test_utils; + +pub type TableId = i64; +pub type PartitionId = i64; +pub type BucketId = i32; + +pub mod proto { + // generated from the canonical proto; its doc comments aren't clippy-clean + #![allow(clippy::doc_lazy_continuation)] + include!(concat!(env!("OUT_DIR"), "/fluss.rs")); +} diff --git a/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs b/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs new file mode 100644 index 0000000000..77e5ad3c1c --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use strum_macros::{Display, EnumString}; + +/// Identifies the logical format of a data lake table supported by Fluss. +/// +/// This enum is typically used in metadata and configuration to distinguish +/// between different table formats so that the appropriate integration and +/// semantics can be applied. +#[derive(Debug, EnumString, Display, PartialEq)] +#[strum(ascii_case_insensitive)] +pub enum DataLakeFormat { + #[strum(serialize = "paimon")] + Paimon, + + #[strum(serialize = "lance")] + Lance, + + #[strum(serialize = "iceberg")] + Iceberg, +} + +#[cfg(test)] +mod tests { + use crate::metadata::DataLakeFormat; + use crate::metadata::DataLakeFormat::{Iceberg, Lance, Paimon}; + + #[test] + fn test_parse() { + let cases = vec![ + ("paimon", Paimon), + ("Paimon", Paimon), + ("PAIMON", Paimon), + ("lance", Lance), + ("LANCE", Lance), + ("iceberg", Iceberg), + ("ICEBERG", Iceberg), + ]; + + for (raw, expected) in cases { + let parsed = raw.parse::().unwrap(); + assert_eq!(parsed, expected, "failed to parse: {raw}"); + } + + // negative cases + assert!("unknown".parse::().is_err()); + assert!("".parse::().is_err()); + } +} diff --git a/fluss-rust/crates/fluss/src/metadata/database.rs b/fluss-rust/crates/fluss/src/metadata/database.rs new file mode 100644 index 0000000000..15fefb5496 --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/database.rs @@ -0,0 +1,239 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::JsonSerdeError; +use crate::error::Result; +use crate::metadata::JsonSerde; +use serde::{Deserialize, Serialize}; +use serde_json::{Value, json}; +use std::collections::HashMap; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct DatabaseDescriptor { + comment: Option, + custom_properties: HashMap, +} + +#[derive(Debug, Clone)] +pub struct DatabaseInfo { + database_name: String, + database_descriptor: DatabaseDescriptor, + created_time: i64, + modified_time: i64, +} + +impl DatabaseInfo { + pub fn new( + database_name: String, + database_descriptor: DatabaseDescriptor, + created_time: i64, + modified_time: i64, + ) -> Self { + Self { + database_name, + database_descriptor, + created_time, + modified_time, + } + } + + pub fn database_name(&self) -> &str { + &self.database_name + } + + pub fn database_descriptor(&self) -> &DatabaseDescriptor { + &self.database_descriptor + } + + pub fn created_time(&self) -> i64 { + self.created_time + } + + pub fn modified_time(&self) -> i64 { + self.modified_time + } +} + +#[derive(Debug, Default)] +pub struct DatabaseDescriptorBuilder { + comment: Option, + custom_properties: HashMap, +} + +impl DatabaseDescriptor { + pub fn builder() -> DatabaseDescriptorBuilder { + DatabaseDescriptorBuilder::default() + } + + pub fn comment(&self) -> Option<&str> { + self.comment.as_deref() + } + + pub fn custom_properties(&self) -> &HashMap { + &self.custom_properties + } +} + +impl DatabaseDescriptorBuilder { + pub fn comment>(mut self, comment: C) -> Self { + self.comment = Some(comment.into()); + self + } + + pub fn custom_properties, V: Into>( + mut self, + properties: HashMap, + ) -> Self { + for (k, v) in properties { + self.custom_properties.insert(k.into(), v.into()); + } + self + } + + pub fn custom_property, V: Into>(mut self, key: K, value: V) -> Self { + self.custom_properties.insert(key.into(), value.into()); + self + } + + pub fn build(self) -> DatabaseDescriptor { + DatabaseDescriptor { + comment: self.comment, + custom_properties: self.custom_properties, + } + } +} + +impl DatabaseDescriptor { + const CUSTOM_PROPERTIES_NAME: &'static str = "custom_properties"; + const COMMENT_NAME: &'static str = "comment"; + const VERSION_KEY: &'static str = "version"; + const VERSION: u32 = 1; +} + +impl JsonSerde for DatabaseDescriptor { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + // Serialize version + obj.insert(Self::VERSION_KEY.to_string(), json!(Self::VERSION)); + + // Serialize comment if present + if let Some(comment) = self.comment() { + obj.insert(Self::COMMENT_NAME.to_string(), json!(comment)); + } + + // Serialize custom properties + obj.insert( + Self::CUSTOM_PROPERTIES_NAME.to_string(), + json!(self.custom_properties()), + ); + + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let mut builder = DatabaseDescriptor::builder(); + + // Deserialize comment if present + if let Some(comment_node) = node.get(Self::COMMENT_NAME) { + let comment = comment_node + .as_str() + .ok_or_else(|| JsonSerdeError { + message: format!("{} should be a string", Self::COMMENT_NAME), + })? + .to_owned(); + builder = builder.comment(&comment); + } + + // Deserialize custom properties directly + let custom_properties = if let Some(props_node) = node.get(Self::CUSTOM_PROPERTIES_NAME) { + let obj = props_node.as_object().ok_or_else(|| JsonSerdeError { + message: "Custom properties should be an object".to_string(), + })?; + + let mut properties = HashMap::with_capacity(obj.len()); + for (key, value) in obj { + properties.insert( + key.clone(), + value + .as_str() + .ok_or_else(|| JsonSerdeError { + message: "Property value should be a string".to_string(), + })? + .to_owned(), + ); + } + properties + } else { + HashMap::new() + }; + builder = builder.custom_properties(custom_properties); + + Ok(builder.build()) + } +} + +impl DatabaseDescriptor { + /// Create DatabaseDescriptor from JSON bytes (equivalent to Java's fromJsonBytes) + pub fn from_json_bytes(bytes: &[u8]) -> Result { + let json_value: Value = serde_json::from_slice(bytes).map_err(|e| JsonSerdeError { + message: format!("Failed to parse JSON: {e}"), + })?; + Self::deserialize_json(&json_value) + } + + /// Convert DatabaseDescriptor to JSON bytes + pub fn to_json_bytes(&self) -> Result> { + let json_value = self.serialize_json()?; + serde_json::to_vec(&json_value).map_err(|e| JsonSerdeError { + message: format!("Failed to serialize to JSON: {e}"), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_database_descriptor_json_serde() { + let mut custom_props = HashMap::new(); + custom_props.insert("key1".to_string(), "value1".to_string()); + custom_props.insert("key2".to_string(), "value2".to_string()); + + let descriptor = DatabaseDescriptor::builder() + .comment("Test database") + .custom_properties(custom_props) + .build(); + + // Test serialization + let json_bytes = descriptor.to_json_bytes().unwrap(); + println!("Serialized JSON: {}", String::from_utf8_lossy(&json_bytes)); + + // Test deserialization + let deserialized = DatabaseDescriptor::from_json_bytes(&json_bytes).unwrap(); + assert_eq!(descriptor, deserialized); + } + + #[test] + fn test_empty_database_descriptor() { + let descriptor = DatabaseDescriptor::builder().build(); + let json_bytes = descriptor.to_json_bytes().unwrap(); + let deserialized = DatabaseDescriptor::from_json_bytes(&json_bytes).unwrap(); + assert_eq!(descriptor, deserialized); + } +} diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs new file mode 100644 index 0000000000..60a44ba718 --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs @@ -0,0 +1,1992 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use serde::{Deserialize, Serialize}; +use std::fmt::{Display, Formatter}; + +/// Data type for Fluss table. +/// Impl reference: +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum DataType { + Boolean(BooleanType), + TinyInt(TinyIntType), + SmallInt(SmallIntType), + Int(IntType), + BigInt(BigIntType), + Float(FloatType), + Double(DoubleType), + Char(CharType), + String(StringType), + Decimal(DecimalType), + Date(DateType), + Time(TimeType), + Timestamp(TimestampType), + TimestampLTz(TimestampLTzType), + Bytes(BytesType), + Binary(BinaryType), + Array(ArrayType), + Map(MapType), + Row(RowType), +} + +impl DataType { + pub fn is_nullable(&self) -> bool { + match self { + DataType::Boolean(v) => v.nullable, + DataType::TinyInt(v) => v.nullable, + DataType::SmallInt(v) => v.nullable, + DataType::Int(v) => v.nullable, + DataType::BigInt(v) => v.nullable, + DataType::Decimal(v) => v.nullable, + DataType::Double(v) => v.nullable, + DataType::Float(v) => v.nullable, + DataType::Binary(v) => v.nullable, + DataType::Char(v) => v.nullable, + DataType::String(v) => v.nullable, + DataType::Date(v) => v.nullable, + DataType::TimestampLTz(v) => v.nullable, + DataType::Time(v) => v.nullable, + DataType::Timestamp(v) => v.nullable, + DataType::Array(v) => v.nullable, + DataType::Map(v) => v.nullable, + DataType::Row(v) => v.nullable, + DataType::Bytes(v) => v.nullable, + } + } + + pub fn as_non_nullable(&self) -> Self { + match self { + DataType::Boolean(v) => DataType::Boolean(v.as_non_nullable()), + DataType::TinyInt(v) => DataType::TinyInt(v.as_non_nullable()), + DataType::SmallInt(v) => DataType::SmallInt(v.as_non_nullable()), + DataType::Int(v) => DataType::Int(v.as_non_nullable()), + DataType::BigInt(v) => DataType::BigInt(v.as_non_nullable()), + DataType::Decimal(v) => DataType::Decimal(v.as_non_nullable()), + DataType::Double(v) => DataType::Double(v.as_non_nullable()), + DataType::Float(v) => DataType::Float(v.as_non_nullable()), + DataType::Binary(v) => DataType::Binary(v.as_non_nullable()), + DataType::Char(v) => DataType::Char(v.as_non_nullable()), + DataType::String(v) => DataType::String(v.as_non_nullable()), + DataType::Date(v) => DataType::Date(v.as_non_nullable()), + DataType::TimestampLTz(v) => DataType::TimestampLTz(v.as_non_nullable()), + DataType::Time(v) => DataType::Time(v.as_non_nullable()), + DataType::Timestamp(v) => DataType::Timestamp(v.as_non_nullable()), + DataType::Array(v) => DataType::Array(v.as_non_nullable()), + DataType::Map(v) => DataType::Map(v.as_non_nullable()), + DataType::Row(v) => DataType::Row(v.as_non_nullable()), + DataType::Bytes(v) => DataType::Bytes(v.as_non_nullable()), + } + } + + /// Structural equality ignoring the outermost nullability flag at + /// every level. Equivalent to comparing `as_non_nullable()` on both + /// sides but without the recursive clone. + pub(crate) fn eq_ignore_nullable(&self, other: &DataType) -> bool { + match self { + DataType::Boolean(_) => matches!(other, DataType::Boolean(_)), + DataType::TinyInt(_) => matches!(other, DataType::TinyInt(_)), + DataType::SmallInt(_) => matches!(other, DataType::SmallInt(_)), + DataType::Int(_) => matches!(other, DataType::Int(_)), + DataType::BigInt(_) => matches!(other, DataType::BigInt(_)), + DataType::Float(_) => matches!(other, DataType::Float(_)), + DataType::Double(_) => matches!(other, DataType::Double(_)), + DataType::Date(_) => matches!(other, DataType::Date(_)), + DataType::String(_) => matches!(other, DataType::String(_)), + DataType::Bytes(_) => matches!(other, DataType::Bytes(_)), + DataType::Char(a) => { + matches!(other, DataType::Char(b) if a.length() == b.length()) + } + DataType::Binary(a) => { + matches!(other, DataType::Binary(b) if a.length() == b.length()) + } + DataType::Decimal(a) => matches!( + other, + DataType::Decimal(b) if a.precision() == b.precision() && a.scale() == b.scale() + ), + DataType::Time(a) => { + matches!(other, DataType::Time(b) if a.precision() == b.precision()) + } + DataType::Timestamp(a) => { + matches!(other, DataType::Timestamp(b) if a.precision() == b.precision()) + } + DataType::TimestampLTz(a) => { + matches!(other, DataType::TimestampLTz(b) if a.precision() == b.precision()) + } + DataType::Array(a) => matches!( + other, + DataType::Array(b) if a.get_element_type().eq_ignore_nullable(b.get_element_type()) + ), + DataType::Map(a) => matches!( + other, + DataType::Map(b) + if a.key_type().eq_ignore_nullable(b.key_type()) + && a.value_type().eq_ignore_nullable(b.value_type()) + ), + DataType::Row(a) => matches!( + other, + DataType::Row(b) if a.fields().len() == b.fields().len() + && a.fields().iter().zip(b.fields().iter()).all(|(x, y)| { + x.name() == y.name() && x.data_type().eq_ignore_nullable(y.data_type()) + }) + ), + } + } +} + +impl Display for DataType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + DataType::Boolean(v) => write!(f, "{v}"), + DataType::TinyInt(v) => write!(f, "{v}"), + DataType::SmallInt(v) => write!(f, "{v}"), + DataType::Int(v) => write!(f, "{v}"), + DataType::BigInt(v) => write!(f, "{v}"), + DataType::Float(v) => write!(f, "{v}"), + DataType::Double(v) => write!(f, "{v}"), + DataType::Char(v) => write!(f, "{v}"), + DataType::String(v) => write!(f, "{v}"), + DataType::Decimal(v) => write!(f, "{v}"), + DataType::Date(v) => write!(f, "{v}"), + DataType::Time(v) => write!(f, "{v}"), + DataType::Timestamp(v) => write!(f, "{v}"), + DataType::TimestampLTz(v) => write!(f, "{v}"), + DataType::Bytes(v) => write!(f, "{v}"), + DataType::Binary(v) => write!(f, "{v}"), + DataType::Array(v) => write!(f, "{v}"), + DataType::Map(v) => write!(f, "{v}"), + DataType::Row(v) => write!(f, "{v}"), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct BooleanType { + nullable: bool, +} + +impl Default for BooleanType { + fn default() -> Self { + Self::new() + } +} + +impl BooleanType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for BooleanType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "BOOLEAN")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct TinyIntType { + nullable: bool, +} + +impl Default for TinyIntType { + fn default() -> Self { + Self::new() + } +} + +impl TinyIntType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for TinyIntType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TINYINT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct SmallIntType { + nullable: bool, +} + +impl Default for SmallIntType { + fn default() -> Self { + Self::new() + } +} + +impl SmallIntType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for SmallIntType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "SMALLINT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct IntType { + nullable: bool, +} + +impl Default for IntType { + fn default() -> Self { + Self::new() + } +} + +impl IntType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for IntType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "INT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct BigIntType { + nullable: bool, +} + +impl Default for BigIntType { + fn default() -> Self { + Self::new() + } +} + +impl BigIntType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for BigIntType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "BIGINT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct FloatType { + nullable: bool, +} + +impl Default for FloatType { + fn default() -> Self { + Self::new() + } +} + +impl FloatType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for FloatType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "FLOAT")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct DoubleType { + nullable: bool, +} + +impl Default for DoubleType { + fn default() -> Self { + Self::new() + } +} + +impl DoubleType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for DoubleType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "DOUBLE")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct CharType { + nullable: bool, + length: u32, +} + +impl CharType { + pub fn new(length: u32) -> Self { + Self::with_nullable(length, true) + } + + pub fn with_nullable(length: u32, nullable: bool) -> Self { + Self { nullable, length } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(self.length, false) + } + + pub fn length(&self) -> u32 { + self.length + } +} + +impl Default for CharType { + fn default() -> Self { + Self::new(1) + } +} + +impl Display for CharType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "CHAR({})", self.length)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct StringType { + nullable: bool, +} + +impl Default for StringType { + fn default() -> Self { + Self::new() + } +} + +impl StringType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for StringType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "STRING")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct DecimalType { + nullable: bool, + precision: u32, + scale: u32, +} + +impl DecimalType { + pub const MIN_PRECISION: u32 = 1; + + pub const MAX_PRECISION: u32 = 38; + + pub const DEFAULT_PRECISION: u32 = 10; + + pub const MIN_SCALE: u32 = 0; + + pub const DEFAULT_SCALE: u32 = 0; + + pub fn new(precision: u32, scale: u32) -> Result { + Self::with_nullable(true, precision, scale) + } + + /// Create a DecimalType with validation, returning an error if parameters are invalid. + pub fn with_nullable(nullable: bool, precision: u32, scale: u32) -> Result { + // Validate precision + if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { + return Err(IllegalArgument { + message: format!( + "Decimal precision must be between {} and {} (both inclusive), got: {}", + Self::MIN_PRECISION, + Self::MAX_PRECISION, + precision + ), + }); + } + // Validate scale + if scale > precision { + return Err(IllegalArgument { + message: format!( + "Decimal scale must be between {} and the precision {} (both inclusive), got: {}", + Self::MIN_SCALE, + precision, + scale + ), + }); + } + Ok(DecimalType { + nullable, + precision, + scale, + }) + } + + pub fn precision(&self) -> u32 { + self.precision + } + + pub fn scale(&self) -> u32 { + self.scale + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.precision, self.scale) + .expect("Invalid decimal precision or scale") + } +} + +impl Default for DecimalType { + fn default() -> Self { + Self::new(Self::DEFAULT_PRECISION, Self::DEFAULT_SCALE) + .expect("Invalid default decimal precision or scale") + } +} + +impl Display for DecimalType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "DECIMAL({}, {})", self.precision, self.scale)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct DateType { + nullable: bool, +} + +impl Default for DateType { + fn default() -> Self { + Self::new() + } +} + +impl DateType { + pub fn new() -> Self { + Self::with_nullable(true) + } + + pub fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for DateType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "DATE")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct TimeType { + nullable: bool, + precision: u32, +} + +impl Default for TimeType { + fn default() -> Self { + Self::new(Self::DEFAULT_PRECISION).expect("Invalid default time precision") + } +} + +impl TimeType { + pub const MIN_PRECISION: u32 = 0; + + pub const MAX_PRECISION: u32 = 9; + + pub const DEFAULT_PRECISION: u32 = 0; + + pub fn new(precision: u32) -> Result { + Self::with_nullable(true, precision) + } + + /// Create a TimeType with validation, returning an error if precision is invalid. + pub fn with_nullable(nullable: bool, precision: u32) -> Result { + // Validate precision + if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { + return Err(IllegalArgument { + message: format!( + "Time precision must be between {} and {} (both inclusive), got: {}", + Self::MIN_PRECISION, + Self::MAX_PRECISION, + precision + ), + }); + } + Ok(TimeType { + nullable, + precision, + }) + } + + pub fn precision(&self) -> u32 { + self.precision + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.precision).expect("Invalid time precision") + } +} + +impl Display for TimeType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TIME({})", self.precision)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct TimestampType { + nullable: bool, + precision: u32, +} + +impl Default for TimestampType { + fn default() -> Self { + Self::new(Self::DEFAULT_PRECISION).expect("Invalid default timestamp precision") + } +} + +impl TimestampType { + pub const MIN_PRECISION: u32 = 0; + + pub const MAX_PRECISION: u32 = 9; + + pub const DEFAULT_PRECISION: u32 = 6; + + pub fn new(precision: u32) -> Result { + Self::with_nullable(true, precision) + } + + /// Create a TimestampType with validation, returning an error if precision is invalid. + pub fn with_nullable(nullable: bool, precision: u32) -> Result { + // Validate precision + if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { + return Err(IllegalArgument { + message: format!( + "Timestamp precision must be between {} and {} (both inclusive), got: {}", + Self::MIN_PRECISION, + Self::MAX_PRECISION, + precision + ), + }); + } + Ok(TimestampType { + nullable, + precision, + }) + } + + pub fn precision(&self) -> u32 { + self.precision + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.precision).expect("Invalid timestamp precision") + } +} + +impl Display for TimestampType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TIMESTAMP({})", self.precision)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct TimestampLTzType { + nullable: bool, + precision: u32, +} + +impl Default for TimestampLTzType { + fn default() -> Self { + Self::new(Self::DEFAULT_PRECISION) + .expect("Invalid default timestamp with local time zone precision") + } +} + +impl TimestampLTzType { + pub const MIN_PRECISION: u32 = 0; + + pub const MAX_PRECISION: u32 = 9; + + pub const DEFAULT_PRECISION: u32 = 6; + + pub fn new(precision: u32) -> Result { + Self::with_nullable(true, precision) + } + + /// Create a TimestampLTzType with validation, returning an error if precision is invalid. + pub fn with_nullable(nullable: bool, precision: u32) -> Result { + // Validate precision + if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) { + return Err(IllegalArgument { + message: format!( + "Timestamp with local time zone precision must be between {} and {} (both inclusive), got: {}", + Self::MIN_PRECISION, + Self::MAX_PRECISION, + precision + ), + }); + } + Ok(TimestampLTzType { + nullable, + precision, + }) + } + + pub fn precision(&self) -> u32 { + self.precision + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.precision) + .expect("Invalid timestamp with local time zone precision") + } +} + +impl Display for TimestampLTzType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "TIMESTAMP_LTZ({})", self.precision)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct BytesType { + nullable: bool, +} + +impl Default for BytesType { + fn default() -> Self { + Self::new() + } +} + +impl BytesType { + pub const fn new() -> Self { + Self::with_nullable(true) + } + + pub const fn with_nullable(nullable: bool) -> Self { + Self { nullable } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false) + } +} + +impl Display for BytesType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "BYTES")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct BinaryType { + nullable: bool, + length: usize, +} + +impl BinaryType { + pub const MIN_LENGTH: usize = 1; + + pub const MAX_LENGTH: usize = usize::MAX; + + pub const DEFAULT_LENGTH: usize = 1; + + pub fn new(length: usize) -> Self { + Self::with_nullable(true, length) + } + + pub fn with_nullable(nullable: bool, length: usize) -> Self { + Self { nullable, length } + } + + pub fn length(&self) -> usize { + self.length + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.length) + } +} + +impl Default for BinaryType { + fn default() -> Self { + Self::new(Self::DEFAULT_LENGTH) + } +} + +impl Display for BinaryType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "BINARY({})", self.length)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct ArrayType { + nullable: bool, + element_type: Box, +} + +impl ArrayType { + pub fn new(element_type: DataType) -> Self { + Self::with_nullable(true, element_type) + } + + pub fn with_nullable(nullable: bool, element_type: DataType) -> Self { + Self { + nullable, + element_type: Box::new(element_type), + } + } + + pub fn as_non_nullable(&self) -> Self { + Self { + nullable: false, + element_type: self.element_type.clone(), + } + } + + pub fn get_element_type(&self) -> &DataType { + &self.element_type + } +} + +impl Display for ArrayType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "ARRAY<{}>", self.element_type)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Hash)] +pub struct MapType { + nullable: bool, + key_type: Box, + value_type: Box, +} + +// Route Deserialize through `with_nullable` so a Serde-built MapType +// collapses to the same canonical form as the constructor (otherwise +// equivalent maps disagree under `PartialEq`). +impl<'de> Deserialize<'de> for MapType { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + #[derive(Deserialize)] + struct Raw { + nullable: bool, + key_type: Box, + value_type: Box, + } + let raw = Raw::deserialize(deserializer)?; + Ok(MapType::with_nullable( + raw.nullable, + *raw.key_type, + *raw.value_type, + )) + } +} + +impl MapType { + pub fn new(key_type: DataType, value_type: DataType) -> Self { + Self::with_nullable(true, key_type, value_type) + } + + pub fn with_nullable(nullable: bool, key_type: DataType, value_type: DataType) -> Self { + Self { + nullable, + key_type: Box::new(key_type.as_non_nullable()), + value_type: Box::new(value_type), + } + } + + pub fn as_non_nullable(&self) -> Self { + Self { + nullable: false, + key_type: self.key_type.clone(), + value_type: self.value_type.clone(), + } + } + + pub fn key_type(&self) -> &DataType { + &self.key_type + } + + pub fn value_type(&self) -> &DataType { + &self.value_type + } +} + +impl Display for MapType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "MAP<{}, {}>", self.key_type, self.value_type)?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)] +pub struct RowType { + nullable: bool, + fields: Vec, +} + +impl RowType { + pub const fn new(fields: Vec) -> Self { + Self::with_nullable(true, fields) + } + + pub const fn with_nullable(nullable: bool, fields: Vec) -> Self { + Self { nullable, fields } + } + + pub fn as_non_nullable(&self) -> Self { + Self::with_nullable(false, self.fields.clone()) + } + + pub fn fields(&self) -> &Vec { + &self.fields + } + + pub fn get_field_index(&self, field_name: &str) -> Option { + self.fields.iter().position(|f| f.name == field_name) + } + + pub fn field_types(&self) -> impl Iterator + '_ { + self.fields.iter().map(|f| &f.data_type) + } + + pub fn get_field_names(&self) -> Vec<&str> { + self.fields.iter().map(|f| f.name.as_str()).collect() + } + + pub fn project_with_field_names(&self, field_names: &[String]) -> Result { + let indices: Vec = field_names + .iter() + .map(|name| { + self.get_field_index(name).ok_or_else(|| IllegalArgument { + message: format!("Field '{name}' does not exist in the row type"), + }) + }) + .collect::>>()?; + + self.project(indices.as_slice()) + } + + pub fn project(&self, project_field_positions: &[usize]) -> Result { + Ok(RowType::with_nullable( + self.nullable, + project_field_positions + .iter() + .map(|pos| { + self.fields + .get(*pos) + .cloned() + .ok_or_else(|| IllegalArgument { + message: format!("invalid field position: {}", *pos), + }) + }) + .collect::>>()?, + )) + } + + #[cfg(test)] + pub fn with_data_types(data_types: Vec) -> Self { + let mut fields: Vec = Vec::new(); + data_types.iter().enumerate().for_each(|(idx, data_type)| { + fields.push(DataField::new(format!("f{idx}"), data_type.clone(), None)); + }); + + Self::with_nullable(true, fields) + } + + #[cfg(test)] + pub fn with_data_types_and_field_names( + data_types: Vec, + field_names: Vec<&str>, + ) -> Self { + let fields = data_types + .into_iter() + .zip(field_names) + .map(|(data_type, field_name)| { + DataField::new(field_name.to_string(), data_type.clone(), None) + }) + .collect::>(); + + Self::with_nullable(true, fields) + } +} + +impl Display for RowType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "ROW<")?; + for (i, field) in self.fields.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{field}")?; + } + write!(f, ">")?; + if !self.nullable { + write!(f, " NOT NULL")?; + } + Ok(()) + } +} + +pub struct DataTypes; + +impl DataTypes { + pub fn binary(length: usize) -> DataType { + DataType::Binary(BinaryType::new(length)) + } + + pub const fn bytes() -> DataType { + DataType::Bytes(BytesType::new()) + } + + pub fn boolean() -> DataType { + DataType::Boolean(BooleanType::new()) + } + + pub fn int() -> DataType { + DataType::Int(IntType::new()) + } + + /// Data type of a 1-byte signed integer with values from -128 to 127. + pub fn tinyint() -> DataType { + DataType::TinyInt(TinyIntType::new()) + } + + /// Data type of a 2-byte signed integer with values from -32,768 to 32,767. + pub fn smallint() -> DataType { + DataType::SmallInt(SmallIntType::new()) + } + + pub fn bigint() -> DataType { + DataType::BigInt(BigIntType::new()) + } + + /// Data type of a 4-byte single precision floating point number. + pub fn float() -> DataType { + DataType::Float(FloatType::new()) + } + + /// Data type of an 8-byte double precision floating point number. + pub fn double() -> DataType { + DataType::Double(DoubleType::new()) + } + + pub fn char(length: u32) -> DataType { + DataType::Char(CharType::new(length)) + } + + /// Data type of a variable-length character string. + pub fn string() -> DataType { + DataType::String(StringType::new()) + } + + /// Data type of a decimal number with fixed precision and scale `DECIMAL(p, s)` where + /// `p` is the number of digits in a number (=precision) and `s` is the number of + /// digits to the right of the decimal point in a number (=scale). `p` must have a value + /// between 1 and 38 (both inclusive). `s` must have a value between 0 and `p` (both inclusive). + pub fn decimal(precision: u32, scale: u32) -> DataType { + DataType::Decimal(DecimalType::new(precision, scale).expect("Invalid decimal parameters")) + } + + pub fn date() -> DataType { + DataType::Date(DateType::new()) + } + + /// Data type of a time WITHOUT time zone `TIME` with no fractional seconds by default. + pub fn time() -> DataType { + DataType::Time(TimeType::default()) + } + + /// Data type of a time WITHOUT time zone `TIME(p)` where `p` is the number of digits + /// of fractional seconds (=precision). `p` must have a value between 0 and 9 (both inclusive). + pub fn time_with_precision(precision: u32) -> DataType { + DataType::Time(TimeType::new(precision).expect("Invalid time precision")) + } + + /// Data type of a timestamp WITHOUT time zone `TIMESTAMP` with 6 digits of fractional + /// seconds by default. + pub fn timestamp() -> DataType { + DataType::Timestamp(TimestampType::default()) + } + + /// Data type of a timestamp WITHOUT time zone `TIMESTAMP(p)` where `p` is the number + /// of digits of fractional seconds (=precision). `p` must have a value between 0 and 9 + /// (both inclusive). + pub fn timestamp_with_precision(precision: u32) -> DataType { + DataType::Timestamp(TimestampType::new(precision).expect("Invalid timestamp precision")) + } + + /// Data type of a timestamp WITH time zone `TIMESTAMP WITH TIME ZONE` with 6 digits of + /// fractional seconds by default. + pub fn timestamp_ltz() -> DataType { + DataType::TimestampLTz(TimestampLTzType::default()) + } + + /// Data type of a timestamp WITH time zone `TIMESTAMP WITH TIME ZONE(p)` where `p` is the number + /// of digits of fractional seconds (=precision). `p` must have a value between 0 and 9 (both inclusive). + pub fn timestamp_ltz_with_precision(precision: u32) -> DataType { + DataType::TimestampLTz( + TimestampLTzType::new(precision) + .expect("Invalid timestamp with local time zone precision"), + ) + } + + /// Data type of an array of elements with same subtype. + pub fn array(element: DataType) -> DataType { + DataType::Array(ArrayType::new(element)) + } + + /// Data type of an associative array that maps keys to values. + pub fn map(key_type: DataType, value_type: DataType) -> DataType { + DataType::Map(MapType::new(key_type, value_type)) + } + + /// Field definition with field name and data type. + pub fn field>(name: N, data_type: DataType) -> DataField { + DataField::new(name, data_type, None) + } + + /// Field definition with field name, data type, and a description. + pub fn field_with_description>( + name: N, + data_type: DataType, + description: String, + ) -> DataField { + DataField::new(name, data_type, Some(description)) + } + + /// Data type of a sequence of fields. + pub fn row(fields: Vec) -> DataType { + DataType::Row(RowType::new(fields)) + } + + /// Data type of a sequence of fields with generated field names (f0, f1, f2, ...). + pub fn row_from_types(field_types: Vec) -> DataType { + let fields = field_types + .into_iter() + .enumerate() + .map(|(i, dt)| DataField::new(format!("f{i}"), dt, None)) + .collect(); + DataType::Row(RowType::new(fields)) + } +} + +pub const UNASSIGNED_FIELD_ID: i32 = -1; + +pub fn reassign_field_ids(data_type: &DataType, counter: &mut i32) -> DataType { + match data_type { + DataType::Array(at) => DataType::Array(ArrayType::with_nullable( + at.nullable, + reassign_field_ids(at.get_element_type(), counter), + )), + DataType::Map(mt) => DataType::Map(MapType::with_nullable( + mt.nullable, + reassign_field_ids(mt.key_type(), counter), + reassign_field_ids(mt.value_type(), counter), + )), + DataType::Row(rt) => { + let new_fields: Vec = rt + .fields() + .iter() + .map(|f| { + *counter += 1; + let id = *counter; + let new_inner = reassign_field_ids(&f.data_type, counter); + DataField::with_field_id(f.name.clone(), new_inner, f.description.clone(), id) + }) + .collect(); + DataType::Row(RowType::with_nullable(rt.nullable, new_fields)) + } + _ => data_type.clone(), + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DataField { + pub name: String, + pub data_type: DataType, + pub description: Option, + pub field_id: i32, +} + +// field_id is excluded from PartialEq/Eq/Hash to match Java's DataField.equals/hashCode. +impl PartialEq for DataField { + fn eq(&self, other: &Self) -> bool { + self.name == other.name + && self.data_type == other.data_type + && self.description == other.description + } +} + +impl Eq for DataField {} + +impl std::hash::Hash for DataField { + fn hash(&self, state: &mut H) { + self.name.hash(state); + self.data_type.hash(state); + self.description.hash(state); + } +} + +impl DataField { + pub fn new>( + name: N, + data_type: DataType, + description: Option, + ) -> DataField { + DataField { + name: name.into(), + data_type, + description, + field_id: UNASSIGNED_FIELD_ID, + } + } + + pub fn with_field_id>( + name: N, + data_type: DataType, + description: Option, + field_id: i32, + ) -> DataField { + DataField { + name: name.into(), + data_type, + description, + field_id, + } + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + pub fn field_id(&self) -> i32 { + self.field_id + } +} + +impl Display for DataField { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{} {}", self.name, self.data_type) + } +} + +#[test] +fn test_primitive_types_display() { + // Test simple primitive types with nullable and non-nullable variants + assert_eq!(BooleanType::new().to_string(), "BOOLEAN"); + assert_eq!( + BooleanType::with_nullable(false).to_string(), + "BOOLEAN NOT NULL" + ); + + assert_eq!(TinyIntType::new().to_string(), "TINYINT"); + assert_eq!( + TinyIntType::with_nullable(false).to_string(), + "TINYINT NOT NULL" + ); + + assert_eq!(SmallIntType::new().to_string(), "SMALLINT"); + assert_eq!( + SmallIntType::with_nullable(false).to_string(), + "SMALLINT NOT NULL" + ); + + assert_eq!(IntType::new().to_string(), "INT"); + assert_eq!(IntType::with_nullable(false).to_string(), "INT NOT NULL"); + + assert_eq!(BigIntType::new().to_string(), "BIGINT"); + assert_eq!( + BigIntType::with_nullable(false).to_string(), + "BIGINT NOT NULL" + ); + + assert_eq!(FloatType::new().to_string(), "FLOAT"); + assert_eq!( + FloatType::with_nullable(false).to_string(), + "FLOAT NOT NULL" + ); + + assert_eq!(DoubleType::new().to_string(), "DOUBLE"); + assert_eq!( + DoubleType::with_nullable(false).to_string(), + "DOUBLE NOT NULL" + ); + + assert_eq!(StringType::new().to_string(), "STRING"); + assert_eq!( + StringType::with_nullable(false).to_string(), + "STRING NOT NULL" + ); + + assert_eq!(DateType::new().to_string(), "DATE"); + assert_eq!(DateType::with_nullable(false).to_string(), "DATE NOT NULL"); + + assert_eq!(BytesType::new().to_string(), "BYTES"); + assert_eq!( + BytesType::with_nullable(false).to_string(), + "BYTES NOT NULL" + ); +} + +#[test] +fn test_parameterized_types_display() { + // Test types with parameters (length, precision, scale, etc.) + assert_eq!(CharType::new(10).to_string(), "CHAR(10)"); + assert_eq!( + CharType::with_nullable(20, false).to_string(), + "CHAR(20) NOT NULL" + ); + + assert_eq!(BinaryType::new(100).to_string(), "BINARY(100)"); + assert_eq!( + BinaryType::with_nullable(false, 256).to_string(), + "BINARY(256) NOT NULL" + ); + + assert_eq!( + DecimalType::new(10, 2).unwrap().to_string(), + "DECIMAL(10, 2)" + ); + assert_eq!( + DecimalType::with_nullable(false, 38, 10) + .unwrap() + .to_string(), + "DECIMAL(38, 10) NOT NULL" + ); + + assert_eq!(TimeType::new(0).unwrap().to_string(), "TIME(0)"); + assert_eq!(TimeType::new(3).unwrap().to_string(), "TIME(3)"); + assert_eq!( + TimeType::with_nullable(false, 9).unwrap().to_string(), + "TIME(9) NOT NULL" + ); + + assert_eq!(TimestampType::new(6).unwrap().to_string(), "TIMESTAMP(6)"); + assert_eq!(TimestampType::new(0).unwrap().to_string(), "TIMESTAMP(0)"); + assert_eq!( + TimestampType::with_nullable(false, 9).unwrap().to_string(), + "TIMESTAMP(9) NOT NULL" + ); + + assert_eq!( + TimestampLTzType::new(6).unwrap().to_string(), + "TIMESTAMP_LTZ(6)" + ); + assert_eq!( + TimestampLTzType::new(3).unwrap().to_string(), + "TIMESTAMP_LTZ(3)" + ); + assert_eq!( + TimestampLTzType::with_nullable(false, 9) + .unwrap() + .to_string(), + "TIMESTAMP_LTZ(9) NOT NULL" + ); +} + +#[test] +fn test_array_display() { + let array_type = ArrayType::new(DataTypes::int()); + assert_eq!(array_type.to_string(), "ARRAY"); + + let array_type_non_null = ArrayType::with_nullable(false, DataTypes::string()); + assert_eq!(array_type_non_null.to_string(), "ARRAY NOT NULL"); + + let nested_array = ArrayType::new(DataTypes::array(DataTypes::int())); + assert_eq!(nested_array.to_string(), "ARRAY>"); +} + +#[test] +fn test_map_display() { + let map_type = MapType::new(DataTypes::string(), DataTypes::int()); + assert_eq!(map_type.to_string(), "MAP"); + + let map_type_non_null = MapType::with_nullable(false, DataTypes::int(), DataTypes::string()); + assert_eq!( + map_type_non_null.to_string(), + "MAP NOT NULL" + ); + + let nested_map = MapType::new( + DataTypes::string(), + DataTypes::map(DataTypes::int(), DataTypes::boolean()), + ); + assert_eq!( + nested_map.to_string(), + "MAP>" + ); +} + +#[test] +fn test_map_deserialize_normalises_key_nullability() { + let json = r#"{ + "nullable": true, + "key_type": {"Int": {"nullable": true}}, + "value_type": {"String": {"nullable": true}} + }"#; + let from_json: MapType = serde_json::from_str(json).expect("deserialize"); + let from_ctor = MapType::new(DataTypes::int(), DataTypes::string()); + assert_eq!(from_json, from_ctor); + assert!(!from_json.key_type().is_nullable()); +} + +#[test] +fn test_map_deserialize_normalises_nested_map_keys() { + let json = r#"{ + "nullable": true, + "key_type": {"String": {"nullable": true}}, + "value_type": {"Map": { + "nullable": true, + "key_type": {"Int": {"nullable": true}}, + "value_type": {"Boolean": {"nullable": true}} + }} + }"#; + let from_json: MapType = serde_json::from_str(json).expect("deserialize"); + let from_ctor = MapType::new( + DataTypes::string(), + DataTypes::map(DataTypes::int(), DataTypes::boolean()), + ); + assert_eq!(from_json, from_ctor); + assert!(!from_json.key_type().is_nullable()); + let inner = match from_json.value_type() { + DataType::Map(m) => m, + other => panic!("expected nested Map, got {other:?}"), + }; + assert!(!inner.key_type().is_nullable()); +} + +#[test] +fn test_row_display() { + let fields = vec![ + DataTypes::field("id", DataTypes::int()), + DataTypes::field("name", DataTypes::string()), + ]; + let row_type = RowType::new(fields); + assert_eq!(row_type.to_string(), "ROW"); + + let fields_non_null = vec![DataTypes::field("age", DataTypes::bigint())]; + let row_type_non_null = RowType::with_nullable(false, fields_non_null); + assert_eq!(row_type_non_null.to_string(), "ROW NOT NULL"); +} + +#[test] +fn test_datatype_display() { + assert_eq!(DataTypes::boolean().to_string(), "BOOLEAN"); + assert_eq!(DataTypes::int().to_string(), "INT"); + assert_eq!(DataTypes::string().to_string(), "STRING"); + assert_eq!(DataTypes::char(50).to_string(), "CHAR(50)"); + assert_eq!(DataTypes::decimal(10, 2).to_string(), "DECIMAL(10, 2)"); + assert_eq!(DataTypes::time_with_precision(3).to_string(), "TIME(3)"); + assert_eq!( + DataTypes::timestamp_with_precision(6).to_string(), + "TIMESTAMP(6)" + ); + assert_eq!( + DataTypes::timestamp_ltz_with_precision(9).to_string(), + "TIMESTAMP_LTZ(9)" + ); + assert_eq!(DataTypes::array(DataTypes::int()).to_string(), "ARRAY"); + assert_eq!( + DataTypes::map(DataTypes::string(), DataTypes::int()).to_string(), + "MAP" + ); +} + +#[test] +fn test_datafield_display() { + let field = DataTypes::field("user_id", DataTypes::bigint()); + assert_eq!(field.to_string(), "user_id BIGINT"); + + let field2 = DataTypes::field("email", DataTypes::string()); + assert_eq!(field2.to_string(), "email STRING"); + + let field3 = DataTypes::field("score", DataTypes::decimal(10, 2)); + assert_eq!(field3.to_string(), "score DECIMAL(10, 2)"); +} + +#[test] +fn test_complex_nested_display() { + let row_type = DataTypes::row(vec![ + DataTypes::field("id", DataTypes::int()), + DataTypes::field("tags", DataTypes::array(DataTypes::string())), + DataTypes::field( + "metadata", + DataTypes::map(DataTypes::string(), DataTypes::string()), + ), + ]); + assert_eq!( + row_type.to_string(), + "ROW, metadata MAP>" + ); +} + +#[test] +fn test_non_nullable_datatype() { + let nullable_int = DataTypes::int(); + assert_eq!(nullable_int.to_string(), "INT"); + + let non_nullable_int = nullable_int.as_non_nullable(); + assert_eq!(non_nullable_int.to_string(), "INT NOT NULL"); +} + +#[test] +fn test_deeply_nested_types() { + let nested = DataTypes::array(DataTypes::map( + DataTypes::string(), + DataTypes::row(vec![ + DataTypes::field("x", DataTypes::int()), + DataTypes::field("y", DataTypes::int()), + ]), + )); + assert_eq!( + nested.to_string(), + "ARRAY>>" + ); +} + +// ============================================================================ +// DecimalType validation tests +// ============================================================================ + +#[test] +fn test_decimal_invalid_precision() { + // DecimalType::with_nullable should return an error for invalid precision + let result = DecimalType::with_nullable(true, 50, 2); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Decimal precision must be between 1 and 38") + ); +} + +#[test] +fn test_decimal_invalid_scale() { + // DecimalType::with_nullable should return an error when scale > precision + let result = DecimalType::with_nullable(true, 10, 15); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Decimal scale must be between 0 and the precision 10") + ); +} + +// ============================================================================ +// DecimalType validation tests - edge cases +// ============================================================================ + +#[test] +fn test_decimal_valid_precision_and_scale() { + // Valid: precision=10, scale=2 + let result = DecimalType::with_nullable(true, 10, 2); + assert!(result.is_ok()); + let decimal = result.unwrap(); + assert_eq!(decimal.precision(), 10); + assert_eq!(decimal.scale(), 2); + // Nullable: should NOT contain "NOT NULL" + assert!(!decimal.to_string().contains("NOT NULL")); + + // Valid: precision=38, scale=0 + let result = DecimalType::with_nullable(true, 38, 0); + assert!(result.is_ok()); + let decimal = result.unwrap(); + assert_eq!(decimal.precision(), 38); + assert_eq!(decimal.scale(), 0); + + // Valid: precision=1, scale=0 + let result = DecimalType::with_nullable(false, 1, 0); + assert!(result.is_ok()); + let decimal = result.unwrap(); + assert_eq!(decimal.precision(), 1); + assert_eq!(decimal.scale(), 0); + // Non-nullable: should contain "NOT NULL" + assert!(decimal.to_string().contains("NOT NULL")); +} + +#[test] +fn test_decimal_invalid_precision_zero() { + // Invalid: precision=0 (edge case not covered by existing tests) + let result = DecimalType::with_nullable(true, 0, 0); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Decimal precision must be between 1 and 38") + ); +} + +#[test] +fn test_decimal_scale_equals_precision_boundary() { + // Boundary: precision=10, scale=10 (scale == precision is valid) + let result = DecimalType::with_nullable(true, 10, 10); + assert!(result.is_ok()); + let decimal = result.unwrap(); + assert_eq!(decimal.precision(), 10); + assert_eq!(decimal.scale(), 10); +} + +// ============================================================================ +// TimeType validation tests +// ============================================================================ + +#[test] +fn test_time_valid_precision() { + // Test all valid precision values 0 through 9 + for precision in 0..=9 { + let result = TimeType::with_nullable(true, precision); + assert!(result.is_ok(), "precision {precision} should be valid"); + let time = result.unwrap(); + assert_eq!(time.precision(), precision); + } +} + +#[test] +fn test_time_invalid_precision() { + // TimeType::with_nullable should return an error for invalid precision + let result = TimeType::with_nullable(true, 10); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Time precision must be between 0 and 9") + ); +} + +// ============================================================================ +// TimestampType validation tests +// ============================================================================ + +#[test] +fn test_timestamp_valid_precision() { + // Test all valid precision values 0 through 9 + for precision in 0..=9 { + let result = TimestampType::with_nullable(true, precision); + assert!(result.is_ok(), "precision {precision} should be valid"); + let timestamp_type = result.unwrap(); + assert_eq!(timestamp_type.precision(), precision); + } +} + +#[test] +fn test_timestamp_invalid_precision() { + // TimestampType::with_nullable should return an error for invalid precision + let result = TimestampType::with_nullable(true, 10); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Timestamp precision must be between 0 and 9") + ); +} + +#[test] +fn test_timestamp_ltz_invalid_precision() { + // TimestampLTzType::with_nullable should return an error for invalid precision + let result = TimestampLTzType::with_nullable(true, 10); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Timestamp with local time zone precision must be between 0 and 9") + ); +} + +// ============================================================================ +// RowType projection tests +// ============================================================================ + +#[test] +fn test_row_type_project_valid_indices() { + // Create a 3-column row type + let row_type = RowType::with_data_types_and_field_names( + vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()], + vec!["id", "name", "age"], + ); + + // Valid projection by indices: [0, 2] + let projected = row_type.project(&[0, 2]).unwrap(); + assert_eq!(projected.fields().len(), 2); + assert_eq!(projected.fields()[0].name, "id"); + assert_eq!(projected.fields()[1].name, "age"); +} + +#[test] +fn test_row_type_project_empty_indices() { + // Create a 3-column row type + let row_type = RowType::with_data_types_and_field_names( + vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()], + vec!["id", "name", "age"], + ); + + // Projection with an empty indices array should yield an empty RowType + let projected = row_type.project(&[]).unwrap(); + assert_eq!(projected.fields().len(), 0); +} + +#[test] +fn test_row_type_project_with_field_names_valid() { + // Create a 3-column row type + let row_type = RowType::with_data_types_and_field_names( + vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()], + vec!["id", "name", "age"], + ); + + // Valid projection by names: ["id", "name"] + let projected = row_type + .project_with_field_names(&["id".to_string(), "name".to_string()]) + .unwrap(); + assert_eq!(projected.fields().len(), 2); + assert_eq!(projected.fields()[0].name, "id"); + assert_eq!(projected.fields()[1].name, "name"); +} + +#[test] +fn test_row_type_project_index_out_of_bounds() { + // Create a 3-column row type + let row_type = RowType::with_data_types_and_field_names( + vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()], + vec!["id", "name", "age"], + ); + + // Error: index out of bounds + let result = row_type.project(&[0, 5]); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("invalid field position: 5") + ); +} + +#[test] +fn test_row_type_project_with_field_names_nonexistent() { + // Create a 3-column row type + let row_type = RowType::with_data_types_and_field_names( + vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()], + vec!["id", "name", "age"], + ); + + // Error: non-existent field name should throw exception + let result = row_type.project_with_field_names(&["nonexistent".to_string()]); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Field 'nonexistent' does not exist in the row type") + ); + + // Mixed existing and non-existing: should also error on the first non-existent field + let result = row_type.project_with_field_names(&["id".to_string(), "nonexistent".to_string()]); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Field 'nonexistent' does not exist in the row type") + ); +} + +#[test] +fn test_row_type_project_duplicate_indices() { + // Create a 3-column row type + let row_type = RowType::with_data_types_and_field_names( + vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()], + vec!["id", "name", "age"], + ); + + // Projection with duplicate indices: [0, 0, 1] + // This documents the expected behavior - duplicates are allowed + let projected = row_type.project(&[0, 0, 1]).unwrap(); + assert_eq!(projected.fields().len(), 3); + assert_eq!(projected.fields()[0].name, "id"); + assert_eq!(projected.fields()[1].name, "id"); + assert_eq!(projected.fields()[2].name, "name"); +} + +#[cfg(test)] +mod eq_ignore_nullable_tests { + use super::*; + + #[test] + fn ignores_nullability_at_top_level() { + let nullable = DataType::Int(IntType::new()); + let non_nullable = DataType::Int(IntType::with_nullable(false)); + assert_ne!(nullable, non_nullable, "PartialEq still distinguishes"); + assert!(nullable.eq_ignore_nullable(&non_nullable)); + assert!(non_nullable.eq_ignore_nullable(&nullable)); + } + + #[test] + fn rejects_different_kinds() { + assert!( + !DataType::Int(IntType::new()).eq_ignore_nullable(&DataType::BigInt(BigIntType::new())) + ); + } + + #[test] + fn compares_parameterized_types() { + // Char length must match. + assert!( + DataType::Char(CharType::with_nullable(10, true)) + .eq_ignore_nullable(&DataType::Char(CharType::with_nullable(10, false))) + ); + assert!( + !DataType::Char(CharType::with_nullable(10, true)) + .eq_ignore_nullable(&DataType::Char(CharType::with_nullable(11, true))) + ); + + // Decimal precision + scale must match. + let a = DataType::Decimal(DecimalType::with_nullable(true, 10, 2).unwrap()); + let b = DataType::Decimal(DecimalType::with_nullable(false, 10, 2).unwrap()); + let c = DataType::Decimal(DecimalType::with_nullable(true, 10, 3).unwrap()); + assert!(a.eq_ignore_nullable(&b)); + assert!(!a.eq_ignore_nullable(&c)); + } + + #[test] + fn recurses_into_array_and_map() { + // Array ~ Array + let a = DataType::Array(ArrayType::with_nullable( + true, + DataType::Int(IntType::new()), + )); + let b = DataType::Array(ArrayType::with_nullable( + false, + DataType::Int(IntType::with_nullable(false)), + )); + assert!(a.eq_ignore_nullable(&b)); + + // Map on both sides, mixed nullability. + let m1 = DataType::Map(MapType::with_nullable( + true, + DataType::String(StringType::new()), + DataType::Int(IntType::new()), + )); + let m2 = DataType::Map(MapType::with_nullable( + false, + DataType::String(StringType::with_nullable(false)), + DataType::Int(IntType::with_nullable(false)), + )); + assert!(m1.eq_ignore_nullable(&m2)); + + // Map element-type mismatch is still caught. + let m3 = DataType::Map(MapType::with_nullable( + true, + DataType::String(StringType::new()), + DataType::BigInt(BigIntType::new()), + )); + assert!(!m1.eq_ignore_nullable(&m3)); + } + + #[test] + fn recurses_into_row_fields() { + let r1 = DataType::Row(RowType::new(vec![ + DataField::new("a", DataType::Int(IntType::new()), None), + DataField::new("b", DataType::String(StringType::new()), None), + ])); + let r2 = DataType::Row(RowType::with_nullable( + false, + vec![ + DataField::new("a", DataType::Int(IntType::with_nullable(false)), None), + DataField::new( + "b", + DataType::String(StringType::with_nullable(false)), + None, + ), + ], + )); + assert!(r1.eq_ignore_nullable(&r2)); + + // Field name mismatch must fail. + let r3 = DataType::Row(RowType::new(vec![ + DataField::new("renamed_a", DataType::Int(IntType::new()), None), + DataField::new("b", DataType::String(StringType::new()), None), + ])); + assert!(!r1.eq_ignore_nullable(&r3)); + } +} diff --git a/fluss-rust/crates/fluss/src/metadata/json_serde.rs b/fluss-rust/crates/fluss/src/metadata/json_serde.rs new file mode 100644 index 0000000000..b08159ae66 --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/json_serde.rs @@ -0,0 +1,1154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::JsonSerdeError; +use crate::error::{Error, Result}; +use crate::metadata::datatype::{ + DataField, DataType, DataTypes, DecimalType, TimeType, TimestampLTzType, TimestampType, + UNASSIGNED_FIELD_ID, +}; +use crate::metadata::table::{Column, Schema, TableDescriptor}; +use serde_json::{Value, json}; +use std::collections::HashMap; + +pub trait JsonSerde: Sized { + fn serialize_json(&self) -> Result; + + fn deserialize_json(node: &Value) -> Result; +} + +impl DataType { + pub fn to_type_root(&self) -> &str { + match &self { + DataType::Boolean(_) => "BOOLEAN", + DataType::TinyInt(_) => "TINYINT", + DataType::SmallInt(_) => "SMALLINT", + DataType::Int(_) => "INTEGER", + DataType::BigInt(_) => "BIGINT", + DataType::Float(_) => "FLOAT", + DataType::Double(_) => "DOUBLE", + DataType::Char(_) => "CHAR", + DataType::String(_) => "STRING", + DataType::Decimal(_) => "DECIMAL", + DataType::Date(_) => "DATE", + DataType::Time(_) => "TIME_WITHOUT_TIME_ZONE", + DataType::Timestamp(_) => "TIMESTAMP_WITHOUT_TIME_ZONE", + DataType::TimestampLTz(_) => "TIMESTAMP_WITH_LOCAL_TIME_ZONE", + DataType::Bytes(_) => "BYTES", + DataType::Binary(_) => "BINARY", + DataType::Array(_) => "ARRAY", + DataType::Map(_) => "MAP", + DataType::Row(_) => "ROW", + } + } +} + +impl DataType { + const FIELD_NAME_TYPE_NAME: &'static str = "type"; + const FIELD_NAME_NULLABLE: &'static str = "nullable"; + const FIELD_NAME_LENGTH: &'static str = "length"; + const FIELD_NAME_PRECISION: &'static str = "precision"; + const FIELD_NAME_SCALE: &'static str = "scale"; + #[allow(dead_code)] + const FIELD_NAME_ELEMENT_TYPE: &'static str = "element_type"; + #[allow(dead_code)] + const FIELD_NAME_KEY_TYPE: &'static str = "key_type"; + #[allow(dead_code)] + const FIELD_NAME_VALUE_TYPE: &'static str = "value_type"; + #[allow(dead_code)] + const FIELD_NAME_FIELDS: &'static str = "fields"; + #[allow(dead_code)] + const FIELD_NAME_FIELD_NAME: &'static str = "name"; + // ROW + #[allow(dead_code)] + const FIELD_NAME_FIELD_TYPE: &'static str = "field_type"; + #[allow(dead_code)] + const FIELD_NAME_FIELD_DESCRIPTION: &'static str = "description"; +} + +impl JsonSerde for DataType { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + obj.insert( + Self::FIELD_NAME_TYPE_NAME.to_string(), + json!(Self::to_type_root(self)), + ); + if !self.is_nullable() { + obj.insert(Self::FIELD_NAME_NULLABLE.to_string(), json!(false)); + } + + match &self { + DataType::Boolean(_) + | DataType::TinyInt(_) + | DataType::SmallInt(_) + | DataType::Int(_) + | DataType::BigInt(_) + | DataType::Float(_) + | DataType::Double(_) + | DataType::String(_) + | DataType::Bytes(_) + | DataType::Date(_) => { + // do nothing + } + DataType::Char(_type) => { + obj.insert(Self::FIELD_NAME_LENGTH.to_string(), json!(_type.length())); + } + DataType::Binary(_type) => { + obj.insert(Self::FIELD_NAME_LENGTH.to_string(), json!(_type.length())); + } + DataType::Decimal(_type) => { + obj.insert( + Self::FIELD_NAME_PRECISION.to_string(), + json!(_type.precision()), + ); + obj.insert(Self::FIELD_NAME_SCALE.to_string(), json!(_type.scale())); + } + DataType::Time(_type) => { + obj.insert( + Self::FIELD_NAME_PRECISION.to_string(), + json!(_type.precision()), + ); + } + DataType::Timestamp(_type) => { + obj.insert( + Self::FIELD_NAME_PRECISION.to_string(), + json!(_type.precision()), + ); + } + DataType::TimestampLTz(_type) => { + obj.insert( + Self::FIELD_NAME_PRECISION.to_string(), + json!(_type.precision()), + ); + } + DataType::Array(_type) => { + obj.insert( + Self::FIELD_NAME_ELEMENT_TYPE.to_string(), + _type.get_element_type().serialize_json()?, + ); + } + DataType::Map(_type) => { + obj.insert( + Self::FIELD_NAME_KEY_TYPE.to_string(), + _type.key_type().serialize_json()?, + ); + obj.insert( + Self::FIELD_NAME_VALUE_TYPE.to_string(), + _type.value_type().serialize_json()?, + ); + } + DataType::Row(_type) => { + let fields: Vec = _type + .fields() + .iter() + .map(|field| field.serialize_json()) + .collect::>()?; + obj.insert(Self::FIELD_NAME_FIELDS.to_string(), json!(fields)); + } + } + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let mut _is_nullable = true; + let type_root = node + .get(Self::FIELD_NAME_TYPE_NAME) + .and_then(|v| v.as_str()) + .ok_or_else(|| Error::JsonSerdeError { + message: format!( + "Couldn't find field {} while deserializing datatype.", + Self::FIELD_NAME_TYPE_NAME + ), + })?; + + let mut data_type = match type_root { + "BOOLEAN" => DataTypes::boolean(), + "TINYINT" => DataTypes::tinyint(), + "SMALLINT" => DataTypes::smallint(), + "INTEGER" => DataTypes::int(), + "BIGINT" => DataTypes::bigint(), + "FLOAT" => DataTypes::float(), + "DOUBLE" => DataTypes::double(), + "CHAR" => { + let length = node + .get(Self::FIELD_NAME_LENGTH) + .and_then(|v| v.as_u64()) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::FIELD_NAME_LENGTH), + })? as u32; + DataTypes::char(length) + } + "STRING" => DataTypes::string(), + "DECIMAL" => { + let precision = node + .get(Self::FIELD_NAME_PRECISION) + .and_then(|v| v.as_u64()) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::FIELD_NAME_PRECISION), + })? as u32; + let scale = node + .get(Self::FIELD_NAME_SCALE) + .and_then(|v| v.as_u64()) + .unwrap_or(0) as u32; + DataType::Decimal(DecimalType::with_nullable(true, precision, scale).map_err( + |e| Error::JsonSerdeError { + message: format!("Invalid DECIMAL parameters: {e}"), + }, + )?) + } + "DATE" => DataTypes::date(), + "TIME_WITHOUT_TIME_ZONE" => { + let precision = node + .get(Self::FIELD_NAME_PRECISION) + .and_then(|v| v.as_u64()) + .unwrap_or(0) as u32; + DataType::Time(TimeType::with_nullable(true, precision).map_err(|e| { + Error::JsonSerdeError { + message: format!("Invalid TIME_WITHOUT_TIME_ZONE precision: {e}"), + } + })?) + } + "TIMESTAMP_WITHOUT_TIME_ZONE" => { + let precision = node + .get(Self::FIELD_NAME_PRECISION) + .and_then(|v| v.as_u64()) + .unwrap_or(6) as u32; + DataType::Timestamp(TimestampType::with_nullable(true, precision).map_err(|e| { + Error::JsonSerdeError { + message: format!("Invalid TIMESTAMP_WITHOUT_TIME_ZONE precision: {e}"), + } + })?) + } + "TIMESTAMP_WITH_LOCAL_TIME_ZONE" => { + let precision = node + .get(Self::FIELD_NAME_PRECISION) + .and_then(|v| v.as_u64()) + .unwrap_or(6) as u32; + DataType::TimestampLTz(TimestampLTzType::with_nullable(true, precision).map_err( + |e| Error::JsonSerdeError { + message: format!("Invalid TIMESTAMP_WITH_LOCAL_TIME_ZONE precision: {e}"), + }, + )?) + } + "BYTES" => DataTypes::bytes(), + "BINARY" => { + let length = node + .get(Self::FIELD_NAME_LENGTH) + .and_then(|v| v.as_u64()) + .unwrap_or(1) as usize; + DataTypes::binary(length) + } + "ARRAY" => { + let element_type_node = + node.get(Self::FIELD_NAME_ELEMENT_TYPE).ok_or_else(|| { + Error::JsonSerdeError { + message: format!( + "Missing required field: {}", + Self::FIELD_NAME_ELEMENT_TYPE + ), + } + })?; + let element_type = DataType::deserialize_json(element_type_node)?; + DataTypes::array(element_type) + } + "MAP" => { + let key_type_node = + node.get(Self::FIELD_NAME_KEY_TYPE) + .ok_or_else(|| Error::JsonSerdeError { + message: format!( + "Missing required field: {}", + Self::FIELD_NAME_KEY_TYPE + ), + })?; + let key_type = DataType::deserialize_json(key_type_node)?; + let value_type_node = + node.get(Self::FIELD_NAME_VALUE_TYPE) + .ok_or_else(|| Error::JsonSerdeError { + message: format!( + "Missing required field: {}", + Self::FIELD_NAME_VALUE_TYPE + ), + })?; + let value_type = DataType::deserialize_json(value_type_node)?; + DataTypes::map(key_type, value_type) + } + "ROW" => { + let fields_node = node + .get(Self::FIELD_NAME_FIELDS) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::FIELD_NAME_FIELDS), + })? + .as_array() + .ok_or_else(|| Error::JsonSerdeError { + message: format!("{} must be an array", Self::FIELD_NAME_FIELDS), + })?; + let mut fields = Vec::with_capacity(fields_node.len()); + for field_node in fields_node { + fields.push(DataField::deserialize_json(field_node)?); + } + DataTypes::row(fields) + } + _ => { + return Err(Error::JsonSerdeError { + message: format!("Unknown type root: {type_root}"), + }); + } + }; + + if let Some(nullable) = node.get(Self::FIELD_NAME_NULLABLE) { + let nullable_value = nullable.as_bool().unwrap_or(true); + if !nullable_value { + data_type = data_type.as_non_nullable(); + } + } + Ok(data_type) + } +} + +impl DataField { + const NAME: &'static str = "name"; + const FIELD_TYPE: &'static str = "field_type"; + const DESCRIPTION: &'static str = "description"; + const FIELD_ID: &'static str = "field_id"; +} + +impl JsonSerde for DataField { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + obj.insert(Self::NAME.to_string(), json!(self.name())); + obj.insert( + Self::FIELD_TYPE.to_string(), + self.data_type.serialize_json()?, + ); + + if let Some(description) = &self.description { + obj.insert(Self::DESCRIPTION.to_string(), json!(description)); + } + + obj.insert(Self::FIELD_ID.to_string(), json!(self.field_id())); + + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let name = node + .get(Self::NAME) + .and_then(|v| v.as_str()) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::NAME), + })? + .to_string(); + + let field_type_node = node + .get(Self::FIELD_TYPE) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::FIELD_TYPE), + })?; + + let data_type = DataType::deserialize_json(field_type_node)?; + + let description = node + .get(Self::DESCRIPTION) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()); + + let field_id = node + .get(Self::FIELD_ID) + .and_then(|v| v.as_i64()) + .map(|v| v as i32) + .unwrap_or(UNASSIGNED_FIELD_ID); + + Ok(DataField::with_field_id( + name, + data_type, + description, + field_id, + )) + } +} + +impl Column { + const NAME: &'static str = "name"; + const DATA_TYPE: &'static str = "data_type"; + const COMMENT: &'static str = "comment"; + const ID: &'static str = "id"; +} + +impl JsonSerde for Column { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + // Common fields + obj.insert(Self::NAME.to_string(), json!(self.name())); + obj.insert( + Self::DATA_TYPE.to_string(), + self.data_type().serialize_json()?, + ); + + if let Some(comment) = &self.comment() { + obj.insert(Self::COMMENT.to_string(), json!(comment)); + } + + // The Java client requires `id` on input. + obj.insert(Self::ID.to_string(), json!(self.id())); + + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let name = node + .get(Self::NAME) + .and_then(|v| v.as_str()) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::NAME), + })?; + + let data_type_node = node + .get(Self::DATA_TYPE) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::DATA_TYPE), + })?; + + let data_type = DataType::deserialize_json(data_type_node)?; + + let mut column = Column::new(name, data_type); + + if let Some(comment) = node.get(Self::COMMENT).and_then(|v| v.as_str()) { + column = column.with_comment(comment); + } + + // Pre-id JSON is treated as unassigned; SchemaBuilder will + // auto-assign on build. + if let Some(id) = node.get(Self::ID).and_then(|v| v.as_i64()) { + let id = i32::try_from(id).map_err(|_| Error::JsonSerdeError { + message: format!("Column id {id} does not fit in i32"), + })?; + column = column.with_id(id); + } + + Ok(column) + } +} + +impl Schema { + const COLUMNS_NAME: &'static str = "columns"; + const PRIMARY_KEY_NAME: &'static str = "primary_key"; + const HIGHEST_FIELD_ID: &'static str = "highest_field_id"; + const VERSION_KEY: &'static str = "version"; + const VERSION: u32 = 1; +} + +impl JsonSerde for Schema { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + // Serialize version + obj.insert(Self::VERSION_KEY.to_string(), json!(Self::VERSION)); + + // Serialize columns + let columns: Vec = self + .columns() + .iter() + .map(|col| col.serialize_json()) + .collect::>()?; + obj.insert(Self::COLUMNS_NAME.to_string(), json!(columns)); + + // Serialize primary key if present + if let Some(primary_key) = &self.primary_key() { + let pk_values: Vec = primary_key + .column_names() + .iter() + .map(|name| json!(name)) + .collect(); + obj.insert(Self::PRIMARY_KEY_NAME.to_string(), json!(pk_values)); + } + + obj.insert( + Self::HIGHEST_FIELD_ID.to_string(), + json!(self.highest_field_id()), + ); + + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let columns_node = node + .get(Self::COLUMNS_NAME) + .ok_or_else(|| Error::JsonSerdeError { + message: format!("Missing required field: {}", Self::COLUMNS_NAME), + })? + .as_array() + .ok_or_else(|| Error::JsonSerdeError { + message: format!("{} must be an array", Self::COLUMNS_NAME), + })?; + + let mut columns = Vec::with_capacity(columns_node.len()); + for col_node in columns_node { + columns.push(Column::deserialize_json(col_node)?); + } + + let mut schema_builder = Schema::builder().with_columns(columns); + + if let Some(pk_node) = node.get(Self::PRIMARY_KEY_NAME) { + let pk_array = pk_node + .as_array() + .ok_or_else(|| Error::invalid_table("Primary key must be an array"))?; + + let mut primary_keys = Vec::with_capacity(pk_array.len()); + for name_node in pk_array { + primary_keys.push( + name_node.as_str().ok_or_else(|| { + Error::invalid_table("Primary key element must be a string") + })?, + ); + } + + schema_builder = schema_builder.primary_key(primary_keys); + } + + schema_builder.build() + } +} + +impl TableDescriptor { + const SCHEMA_NAME: &'static str = "schema"; + const COMMENT_NAME: &'static str = "comment"; + const PARTITION_KEY_NAME: &'static str = "partition_key"; + const BUCKET_KEY_NAME: &'static str = "bucket_key"; + const BUCKET_COUNT_NAME: &'static str = "bucket_count"; + const PROPERTIES_NAME: &'static str = "properties"; + const CUSTOM_PROPERTIES_NAME: &'static str = "custom_properties"; + const VERSION_KEY: &'static str = "version"; + const VERSION: u32 = 1; + + fn deserialize_properties(node: &Value) -> Result> { + let obj = node.as_object().ok_or_else(|| Error::JsonSerdeError { + message: "Properties must be an object".to_string(), + })?; + + let mut properties = HashMap::with_capacity(obj.len()); + for (key, value) in obj { + properties.insert( + key.clone(), + value + .as_str() + .ok_or_else(|| Error::JsonSerdeError { + message: "Property value must be a string".to_string(), + })? + .to_owned(), + ); + } + + Ok(properties) + } +} + +impl JsonSerde for TableDescriptor { + fn serialize_json(&self) -> Result { + let mut obj = serde_json::Map::new(); + + // Serialize version + obj.insert(Self::VERSION_KEY.to_string(), json!(Self::VERSION)); + + // Serialize schema + obj.insert( + Self::SCHEMA_NAME.to_string(), + self.schema().serialize_json()?, + ); + + // Serialize comment if present + if let Some(comment) = &self.comment() { + obj.insert(Self::COMMENT_NAME.to_string(), json!(comment)); + } + + // Serialize partition keys + let partition_keys: Vec = + self.partition_keys().iter().map(|key| json!(key)).collect(); + obj.insert(Self::PARTITION_KEY_NAME.to_string(), json!(partition_keys)); + + // Serialize table distribution if present + if let Some(dist) = &self.table_distribution() { + let bucket_keys: Vec = dist.bucket_keys().iter().map(|key| json!(key)).collect(); + obj.insert(Self::BUCKET_KEY_NAME.to_string(), json!(bucket_keys)); + + if let Some(count) = dist.bucket_count() { + obj.insert(Self::BUCKET_COUNT_NAME.to_string(), json!(count)); + } + } + + // Serialize properties + obj.insert(Self::PROPERTIES_NAME.to_string(), json!(self.properties())); + + obj.insert( + Self::CUSTOM_PROPERTIES_NAME.to_string(), + json!(self.custom_properties()), + ); + + Ok(Value::Object(obj)) + } + + fn deserialize_json(node: &Value) -> Result { + let mut builder = TableDescriptor::builder(); + + // Deserialize schema + let schema_node = node.get(Self::SCHEMA_NAME).ok_or_else(|| JsonSerdeError { + message: format!("Missing required field: {}", Self::SCHEMA_NAME), + })?; + let schema = Schema::deserialize_json(schema_node)?; + builder = builder.schema(schema); + + // Deserialize comment if present + if let Some(comment_node) = node.get(Self::COMMENT_NAME) { + let comment = comment_node + .as_str() + .ok_or_else(|| JsonSerdeError { + message: format!("{} must be a string", Self::COMMENT_NAME), + })? + .to_owned(); + builder = builder.comment(comment.as_str()); + } + + let partition_node = node + .get(Self::PARTITION_KEY_NAME) + .ok_or_else(|| JsonSerdeError { + message: format!("Missing required field: {}", Self::PARTITION_KEY_NAME), + })? + .as_array() + .ok_or_else(|| JsonSerdeError { + message: format!("{} must be an array", Self::PARTITION_KEY_NAME), + })?; + + let mut partition_keys = Vec::with_capacity(partition_node.len()); + for key_node in partition_node { + partition_keys.push( + key_node + .as_str() + .ok_or_else(|| JsonSerdeError { + message: format!("{} element must be a string", Self::PARTITION_KEY_NAME), + })? + .to_owned(), + ); + } + builder = builder.partitioned_by(partition_keys); + + let mut bucket_count = None; + let mut bucket_keys = vec![]; + if let Some(bucket_key_node) = node.get(Self::BUCKET_KEY_NAME) { + let bucket_key_node = bucket_key_node.as_array().ok_or_else(|| JsonSerdeError { + message: format!("{} must be an array", Self::BUCKET_KEY_NAME), + })?; + + for key_node in bucket_key_node { + bucket_keys.push( + key_node + .as_str() + .ok_or_else(|| JsonSerdeError { + message: "Bucket key must be a string".to_string(), + })? + .to_owned(), + ); + } + } + + if let Some(bucket_count_node) = node.get(Self::BUCKET_COUNT_NAME) { + bucket_count = bucket_count_node.as_u64().map(|n| n as i32); + } + + if bucket_count.is_some() || !bucket_keys.is_empty() { + builder = builder.distributed_by(bucket_count, bucket_keys); + } + + // Deserialize properties + let properties = + Self::deserialize_properties(node.get(Self::PROPERTIES_NAME).ok_or_else(|| { + JsonSerdeError { + message: format!("Missing required field: {}", Self::PROPERTIES_NAME), + } + })?)?; + builder = builder.properties(properties); + + // Deserialize custom properties + let custom_properties = Self::deserialize_properties( + node.get(Self::CUSTOM_PROPERTIES_NAME) + .ok_or_else(|| JsonSerdeError { + message: format!("Missing required field: {}", Self::CUSTOM_PROPERTIES_NAME), + })?, + )?; + builder = builder.custom_properties(custom_properties); + + builder.build() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::reassign_field_ids; + use crate::metadata::{ + Column, DataField, DataType, DataTypes as DT, DataTypes, MapType, Schema, + }; + + #[test] + fn column_id_round_trip_through_json() { + let col = Column::new("a", DataTypes::int()) + .with_id(7) + .with_comment("desc"); + let json = col.serialize_json().unwrap(); + assert_eq!(json.get("id").and_then(|v| v.as_i64()), Some(7)); + let round_tripped = Column::deserialize_json(&json).unwrap(); + assert_eq!(round_tripped, col); + } + + #[test] + fn schema_assigns_ids_when_absent_and_preserves_when_present() { + let auto = Schema::builder() + .column("a", DataTypes::int()) + .column("b", DataTypes::string()) + .build() + .unwrap(); + let ids: Vec = auto.columns().iter().map(|c| c.id()).collect(); + assert_eq!(ids, vec![0, 1]); + + let preserved = Schema::builder() + .with_columns(vec![ + Column::new("a", DataTypes::int()).with_id(3), + Column::new("b", DataTypes::string()).with_id(7), + ]) + .build() + .unwrap(); + let ids: Vec = preserved.columns().iter().map(|c| c.id()).collect(); + assert_eq!(ids, vec![3, 7]); + } + + #[test] + fn schema_rejects_duplicate_ids() { + let err = Schema::builder() + .with_columns(vec![ + Column::new("a", DataTypes::int()).with_id(7), + Column::new("b", DataTypes::string()).with_id(7), + ]) + .build() + .unwrap_err(); + assert!(err.to_string().contains("Duplicate field id 7"), "{err}"); + } + + #[test] + fn schema_rejects_negative_non_sentinel_ids() { + let err = Schema::builder() + .with_columns(vec![Column::new("a", DataTypes::int()).with_id(-7)]) + .build() + .unwrap_err(); + assert!(err.to_string().contains("invalid id -7"), "{err}"); + } + + #[test] + fn column_json_id_overflow_errors() { + let json = serde_json::json!({ + "name": "a", + "data_type": Column::new("a", DataTypes::int()).serialize_json().unwrap() + .get("data_type").unwrap(), + "id": (i32::MAX as i64) + 1, + }); + let err = Column::deserialize_json(&json).unwrap_err(); + assert!(err.to_string().contains("does not fit in i32"), "{err}"); + } + + #[test] + fn schema_rejects_partially_assigned_ids() { + let err = Schema::builder() + .with_columns(vec![ + Column::new("a", DataTypes::int()).with_id(0), + Column::new("b", DataTypes::string()), + ]) + .build() + .unwrap_err(); + assert!( + err.to_string().contains("All columns must have an id"), + "{err}" + ); + } + + #[test] + fn schema_assigns_nested_field_ids_in_java_dfs_order() { + let inner_row = DataTypes::row(vec![DataField::new("n", DataTypes::int(), None)]); + let nested_row = DataTypes::row(vec![ + DataField::new("x", DataTypes::int(), None), + DataField::new("label", DataTypes::string(), None), + ]); + let deep_row = DataTypes::row(vec![DataField::new("inner", inner_row, None)]); + + let schema = Schema::builder() + .column("id", DataTypes::int()) + .column("nested", nested_row) + .column("deep", deep_row) + .build() + .unwrap(); + + let top_ids: Vec = schema.columns().iter().map(|c| c.id()).collect(); + assert_eq!(top_ids, vec![0, 1, 4]); + + fn nested_field(dt: &DataType, idx: usize) -> &DataField { + match dt { + DataType::Row(rt) => &rt.fields()[idx], + _ => panic!("not a Row"), + } + } + let nested_dt = schema.columns()[1].data_type(); + assert_eq!(nested_field(nested_dt, 0).field_id(), 2); // x + assert_eq!(nested_field(nested_dt, 1).field_id(), 3); // label + + let deep_dt = schema.columns()[2].data_type(); + let inner_field = nested_field(deep_dt, 0); // inner + assert_eq!(inner_field.field_id(), 5); + let n_field = nested_field(inner_field.data_type(), 0); // n + assert_eq!(n_field.field_id(), 6); + + assert_eq!(schema.highest_field_id(), 6); + + for c in schema.columns() { + assert_ne!(c.id(), UNASSIGNED_FIELD_ID); + } + } + + #[test] + fn schema_array_of_row_assigns_nested_ids() { + let elem = DataTypes::row(vec![ + DataField::new("seq", DataTypes::int(), None), + DataField::new("label", DataTypes::string(), None), + ]); + let schema = Schema::builder() + .column("id", DataTypes::int()) + .column("events", DataTypes::array(elem)) + .build() + .unwrap(); + assert_eq!(schema.highest_field_id(), 3); + let array_dt = schema.columns()[1].data_type(); + let elem_dt = match array_dt { + DataType::Array(at) => at.get_element_type(), + _ => unreachable!(), + }; + let fields = match elem_dt { + DataType::Row(rt) => rt.fields(), + _ => unreachable!(), + }; + assert_eq!(fields[0].field_id(), 2); + assert_eq!(fields[1].field_id(), 3); + } + + #[test] + fn schema_nested_row_round_trips_through_json() { + let nested = DataTypes::row(vec![ + DataField::new("x", DataTypes::int(), None), + DataField::new("label", DataTypes::string(), None), + ]); + let original = Schema::builder() + .column("id", DataTypes::int()) + .column("nested", nested) + .build() + .unwrap(); + + let json = original.serialize_json().unwrap(); + + assert_eq!( + json.get("highest_field_id").and_then(|v| v.as_i64()), + Some(3) + ); + + let round_tripped = Schema::deserialize_json(&json).unwrap(); + assert_eq!(round_tripped.highest_field_id(), 3); + assert_eq!( + round_tripped + .columns() + .iter() + .map(|c| c.id()) + .collect::>(), + vec![0, 1], + ); + assert_eq!(round_tripped, original); + } + + #[test] + fn schema_rejects_duplicate_nested_field_ids() { + let nested = DataTypes::row(vec![ + DataField::with_field_id("x", DT::int(), None, 0), + DataField::with_field_id("y", DT::int(), None, 2), + ]); + let err = Schema::builder() + .with_columns(vec![ + Column::new("a", DT::int()).with_id(0), + Column::new("b", nested).with_id(1), + ]) + .build() + .unwrap_err(); + assert!(err.to_string().contains("Duplicate field id 0"), "{err}"); + } + + #[test] + fn schema_rejects_partially_assigned_nested_field_ids() { + let nested = DataTypes::row(vec![DataField::new("x", DT::int(), None)]); + let err = Schema::builder() + .with_columns(vec![ + Column::new("a", DT::int()).with_id(0), + Column::new("b", nested).with_id(1), + ]) + .build() + .unwrap_err(); + assert!( + err.to_string() + .contains("nested DataField ids are unassigned"), + "{err}" + ); + } + + #[test] + fn schema_preserves_nested_ids_with_gaps() { + // n2.m1=11), f2=2 (nested n0=9, n1=10). + let inner_for_n2 = DataTypes::row(vec![DataField::with_field_id( + "m1", + DataTypes::tinyint(), + None, + 11, + )]); + let f1_row = DataTypes::row(vec![ + DataField::with_field_id("n0", DataTypes::tinyint(), None, 6), + DataField::with_field_id("n1", DataTypes::string(), None, 7), + DataField::with_field_id("n2", inner_for_n2, None, 8), + ]); + let f2_row = DataTypes::row(vec![ + DataField::with_field_id("n0", DataTypes::tinyint(), None, 9), + DataField::with_field_id("n1", DataTypes::string(), None, 10), + ]); + + let schema = Schema::builder() + .with_columns(vec![ + Column::new("f0", DataTypes::string().as_non_nullable()).with_id(0), + Column::new("f1", f1_row).with_id(1), + Column::new("f2", f2_row).with_id(2), + ]) + .build() + .unwrap(); + + let top_ids: Vec = schema.columns().iter().map(|c| c.id()).collect(); + assert_eq!(top_ids, vec![0, 1, 2]); + + fn row_fields(dt: &DataType) -> &[DataField] { + match dt { + DataType::Row(rt) => rt.fields(), + _ => panic!("not a Row"), + } + } + let f1_fields = row_fields(schema.columns()[1].data_type()); + assert_eq!(f1_fields[0].field_id(), 6); // n0 + assert_eq!(f1_fields[1].field_id(), 7); // n1 + assert_eq!(f1_fields[2].field_id(), 8); // n2 + let n2_fields = row_fields(f1_fields[2].data_type()); + assert_eq!(n2_fields[0].field_id(), 11); // m1 — the "gap" + + let f2_fields = row_fields(schema.columns()[2].data_type()); + assert_eq!(f2_fields[0].field_id(), 9); + assert_eq!(f2_fields[1].field_id(), 10); + + assert_eq!(schema.highest_field_id(), 11); + } + + #[test] + fn schema_deserializes_legacy_json_without_column_ids() { + let legacy_json: Value = serde_json::from_str( + r#"{ + "version": 1, + "columns": [ + {"name": "a", "data_type": {"type": "INTEGER", "nullable": false}, "comment": "first"}, + {"name": "b", "data_type": {"type": "STRING"}, "comment": "second"}, + {"name": "c", "data_type": {"type": "CHAR", "nullable": false, "length": 10}, "comment": "third"} + ], + "primary_key": ["a", "c"] + }"#, + ) + .unwrap(); + + let schema = Schema::deserialize_json(&legacy_json).expect("legacy JSON must deserialize"); + let ids: Vec = schema.columns().iter().map(|c| c.id()).collect(); + assert_eq!(ids, vec![0, 1, 2], "missing IDs auto-assigned 0..N-1"); + assert_eq!(schema.highest_field_id(), 2); + assert!(schema.primary_key().is_some()); + } + + #[test] + fn empty_schema_has_minus_one_highest_field_id() { + let s = Schema::builder().build().unwrap(); + assert_eq!(s.highest_field_id(), -1); + let json = s.serialize_json().unwrap(); + assert_eq!( + json.get("highest_field_id").and_then(|v| v.as_i64()), + Some(-1) + ); + } + + #[test] + fn reassign_field_ids_walks_array_map_row() { + let dt = DataTypes::array(DataTypes::row(vec![ + DataField::new("a", DataTypes::int(), None), + DataField::new("b", DataTypes::string(), None), + ])); + let mut counter = -1_i32; + let assigned = reassign_field_ids(&dt, &mut counter); + match assigned { + DataType::Array(at) => match at.get_element_type() { + DataType::Row(rt) => { + assert_eq!(rt.fields()[0].field_id(), 0); + assert_eq!(rt.fields()[1].field_id(), 1); + } + _ => panic!("expected Row"), + }, + _ => panic!("expected Array"), + } + assert_eq!(counter, 1); + + let dt = DataType::Map(MapType::new( + DataTypes::int(), + DataTypes::row(vec![DataField::new("x", DataTypes::int(), None)]), + )); + let mut counter = -1_i32; + let assigned = reassign_field_ids(&dt, &mut counter); + let value_type = match &assigned { + DataType::Map(mt) => mt.value_type(), + _ => panic!("expected Map"), + }; + match value_type { + DataType::Row(rt) => assert_eq!(rt.fields()[0].field_id(), 0), + _ => panic!("expected Row"), + } + assert_eq!(counter, 0); + } + + #[test] + fn test_datatype_json_serde() { + let data_types = vec![ + DataTypes::boolean(), + DataTypes::tinyint(), + DataTypes::smallint(), + DataTypes::int().as_non_nullable(), + DataTypes::bigint(), + DataTypes::float(), + DataTypes::double(), + DataTypes::char(10), + DataTypes::string(), + DataTypes::decimal(10, 2), + DataTypes::date(), + DataTypes::time(), + DataTypes::timestamp(), + DataTypes::timestamp_ltz(), + DataTypes::bytes(), + DataTypes::binary(100), + DataTypes::array(DataTypes::int()), + DataTypes::map(DataTypes::string(), DataTypes::int()), + DataTypes::row(vec![ + DataField::new("f1".to_string(), DataTypes::int(), None), + DataField::new( + "f2".to_string(), + DataTypes::string(), + Some("desc".to_string()), + ), + ]), + ]; + + for dt in data_types { + let json = dt.serialize_json().unwrap(); + let deserialized = DataType::deserialize_json(&json).unwrap(); + assert_eq!(dt, deserialized); + } + } + + #[test] + fn test_invalid_datatype_validation() { + use serde_json::json; + + // Invalid DECIMAL precision (> 38) + let invalid_decimal = json!({ + "type": "DECIMAL", + "precision": 50, + "scale": 2 + }); + let result = DataType::deserialize_json(&invalid_decimal); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid DECIMAL parameters") + ); + + // Invalid TIME precision (> 9) + let invalid_time = json!({ + "type": "TIME_WITHOUT_TIME_ZONE", + "precision": 15 + }); + let result = DataType::deserialize_json(&invalid_time); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid TIME_WITHOUT_TIME_ZONE precision") + ); + + // Invalid TIMESTAMP precision (> 9) + let invalid_timestamp = json!({ + "type": "TIMESTAMP_WITHOUT_TIME_ZONE", + "precision": 20 + }); + let result = DataType::deserialize_json(&invalid_timestamp); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid TIMESTAMP_WITHOUT_TIME_ZONE precision") + ); + + // Invalid TIMESTAMP_LTZ precision (> 9) + let invalid_timestamp_ltz = json!({ + "type": "TIMESTAMP_WITH_LOCAL_TIME_ZONE", + "precision": 10 + }); + let result = DataType::deserialize_json(&invalid_timestamp_ltz); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid TIMESTAMP_WITH_LOCAL_TIME_ZONE precision") + ); + + // Invalid DECIMAL scale (> precision) + let invalid_decimal_scale = json!({ + "type": "DECIMAL", + "precision": 10, + "scale": 15 + }); + let result = DataType::deserialize_json(&invalid_decimal_scale); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Invalid DECIMAL parameters") + ); + } +} diff --git a/fluss-rust/crates/fluss/src/metadata/mod.rs b/fluss-rust/crates/fluss/src/metadata/mod.rs new file mode 100644 index 0000000000..c1d1b72c69 --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/mod.rs @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod data_lake_format; +mod database; +mod datatype; +mod json_serde; +mod partition; +mod schema_util; +mod table; + +pub use data_lake_format::*; +pub use database::*; +pub use datatype::*; +pub use json_serde::*; +pub use partition::*; +pub(crate) use schema_util::{UNEXIST_MAPPING, index_mapping}; +pub use table::*; diff --git a/fluss-rust/crates/fluss/src/metadata/partition.rs b/fluss-rust/crates/fluss/src/metadata/partition.rs new file mode 100644 index 0000000000..c63fe296c5 --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/partition.rs @@ -0,0 +1,476 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::{Error, Result}; +use crate::proto::{PbKeyValue, PbPartitionInfo, PbPartitionSpec}; +use crate::{PartitionId, TableId}; +use std::collections::HashMap; +use std::fmt::{Display, Formatter}; +use std::sync::Arc; + +/// Represents a partition spec in fluss. Partition columns and values are NOT of strict order, and +/// they need to be re-arranged to the correct order by comparing with a list of strictly ordered +/// partition keys. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PartitionSpec { + partition_spec: HashMap, +} + +impl PartitionSpec { + pub fn new, V: Into>(partition_spec: HashMap) -> Self { + let mut new_map = HashMap::new(); + for (k, v) in partition_spec { + new_map.insert(k.into(), v.into()); + } + Self { + partition_spec: new_map, + } + } + + pub fn get_spec_map(&self) -> &HashMap { + &self.partition_spec + } + + pub fn to_pb(&self) -> PbPartitionSpec { + PbPartitionSpec { + partition_key_values: self + .partition_spec + .iter() + .map(|(k, v)| PbKeyValue { + key: k.clone(), + value: v.clone(), + }) + .collect(), + } + } + + pub fn from_pb(pb: &PbPartitionSpec) -> Self { + let partition_spec = pb + .partition_key_values + .iter() + .map(|kv| (kv.key.clone(), kv.value.clone())) + .collect(); + Self { partition_spec } + } +} + +impl Display for PartitionSpec { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "PartitionSpec{{{:?}}}", self.partition_spec) + } +} + +/// Represents a partition, which is the resolved version of PartitionSpec. The partition +/// spec is re-arranged into the correct order by comparing it with a list of strictly ordered +/// partition keys. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ResolvedPartitionSpec { + partition_keys: Arc<[String]>, + partition_values: Vec, +} + +pub const PARTITION_SPEC_SEPARATOR: &str = "$"; + +impl ResolvedPartitionSpec { + pub fn new(partition_keys: Arc<[String]>, partition_values: Vec) -> Result { + if partition_keys.len() != partition_values.len() { + return Err(Error::IllegalArgument { + message: "The number of partition keys and partition values should be the same." + .to_string(), + }); + } + + Ok(Self { + partition_keys, + partition_values, + }) + } + + pub fn from_partition_spec( + partition_keys: Arc<[String]>, + partition_spec: &PartitionSpec, + ) -> Self { + let partition_values = + Self::get_reordered_partition_values(&partition_keys, partition_spec); + Self { + partition_keys, + partition_values, + } + } + + pub fn from_partition_name(partition_keys: Arc<[String]>, partition_name: &str) -> Self { + let partition_values: Vec = partition_name + .split(PARTITION_SPEC_SEPARATOR) + .map(|s| s.to_string()) + .collect(); + Self { + partition_keys, + partition_values, + } + } + + pub fn from_partition_qualified_name(qualified_partition_name: &str) -> Result { + let mut keys = Vec::new(); + let mut values = Vec::new(); + + for pair in qualified_partition_name.split('/') { + let parts: Vec<&str> = pair.splitn(2, '=').collect(); + if parts.len() != 2 { + return Err(Error::IllegalArgument { + message: format!( + "Invalid partition name format. Expected key=value, got: {pair}" + ), + }); + } + keys.push(parts[0].to_string()); + values.push(parts[1].to_string()); + } + + Ok(Self { + partition_keys: Arc::from(keys), + partition_values: values, + }) + } + + pub fn get_partition_keys(&self) -> &[String] { + &self.partition_keys + } + + pub fn get_partition_values(&self) -> &[String] { + &self.partition_values + } + + pub fn to_partition_spec(&self) -> PartitionSpec { + let mut spec_map = HashMap::new(); + for (i, key) in self.partition_keys.iter().enumerate() { + spec_map.insert(key.clone(), self.partition_values[i].clone()); + } + PartitionSpec::new(spec_map) + } + + /// Generate the partition name for a partition table with specified partition values. + /// + /// The partition name is in the following format: value1$value2$...$valueN + pub fn get_partition_name(&self) -> String { + self.partition_values.join(PARTITION_SPEC_SEPARATOR) + } + + /// Returns the qualified partition name for a partition spec. + /// The format is: key1=value1/key2=value2/.../keyN=valueN + pub fn get_partition_qualified_name(&self) -> String { + let mut sb = String::new(); + for (i, key) in self.partition_keys.iter().enumerate() { + sb.push_str(key); + sb.push('='); + sb.push_str(&self.partition_values[i]); + if i != self.partition_keys.len() - 1 { + sb.push('/'); + } + } + sb + } + + pub fn contains(&self, other: &ResolvedPartitionSpec) -> Result { + let other_partition_keys = other.get_partition_keys(); + let other_partition_values = other.get_partition_values(); + + let mut expected_partition_values = Vec::new(); + for other_partition_key in other_partition_keys { + let key_index = self + .partition_keys + .iter() + .position(|k| k == other_partition_key); + match key_index { + Some(idx) => expected_partition_values.push(self.partition_values[idx].clone()), + None => { + return Err(Error::IllegalArgument { + message: format!( + "table does not contain partitionKey: {other_partition_key}" + ), + }); + } + } + } + + let expected_partition_name = expected_partition_values.join(PARTITION_SPEC_SEPARATOR); + let other_partition_name = other_partition_values.join(PARTITION_SPEC_SEPARATOR); + + Ok(expected_partition_name == other_partition_name) + } + + pub fn to_pb(&self) -> PbPartitionSpec { + PbPartitionSpec { + partition_key_values: self + .partition_keys + .iter() + .zip(self.partition_values.iter()) + .map(|(k, v)| PbKeyValue { + key: k.clone(), + value: v.clone(), + }) + .collect(), + } + } + + pub fn from_pb(pb: &PbPartitionSpec) -> Self { + let partition_keys = pb + .partition_key_values + .iter() + .map(|kv| kv.key.clone()) + .collect(); + let partition_values = pb + .partition_key_values + .iter() + .map(|kv| kv.value.clone()) + .collect(); + + Self { + partition_keys, + partition_values, + } + } + + fn get_reordered_partition_values( + partition_keys: &Arc<[String]>, + partition_spec: &PartitionSpec, + ) -> Vec { + let partition_spec_map = partition_spec.get_spec_map(); + partition_keys + .iter() + .map(|key| partition_spec_map.get(key).cloned().unwrap_or_default()) + .collect() + } +} + +impl Display for ResolvedPartitionSpec { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.get_partition_qualified_name()) + } +} + +/// Information of a partition metadata, includes the partition's name and the partition id that +/// represents the unique identifier of the partition. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct PartitionInfo { + partition_id: PartitionId, + partition_spec: ResolvedPartitionSpec, +} + +impl PartitionInfo { + pub fn new(partition_id: PartitionId, partition_spec: ResolvedPartitionSpec) -> Self { + Self { + partition_id, + partition_spec, + } + } + + /// Get the partition id. The id is globally unique in the Fluss cluster. + pub fn get_partition_id(&self) -> PartitionId { + self.partition_id + } + + /// Get the partition name. + pub fn get_partition_name(&self) -> String { + self.partition_spec.get_partition_name() + } + + pub fn get_resolved_partition_spec(&self) -> &ResolvedPartitionSpec { + &self.partition_spec + } + + pub fn get_partition_spec(&self) -> PartitionSpec { + self.partition_spec.to_partition_spec() + } + + pub fn to_pb(&self) -> PbPartitionInfo { + PbPartitionInfo { + partition_id: self.partition_id, + partition_spec: self.partition_spec.to_pb(), + remote_data_dir: None, + } + } + + pub fn from_pb(pb: &PbPartitionInfo) -> Self { + Self { + partition_id: pb.partition_id, + partition_spec: ResolvedPartitionSpec::from_pb(&pb.partition_spec), + } + } +} + +impl Display for PartitionInfo { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + write!( + f, + "Partition{{name='{}', id={}}}", + self.get_partition_name(), + self.partition_id + ) + } +} + +/// A class to identify a table partition, containing the table id and the partition id. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct TablePartition { + table_id: TableId, + partition_id: PartitionId, +} + +impl TablePartition { + pub fn new(table_id: TableId, partition_id: PartitionId) -> Self { + Self { + table_id, + partition_id, + } + } + + pub fn get_table_id(&self) -> i64 { + self.table_id + } + + pub fn get_partition_id(&self) -> PartitionId { + self.partition_id + } +} + +impl Display for TablePartition { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "TablePartition{{tableId={}, partitionId={}}}", + self.table_id, self.partition_id + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_resolved_partition_spec_name() { + let spec = ResolvedPartitionSpec::new( + Arc::from(["date".to_string(), "region".to_string()]), + vec!["2024-01-15".to_string(), "US".to_string()], + ) + .unwrap(); + + assert_eq!(spec.get_partition_name(), "2024-01-15$US"); + assert_eq!( + spec.get_partition_qualified_name(), + "date=2024-01-15/region=US" + ); + } + + #[test] + fn test_resolved_partition_spec_from_partition_name() { + let spec = ResolvedPartitionSpec::from_partition_name( + Arc::from(["date".to_string(), "region".to_string()]), + "2024-01-15$US", + ); + + assert_eq!(spec.get_partition_values(), &["2024-01-15", "US"]); + } + + #[test] + fn test_resolved_partition_spec_from_qualified_name() { + let spec = + ResolvedPartitionSpec::from_partition_qualified_name("date=2024-01-15/region=US") + .unwrap(); + + assert_eq!(spec.get_partition_keys(), &["date", "region"]); + assert_eq!(spec.get_partition_values(), &["2024-01-15", "US"]); + } + + #[test] + fn test_resolved_partition_spec_mismatched_lengths() { + let result = ResolvedPartitionSpec::new( + Arc::from(["date".to_string(), "region".to_string()]), + vec!["2024-01-15".to_string()], + ); + + assert!(result.is_err()); + } + + #[test] + fn test_partition_info() { + let spec = ResolvedPartitionSpec::new( + Arc::from(["date".to_string()]), + vec!["2024-01-15".to_string()], + ) + .unwrap(); + + let info = PartitionInfo::new(42, spec); + assert_eq!(info.get_partition_id(), 42); + assert_eq!(info.get_partition_name(), "2024-01-15"); + } + + #[test] + fn test_table_partition() { + let tp = TablePartition::new(100, 42); + assert_eq!(tp.get_table_id(), 100); + assert_eq!(tp.get_partition_id(), 42); + } + + #[test] + fn test_partition_spec_pb_roundtrip() { + let mut map = HashMap::new(); + map.insert("date".to_string(), "2024-01-15".to_string()); + let spec = PartitionSpec::new(map); + + let pb = spec.to_pb(); + let restored = PartitionSpec::from_pb(&pb); + + assert_eq!( + spec.get_spec_map().get("date"), + restored.get_spec_map().get("date") + ); + } + + #[test] + fn test_partition_info_pb_roundtrip() { + let spec = ResolvedPartitionSpec::new( + Arc::from(["date".to_string()]), + vec!["2024-01-15".to_string()], + ) + .unwrap(); + let info = PartitionInfo::new(42, spec); + + let pb = info.to_pb(); + let restored = PartitionInfo::from_pb(&pb); + + assert_eq!(info.get_partition_id(), restored.get_partition_id()); + assert_eq!(info.get_partition_name(), restored.get_partition_name()); + } + + #[test] + fn test_contains() { + let full_spec = ResolvedPartitionSpec::new( + Arc::from(["date".to_string(), "region".to_string()]), + vec!["2024-01-15".to_string(), "US".to_string()], + ) + .unwrap(); + + let partial_spec = ResolvedPartitionSpec::new( + Arc::from(["date".to_string()]), + vec!["2024-01-15".to_string()], + ) + .unwrap(); + + assert!(full_spec.contains(&partial_spec).unwrap()); + } +} diff --git a/fluss-rust/crates/fluss/src/metadata/schema_util.rs b/fluss-rust/crates/fluss/src/metadata/schema_util.rs new file mode 100644 index 0000000000..498a526eae --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/schema_util.rs @@ -0,0 +1,204 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::{Error, Result}; +use crate::metadata::{Schema, UNKNOWN_COLUMN_ID}; +use std::collections::{HashMap, HashSet}; + +/// Sentinel for an expected column that does not exist in the origin +/// schema. Used by [`index_mapping`] and [`crate::row::ProjectedRow`]. +pub(crate) const UNEXIST_MAPPING: i32 = -1; + +/// For each column in `expected_schema`, return the index of the column +/// with the same id in `origin_schema`, or [`UNEXIST_MAPPING`] if absent. +/// Matching by id keeps mappings stable across `ALTER TABLE … RENAME`. +pub(crate) fn index_mapping(origin_schema: &Schema, expected_schema: &Schema) -> Result> { + let origin_columns = origin_schema.columns(); + let mut origin_id_to_index: HashMap = HashMap::with_capacity(origin_columns.len()); + for (i, col) in origin_columns.iter().enumerate() { + if col.id() == UNKNOWN_COLUMN_ID { + return Err(Error::RowConvertError { + message: format!( + "origin schema column '{}' has no assigned id; cannot build index mapping", + col.name() + ), + }); + } + if origin_id_to_index.insert(col.id(), i).is_some() { + return Err(Error::RowConvertError { + message: format!("duplicate column id {} in origin schema", col.id()), + }); + } + } + + let expected_columns = expected_schema.columns(); + let mut mapping = Vec::with_capacity(expected_columns.len()); + let mut expected_seen: HashSet = HashSet::with_capacity(expected_columns.len()); + + for expected in expected_columns { + if expected.id() == UNKNOWN_COLUMN_ID { + return Err(Error::RowConvertError { + message: format!( + "expected schema column '{}' has no assigned id; cannot build index mapping", + expected.name() + ), + }); + } + if !expected_seen.insert(expected.id()) { + return Err(Error::RowConvertError { + message: format!("duplicate column id {} in expected schema", expected.id()), + }); + } + match origin_id_to_index.get(&expected.id()) { + None => mapping.push(UNEXIST_MAPPING), + Some(&idx) => { + let origin = &origin_columns[idx]; + if !origin.data_type().eq_ignore_nullable(expected.data_type()) { + return Err(Error::RowConvertError { + message: format!( + "Expected datatype of column(id={},name={}) is [{}], while the actual datatype is [{}]", + expected.id(), + expected.name(), + expected.data_type(), + origin.data_type() + ), + }); + } + mapping.push(idx as i32); + } + } + } + + Ok(mapping) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{Column, DataType, DataTypes}; + + fn schema_auto(columns: &[(&str, DataType)]) -> Schema { + let mut b = Schema::builder(); + for (name, dt) in columns { + b = b.column(*name, dt.clone()); + } + b.build().expect("schema build") + } + + fn schema_with_ids(columns: &[(i32, &str, DataType)]) -> Schema { + let cols: Vec = columns + .iter() + .map(|(id, name, dt)| Column::new(*name, dt.clone()).with_id(*id)) + .collect(); + Schema::builder() + .with_columns(cols) + .build() + .expect("schema build") + } + + #[test] + fn identity_mapping_when_schemas_equal() { + let s = schema_auto(&[ + ("a", DataTypes::bigint()), + ("b", DataTypes::string()), + ("c", DataTypes::int()), + ]); + assert_eq!(index_mapping(&s, &s).unwrap(), vec![0, 1, 2]); + } + + #[test] + fn projection_subset_in_order() { + let origin = schema_auto(&[ + ("a", DataTypes::bigint()), + ("b", DataTypes::string()), + ("c", DataTypes::int()), + ]); + let expected = + schema_with_ids(&[(0, "a", DataTypes::bigint()), (2, "c", DataTypes::int())]); + assert_eq!(index_mapping(&origin, &expected).unwrap(), vec![0, 2]); + } + + #[test] + fn reorder_mapping() { + let origin = schema_auto(&[ + ("a", DataTypes::bigint()), + ("b", DataTypes::string()), + ("c", DataTypes::int()), + ]); + let expected = schema_with_ids(&[ + (2, "c", DataTypes::int()), + (0, "a", DataTypes::bigint()), + (1, "b", DataTypes::string()), + ]); + assert_eq!(index_mapping(&origin, &expected).unwrap(), vec![2, 0, 1]); + } + + #[test] + fn missing_column_returns_sentinel() { + let origin = schema_auto(&[("a", DataTypes::bigint())]); + let expected = schema_with_ids(&[ + (0, "a", DataTypes::bigint()), + (1, "new_col", DataTypes::string()), + ]); + assert_eq!( + index_mapping(&origin, &expected).unwrap(), + vec![0, UNEXIST_MAPPING] + ); + } + + #[test] + fn rename_preserves_mapping_when_id_matches() { + let origin = schema_with_ids(&[(0, "old_name", DataTypes::int())]); + let expected = schema_with_ids(&[(0, "new_name", DataTypes::int())]); + assert_eq!(index_mapping(&origin, &expected).unwrap(), vec![0]); + } + + #[test] + fn drop_then_add_with_same_name_does_not_alias() { + let origin = schema_with_ids(&[(0, "a", DataTypes::int())]); + let expected = schema_with_ids(&[(5, "a", DataTypes::int())]); + assert_eq!( + index_mapping(&origin, &expected).unwrap(), + vec![UNEXIST_MAPPING] + ); + } + + #[test] + fn datatype_mismatch_returns_error() { + let origin = schema_auto(&[("a", DataTypes::bigint())]); + let expected = schema_with_ids(&[(0, "a", DataTypes::int())]); + let err = index_mapping(&origin, &expected).unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("id=0"), "{msg}"); + assert!(msg.contains("name=a"), "{msg}"); + assert!(msg.contains("INT"), "{msg}"); + assert!(msg.contains("BIGINT"), "{msg}"); + } + + #[test] + fn nullability_difference_does_not_error() { + // Primary-key normalization makes the origin non-nullable while + // the expected is nullable. + let origin = Schema::builder() + .column("a", DataTypes::int()) + .primary_key(["a"]) + .build() + .unwrap(); + let expected = schema_with_ids(&[(0, "a", DataTypes::int())]); + assert_eq!(index_mapping(&origin, &expected).unwrap(), vec![0]); + } +} diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs new file mode 100644 index 0000000000..390bdbfcc9 --- /dev/null +++ b/fluss-rust/crates/fluss/src/metadata/table.rs @@ -0,0 +1,1646 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::compression::ArrowCompressionInfo; +use crate::error::Error::IllegalArgument; +use crate::error::{Error, Result}; +use crate::metadata::DataLakeFormat; +use crate::metadata::datatype::{ + DataField, DataType, RowType, UNASSIGNED_FIELD_ID, reassign_field_ids, +}; +use crate::{BucketId, PartitionId, TableId}; +use core::fmt; +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; +use std::fmt::{Display, Formatter}; +use std::sync::Arc; +use strum_macros::EnumString; + +/// Sentinel for a column whose stable id has not yet been assigned. +pub const UNKNOWN_COLUMN_ID: i32 = -1; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct Column { + name: String, + data_type: DataType, + comment: Option, + id: i32, +} + +impl Column { + pub fn new>(name: N, data_type: DataType) -> Self { + Self { + name: name.into(), + data_type, + comment: None, + id: UNKNOWN_COLUMN_ID, + } + } + + pub fn with_comment>(mut self, comment: C) -> Self { + self.comment = Some(comment.into()); + self + } + + pub fn with_data_type(&self, data_type: DataType) -> Self { + Self { + name: self.name.clone(), + data_type: data_type.clone(), + comment: self.comment.clone(), + id: self.id, + } + } + + pub fn with_id(mut self, id: i32) -> Self { + self.id = id; + self + } + + // Getters... + pub fn name(&self) -> &str { + &self.name + } + + pub fn data_type(&self) -> &DataType { + &self.data_type + } + + pub fn comment(&self) -> Option<&str> { + self.comment.as_deref() + } + + /// Returns the stable column id, or [`UNKNOWN_COLUMN_ID`] when the + /// id has not yet been assigned by a [`SchemaBuilder`]. + pub fn id(&self) -> i32 { + self.id + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct PrimaryKey { + constraint_name: String, + column_names: Vec, +} + +impl PrimaryKey { + pub fn new>(constraint_name: N, column_names: Vec) -> Self { + Self { + constraint_name: constraint_name.into(), + column_names, + } + } + + // Getters... + pub fn constraint_name(&self) -> &str { + &self.constraint_name + } + + pub fn column_names(&self) -> &[String] { + &self.column_names + } +} + +fn collect_field_id_state(data_type: &DataType, max_id: &mut i32, has_unassigned: &mut bool) { + match data_type { + DataType::Row(rt) => { + for f in rt.fields() { + if f.field_id == UNASSIGNED_FIELD_ID { + *has_unassigned = true; + } else { + *max_id = (*max_id).max(f.field_id); + } + collect_field_id_state(&f.data_type, max_id, has_unassigned); + } + } + DataType::Array(at) => { + collect_field_id_state(at.get_element_type(), max_id, has_unassigned); + } + DataType::Map(mt) => { + collect_field_id_state(mt.key_type(), max_id, has_unassigned); + collect_field_id_state(mt.value_type(), max_id, has_unassigned); + } + _ => {} + } +} + +fn collect_nested_field_ids(data_type: &DataType, ids: &mut Vec) { + match data_type { + DataType::Row(rt) => { + for f in rt.fields() { + if f.field_id != UNASSIGNED_FIELD_ID { + ids.push(f.field_id); + } + collect_nested_field_ids(&f.data_type, ids); + } + } + DataType::Array(at) => collect_nested_field_ids(at.get_element_type(), ids), + DataType::Map(mt) => { + collect_nested_field_ids(mt.key_type(), ids); + collect_nested_field_ids(mt.value_type(), ids); + } + _ => {} + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct Schema { + columns: Vec, + primary_key: Option, + row_type: RowType, + auto_increment_col_names: Vec, + highest_field_id: i32, +} + +impl Schema { + pub fn empty() -> Result { + Self::builder().build() + } + + pub fn builder() -> SchemaBuilder { + SchemaBuilder::new() + } + + pub fn columns(&self) -> &[Column] { + &self.columns + } + + pub fn primary_key(&self) -> Option<&PrimaryKey> { + self.primary_key.as_ref() + } + + pub fn row_type(&self) -> &RowType { + &self.row_type + } + + pub fn primary_key_indexes(&self) -> Vec { + self.primary_key + .as_ref() + .map(|pk| { + pk.column_names + .iter() + .filter_map(|name| self.columns.iter().position(|c| &c.name == name)) + .collect() + }) + .unwrap_or_default() + } + + pub fn primary_key_column_names(&self) -> Vec<&str> { + self.primary_key + .as_ref() + .map(|pk| pk.column_names.iter().map(|s| s.as_str()).collect()) + .unwrap_or_default() + } + + pub fn column_names(&self) -> Vec<&str> { + self.columns.iter().map(|c| c.name.as_str()).collect() + } + + pub fn auto_increment_col_names(&self) -> &Vec { + &self.auto_increment_col_names + } + + pub fn highest_field_id(&self) -> i32 { + self.highest_field_id + } +} + +/// A schema together with its server-assigned version id. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SchemaInfo { + schema: Schema, + schema_id: i32, +} + +impl SchemaInfo { + pub fn new(schema: Schema, schema_id: i32) -> Self { + Self { schema, schema_id } + } + + pub fn schema(&self) -> &Schema { + &self.schema + } + + pub fn schema_id(&self) -> i32 { + self.schema_id + } + + pub fn into_parts(self) -> (Schema, i32) { + (self.schema, self.schema_id) + } +} + +#[derive(Debug, Default)] +pub struct SchemaBuilder { + columns: Vec, + primary_key: Option, + auto_increment_col_names: Vec, +} + +impl SchemaBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn with_row_type(mut self, row_type: &DataType) -> Self { + match row_type { + DataType::Row(row) => { + for data_field in row.fields() { + self = self.column(&data_field.name, data_field.data_type.clone()) + } + self + } + _ => { + panic!("data type must be row type") + } + } + } + + pub fn column>(mut self, name: N, data_type: DataType) -> Self { + self.columns.push(Column::new(name.into(), data_type)); + self + } + + pub fn with_columns(mut self, columns: Vec) -> Self { + self.columns.extend_from_slice(columns.as_ref()); + self + } + + pub fn with_comment>(mut self, comment: C) -> Self { + if let Some(last) = self.columns.last_mut() { + *last = last.clone().with_comment(comment.into()); + } + self + } + + pub fn primary_key(self, column_names: I) -> Self + where + I: IntoIterator, + S: Into, + { + let names: Vec = column_names.into_iter().map(|s| s.into()).collect(); + + let constraint_name = format!("PK_{}", names.join("_")); + + self.primary_key_named(&constraint_name, names) + } + + pub fn primary_key_named, P: Into>( + mut self, + constraint_name: N, + column_names: Vec

, + ) -> Self { + self.primary_key = Some(PrimaryKey::new( + constraint_name.into(), + column_names.into_iter().map(|s| s.into()).collect(), + )); + self + } + + /// Declares a column to be auto-incremented. With an auto-increment column in the table, + /// whenever a new row is inserted into the table, the new row will be assigned with the next + /// available value from the auto-increment sequence. A table can have at most one auto + /// increment column. + pub fn enable_auto_increment>(mut self, column_name: N) -> Result { + if !self.auto_increment_col_names.is_empty() { + return Err(IllegalArgument { + message: "Multiple auto increment columns are not supported yet.".to_string(), + }); + } + + self.auto_increment_col_names.push(column_name.into()); + Ok(self) + } + + pub fn build(&self) -> Result { + let columns = Self::normalize_columns(&self.columns, self.primary_key.as_ref())?; + let (columns_with_ids, highest_field_id) = Self::assign_all_field_ids(columns)?; + + let column_names: HashSet<_> = columns_with_ids.iter().map(|c| &c.name).collect(); + for auto_inc_col in &self.auto_increment_col_names { + if !column_names.contains(auto_inc_col) { + return Err(IllegalArgument { + message: format!( + "Auto increment column '{auto_inc_col}' is not found in the schema columns." + ), + }); + } + } + + let data_fields = columns_with_ids + .iter() + .map(|c| DataField { + name: c.name.clone(), + data_type: c.data_type.clone(), + description: c.comment.clone(), + field_id: c.id, + }) + .collect(); + + Ok(Schema { + columns: columns_with_ids, + primary_key: self.primary_key.clone(), + row_type: RowType::new(data_fields), + auto_increment_col_names: self.auto_increment_col_names.clone(), + highest_field_id, + }) + } + + fn assign_all_field_ids(columns: Vec) -> Result<(Vec, i32)> { + let with_top_id = columns.iter().filter(|c| c.id != UNKNOWN_COLUMN_ID).count(); + let none_set = with_top_id == 0; + let all_top_set = with_top_id == columns.len(); + + if !none_set && !all_top_set { + return Err(IllegalArgument { + message: "All columns must have an id assigned, or none of them must.".to_string(), + }); + } + + let mut max_nested_id = -1_i32; + let mut has_unassigned_nested = false; + for c in &columns { + collect_field_id_state(&c.data_type, &mut max_nested_id, &mut has_unassigned_nested); + } + + if all_top_set && !has_unassigned_nested { + let mut seen: HashSet = HashSet::new(); + let mut max_id = -1_i32; + for col in &columns { + if col.id < 0 { + return Err(IllegalArgument { + message: format!( + "Column '{}' has invalid id {}; ids must be non-negative", + col.name, col.id + ), + }); + } + if !seen.insert(col.id) { + return Err(IllegalArgument { + message: format!("Duplicate field id {} in schema", col.id), + }); + } + max_id = max_id.max(col.id); + + let mut nested_ids = Vec::new(); + collect_nested_field_ids(&col.data_type, &mut nested_ids); + for id in nested_ids { + if id < 0 { + return Err(IllegalArgument { + message: format!( + "Nested DataField in column '{}' has invalid id {}; ids must be non-negative", + col.name, id + ), + }); + } + if !seen.insert(id) { + return Err(IllegalArgument { + message: format!( + "Duplicate field id {} in schema (column '{}')", + id, col.name + ), + }); + } + } + } + max_id = max_id.max(max_nested_id); + return Ok((columns, max_id)); + } + + if all_top_set && has_unassigned_nested { + return Err(IllegalArgument { + message: "Top-level column ids are set but some nested DataField ids are unassigned; reassign all or none." + .to_string(), + }); + } + + let mut counter: i32 = -1; + let new_columns: Vec = columns + .into_iter() + .map(|c| { + counter += 1; + let id = counter; + let new_data_type = reassign_field_ids(&c.data_type, &mut counter); + Column { + name: c.name, + data_type: new_data_type, + comment: c.comment, + id, + } + }) + .collect(); + Ok((new_columns, counter)) + } + + /// All-or-none: preserve ids if every column has one, auto-assign + /// 0..N-1 if none do, error on mixed input. When preserving ids, + /// also reject duplicates and negative-but-not-sentinel values. + #[allow(dead_code)] + fn assign_column_ids(columns: Vec) -> Result> { + let with_id = columns.iter().filter(|c| c.id != UNKNOWN_COLUMN_ID).count(); + if with_id == 0 { + return Ok(columns + .into_iter() + .enumerate() + .map(|(i, c)| c.with_id(i as i32)) + .collect()); + } + if with_id != columns.len() { + return Err(IllegalArgument { + message: "All columns must have an id assigned, or none of them must.".to_string(), + }); + } + let mut seen: HashSet = HashSet::with_capacity(columns.len()); + for col in &columns { + if col.id < 0 { + return Err(IllegalArgument { + message: format!( + "Column '{}' has invalid id {}; ids must be non-negative", + col.name, col.id + ), + }); + } + if !seen.insert(col.id) { + return Err(IllegalArgument { + message: format!("Duplicate column id {} in schema", col.id), + }); + } + } + Ok(columns) + } + + fn normalize_columns( + columns: &[Column], + primary_key: Option<&PrimaryKey>, + ) -> Result> { + let names: Vec<_> = columns.iter().map(|c| &c.name).collect(); + if let Some(duplicates) = Self::find_duplicates(&names) { + return Err(Error::invalid_table(format!( + "Duplicate column names found: {duplicates:?}" + ))); + } + + let Some(pk) = primary_key else { + return Ok(columns.to_vec()); + }; + + let pk_set: HashSet<_> = pk.column_names.iter().collect(); + let all_columns: HashSet<_> = columns.iter().map(|c| &c.name).collect(); + if !pk_set.is_subset(&all_columns) { + return Err(Error::invalid_table(format!( + "Primary key columns {pk_set:?} not found in schema" + ))); + } + + Ok(columns + .iter() + .map(|col| { + if pk_set.contains(&col.name) && col.data_type.is_nullable() { + col.with_data_type(col.data_type.as_non_nullable()) + } else { + col.clone() + } + }) + .collect()) + } + + fn find_duplicates<'a>(names: &'a [&String]) -> Option> { + let mut seen = HashSet::new(); + let mut duplicates = HashSet::new(); + + for name in names { + if !seen.insert(name) { + duplicates.insert(*name); + } + } + + if duplicates.is_empty() { + None + } else { + Some(duplicates) + } + } +} + +/// distribution of table +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct TableDistribution { + bucket_count: Option, + bucket_keys: Vec, +} + +impl TableDistribution { + pub fn bucket_keys(&self) -> &[String] { + &self.bucket_keys + } + + pub fn bucket_count(&self) -> Option { + self.bucket_count + } +} + +#[derive(Debug, Default)] +pub struct TableDescriptorBuilder { + schema: Option, + properties: HashMap, + custom_properties: HashMap, + partition_keys: Arc<[String]>, + comment: Option, + table_distribution: Option, +} + +impl TableDescriptorBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn schema(mut self, schema: Schema) -> Self { + self.schema = Some(schema); + self + } + + pub fn log_format(mut self, log_format: LogFormat) -> Self { + self.properties + .insert("table.log.format".to_string(), log_format.to_string()); + self + } + + pub fn kv_format(mut self, kv_format: KvFormat) -> Self { + self.properties + .insert("table.kv.format".to_string(), kv_format.to_string()); + self + } + + pub fn property, V: Into>(mut self, key: K, value: V) -> Self { + self.properties.insert(key.into(), value.into()); + self + } + + pub fn properties, V: Into>( + mut self, + properties: HashMap, + ) -> Self { + for (k, v) in properties { + self.properties.insert(k.into(), v.into()); + } + self + } + + pub fn custom_property, V: Into>(mut self, key: K, value: V) -> Self { + self.custom_properties.insert(key.into(), value.into()); + self + } + + pub fn custom_properties, V: Into>( + mut self, + custom_properties: HashMap, + ) -> Self { + for (k, v) in custom_properties { + self.custom_properties.insert(k.into(), v.into()); + } + self + } + + pub fn partitioned_by>(mut self, partition_keys: Vec

) -> Self { + self.partition_keys = Arc::from( + partition_keys + .into_iter() + .map(|s| s.into()) + .collect::>(), + ); + self + } + + pub fn distributed_by(mut self, bucket_count: Option, bucket_keys: Vec) -> Self { + self.table_distribution = Some(TableDistribution { + bucket_count, + bucket_keys, + }); + self + } + + pub fn comment>(mut self, comment: S) -> Self { + self.comment = Some(comment.into()); + self + } + + pub fn build(self) -> Result { + let schema = self.schema.expect("Schema must be set"); + let table_distribution = TableDescriptor::normalize_distribution( + &schema, + &self.partition_keys, + self.table_distribution, + )?; + Ok(TableDescriptor { + schema, + comment: self.comment, + partition_keys: self.partition_keys, + table_distribution, + properties: self.properties, + custom_properties: self.custom_properties, + }) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct TableDescriptor { + schema: Schema, + comment: Option, + partition_keys: Arc<[String]>, + table_distribution: Option, + properties: HashMap, + custom_properties: HashMap, +} + +impl TableDescriptor { + pub fn builder() -> TableDescriptorBuilder { + TableDescriptorBuilder::new() + } + + pub fn schema(&self) -> &Schema { + &self.schema + } + + pub fn bucket_keys(&self) -> Vec<&str> { + self.table_distribution + .as_ref() + .map(|td| td.bucket_keys.iter().map(|s| s.as_str()).collect()) + .unwrap_or_default() + } + + pub fn is_default_bucket_key(&self) -> Result { + if self.schema.primary_key().is_some() { + Ok(self.bucket_keys() + == Self::default_bucket_key_of_primary_key_table( + self.schema(), + &self.partition_keys, + )? + .iter() + .map(|s| s.as_str()) + .collect::>()) + } else { + Ok(self.bucket_keys().is_empty()) + } + } + + pub fn is_partitioned(&self) -> bool { + !self.partition_keys.is_empty() + } + + pub fn has_primary_key(&self) -> bool { + self.schema.primary_key().is_some() + } + + pub fn partition_keys(&self) -> &[String] { + &self.partition_keys + } + + pub fn table_distribution(&self) -> Option<&TableDistribution> { + self.table_distribution.as_ref() + } + + pub fn properties(&self) -> &HashMap { + &self.properties + } + + pub fn custom_properties(&self) -> &HashMap { + &self.custom_properties + } + + pub fn replication_factor(&self) -> Result { + self.properties + .get("table.replication.factor") + .ok_or_else(|| Error::invalid_table("Replication factor is not set"))? + .parse() + .map_err(|_e| Error::invalid_table("Replication factor can't be converted to int")) + } + + pub fn with_properties, V: Into>( + &self, + new_properties: HashMap, + ) -> Self { + let mut properties = HashMap::new(); + for (k, v) in new_properties { + properties.insert(k.into(), v.into()); + } + Self { + properties, + ..self.clone() + } + } + + pub fn with_replication_factor(&self, new_replication_factor: i32) -> Self { + let mut properties = self.properties.clone(); + properties.insert( + "table.replication.factor".to_string(), + new_replication_factor.to_string(), + ); + self.with_properties(properties) + } + + pub fn with_bucket_count(&self, new_bucket_count: i32) -> Self { + Self { + table_distribution: Some(TableDistribution { + bucket_count: Some(new_bucket_count), + bucket_keys: self + .table_distribution + .as_ref() + .map(|td| td.bucket_keys.clone()) + .unwrap_or_default(), + }), + ..self.clone() + } + } + + pub fn comment(&self) -> Option<&str> { + self.comment.as_deref() + } + + fn default_bucket_key_of_primary_key_table( + schema: &Schema, + partition_keys: &[String], + ) -> Result> { + let mut bucket_keys = schema + .primary_key() + .expect("Primary key must be set") + .column_names() + .to_vec(); + + bucket_keys.retain(|k| !partition_keys.contains(k)); + + if bucket_keys.is_empty() { + return Err(Error::invalid_table(format!( + "Primary Key constraint {:?} should not be same with partition fields {:?}.", + schema.primary_key().unwrap().column_names(), + partition_keys + ))); + } + + Ok(bucket_keys) + } + + fn normalize_distribution( + schema: &Schema, + partition_keys: &[String], + origin_distribution: Option, + ) -> Result> { + if let Some(distribution) = origin_distribution { + if distribution + .bucket_keys + .iter() + .any(|k| partition_keys.contains(k)) + { + return Err(Error::invalid_table(format!( + "Bucket key {:?} shouldn't include any column in partition keys {:?}.", + distribution.bucket_keys, partition_keys + ))); + } + + return if let Some(pk) = schema.primary_key() { + if distribution.bucket_keys.is_empty() { + Ok(Some(TableDistribution { + bucket_count: distribution.bucket_count, + bucket_keys: Self::default_bucket_key_of_primary_key_table( + schema, + partition_keys, + )?, + })) + } else { + let pk_columns: HashSet<_> = pk.column_names().iter().collect(); + if !distribution + .bucket_keys + .iter() + .all(|k| pk_columns.contains(k)) + { + return Err(Error::invalid_table(format!( + "Bucket keys must be a subset of primary keys excluding partition keys for primary-key tables. \ + The primary keys are {:?}, the partition keys are {:?}, but the user-defined bucket keys are {:?}.", + pk.column_names(), + partition_keys, + distribution.bucket_keys + ))); + } + Ok(Some(distribution)) + } + } else { + Ok(Some(distribution)) + }; + } else if schema.primary_key().is_some() { + return Ok(Some(TableDistribution { + bucket_count: None, + bucket_keys: Self::default_bucket_key_of_primary_key_table(schema, partition_keys)?, + })); + } + + Ok(None) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum LogFormat { + ARROW, + INDEXED, +} + +impl Display for LogFormat { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + LogFormat::ARROW => { + write!(f, "ARROW")?; + } + LogFormat::INDEXED => { + write!(f, "INDEXED")?; + } + } + Ok(()) + } +} + +impl LogFormat { + pub fn parse(s: &str) -> Result { + match s.to_uppercase().as_str() { + "ARROW" => Ok(LogFormat::ARROW), + "INDEXED" => Ok(LogFormat::INDEXED), + _ => Err(Error::invalid_table(format!("Unknown log format: {s}"))), + } + } +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, EnumString)] +pub enum KvFormat { + INDEXED, + COMPACTED, +} + +impl Display for KvFormat { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + KvFormat::COMPACTED => write!(f, "COMPACTED")?, + KvFormat::INDEXED => write!(f, "INDEXED")?, + } + Ok(()) + } +} + +impl KvFormat { + pub fn parse(s: &str) -> Result { + match s.to_uppercase().as_str() { + "INDEXED" => Ok(KvFormat::INDEXED), + "COMPACTED" => Ok(KvFormat::COMPACTED), + _ => Err(Error::invalid_table(format!("Unknown kv format: {s}"))), + } + } +} + +#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)] +pub struct TablePath { + database: String, + table: String, +} + +impl Display for TablePath { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "{}.{}", self.database, self.table) + } +} + +const MAX_NAME_LENGTH: usize = 200; + +const INTERNAL_NAME_PREFIX: &str = "__"; + +impl TablePath { + pub fn new, T: Into>(db: D, tbl: T) -> Self { + TablePath { + database: db.into(), + table: tbl.into(), + } + } + + #[inline] + pub fn database(&self) -> &str { + &self.database + } + + #[inline] + pub fn table(&self) -> &str { + &self.table + } + + pub fn detect_invalid_name(identifier: &str) -> Option { + if identifier.is_empty() { + return Some("the empty string is not allowed".to_string()); + } + if identifier == "." { + return Some("'.' is not allowed".to_string()); + } + if identifier == ".." { + return Some("'..' is not allowed".to_string()); + } + if identifier.len() > MAX_NAME_LENGTH { + return Some(format!( + "the length of '{identifier}' is longer than the max allowed length {MAX_NAME_LENGTH}" + )); + } + if Self::contains_invalid_pattern(identifier) { + return Some(format!( + "'{identifier}' contains one or more characters other than ASCII alphanumerics, '_' and '-'" + )); + } + None + } + + pub fn validate_prefix(identifier: &str) -> Option { + if identifier.starts_with(INTERNAL_NAME_PREFIX) { + return Some(format!( + "'{INTERNAL_NAME_PREFIX}' is not allowed as prefix, since it is reserved for internal databases/internal tables/internal partitions in Fluss server" + )); + } + None + } + + // Valid characters for Fluss table names are the ASCII alphanumerics, '_' and '-'. + fn contains_invalid_pattern(identifier: &str) -> bool { + for c in identifier.chars() { + let valid_char = c.is_ascii_alphanumeric() || c == '_' || c == '-'; + if !valid_char { + return true; + } + } + false + } +} + +/// A database name, table name and partition name combo. It's used to represent the physical path of +/// a bucket. If the bucket belongs to a partition (i.e., the table is a partitioned table), +/// `partition_name` will be `Some(...)`; otherwise, it will be `None`. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct PhysicalTablePath { + table_path: Arc, + partition_name: Option, +} + +impl PhysicalTablePath { + pub fn of(table_path: Arc) -> Self { + Self { + table_path, + partition_name: None, + } + } + + pub fn of_partitioned(table_path: Arc, partition_name: Option) -> Self { + Self { + table_path, + partition_name, + } + } + + pub fn of_with_names, T: Into, P: Into>( + database_name: D, + table_name: T, + partition_name: Option

, + ) -> Self { + Self { + table_path: Arc::new(TablePath::new(database_name, table_name)), + partition_name: partition_name.map(|p| p.into()), + } + } + + pub fn get_table_path(&self) -> &TablePath { + &self.table_path + } + + pub fn get_database_name(&self) -> &str { + self.table_path.database() + } + + pub fn get_table_name(&self) -> &str { + self.table_path.table() + } + + pub fn get_partition_name(&self) -> Option<&String> { + self.partition_name.as_ref() + } +} + +impl Display for PhysicalTablePath { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match &self.partition_name { + Some(partition) => write!(f, "{}(p={})", self.table_path, partition), + None => write!(f, "{}", self.table_path), + } + } +} + +#[derive(Debug, Clone)] +pub struct TableInfo { + pub table_path: TablePath, + pub table_id: TableId, + pub schema_id: i32, + pub schema: Schema, + pub row_type: RowType, + pub primary_keys: Vec, + pub physical_primary_keys: Vec, + pub bucket_keys: Vec, + pub partition_keys: Arc<[String]>, + pub num_buckets: i32, + pub properties: HashMap, + pub table_config: TableConfig, + pub custom_properties: HashMap, + pub comment: Option, + pub created_time: i64, + pub modified_time: i64, +} + +impl TableInfo { + pub fn row_type(&self) -> &RowType { + &self.row_type + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct AutoPartitionStrategy { + auto_partition_enabled: bool, + auto_partition_key: Option, + auto_partition_time_unit: String, + auto_partition_num_precreate: i32, + auto_partition_num_retention: i32, + auto_partition_timezone: String, +} + +impl AutoPartitionStrategy { + pub fn from(properties: &HashMap) -> Self { + Self { + auto_partition_enabled: properties + .get("table.auto-partition.enabled") + .and_then(|s| s.parse().ok()) + .unwrap_or(false), + auto_partition_key: properties + .get("table.auto-partition.key") + .map(|s| s.to_string()), + auto_partition_time_unit: properties + .get("table.auto-partition.time-unit") + .map(|s| s.to_string()) + .unwrap_or_else(|| "DAY".to_string()), + auto_partition_num_precreate: properties + .get("table.auto-partition.num-precreate") + .and_then(|s| s.parse().ok()) + .unwrap_or(2), + auto_partition_num_retention: properties + .get("table.auto-partition.num-retention") + .and_then(|s| s.parse().ok()) + .unwrap_or(7), + auto_partition_timezone: properties + .get("table.auto-partition.time-zone") + .map(|s| s.to_string()) + .unwrap_or_else(|| { + jiff::tz::TimeZone::system() + .iana_name() + .unwrap_or("UTC") + .to_string() + }), + } + } + + pub fn is_auto_partition_enabled(&self) -> bool { + self.auto_partition_enabled + } + + pub fn key(&self) -> Option<&str> { + self.auto_partition_key.as_deref() + } + + pub fn time_unit(&self) -> &str { + &self.auto_partition_time_unit + } + + pub fn num_precreate(&self) -> i32 { + self.auto_partition_num_precreate + } + + pub fn num_retention(&self) -> i32 { + self.auto_partition_num_retention + } + + pub fn timezone(&self) -> &str { + &self.auto_partition_timezone + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct TableConfig { + pub properties: HashMap, +} + +impl TableConfig { + pub fn from_properties(properties: HashMap) -> Self { + TableConfig { properties } + } + + pub fn get_arrow_compression_info(&self) -> Result { + ArrowCompressionInfo::from_conf(&self.properties) + } + + /// Returns the data lake format if configured, or None if not set. + pub fn get_datalake_format(&self) -> Result> { + self.properties + .get("table.datalake.format") + .map(|f| f.parse().map_err(Error::from)) + .transpose() + } + + pub fn get_kv_format(&self) -> Result { + // TODO: Consolidate configurations logic, constants, defaults in a single place + const DEFAULT_KV_FORMAT: &str = "COMPACTED"; + let kv_format = self + .properties + .get("table.kv.format") + .map(String::as_str) + .unwrap_or(DEFAULT_KV_FORMAT); + kv_format.parse().map_err(Into::into) + } + + pub fn get_log_format(&self) -> Result { + // TODO: Consolidate configurations logic, constants, defaults in a single place + const DEFAULT_LOG_FORMAT: &str = "ARROW"; + let log_format = self + .properties + .get("table.log.format") + .map(String::as_str) + .unwrap_or(DEFAULT_LOG_FORMAT); + LogFormat::parse(log_format) + } + + pub fn get_auto_partition_strategy(&self) -> AutoPartitionStrategy { + AutoPartitionStrategy::from(&self.properties) + } +} + +impl TableInfo { + pub fn of( + table_path: TablePath, + table_id: i64, + schema_id: i32, + table_descriptor: TableDescriptor, + created_time: i64, + modified_time: i64, + ) -> TableInfo { + let TableDescriptor { + schema, + table_distribution, + comment, + partition_keys, + properties, + custom_properties, + } = table_descriptor; + let TableDistribution { + bucket_count, + bucket_keys, + } = table_distribution.unwrap(); + TableInfo::new( + table_path, + table_id, + schema_id, + schema, + bucket_keys, + partition_keys, + bucket_count.unwrap(), + properties, + custom_properties, + comment, + created_time, + modified_time, + ) + } + + #[allow(clippy::too_many_arguments)] + pub fn new( + table_path: TablePath, + table_id: TableId, + schema_id: i32, + schema: Schema, + bucket_keys: Vec, + partition_keys: Arc<[String]>, + num_buckets: i32, + properties: HashMap, + custom_properties: HashMap, + comment: Option, + created_time: i64, + modified_time: i64, + ) -> Self { + let row_type = schema.row_type.clone(); + let primary_keys: Vec = schema + .primary_key_column_names() + .iter() + .map(|col| (*col).to_string()) + .collect(); + let physical_primary_keys = + Self::generate_physical_primary_key(&primary_keys, &partition_keys); + let table_config = TableConfig::from_properties(properties.clone()); + + TableInfo { + table_path, + table_id, + schema_id, + schema, + row_type, + primary_keys, + physical_primary_keys, + bucket_keys, + partition_keys, + num_buckets, + properties, + table_config, + custom_properties, + comment, + created_time, + modified_time, + } + } + + pub fn get_table_path(&self) -> &TablePath { + &self.table_path + } + + pub fn get_table_id(&self) -> i64 { + self.table_id + } + + pub fn get_schema_id(&self) -> i32 { + self.schema_id + } + + pub fn get_schema(&self) -> &Schema { + &self.schema + } + + pub fn get_row_type(&self) -> &RowType { + &self.row_type + } + + pub fn has_primary_key(&self) -> bool { + !self.primary_keys.is_empty() + } + + pub fn get_primary_keys(&self) -> &Vec { + &self.primary_keys + } + + pub fn get_physical_primary_keys(&self) -> &[String] { + &self.physical_primary_keys + } + + pub fn has_bucket_key(&self) -> bool { + !self.bucket_keys.is_empty() + } + + pub fn is_default_bucket_key(&self) -> bool { + if self.has_primary_key() { + self.bucket_keys == self.physical_primary_keys + } else { + self.bucket_keys.is_empty() + } + } + + pub fn get_bucket_keys(&self) -> &[String] { + &self.bucket_keys + } + + pub fn is_partitioned(&self) -> bool { + !self.partition_keys.is_empty() + } + + pub fn is_auto_partitioned(&self) -> bool { + self.is_partitioned() + && self + .table_config + .get_auto_partition_strategy() + .is_auto_partition_enabled() + } + + pub fn get_partition_keys(&self) -> &Arc<[String]> { + &self.partition_keys + } + + pub fn get_num_buckets(&self) -> i32 { + self.num_buckets + } + + pub fn get_properties(&self) -> &HashMap { + &self.properties + } + + pub fn get_table_config(&self) -> &TableConfig { + &self.table_config + } + + pub fn get_custom_properties(&self) -> &HashMap { + &self.custom_properties + } + + pub fn get_comment(&self) -> Option<&str> { + self.comment.as_deref() + } + + pub fn get_created_time(&self) -> i64 { + self.created_time + } + + pub fn get_modified_time(&self) -> i64 { + self.modified_time + } + + pub fn to_table_descriptor(&self) -> Result { + let mut builder = TableDescriptor::builder() + .schema(self.schema.clone()) + .partitioned_by(self.partition_keys.to_vec()) + .distributed_by(Some(self.num_buckets), self.bucket_keys.clone()) + .properties(self.properties.clone()) + .custom_properties(self.custom_properties.clone()); + + if let Some(comment) = &self.comment { + builder = builder.comment(comment.clone()); + } + + builder.build() + } + + fn generate_physical_primary_key( + primary_keys: &[String], + partition_keys: &[String], + ) -> Vec { + primary_keys + .iter() + .filter(|pk| !partition_keys.contains(*pk)) + .cloned() + .collect() + } +} + +impl Display for TableInfo { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "TableInfo{{ table_path={:?}, table_id={}, schema_id={}, schema={:?}, physical_primary_keys={:?}, bucket_keys={:?}, partition_keys={:?}, num_buckets={}, properties={:?}, custom_properties={:?}, comment={:?}, created_time={}, modified_time={} }}", + self.table_path, + self.table_id, + self.schema_id, + self.schema, + self.physical_primary_keys, + self.bucket_keys, + self.partition_keys, + self.num_buckets, + self.properties, + self.custom_properties, + self.comment, + self.created_time, + self.modified_time + ) + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, Hash, PartialEq, Eq)] +pub struct TableBucket { + table_id: TableId, + partition_id: Option, + bucket: BucketId, +} + +impl TableBucket { + pub fn new(table_id: TableId, bucket: BucketId) -> Self { + Self { + table_id, + partition_id: None, + bucket, + } + } + + pub fn new_with_partition( + table_id: TableId, + partition_id: Option, + bucket: BucketId, + ) -> Self { + TableBucket { + table_id, + partition_id, + bucket, + } + } + + pub fn table_id(&self) -> TableId { + self.table_id + } + + pub fn bucket_id(&self) -> BucketId { + self.bucket + } + + pub fn partition_id(&self) -> Option { + self.partition_id + } +} + +impl Display for TableBucket { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + if let Some(partition_id) = self.partition_id { + write!( + f, + "TableBucket(table_id={}, partition_id={}, bucket={})", + self.table_id, partition_id, self.bucket + ) + } else { + write!( + f, + "TableBucket(table_id={}, bucket={})", + self.table_id, self.bucket + ) + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LakeSnapshot { + pub snapshot_id: i64, + pub table_buckets_offset: HashMap, +} + +impl LakeSnapshot { + pub fn new(snapshot_id: i64, table_buckets_offset: HashMap) -> Self { + Self { + snapshot_id, + table_buckets_offset, + } + } + + pub fn snapshot_id(&self) -> i64 { + self.snapshot_id + } + + pub fn table_buckets_offset(&self) -> &HashMap { + &self.table_buckets_offset + } +} + +/// Tests for [`TablePath`]. +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::DataTypes; + + #[test] + fn test_validate() { + // assert valid name + let path = TablePath::new("db_2-abc3".to_string(), "table-1_abc_2".to_string()); + assert!(TablePath::detect_invalid_name(path.database()).is_none()); + assert!(TablePath::detect_invalid_name(path.table()).is_none()); + assert_eq!(path.to_string(), "db_2-abc3.table-1_abc_2"); + + // assert invalid name prefix + assert!( + TablePath::validate_prefix("__table-1") + .unwrap() + .contains("'__' is not allowed as prefix") + ); + + // check max length + let long_name = "a".repeat(200); + assert!(TablePath::detect_invalid_name(&long_name).is_none()); + + // assert invalid names + assert_invalid_name("*abc", "'*abc' contains one or more characters other than"); + assert_invalid_name( + "table.abc", + "'table.abc' contains one or more characters other than", + ); + assert_invalid_name("", "the empty string is not allowed"); + assert_invalid_name(" ", "' ' contains one or more characters other than"); + assert_invalid_name(".", "'.' is not allowed"); + assert_invalid_name("..", "'..' is not allowed"); + let invalid_long_name = "a".repeat(201); + assert_invalid_name( + &invalid_long_name, + &format!( + "the length of '{invalid_long_name}' is longer than the max allowed length {MAX_NAME_LENGTH}" + ), + ); + } + + fn assert_invalid_name(name: &str, expected_message: &str) { + let result = TablePath::detect_invalid_name(name); + assert!( + result.is_some(), + "Expected '{name}' to be invalid, but it was valid" + ); + assert!( + result.as_ref().unwrap().contains(expected_message), + "Expected message containing '{}', but got '{}'", + expected_message, + result.unwrap() + ); + } + + #[test] + fn test_is_auto_partitioned() { + let schema = Schema::builder() + .column("id", DataTypes::int()) + .column("name", DataTypes::string()) + .primary_key(vec!["id".to_string()]) + .build() + .unwrap(); + + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + + // 1. Not partitioned, auto partition disabled + let mut properties = HashMap::new(); + let table_info = TableInfo::new( + table_path.clone(), + 1, + 1, + schema.clone(), + vec!["id".to_string()], + Arc::from(vec![]), // No partition keys + 1, + properties.clone(), + HashMap::new(), + None, + 0, + 0, + ); + assert!(!table_info.is_auto_partitioned()); + + // 2. Not partitioned, auto partition enabled + properties.insert( + "table.auto-partition.enabled".to_string(), + "true".to_string(), + ); + let table_info = TableInfo::new( + table_path.clone(), + 1, + 1, + schema.clone(), + vec!["id".to_string()], + Arc::from(vec![]), // No partition keys + 1, + properties.clone(), + HashMap::new(), + None, + 0, + 0, + ); + assert!(!table_info.is_auto_partitioned()); + + // 3. Partitioned, auto partition disabled + properties.insert( + "table.auto-partition.enabled".to_string(), + "false".to_string(), + ); + let table_info = TableInfo::new( + table_path.clone(), + 1, + 1, + schema.clone(), + vec!["id".to_string()], + Arc::from(vec!["name".to_string()]), // Partition keys + 1, + properties.clone(), + HashMap::new(), + None, + 0, + 0, + ); + assert!(!table_info.is_auto_partitioned()); + + // 4. Partitioned, auto partition enabled + properties.insert( + "table.auto-partition.enabled".to_string(), + "true".to_string(), + ); + let table_info = TableInfo::new( + table_path.clone(), + 1, + 1, + schema.clone(), + vec!["id".to_string()], + Arc::from(vec!["name".to_string()]), // Partition keys + 1, + properties.clone(), + HashMap::new(), + None, + 0, + 0, + ); + assert!(table_info.is_auto_partitioned()); + } +} diff --git a/fluss-rust/crates/fluss/src/metrics.rs b/fluss-rust/crates/fluss/src/metrics.rs new file mode 100644 index 0000000000..7c62738c4e --- /dev/null +++ b/fluss-rust/crates/fluss/src/metrics.rs @@ -0,0 +1,617 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Metric name constants and helpers for fluss-rust client instrumentation. +//! +//! Uses the [`metrics`] crate facade pattern: library code emits metrics via +//! `counter!`/`gauge!`/`histogram!` macros, and the application installs a +//! recorder (e.g. `metrics-exporter-prometheus`) to collect them. When no +//! recorder is installed, all metric calls are no-ops with zero overhead. + +use crate::metadata::TablePath; +use crate::rpc::ApiKey; + +// --------------------------------------------------------------------------- +// Label keys +// --------------------------------------------------------------------------- + +pub const LABEL_API_KEY: &str = "api_key"; + +/// Identifies the database and table for per-table scanner metrics. +pub const LABEL_DATABASE: &str = "database"; +pub const LABEL_TABLE: &str = "table"; + +// --------------------------------------------------------------------------- +// Connection / RPC metrics +// +// Java reference: ConnectionMetrics.java, ClientMetricGroup.java, MetricNames.java +// +// Byte counting matches Java semantics: both sides count only the API message +// body, excluding the protocol header and framing. +// Java: rawRequest.totalSize() / response.totalSize() (see MessageCodec.java). +// Rust: buf.len() - REQUEST_HEADER_LENGTH for sent bytes, +// buffer.len() - cursor.position() for received bytes. +// --------------------------------------------------------------------------- + +pub const CLIENT_REQUESTS_TOTAL: &str = "fluss.client.requests.total"; +pub const CLIENT_RESPONSES_TOTAL: &str = "fluss.client.responses.total"; +pub const CLIENT_BYTES_SENT_TOTAL: &str = "fluss.client.bytes_sent.total"; +pub const CLIENT_BYTES_RECEIVED_TOTAL: &str = "fluss.client.bytes_received.total"; +pub const CLIENT_REQUEST_LATENCY_MS: &str = "fluss.client.request_latency_ms"; +pub const CLIENT_REQUESTS_IN_FLIGHT: &str = "fluss.client.requests_in_flight"; + +// --------------------------------------------------------------------------- +// Scanner poll-timing metrics +// +// Java reference: ScannerMetricGroup.java, LogScannerImpl.java +// +// These track consumer liveness and processing efficiency at the `poll()` +// boundary. Java records via `volatile long` fields read by gauge suppliers; +// Rust snapshots the values at poll start/end. +// +// Java's `lastPollSecondsAgo` gauge is intentionally NOT ported. Java +// implements it as a gauge supplier evaluated at scrape time, which the +// `metrics` crate facade has no equivalent for. A snapshot-at-poll-start +// port would just duplicate `time_between_poll_ms / 1000` and would not +// advance while a consumer is hung — defeating the metric's purpose +// (detecting a stuck consumer). Revisit if the `metrics` crate gains a +// supplier abstraction or we add a background liveness task. +// --------------------------------------------------------------------------- + +/// Gauge: milliseconds between the start of consecutive `poll()` calls. A +/// large value usually means the consumer's downstream processing is slow. +pub const SCANNER_TIME_BETWEEN_POLL_MS: &str = "fluss.client.scanner.time_between_poll_ms"; + +/// Gauge: fraction of wall-clock time spent inside `poll()` — +/// `poll_time_ms / (poll_time_ms + time_between_poll_ms)`. A value near 1.0 +/// means the scanner is starved for data; a low value means the consumer is +/// the bottleneck. +pub const SCANNER_POLL_IDLE_RATIO: &str = "fluss.client.scanner.poll_idle_ratio"; + +// --------------------------------------------------------------------------- +// Scanner fetch + remote download metrics +// +// Fetch metrics are recorded in the LogFetcher fetch loop on response +// completion. Remote metrics are recorded inside RemoteLogDownloader's +// download task. +// +// Java uses a volatile-long gauge for fetch latency and Counter+MeterView +// for rates. Rust uses a histogram for latency (richer percentile data) +// and counters for throughput; the recorder/exporter handles rate +// computation (e.g. Prometheus `rate()`). +// +// Java emits one `ScannerMetricGroup` per (database, table); Rust matches +// that by attaching `database` + `table` labels to every scanner metric +// (see `ScannerMetrics` below). +// --------------------------------------------------------------------------- + +/// Histogram: elapsed ms for each successful FetchLog RPC. +pub const SCANNER_FETCH_LATENCY_MS: &str = "fluss.client.scanner.fetch_latency_ms"; + +/// Counter: total FetchLog RPC requests attempted after connection acquisition. +pub const SCANNER_FETCH_REQUESTS_TOTAL: &str = "fluss.client.scanner.fetch_requests.total"; + +/// Histogram: serialized bytes per successful FetchLog response. +pub const SCANNER_BYTES_PER_REQUEST: &str = "fluss.client.scanner.bytes_per_request"; + +/// Counter: total remote log download attempts (includes per-segment retries). +pub const SCANNER_REMOTE_FETCH_REQUESTS_TOTAL: &str = + "fluss.client.scanner.remote_fetch_requests.total"; + +/// Counter: total bytes downloaded from remote log storage. +pub const SCANNER_REMOTE_FETCH_BYTES_TOTAL: &str = "fluss.client.scanner.remote_fetch_bytes.total"; + +/// Counter: total remote log download failures (each retry attempt counts). +pub const SCANNER_REMOTE_FETCH_ERRORS_TOTAL: &str = + "fluss.client.scanner.remote_fetch_errors.total"; + +// --------------------------------------------------------------------------- +// Per-table scanner metric handles +// --------------------------------------------------------------------------- + +/// Cached `(database, table)`-labeled scanner metric handles. +/// +/// Adding a new scanner metric: declare the constant above, add one +/// field plus an initializer line in [`Self::new`] using the matching +/// `scanner_{gauge,counter,histogram}` helper, and a `record_*` method. +/// The helpers are the single source of truth for the label set, so a +/// future label addition (e.g. `cluster_id`) is a one-line change. +/// +/// # Recorder binding +/// +/// `metrics::counter!(...)` / `gauge!(...)` / `histogram!(...)` resolve +/// the recorder at the macro callsite. Because this struct caches the +/// returned handles, every cached handle is bound to whichever recorder +/// is installed when [`Self::new`] runs. Construct the scanner *after* +/// installing the production recorder; in tests, construct it inside +/// the `metrics::with_local_recorder(...)` closure. With no recorder +/// installed, all `record_*` calls are zero-overhead no-ops. +pub(crate) struct ScannerMetrics { + time_between_poll_ms: metrics::Gauge, + poll_idle_ratio: metrics::Gauge, + fetch_requests_total: metrics::Counter, + fetch_latency_ms: metrics::Histogram, + bytes_per_request: metrics::Histogram, + remote_fetch_requests_total: metrics::Counter, + remote_fetch_bytes_total: metrics::Counter, + remote_fetch_errors_total: metrics::Counter, +} + +impl ScannerMetrics { + /// Build a fresh handle cache for `table_path`. Resolves the + /// currently installed recorder once per metric. + pub(crate) fn new(table_path: &TablePath) -> Self { + let database = table_path.database(); + let table = table_path.table(); + Self { + time_between_poll_ms: scanner_gauge(SCANNER_TIME_BETWEEN_POLL_MS, database, table), + poll_idle_ratio: scanner_gauge(SCANNER_POLL_IDLE_RATIO, database, table), + fetch_requests_total: scanner_counter(SCANNER_FETCH_REQUESTS_TOTAL, database, table), + fetch_latency_ms: scanner_histogram(SCANNER_FETCH_LATENCY_MS, database, table), + bytes_per_request: scanner_histogram(SCANNER_BYTES_PER_REQUEST, database, table), + remote_fetch_requests_total: scanner_counter( + SCANNER_REMOTE_FETCH_REQUESTS_TOTAL, + database, + table, + ), + remote_fetch_bytes_total: scanner_counter( + SCANNER_REMOTE_FETCH_BYTES_TOTAL, + database, + table, + ), + remote_fetch_errors_total: scanner_counter( + SCANNER_REMOTE_FETCH_ERRORS_TOTAL, + database, + table, + ), + } + } + + pub(crate) fn record_time_between_poll_ms(&self, value: f64) { + self.time_between_poll_ms.set(value); + } + + pub(crate) fn record_poll_idle_ratio(&self, value: f64) { + self.poll_idle_ratio.set(value); + } + + pub(crate) fn record_fetch_request(&self) { + self.fetch_requests_total.increment(1); + } + + pub(crate) fn record_fetch_latency_ms(&self, value: f64) { + self.fetch_latency_ms.record(value); + } + + pub(crate) fn record_bytes_per_request(&self, value: f64) { + self.bytes_per_request.record(value); + } + + pub(crate) fn record_remote_fetch_request(&self) { + self.remote_fetch_requests_total.increment(1); + } + + pub(crate) fn record_remote_fetch_bytes(&self, bytes: u64) { + self.remote_fetch_bytes_total.increment(bytes); + } + + pub(crate) fn record_remote_fetch_error(&self) { + self.remote_fetch_errors_total.increment(1); + } +} + +// Per-table scanner handle factories. These centralize the +// `(database, table)` label set so a future schema change (renaming a +// label, adding `cluster_id`, etc.) is a one-line edit instead of +// touching every callsite in `ScannerMetrics::new`. + +fn scanner_gauge(name: &'static str, database: &str, table: &str) -> metrics::Gauge { + metrics::gauge!( + name, + LABEL_DATABASE => database.to_string(), + LABEL_TABLE => table.to_string(), + ) +} + +fn scanner_counter(name: &'static str, database: &str, table: &str) -> metrics::Counter { + metrics::counter!( + name, + LABEL_DATABASE => database.to_string(), + LABEL_TABLE => table.to_string(), + ) +} + +fn scanner_histogram(name: &'static str, database: &str, table: &str) -> metrics::Histogram { + metrics::histogram!( + name, + LABEL_DATABASE => database.to_string(), + LABEL_TABLE => table.to_string(), + ) +} + +/// Returns a label value for reportable API keys, matching Java's +/// `ConnectionMetrics.REPORT_API_KEYS` filter (`ProduceLog`, `FetchLog`, +/// `PutKv`, `Lookup`). Returns `None` for admin/metadata/auth calls to +/// avoid metric cardinality bloat. +pub(crate) fn api_key_label(api_key: ApiKey) -> Option<&'static str> { + match api_key { + ApiKey::ProduceLog => Some("produce_log"), + ApiKey::FetchLog => Some("fetch_log"), + ApiKey::PutKv => Some("put_kv"), + ApiKey::Lookup => Some("lookup"), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_utils::assert_scanner_entries_labeled; + use metrics_util::debugging::DebuggingRecorder; + + macro_rules! find_counter { + ($entries:expr, $name:expr) => { + $entries.iter().find_map(|(key, _, _, val)| { + if key.key().name() == $name { + match val { + metrics_util::debugging::DebugValue::Counter(v) => Some(*v), + _ => None, + } + } else { + None + } + }) + }; + } + + macro_rules! find_histogram { + ($entries:expr, $name:expr) => { + $entries.iter().find_map(|(key, _, _, val)| { + if key.key().name() == $name { + match val { + metrics_util::debugging::DebugValue::Histogram(v) => { + Some(v.iter().map(|f| f.into_inner()).collect::>()) + } + _ => None, + } + } else { + None + } + }) + }; + } + + macro_rules! find_gauge { + ($entries:expr, $name:expr) => { + $entries.iter().find_map(|(key, _, _, val)| { + if key.key().name() == $name { + match val { + metrics_util::debugging::DebugValue::Gauge(g) => Some(g.into_inner()), + _ => None, + } + } else { + None + } + }) + }; + } + + #[test] + fn reportable_api_keys_return_label() { + assert_eq!(api_key_label(ApiKey::ProduceLog), Some("produce_log")); + assert_eq!(api_key_label(ApiKey::FetchLog), Some("fetch_log")); + assert_eq!(api_key_label(ApiKey::PutKv), Some("put_kv")); + assert_eq!(api_key_label(ApiKey::Lookup), Some("lookup")); + } + + #[test] + fn non_reportable_api_keys_return_none() { + assert_eq!(api_key_label(ApiKey::MetaData), None); + assert_eq!(api_key_label(ApiKey::CreateTable), None); + assert_eq!(api_key_label(ApiKey::Authenticate), None); + assert_eq!(api_key_label(ApiKey::ListDatabases), None); + assert_eq!(api_key_label(ApiKey::GetTable), None); + } + + #[test] + fn reportable_request_records_all_connection_metrics() { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + let label = api_key_label(ApiKey::ProduceLog).unwrap(); + + metrics::counter!(CLIENT_REQUESTS_TOTAL, LABEL_API_KEY => label).increment(1); + metrics::counter!(CLIENT_BYTES_SENT_TOTAL, LABEL_API_KEY => label).increment(256); + metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).increment(1.0); + + metrics::counter!(CLIENT_RESPONSES_TOTAL, LABEL_API_KEY => label).increment(1); + metrics::counter!(CLIENT_BYTES_RECEIVED_TOTAL, LABEL_API_KEY => label).increment(128); + metrics::histogram!(CLIENT_REQUEST_LATENCY_MS, LABEL_API_KEY => label).record(42.5); + metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).decrement(1.0); + }); + + let snapshot = snapshotter.snapshot(); + let entries: Vec<_> = snapshot.into_vec(); + + assert_eq!(find_counter!(entries, CLIENT_REQUESTS_TOTAL), Some(1)); + assert_eq!(find_counter!(entries, CLIENT_RESPONSES_TOTAL), Some(1)); + assert_eq!(find_counter!(entries, CLIENT_BYTES_SENT_TOTAL), Some(256)); + assert_eq!( + find_counter!(entries, CLIENT_BYTES_RECEIVED_TOTAL), + Some(128) + ); + assert_eq!( + find_histogram!(entries, CLIENT_REQUEST_LATENCY_MS), + Some(vec![42.5]) + ); + assert_eq!(find_gauge!(entries, CLIENT_REQUESTS_IN_FLIGHT), Some(0.0)); + + let has_label = entries.iter().all(|(key, _, _, _)| { + key.key() + .labels() + .any(|l| l.key() == LABEL_API_KEY && l.value() == "produce_log") + }); + assert!(has_label, "all metrics must carry the api_key label"); + } + + #[test] + fn non_reportable_request_records_no_metrics() { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + let label = api_key_label(ApiKey::MetaData); + assert!(label.is_none()); + // When label is None, no metrics calls are made (matching request() logic). + }); + + let snapshot = snapshotter.snapshot(); + assert!( + snapshot.into_vec().is_empty(), + "non-reportable API keys must not produce metrics" + ); + } + + #[test] + fn inflight_gauge_nets_to_zero_after_balanced_calls() { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + let label = api_key_label(ApiKey::FetchLog).unwrap(); + + // Simulate 3 concurrent requests completing + for _ in 0..3 { + metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).increment(1.0); + } + for _ in 0..3 { + metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).decrement(1.0); + } + }); + + let snapshot = snapshotter.snapshot(); + let entries: Vec<_> = snapshot.into_vec(); + assert_eq!( + find_gauge!(entries, CLIENT_REQUESTS_IN_FLIGHT), + Some(0.0), + "in-flight gauge should be 0 after balanced inc/dec" + ); + } + + #[test] + fn different_api_keys_produce_separate_metric_series() { + use std::collections::HashMap; + + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + let produce_label = api_key_label(ApiKey::ProduceLog).unwrap(); + let fetch_label = api_key_label(ApiKey::FetchLog).unwrap(); + + metrics::counter!(CLIENT_REQUESTS_TOTAL, LABEL_API_KEY => produce_label).increment(5); + metrics::counter!(CLIENT_REQUESTS_TOTAL, LABEL_API_KEY => fetch_label).increment(3); + }); + + let snapshot = snapshotter.snapshot(); + let entries: Vec<_> = snapshot.into_vec(); + + let request_entries: Vec<_> = entries + .iter() + .filter(|(key, _, _, _)| key.key().name() == CLIENT_REQUESTS_TOTAL) + .collect(); + + assert_eq!( + request_entries.len(), + 2, + "produce_log and fetch_log should be separate metric series" + ); + + let mut counter_by_api_key: HashMap = HashMap::new(); + for (key, _, _, val) in request_entries { + let api_key = key + .key() + .labels() + .find(|label| label.key() == LABEL_API_KEY) + .map(|label| label.value()) + .expect("requests total metric must include api_key label"); + + let counter_value = match val { + metrics_util::debugging::DebugValue::Counter(v) => *v, + other => panic!("expected Counter, got {other:?}"), + }; + + counter_by_api_key.insert(api_key.to_string(), counter_value); + } + + assert_eq!(counter_by_api_key.get("produce_log"), Some(&5)); + assert_eq!(counter_by_api_key.get("fetch_log"), Some(&3)); + } + + #[test] + fn scanner_poll_timing_metrics_emit_correctly() { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + let table_path = TablePath::new("db", "tbl"); + let m = ScannerMetrics::new(&table_path); + m.record_time_between_poll_ms(200.0); + m.record_poll_idle_ratio(0.8); + }); + + let snapshot = snapshotter.snapshot(); + let entries: Vec<_> = snapshot.into_vec(); + + assert_eq!( + find_gauge!(entries, SCANNER_TIME_BETWEEN_POLL_MS), + Some(200.0) + ); + assert_eq!(find_gauge!(entries, SCANNER_POLL_IDLE_RATIO), Some(0.8)); + assert_scanner_entries_labeled(&entries, "db", "tbl"); + } + + #[test] + fn scanner_fetch_metrics_emit_correctly() { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + let table_path = TablePath::new("db", "tbl"); + let m = ScannerMetrics::new(&table_path); + m.record_fetch_request(); + m.record_fetch_latency_ms(15.5); + m.record_bytes_per_request(4096.0); + }); + + let snapshot = snapshotter.snapshot(); + let entries: Vec<_> = snapshot.into_vec(); + + assert_eq!( + find_counter!(entries, SCANNER_FETCH_REQUESTS_TOTAL), + Some(1) + ); + assert_eq!( + find_histogram!(entries, SCANNER_FETCH_LATENCY_MS), + Some(vec![15.5]) + ); + assert_eq!( + find_histogram!(entries, SCANNER_BYTES_PER_REQUEST), + Some(vec![4096.0]) + ); + assert_scanner_entries_labeled(&entries, "db", "tbl"); + } + + #[test] + fn scanner_remote_fetch_metrics_emit_correctly() { + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + let table_path = TablePath::new("db", "tbl"); + let m = ScannerMetrics::new(&table_path); + m.record_remote_fetch_request(); + m.record_remote_fetch_request(); + m.record_remote_fetch_request(); + m.record_remote_fetch_bytes(1024); + m.record_remote_fetch_error(); + }); + + let snapshot = snapshotter.snapshot(); + let entries: Vec<_> = snapshot.into_vec(); + + assert_eq!( + find_counter!(entries, SCANNER_REMOTE_FETCH_REQUESTS_TOTAL), + Some(3) + ); + assert_eq!( + find_counter!(entries, SCANNER_REMOTE_FETCH_BYTES_TOTAL), + Some(1024) + ); + assert_eq!( + find_counter!(entries, SCANNER_REMOTE_FETCH_ERRORS_TOTAL), + Some(1) + ); + assert_scanner_entries_labeled(&entries, "db", "tbl"); + } + + /// Two scanners on different tables must produce independent metric + /// series. + #[test] + fn different_table_paths_produce_separate_metric_series() { + use std::collections::HashMap; + + let recorder = DebuggingRecorder::new(); + let snapshotter = recorder.snapshotter(); + + metrics::with_local_recorder(&recorder, || { + let m1 = ScannerMetrics::new(&TablePath::new("db1", "t1")); + let m2 = ScannerMetrics::new(&TablePath::new("db2", "t2")); + + for _ in 0..5 { + m1.record_fetch_request(); + } + for _ in 0..3 { + m2.record_fetch_request(); + } + }); + + let snapshot = snapshotter.snapshot(); + let entries: Vec<_> = snapshot.into_vec(); + + let request_entries: Vec<_> = entries + .iter() + .filter(|(key, _, _, _)| key.key().name() == SCANNER_FETCH_REQUESTS_TOTAL) + .collect(); + + assert_eq!( + request_entries.len(), + 2, + "(db1,t1) and (db2,t2) must be separate metric series" + ); + + let mut counter_by_table: HashMap<(String, String), u64> = HashMap::new(); + for (key, _, _, val) in request_entries { + let mut database = None; + let mut table = None; + for label in key.key().labels() { + if label.key() == LABEL_DATABASE { + database = Some(label.value().to_string()); + } else if label.key() == LABEL_TABLE { + table = Some(label.value().to_string()); + } + } + let database = database.expect("scanner metric must include database label"); + let table = table.expect("scanner metric must include table label"); + let counter_value = match val { + metrics_util::debugging::DebugValue::Counter(v) => *v, + other => panic!("expected Counter, got {other:?}"), + }; + counter_by_table.insert((database, table), counter_value); + } + + assert_eq!( + counter_by_table.get(&("db1".to_string(), "t1".to_string())), + Some(&5), + ); + assert_eq!( + counter_by_table.get(&("db2".to_string(), "t2".to_string())), + Some(&3), + ); + } +} diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs new file mode 100644 index 0000000000..b97fc120de --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/arrow.rs @@ -0,0 +1,2320 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::client::{LogWriteRecord, Record, WriteRecord}; +use crate::compression::{ + ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType, +}; +use crate::error::{Error, Result}; +use crate::metadata::{DataField, DataType, RowType}; +use crate::record::{ChangeType, ScanRecord}; +use crate::row::column_writer::{ColumnWriter, round_up_to_8}; +use crate::row::{ColumnarRow, InternalRow, arrow_row_column_indices, fluss_row_column_indices}; +use arrow::array::{ArrayBuilder, ArrayRef}; +use arrow::{ + array::RecordBatch, + buffer::Buffer, + ipc::{ + CompressionType, + reader::{StreamReader, read_record_batch}, + root_as_message, + writer::StreamWriter, + }, +}; +use arrow_schema::ArrowError::ParseError; +use arrow_schema::SchemaRef; +use arrow_schema::{DataType as ArrowDataType, Field}; +use byteorder::WriteBytesExt; +use byteorder::{ByteOrder, LittleEndian}; +use bytes::Bytes; +use crc32c::crc32c; +use std::{ + cell::Cell, + collections::HashMap, + fs::File, + io::{Cursor, Read, Seek, SeekFrom, Write}, + path::PathBuf, + sync::Arc, +}; + +use crate::error::Error::IllegalArgument; +use arrow::ipc::writer::IpcWriteOptions; +/// const for record batch +pub const BASE_OFFSET_LENGTH: usize = 8; +pub const LENGTH_LENGTH: usize = 4; +pub const MAGIC_LENGTH: usize = 1; +pub const COMMIT_TIMESTAMP_LENGTH: usize = 8; +pub const CRC_LENGTH: usize = 4; +pub const SCHEMA_ID_LENGTH: usize = 2; +pub const ATTRIBUTE_LENGTH: usize = 1; +pub const LAST_OFFSET_DELTA_LENGTH: usize = 4; +pub const WRITE_CLIENT_ID_LENGTH: usize = 8; +pub const BATCH_SEQUENCE_LENGTH: usize = 4; +pub const RECORDS_COUNT_LENGTH: usize = 4; + +pub const BASE_OFFSET_OFFSET: usize = 0; +pub const LENGTH_OFFSET: usize = BASE_OFFSET_OFFSET + BASE_OFFSET_LENGTH; +pub const MAGIC_OFFSET: usize = LENGTH_OFFSET + LENGTH_LENGTH; +pub const COMMIT_TIMESTAMP_OFFSET: usize = MAGIC_OFFSET + MAGIC_LENGTH; +pub const CRC_OFFSET: usize = COMMIT_TIMESTAMP_OFFSET + COMMIT_TIMESTAMP_LENGTH; +pub const SCHEMA_ID_OFFSET: usize = CRC_OFFSET + CRC_LENGTH; +pub const ATTRIBUTES_OFFSET: usize = SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH; +pub const LAST_OFFSET_DELTA_OFFSET: usize = ATTRIBUTES_OFFSET + ATTRIBUTE_LENGTH; +pub const WRITE_CLIENT_ID_OFFSET: usize = LAST_OFFSET_DELTA_OFFSET + LAST_OFFSET_DELTA_LENGTH; +pub const BATCH_SEQUENCE_OFFSET: usize = WRITE_CLIENT_ID_OFFSET + WRITE_CLIENT_ID_LENGTH; +pub const RECORDS_COUNT_OFFSET: usize = BATCH_SEQUENCE_OFFSET + BATCH_SEQUENCE_LENGTH; +pub const RECORDS_OFFSET: usize = RECORDS_COUNT_OFFSET + RECORDS_COUNT_LENGTH; + +pub const RECORD_BATCH_HEADER_SIZE: usize = RECORDS_OFFSET; +pub const ARROW_CHANGETYPE_OFFSET: usize = RECORD_BATCH_HEADER_SIZE; +pub const LOG_OVERHEAD: usize = LENGTH_OFFSET + LENGTH_LENGTH; + +/// Maximum batch size matches Java's Integer.MAX_VALUE limit. +/// Java uses int type for batch size, so max value is 2^31 - 1 = 2,147,483,647 bytes (~2GB). +/// This is the implicit limit in FileLogRecords.java and other Java components. +pub const MAX_BATCH_SIZE: usize = i32::MAX as usize; // 2,147,483,647 bytes (~2GB) + +/// const for record +/// The "magic" values. +#[derive(Debug, Clone, Copy)] +pub enum LogMagicValue { + V0 = 0, +} + +/// Safely convert batch size from i32 to usize with validation. +/// +/// Validates that: +/// - batch_size_bytes is non-negative +/// - batch_size_bytes + LOG_OVERHEAD doesn't overflow +/// - Result is within reasonable bounds +fn validate_batch_size(batch_size_bytes: i32) -> Result { + // Check for negative size (corrupted data) + if batch_size_bytes < 0 { + return Err(Error::UnexpectedError { + message: format!("Invalid negative batch size: {batch_size_bytes}"), + source: None, + }); + } + + let batch_size_u = batch_size_bytes as usize; + + // Check for overflow when adding LOG_OVERHEAD + let total_size = + batch_size_u + .checked_add(LOG_OVERHEAD) + .ok_or_else(|| Error::UnexpectedError { + message: format!( + "Batch size {batch_size_u} + LOG_OVERHEAD {LOG_OVERHEAD} would overflow" + ), + source: None, + })?; + + // Sanity check: reject unreasonably large batches + if total_size > MAX_BATCH_SIZE { + return Err(Error::UnexpectedError { + message: format!( + "Batch size {total_size} exceeds maximum allowed size {MAX_BATCH_SIZE}" + ), + source: None, + }); + } + + Ok(total_size) +} + +// NOTE: Rust layout/offsets currently match Java only for V0. +// TODO: Add V1 layout/offsets to keep parity with Java's V1 format. +pub const CURRENT_LOG_MAGIC_VALUE: u8 = LogMagicValue::V0 as u8; + +/// Value used if writer ID is not available or non-idempotent. +pub const NO_WRITER_ID: i64 = -1; + +/// Value used if batch sequence is not available. +pub const NO_BATCH_SEQUENCE: i32 = -1; + +pub const BUILDER_DEFAULT_OFFSET: i64 = 0; + +/// Initial capacity for Arrow column vectors (pre-allocation hint, not a record cap). +/// Matching Java's `ArrowWriter.INITIAL_CAPACITY`. +const INITIAL_ROW_CAPACITY: usize = 1024; + +/// Fraction of the allocated buffer used as the effective write limit. +/// Matching Java's `ArrowWriter.BUFFER_USAGE_RATIO`. +const BUFFER_USAGE_RATIO: f32 = 0.95; + +pub struct MemoryLogRecordsArrowBuilder { + base_log_offset: i64, + schema_id: i32, + magic: u8, + writer_id: i64, + batch_sequence: i32, + arrow_record_batch_builder: Box, + is_closed: bool, + arrow_compression_info: ArrowCompressionInfo, + /// Effective write limit in bytes (after applying BUFFER_USAGE_RATIO). + write_limit: usize, + /// Pre-computed Arrow IPC overhead (metadata + body framing) for this schema. + /// Constant per schema+compression combination. + ipc_overhead: usize, + /// Estimated record count at which the next byte-size check should occur. + /// -1 means "unknown — check on the next append". Updated dynamically to + /// skip expensive `estimated_size_in_bytes()` calls on every append. + /// Matching Java's `ArrowWriter.estimatedMaxRecordsCount`. + estimated_max_records_count: Cell, + /// Compression ratio estimator shared across batches for the same table. + compression_ratio_estimator: Arc, + /// Snapshot of the compression ratio at batch creation time. + /// Matching Java's `ArrowWriter.estimatedCompressionRatio` which is + /// cached per batch and only refreshed on `reset()`. + estimated_compression_ratio: f32, +} + +pub trait ArrowRecordBatchInnerBuilder: Send { + fn build_arrow_record_batch(&mut self) -> Result>; + + fn append(&mut self, row: &dyn InternalRow) -> Result; + + fn append_batch(&mut self, record_batch: Arc) -> Result; + + fn schema(&self) -> SchemaRef; + + fn records_count(&self) -> i32; + + fn is_full(&self) -> bool; + + /// Get an estimate of the size in bytes of the arrow data. + fn estimated_size_in_bytes(&self) -> usize; +} + +#[derive(Default)] +pub struct PrebuiltRecordBatchBuilder { + arrow_record_batch: Option>, + records_count: i32, +} + +impl ArrowRecordBatchInnerBuilder for PrebuiltRecordBatchBuilder { + fn build_arrow_record_batch(&mut self) -> Result> { + Ok(self.arrow_record_batch.as_ref().unwrap().clone()) + } + + fn append(&mut self, _row: &dyn InternalRow) -> Result { + // append one single row is not supported, return false directly + Ok(false) + } + + fn append_batch(&mut self, record_batch: Arc) -> Result { + if self.arrow_record_batch.is_some() { + return Ok(false); + } + self.records_count = record_batch.num_rows() as i32; + self.arrow_record_batch = Some(record_batch); + Ok(true) + } + + fn schema(&self) -> SchemaRef { + self.arrow_record_batch.as_ref().unwrap().schema() + } + + fn records_count(&self) -> i32 { + self.records_count + } + + fn is_full(&self) -> bool { + // full if has one record batch + self.arrow_record_batch.is_some() + } + + fn estimated_size_in_bytes(&self) -> usize { + self.arrow_record_batch + .as_ref() + .map(|batch| batch.get_array_memory_size()) + .unwrap_or(0) + } +} + +pub struct RowAppendRecordBatchBuilder { + table_schema: SchemaRef, + column_writers: Vec, + records_count: i32, +} + +impl RowAppendRecordBatchBuilder { + pub fn new(row_type: &RowType) -> Result { + let capacity = INITIAL_ROW_CAPACITY; + let schema_ref = to_arrow_schema(row_type)?; + let writers: Result> = row_type + .fields() + .iter() + .enumerate() + .map(|(pos, field)| { + let arrow_type = schema_ref.field(pos).data_type(); + ColumnWriter::create(field.data_type(), arrow_type, pos, capacity) + }) + .collect(); + Ok(Self { + table_schema: schema_ref.clone(), + column_writers: writers?, + records_count: 0, + }) + } + /// Appends a row to the builder. + pub fn append(&mut self, row: &dyn InternalRow) -> Result { + ArrowRecordBatchInnerBuilder::append(self, row) + } + + /// Builds the final Arrow RecordBatch. + pub fn build_arrow_record_batch(&mut self) -> Result> { + ArrowRecordBatchInnerBuilder::build_arrow_record_batch(self) + } +} + +impl ArrowRecordBatchInnerBuilder for RowAppendRecordBatchBuilder { + fn build_arrow_record_batch(&mut self) -> Result> { + let arrays: Result> = self + .column_writers + .iter_mut() + .enumerate() + .map(|(idx, writer)| { + let array = writer.finish(); + let expected_type = self.table_schema.field(idx).data_type(); + + // Validate array type matches schema + if array.data_type() != expected_type { + return Err(Error::IllegalArgument { + message: format!( + "Builder type mismatch at column {}: expected {:?}, got {:?}", + idx, + expected_type, + array.data_type() + ), + }); + } + + Ok(array) + }) + .collect(); + + Ok(Arc::new(RecordBatch::try_new( + self.table_schema.clone(), + arrays?, + )?)) + } + + fn append(&mut self, row: &dyn InternalRow) -> Result { + for writer in &mut self.column_writers { + writer.write_field(row)?; + } + self.records_count += 1; + Ok(true) + } + + fn append_batch(&mut self, _record_batch: Arc) -> Result { + Ok(false) + } + + fn schema(&self) -> SchemaRef { + self.table_schema.clone() + } + + fn records_count(&self) -> i32 { + self.records_count + } + + fn is_full(&self) -> bool { + // Size-based fullness is handled by MemoryLogRecordsArrowBuilder, + // which accounts for metadata length and compression ratio. + false + } + + fn estimated_size_in_bytes(&self) -> usize { + // Returns the uncompressed Arrow IPC body size by reading buffer lengths + // directly from the builders — O(num_columns), zero allocation. + // Analogous to Java's `ArrowUtils.estimateArrowBodyLength()`. + // Java reads exact IPC buffer sizes from vectors; we read builder + // buffer lengths. The IPC framing overhead is accounted for + // separately by `ipc_overhead`. + self.column_writers.iter().map(|w| w.buffer_size()).sum() + } +} + +// TODO: Pool and reuse MemoryLogRecordsArrowBuilder instances per table/schema like +// Java's ArrowWriterPool. Reused writers can seed `estimated_max_records_count` from +// the previous batch (recordsCount / 2) for a warm start, avoiding the first-record +// size check on every new batch. +impl MemoryLogRecordsArrowBuilder { + pub fn new( + schema_id: i32, + row_type: &RowType, + to_append_record_batch: bool, + arrow_compression_info: ArrowCompressionInfo, + write_limit: usize, + compression_ratio_estimator: Arc, + ) -> Result { + let arrow_batch_builder: Box = { + if to_append_record_batch { + Box::new(PrebuiltRecordBatchBuilder::default()) + } else { + Box::new(RowAppendRecordBatchBuilder::new(row_type)?) + } + }; + let schema = to_arrow_schema(row_type)?; + let ipc_overhead = + estimate_arrow_ipc_overhead(&schema, arrow_compression_info.get_compression_type())?; + let effective_limit = (write_limit as f32 * BUFFER_USAGE_RATIO) as usize; + let estimated_compression_ratio = compression_ratio_estimator.estimation(); + Ok(MemoryLogRecordsArrowBuilder { + base_log_offset: BUILDER_DEFAULT_OFFSET, + schema_id, + magic: CURRENT_LOG_MAGIC_VALUE, + writer_id: NO_WRITER_ID, + batch_sequence: NO_BATCH_SEQUENCE, + is_closed: false, + arrow_record_batch_builder: arrow_batch_builder, + arrow_compression_info, + write_limit: effective_limit, + ipc_overhead, + estimated_max_records_count: Cell::new(-1), + compression_ratio_estimator, + estimated_compression_ratio, + }) + } + + pub fn append(&mut self, record: &WriteRecord) -> Result { + match &record.record() { + Record::Log(log_write_record) => match log_write_record { + LogWriteRecord::InternalRow(row) => { + Ok(self.arrow_record_batch_builder.append(*row)?) + } + LogWriteRecord::RecordBatch(record_batch) => Ok(self + .arrow_record_batch_builder + .append_batch(record_batch.clone())?), + }, + Record::Kv(_) => Err(Error::UnsupportedOperation { + message: "Only LogRecord is supported to append".to_string(), + }), + } + // todo: consider write other change type + } + + /// Check if the builder is full based on estimated serialized size. + /// + /// Uses a threshold-based optimization to skip expensive size checks: + /// only computes the actual estimated size when the record count reaches + /// the predicted threshold. Matching Java's `ArrowWriter.isFull()`. + pub fn is_full(&self) -> bool { + // Delegate to inner builder first (e.g. PrebuiltRecordBatchBuilder + // is always full after one batch, regardless of size). + if self.arrow_record_batch_builder.is_full() { + return true; + } + let records_count = self.arrow_record_batch_builder.records_count(); + let threshold = self.estimated_max_records_count.get(); + if records_count > 0 && records_count >= threshold { + let body_size = self.arrow_record_batch_builder.estimated_size_in_bytes(); + let estimated_body = self.estimated_compressed_size(body_size); + let current_size = self.ipc_overhead + estimated_body; + if current_size >= self.write_limit { + return true; + } + if estimated_body == 0 { + self.estimated_max_records_count.set(records_count + 1); + return false; + } + // Matching Java: subtract fixed metadata overhead from the limit, + // divide remaining body budget by per-record body cost. + let body_per_record = estimated_body as f64 / records_count as f64; + let next = ((self.write_limit.saturating_sub(self.ipc_overhead) as f64 + / body_per_record) + .ceil() as i32) + .max(records_count + 1); + self.estimated_max_records_count.set(next); + } + false + } + + /// Estimate the compressed body size using the ratio snapshot taken at batch creation. + /// Matching Java's `ArrowWriter.estimatedBytesWritten()`. + fn estimated_compressed_size(&self, uncompressed_body: usize) -> usize { + if self.arrow_compression_info.compression_type == ArrowCompressionType::None { + uncompressed_body + } else { + (uncompressed_body as f64 * self.estimated_compression_ratio as f64) as usize + } + } + + pub fn is_closed(&self) -> bool { + self.is_closed + } + + pub fn close(&mut self) { + self.is_closed = true; + } + + pub fn build(&mut self) -> Result> { + // Capture uncompressed body size before serialization for compression ratio update. + let uncompressed_body_size = self.arrow_record_batch_builder.estimated_size_in_bytes(); + + // serialize arrow batch + let mut arrow_batch_bytes = vec![]; + let table_schema = self.arrow_record_batch_builder.schema(); + let compression_type = self.arrow_compression_info.get_compression_type(); + let write_option = + IpcWriteOptions::try_with_compression(IpcWriteOptions::default(), compression_type); + let mut writer = StreamWriter::try_new_with_options( + &mut arrow_batch_bytes, + &table_schema, + write_option?, + )?; + + // get header len + let header = writer.get_ref().len(); + let record_batch = self.arrow_record_batch_builder.build_arrow_record_batch()?; + writer.write(record_batch.as_ref())?; + // get real arrow batch bytes (metadata + body, potentially compressed) + let real_arrow_batch_bytes = &arrow_batch_bytes[header..]; + + // Update compression ratio estimator with actual ratio. + // The serialized bytes include metadata + compressed body. Subtract + // metadata to isolate the compressed body for an accurate ratio. + if uncompressed_body_size > 0 + && self.arrow_compression_info.compression_type != ArrowCompressionType::None + { + let compressed_body_size = real_arrow_batch_bytes + .len() + .saturating_sub(self.ipc_overhead); + let actual_ratio = compressed_body_size as f32 / uncompressed_body_size as f32; + self.compression_ratio_estimator + .update_estimation(actual_ratio); + } + + // now, write batch header and arrow batch + let mut batch_bytes = vec![0u8; RECORD_BATCH_HEADER_SIZE + real_arrow_batch_bytes.len()]; + // write batch header + self.write_batch_header(&mut batch_bytes[..])?; + + // write arrow batch bytes + let mut cursor = Cursor::new(&mut batch_bytes[..]); + cursor.set_position(RECORD_BATCH_HEADER_SIZE as u64); + cursor.write_all(real_arrow_batch_bytes)?; + + let calcute_crc_bytes = &cursor.get_ref()[SCHEMA_ID_OFFSET..]; + // then update crc + let crc = crc32c(calcute_crc_bytes); + cursor.set_position(CRC_OFFSET as u64); + cursor.write_u32::(crc)?; + + Ok(batch_bytes.to_vec()) + } + + fn write_batch_header(&self, buffer: &mut [u8]) -> Result<()> { + let total_len = buffer.len(); + let mut cursor = Cursor::new(buffer); + cursor.write_i64::(self.base_log_offset)?; + cursor + .write_i32::((total_len - BASE_OFFSET_LENGTH - LENGTH_LENGTH) as i32)?; + cursor.write_u8(self.magic)?; + cursor.write_i64::(0)?; // timestamp placeholder + cursor.write_u32::(0)?; // crc placeholder + cursor.write_i16::(self.schema_id as i16)?; + + let record_count = self.arrow_record_batch_builder.records_count(); + // todo: curerntly, always is append only + let append_only = true; + cursor.write_u8(if append_only { 1 } else { 0 })?; + cursor.write_i32::(if record_count > 0 { + record_count - 1 + } else { + 0 + })?; + + cursor.write_i64::(self.writer_id)?; + cursor.write_i32::(self.batch_sequence)?; + cursor.write_i32::(record_count)?; + Ok(()) + } + + pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) { + self.writer_id = writer_id; + self.batch_sequence = batch_base_sequence; + } + + /// Get an estimate of the number of bytes written to the underlying buffer. + /// Includes Fluss record batch header + Arrow IPC metadata + estimated + /// compressed body size. + pub fn estimated_size_in_bytes(&self) -> usize { + let body = self.arrow_record_batch_builder.estimated_size_in_bytes(); + let estimated_body = self.estimated_compressed_size(body); + RECORD_BATCH_HEADER_SIZE + self.ipc_overhead + estimated_body + } +} + +/// Estimate the Arrow IPC overhead (metadata + body framing) for a given schema. +/// +/// Serializes a 1-row RecordBatch with known data sizes, then subtracts the +/// raw data contribution to isolate the fixed overhead: IPC message header, +/// RecordBatch flatbuffer, and per-buffer alignment padding within the body. +/// This overhead is constant for a given schema+compression combination. +/// +/// Note: called once per batch creation. With writer pooling (see TODO above), +/// this would be computed once per pooled writer and reused across batches. +/// Analogous to Java's `ArrowUtils.estimateArrowMetadataLength()`. +fn estimate_arrow_ipc_overhead( + schema: &SchemaRef, + compression: Option, +) -> Result { + use arrow::array::new_null_array; + + // Create a 1-row batch of nulls. Null arrays have minimal, predictable + // data: no validity bitmap, no variable-length data, just fixed-width + // zero buffers. This lets us compute raw data size exactly. + let null_arrays: Vec = schema + .fields() + .iter() + .map(|field| new_null_array(field.data_type(), 1)) + .collect(); + let batch = RecordBatch::try_new(schema.clone(), null_arrays)?; + + // Sum the raw buffer sizes — this is what buffer_size() would report. + let raw_data: usize = batch + .columns() + .iter() + .map(|col| { + col.to_data() + .buffers() + .iter() + .map(|buf| round_up_to_8(buf.len())) + .sum::() + // Validity buffer (null bitmap) + + col + .nulls() + .map_or(0, |n| round_up_to_8(n.buffer().len())) + }) + .sum(); + + // Serialize the batch via IPC and measure total output. + let mut buf = vec![]; + let write_option = + IpcWriteOptions::try_with_compression(IpcWriteOptions::default(), compression); + let mut writer = StreamWriter::try_new_with_options(&mut buf, schema, write_option?)?; + let header_len = writer.get_ref().len(); + writer.write(&batch)?; + let total_len = writer.get_ref().len(); + + // IPC overhead = total message size - raw data we put in. + let ipc_message_len = total_len - header_len; + Ok(ipc_message_len.saturating_sub(raw_data)) +} + +pub trait ToArrow { + fn append_to(&self, builder: &mut dyn ArrayBuilder) -> Result<()>; +} + +/// In-memory log record source. +/// Used for local tablet server fetches (existing path). +struct MemorySource { + data: Bytes, +} + +impl MemorySource { + fn new(data: Vec) -> Self { + Self { + data: Bytes::from(data), + } + } + + fn read_batch_header(&mut self, pos: usize) -> Result<(i64, usize)> { + if pos + LOG_OVERHEAD > self.data.len() { + return Err(Error::UnexpectedError { + message: format!( + "Position {} + LOG_OVERHEAD {} exceeds data size {}", + pos, + LOG_OVERHEAD, + self.data.len() + ), + source: None, + }); + } + + let base_offset = LittleEndian::read_i64(&self.data[pos + BASE_OFFSET_OFFSET..]); + let batch_size_bytes = LittleEndian::read_i32(&self.data[pos + LENGTH_OFFSET..]); + + // Validate batch size to prevent integer overflow and corruption + let batch_size = validate_batch_size(batch_size_bytes)?; + + Ok((base_offset, batch_size)) + } + + fn read_batch_data(&mut self, pos: usize, size: usize) -> Result { + if pos + size > self.data.len() { + return Err(Error::UnexpectedError { + message: format!( + "Read beyond data size: {} + {} > {}", + pos, + size, + self.data.len() + ), + source: None, + }); + } + // Zero-copy slice (Bytes is Arc-based) + Ok(self.data.slice(pos..pos + size)) + } + + fn total_size(&self) -> usize { + self.data.len() + } +} + +/// RAII guard that deletes a file when dropped. +/// Used to ensure file deletion happens AFTER the file handle is closed. +struct FileCleanupGuard { + file_path: PathBuf, +} + +impl Drop for FileCleanupGuard { + fn drop(&mut self) { + // File handle is already closed (this guard drops after the file field) + if let Err(e) = std::fs::remove_file(&self.file_path) { + log::warn!( + "Failed to delete remote log file {}: {}", + self.file_path.display(), + e + ); + } else { + log::debug!("Deleted remote log file: {}", self.file_path.display()); + } + } +} + +/// File-backed log record source. +/// Used for remote log segments downloaded to local disk. +/// Streams data on-demand instead of loading entire file into memory. +/// +/// Uses seek + read_exact for cross-platform compatibility. +/// Access pattern is sequential iteration (single consumer). +struct FileSource { + file: File, + file_size: usize, + base_offset: usize, + _cleanup: Option, // Drops AFTER file (field order matters!) +} + +impl FileSource { + /// Create a new FileSource. + /// + /// The file at `file_path` will be deleted when this FileSource is dropped. + fn new(file: File, base_offset: usize, file_path: PathBuf) -> Result { + let file_size = file.metadata()?.len() as usize; + + // Validate base_offset to prevent underflow in total_size() + if base_offset > file_size { + return Err(Error::UnexpectedError { + message: format!("base_offset ({base_offset}) exceeds file_size ({file_size})"), + source: None, + }); + } + + Ok(Self { + file, + file_size, + base_offset, + _cleanup: Some(FileCleanupGuard { file_path }), + }) + } + + /// Read data at a specific position using seek + read_exact. + /// This is cross-platform and adequate for sequential access patterns. + fn read_at(&mut self, pos: u64, buf: &mut [u8]) -> Result<()> { + self.file.seek(SeekFrom::Start(pos))?; + self.file.read_exact(buf)?; + Ok(()) + } + + fn read_batch_header(&mut self, pos: usize) -> Result<(i64, usize)> { + let actual_pos = self.base_offset + pos; + if actual_pos + LOG_OVERHEAD > self.file_size { + return Err(Error::UnexpectedError { + message: format!( + "Position {} exceeds file size {}", + actual_pos, self.file_size + ), + source: None, + }); + } + + // Read only the header to extract base_offset and batch_size + let mut header_buf = vec![0u8; LOG_OVERHEAD]; + self.read_at(actual_pos as u64, &mut header_buf)?; + + let base_offset = LittleEndian::read_i64(&header_buf[BASE_OFFSET_OFFSET..]); + let batch_size_bytes = LittleEndian::read_i32(&header_buf[LENGTH_OFFSET..]); + + // Validate batch size to prevent integer overflow and corruption + let batch_size = validate_batch_size(batch_size_bytes)?; + + Ok((base_offset, batch_size)) + } + + fn read_batch_data(&mut self, pos: usize, size: usize) -> Result { + let actual_pos = self.base_offset + pos; + if actual_pos + size > self.file_size { + return Err(Error::UnexpectedError { + message: format!( + "Read beyond file size: {} + {} > {}", + actual_pos, size, self.file_size + ), + source: None, + }); + } + + // Read the full batch data + let mut batch_buf = vec![0u8; size]; + self.read_at(actual_pos as u64, &mut batch_buf)?; + + Ok(Bytes::from(batch_buf)) + } + + fn total_size(&self) -> usize { + self.file_size - self.base_offset + } +} + +/// Enum for different log record sources. +enum LogRecordsSource { + Memory(MemorySource), + File(FileSource), +} + +impl LogRecordsSource { + fn read_batch_header(&mut self, pos: usize) -> Result<(i64, usize)> { + match self { + Self::Memory(s) => s.read_batch_header(pos), + Self::File(s) => s.read_batch_header(pos), + } + } + + fn read_batch_data(&mut self, pos: usize, size: usize) -> Result { + match self { + Self::Memory(s) => s.read_batch_data(pos, size), + Self::File(s) => s.read_batch_data(pos, size), + } + } + + fn total_size(&self) -> usize { + match self { + Self::Memory(s) => s.total_size(), + Self::File(s) => s.total_size(), + } + } +} + +pub struct LogRecordsBatches { + source: LogRecordsSource, + current_pos: usize, + remaining_bytes: usize, +} + +impl LogRecordsBatches { + /// Create from in-memory Vec (existing path - backward compatible). + pub fn new(data: Vec) -> Self { + let source = LogRecordsSource::Memory(MemorySource::new(data)); + let remaining_bytes = source.total_size(); + Self { + source, + current_pos: 0, + remaining_bytes, + } + } + + /// Create from file. + /// Enables streaming without loading entire file into memory. + /// + /// The file at `file_path` will be deleted when dropped. + /// This ensures the file is closed before deletion. + pub fn from_file(file: File, base_offset: usize, file_path: PathBuf) -> Result { + let source = FileSource::new(file, base_offset, file_path)?; + let remaining_bytes = source.total_size(); + Ok(Self { + source: LogRecordsSource::File(source), + current_pos: 0, + remaining_bytes, + }) + } + + /// Try to get the size of the next batch. + fn next_batch_size(&mut self) -> Result> { + if self.remaining_bytes < LOG_OVERHEAD { + return Ok(None); + } + + // Read only header to get size + match self.source.read_batch_header(self.current_pos) { + Ok((_base_offset, batch_size)) => { + if batch_size > self.remaining_bytes { + Ok(None) + } else { + Ok(Some(batch_size)) + } + } + Err(e) => Err(e), + } + } +} + +impl Iterator for LogRecordsBatches { + type Item = Result; + + fn next(&mut self) -> Option { + match self.next_batch_size() { + Ok(Some(batch_size)) => { + // Read full batch data on-demand + match self.source.read_batch_data(self.current_pos, batch_size) { + Ok(data) => { + let record_batch = LogRecordBatch::new(data); + self.current_pos += batch_size; + self.remaining_bytes -= batch_size; + Some(Ok(record_batch)) + } + Err(e) => Some(Err(e)), + } + } + Ok(None) => None, + Err(e) => Some(Err(e)), + } + } +} + +pub struct LogRecordBatch { + data: Bytes, +} + +#[allow(dead_code)] +impl LogRecordBatch { + pub fn new(data: Bytes) -> Self { + LogRecordBatch { data } + } + + pub fn magic(&self) -> u8 { + self.data[MAGIC_OFFSET] + } + + pub fn commit_timestamp(&self) -> i64 { + let offset = COMMIT_TIMESTAMP_OFFSET; + LittleEndian::read_i64(&self.data[offset..offset + COMMIT_TIMESTAMP_LENGTH]) + } + + pub fn writer_id(&self) -> i64 { + let offset = WRITE_CLIENT_ID_OFFSET; + LittleEndian::read_i64(&self.data[offset..offset + WRITE_CLIENT_ID_LENGTH]) + } + + pub fn batch_sequence(&self) -> i32 { + let offset = BATCH_SEQUENCE_OFFSET; + LittleEndian::read_i32(&self.data[offset..offset + BATCH_SEQUENCE_LENGTH]) + } + + pub fn ensure_valid(&self) -> Result<()> { + // TODO enable validation once checksum handling is corrected. + Ok(()) + } + + pub fn is_valid(&self) -> bool { + self.size_in_bytes() >= RECORD_BATCH_HEADER_SIZE + && self.checksum() == self.compute_checksum() + } + + fn compute_checksum(&self) -> u32 { + let start = SCHEMA_ID_OFFSET; + crc32c(&self.data[start..]) + } + + fn attributes(&self) -> u8 { + self.data[ATTRIBUTES_OFFSET] + } + + pub fn next_log_offset(&self) -> i64 { + self.last_log_offset() + 1 + } + + pub fn checksum(&self) -> u32 { + let offset = CRC_OFFSET; + LittleEndian::read_u32(&self.data[offset..offset + CRC_LENGTH]) + } + + pub fn schema_id(&self) -> i16 { + let offset = SCHEMA_ID_OFFSET; + LittleEndian::read_i16(&self.data[offset..offset + SCHEMA_ID_LENGTH]) + } + + pub fn base_log_offset(&self) -> i64 { + let offset = BASE_OFFSET_OFFSET; + LittleEndian::read_i64(&self.data[offset..offset + BASE_OFFSET_LENGTH]) + } + + pub fn last_log_offset(&self) -> i64 { + self.base_log_offset() + self.last_offset_delta() as i64 + } + + fn last_offset_delta(&self) -> i32 { + let offset = LAST_OFFSET_DELTA_OFFSET; + LittleEndian::read_i32(&self.data[offset..offset + LAST_OFFSET_DELTA_LENGTH]) + } + + pub fn size_in_bytes(&self) -> usize { + let offset = LENGTH_OFFSET; + LittleEndian::read_i32(&self.data[offset..offset + LENGTH_LENGTH]) as usize + LOG_OVERHEAD + } + + pub fn record_count(&self) -> i32 { + let offset = RECORDS_COUNT_OFFSET; + LittleEndian::read_i32(&self.data[offset..offset + RECORDS_COUNT_LENGTH]) + } + + pub fn records(&self, read_context: &ReadContext) -> Result { + if self.record_count() == 0 { + return Ok(LogRecordIterator::empty()); + } + + let data = &self.data[RECORDS_OFFSET..]; + + let record_batch = read_context.record_batch(data)?; + let arrow_reader = ArrowReader::new_with_fluss_row_type( + Arc::new(record_batch), + read_context.row_type.clone(), + read_context.fluss_row_type().cloned(), + ); + let log_record_iterator = LogRecordIterator::Arrow(ArrowLogRecordIterator { + reader: arrow_reader, + base_offset: self.base_log_offset(), + timestamp: self.commit_timestamp(), + row_id: 0, + change_type: ChangeType::AppendOnly, + }); + + Ok(log_record_iterator) + } + + pub fn records_for_remote_log(&self, read_context: &ReadContext) -> Result { + if self.record_count() == 0 { + return Ok(LogRecordIterator::empty()); + } + + let data = &self.data[RECORDS_OFFSET..]; + + let record_batch = read_context.record_batch_for_remote_log(data)?; + let log_record_iterator = match record_batch { + None => LogRecordIterator::empty(), + Some(record_batch) => { + let arrow_reader = ArrowReader::new_with_fluss_row_type( + Arc::new(record_batch), + read_context.row_type.clone(), + read_context.fluss_row_type().cloned(), + ); + LogRecordIterator::Arrow(ArrowLogRecordIterator { + reader: arrow_reader, + base_offset: self.base_log_offset(), + timestamp: self.commit_timestamp(), + row_id: 0, + change_type: ChangeType::AppendOnly, + }) + } + }; + Ok(log_record_iterator) + } + + /// Returns the record batch directly without creating an iterator. + /// This is more efficient when you need the entire batch rather than + /// iterating row-by-row. + pub fn record_batch(&self, read_context: &ReadContext) -> Result { + if self.record_count() == 0 { + // Return empty batch with correct schema + return Ok(RecordBatch::new_empty(read_context.target_schema.clone())); + } + + let data = self + .data + .get(RECORDS_OFFSET..) + .ok_or_else(|| Error::UnexpectedError { + message: format!( + "Corrupt log record batch: data length {} is less than RECORDS_OFFSET {}", + self.data.len(), + RECORDS_OFFSET + ), + source: None, + })?; + read_context.record_batch(data) + } +} + +/// Parse an Arrow IPC message from a byte slice. +/// +/// Server returns RecordBatch message (without Schema message) in the encapsulated message format. +/// Format: [continuation: 4 bytes (0xFFFFFFFF)][metadata_size: 4 bytes][RecordBatch metadata][body] +/// +/// This format is documented at: +/// https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format +/// +/// # Arguments +/// * `data` - The byte slice containing the IPC message. +/// +/// # Returns +/// Returns `Ok((batch_metadata, body_buffer, version))` on success: +/// - `batch_metadata`: The RecordBatch metadata from the IPC message. +/// - `body_buffer`: The buffer containing the record batch body data. +/// - `version`: The Arrow IPC metadata version. +/// +/// Returns `Err(arrow_error)` on errors +/// - `arrow_error`: Error details e.g. malformed, too short or bad continuation marker. +fn parse_ipc_message( + data: &[u8], +) -> Result<( + arrow::ipc::RecordBatch<'_>, + Buffer, + arrow::ipc::MetadataVersion, +)> { + const CONTINUATION_MARKER: u32 = 0xFFFFFFFF; + + if data.len() < 8 { + Err(ParseError(format!("Invalid data length: {}", data.len())))? + } + + let continuation = LittleEndian::read_u32(&data[0..4]); + let metadata_size = LittleEndian::read_u32(&data[4..8]) as usize; + + if continuation != CONTINUATION_MARKER { + Err(ParseError(format!( + "Invalid continuation marker: {continuation}" + )))? + } + + if data.len() < 8 + metadata_size { + Err(ParseError(format!( + "Invalid data length. Remaining data length {} is shorter than specified size {}", + data.len() - 8, + metadata_size + )))? + } + + let metadata_bytes = &data[8..8 + metadata_size]; + let message = root_as_message(metadata_bytes).map_err(|err| ParseError(err.to_string()))?; + let batch_metadata = message + .header_as_record_batch() + .ok_or(ParseError(String::from("Not a record batch")))?; + + let metadata_padded_size = (metadata_size + 7) & !7; + let body_start = 8 + metadata_padded_size; + let body_data = &data[body_start..]; + let body_buffer = Buffer::from(body_data); + + Ok((batch_metadata, body_buffer, message.version())) +} + +pub fn to_arrow_schema(fluss_schema: &RowType) -> Result { + let fields: Result> = fluss_schema + .fields() + .iter() + .map(|f| { + Ok(Field::new( + f.name(), + to_arrow_type(f.data_type())?, + f.data_type().is_nullable(), + )) + }) + .collect(); + + Ok(SchemaRef::new(arrow_schema::Schema::new(fields?))) +} + +pub fn to_arrow_type(fluss_type: &DataType) -> Result { + Ok(match fluss_type { + DataType::Boolean(_) => ArrowDataType::Boolean, + DataType::TinyInt(_) => ArrowDataType::Int8, + DataType::SmallInt(_) => ArrowDataType::Int16, + DataType::BigInt(_) => ArrowDataType::Int64, + DataType::Int(_) => ArrowDataType::Int32, + DataType::Float(_) => ArrowDataType::Float32, + DataType::Double(_) => ArrowDataType::Float64, + DataType::Char(_) => ArrowDataType::Utf8, + DataType::String(_) => ArrowDataType::Utf8, + DataType::Decimal(decimal_type) => { + let precision = + decimal_type + .precision() + .try_into() + .map_err(|_| Error::IllegalArgument { + message: format!( + "Decimal precision {} exceeds Arrow's maximum (u8::MAX)", + decimal_type.precision() + ), + })?; + let scale = decimal_type + .scale() + .try_into() + .map_err(|_| Error::IllegalArgument { + message: format!( + "Decimal scale {} exceeds Arrow's maximum (i8::MAX)", + decimal_type.scale() + ), + })?; + ArrowDataType::Decimal128(precision, scale) + } + DataType::Date(_) => ArrowDataType::Date32, + DataType::Time(time_type) => match time_type.precision() { + 0 => ArrowDataType::Time32(arrow_schema::TimeUnit::Second), + 1..=3 => ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond), + 4..=6 => ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond), + 7..=9 => ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond), + invalid => { + return Err(Error::IllegalArgument { + message: format!("Invalid precision {invalid} for TimeType (must be 0-9)"), + }); + } + }, + DataType::Timestamp(timestamp_type) => match timestamp_type.precision() { + 0 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None), + 1..=3 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None), + 4..=6 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None), + 7..=9 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None), + invalid => { + return Err(Error::IllegalArgument { + message: format!("Invalid precision {invalid} for TimestampType (must be 0-9)"), + }); + } + }, + DataType::TimestampLTz(timestamp_ltz_type) => match timestamp_ltz_type.precision() { + 0 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None), + 1..=3 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None), + 4..=6 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None), + 7..=9 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None), + invalid => { + return Err(Error::IllegalArgument { + message: format!( + "Invalid precision {invalid} for TimestampLTzType (must be 0-9)" + ), + }); + } + }, + DataType::Bytes(_) => ArrowDataType::Binary, + DataType::Binary(binary_type) => { + let length = binary_type + .length() + .try_into() + .map_err(|_| Error::IllegalArgument { + message: format!( + "Binary length {} exceeds Arrow's maximum (i32::MAX)", + binary_type.length() + ), + })?; + ArrowDataType::FixedSizeBinary(length) + } + DataType::Array(array_type) => ArrowDataType::List( + Field::new_list_field( + to_arrow_type(array_type.get_element_type())?, + array_type.get_element_type().is_nullable(), + ) + .into(), + ), + DataType::Map(map_type) => { + let key_type = to_arrow_type(map_type.key_type())?; + let value_type = to_arrow_type(map_type.value_type())?; + let entry_fields = vec![ + Field::new("key", key_type, map_type.key_type().is_nullable()), + Field::new("value", value_type, map_type.value_type().is_nullable()), + ]; + ArrowDataType::Map( + Arc::new(Field::new( + "entries", + ArrowDataType::Struct(arrow_schema::Fields::from(entry_fields)), + false, + )), + false, + ) + } + DataType::Row(row_type) => { + let fields: Result> = row_type + .fields() + .iter() + .map(|f| { + Ok(Field::new( + f.name(), + to_arrow_type(f.data_type())?, + f.data_type().is_nullable(), + )) + }) + .collect(); + ArrowDataType::Struct(arrow_schema::Fields::from(fields?)) + } + }) +} + +/// Like `from_arrow_type`, but also reads the Field's nullability — +/// Arrow stores it on the Field wrapper, not the leaf data type. +pub(crate) fn from_arrow_field(field: &arrow_schema::Field) -> Result { + let mut dt = from_arrow_type(field.data_type())?; + if !field.is_nullable() { + dt = dt.as_non_nullable(); + } + Ok(dt) +} + +/// Converts an Arrow data type back to a Fluss `DataType`. +/// Used for reading array elements from Arrow ListArray back into Fluss types. +pub(crate) fn from_arrow_type(arrow_type: &ArrowDataType) -> Result { + use crate::metadata::DataTypes; + + Ok(match arrow_type { + ArrowDataType::Boolean => DataTypes::boolean(), + ArrowDataType::Int8 => DataTypes::tinyint(), + ArrowDataType::Int16 => DataTypes::smallint(), + ArrowDataType::Int32 => DataTypes::int(), + ArrowDataType::Int64 => DataTypes::bigint(), + ArrowDataType::Float32 => DataTypes::float(), + ArrowDataType::Float64 => DataTypes::double(), + ArrowDataType::Utf8 => DataTypes::string(), + ArrowDataType::Binary => DataTypes::bytes(), + ArrowDataType::Date32 => DataTypes::date(), + ArrowDataType::FixedSizeBinary(len) => { + if *len < 0 { + return Err(Error::IllegalArgument { + message: format!("FixedSizeBinary length must be >= 0, got {len}"), + }); + } + DataTypes::binary(*len as usize) + } + ArrowDataType::Decimal128(p, s) => { + if *s < 0 { + return Err(Error::IllegalArgument { + message: format!("Decimal scale must be >= 0, got {s}"), + }); + } + DataTypes::decimal(*p as u32, *s as u32) + } + ArrowDataType::Time32(arrow_schema::TimeUnit::Second) => DataTypes::time_with_precision(0), + ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond) => { + DataTypes::time_with_precision(3) + } + ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond) => { + DataTypes::time_with_precision(6) + } + ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond) => { + DataTypes::time_with_precision(9) + } + ArrowDataType::Timestamp(unit, tz) => { + let precision = match unit { + arrow_schema::TimeUnit::Second => 0, + arrow_schema::TimeUnit::Millisecond => 3, + arrow_schema::TimeUnit::Microsecond => 6, + arrow_schema::TimeUnit::Nanosecond => 9, + }; + + if tz.is_some() { + DataTypes::timestamp_ltz_with_precision(precision) + } else { + DataTypes::timestamp_with_precision(precision) + } + } + ArrowDataType::List(field) => DataTypes::array(from_arrow_field(field)?), + ArrowDataType::Map(entries_field, _sorted) => { + let fields = match entries_field.data_type() { + ArrowDataType::Struct(f) => f, + other => { + return Err(Error::IllegalArgument { + message: format!("Map entries must be Struct, got {other:?}"), + }); + } + }; + if fields.len() != 2 { + return Err(Error::IllegalArgument { + message: format!( + "Map entries Struct must have 2 fields (key, value), got {}", + fields.len() + ), + }); + } + DataTypes::map(from_arrow_field(&fields[0])?, from_arrow_field(&fields[1])?) + } + ArrowDataType::Struct(fields) => { + let row_fields: Result> = fields + .iter() + .map(|f| Ok(DataField::new(f.name(), from_arrow_field(f)?, None))) + .collect(); + DataTypes::row(row_fields?) + } + other => { + return Err(Error::IllegalArgument { + message: format!("Cannot convert Arrow type to Fluss type: {other:?}"), + }); + } + }) +} + +#[derive(Clone)] +pub struct ReadContext { + target_schema: SchemaRef, + full_schema: SchemaRef, + row_type: Arc, + projection: Option, + is_from_remote: bool, + fluss_row_type: Option>, +} + +#[derive(Clone)] +struct Projection { + ordered_schema: SchemaRef, + projected_fields: Vec, + ordered_fields: Vec, + + reordering_indexes: Vec, + reordering_needed: bool, +} + +impl ReadContext { + pub fn new( + arrow_schema: SchemaRef, + row_type: Arc, + is_from_remote: bool, + ) -> ReadContext { + ReadContext { + target_schema: arrow_schema.clone(), + full_schema: arrow_schema, + row_type, + projection: None, + is_from_remote, + fluss_row_type: None, + } + } + + pub fn with_fluss_row_type(mut self, fluss_row_type: Arc) -> ReadContext { + self.fluss_row_type = Some(fluss_row_type); + self + } + + pub fn fluss_row_type(&self) -> Option<&Arc> { + self.fluss_row_type.as_ref() + } + + pub fn with_projection_pushdown( + arrow_schema: SchemaRef, + row_type: Arc, + projected_fields: Vec, + is_from_remote: bool, + ) -> Result { + Self::validate_projection(&arrow_schema, projected_fields.as_slice())?; + let target_schema = + Self::project_schema(arrow_schema.clone(), projected_fields.as_slice())?; + // the logic is little bit of hard to understand, to refactor it to follow + // java side + let (need_do_reorder, sorted_fields) = { + // currently, for remote read, arrow log doesn't support projection pushdown, + // so, only need to do reordering when is not from remote + if !is_from_remote { + let mut sorted_fields = projected_fields.clone(); + sorted_fields.sort_unstable(); + (!sorted_fields.eq(&projected_fields), sorted_fields) + } else { + // sorted_fields won't be used when need_do_reorder is false, + // let's use an empty vec directly + (false, vec![]) + } + }; + + let project = { + if need_do_reorder { + // reordering is required + // Calculate reordering indexes to transform from sorted order to user-requested order + let mut reordering_indexes = Vec::with_capacity(projected_fields.len()); + for &original_idx in &projected_fields { + let pos = sorted_fields.binary_search(&original_idx).map_err(|_| { + IllegalArgument { + message: format!( + "Projection index {original_idx} is invalid for the current schema." + ), + } + })?; + reordering_indexes.push(pos); + } + Projection { + ordered_schema: Self::project_schema( + arrow_schema.clone(), + sorted_fields.as_slice(), + )?, + projected_fields, + ordered_fields: sorted_fields, + reordering_indexes, + reordering_needed: true, + } + } else { + Projection { + ordered_schema: Self::project_schema( + arrow_schema.clone(), + projected_fields.as_slice(), + )?, + ordered_fields: projected_fields.clone(), + projected_fields, + reordering_indexes: vec![], + reordering_needed: false, + } + } + }; + + Ok(ReadContext { + target_schema, + full_schema: arrow_schema, + row_type, + projection: Some(project), + is_from_remote, + fluss_row_type: None, + }) + } + + fn validate_projection(schema: &SchemaRef, projected_fields: &[usize]) -> Result<()> { + let field_count = schema.fields().len(); + for &index in projected_fields { + if index >= field_count { + return Err(IllegalArgument { + message: format!( + "Projection index {index} is out of bounds for schema with {field_count} fields." + ), + }); + } + } + Ok(()) + } + + pub fn project_schema(schema: SchemaRef, projected_fields: &[usize]) -> Result { + Ok(SchemaRef::new(schema.project(projected_fields).map_err( + |e| IllegalArgument { + message: format!("Invalid projection: {e}"), + }, + )?)) + } + + pub fn project_fields(&self) -> Option<&[usize]> { + self.projection + .as_ref() + .map(|p| p.projected_fields.as_slice()) + } + + pub fn project_fields_in_order(&self) -> Option<&[usize]> { + self.projection + .as_ref() + .map(|p| p.ordered_fields.as_slice()) + } + + pub fn record_batch(&self, data: &[u8]) -> Result { + let (batch_metadata, body_buffer, version) = parse_ipc_message(data)?; + + let resolve_schema = { + // if from remote, no projection, need to use full schema + if self.is_from_remote { + self.full_schema.clone() + } else { + // the record batch from server must be ordered by field pos, + // according to project to decide what arrow schema to use + // to parse the record batch + match self.projection { + Some(ref projection) => { + // projection, should use ordered schema by project field pos + projection.ordered_schema.clone() + } + None => { + // no projection, use target output schema + self.target_schema.clone() + } + } + } + }; + + let record_batch = read_record_batch( + &body_buffer, + batch_metadata, + resolve_schema, + &HashMap::new(), + None, + &version, + )?; + + let record_batch = match &self.projection { + Some(projection) => { + let reordered_columns = { + // need to do reorder + if self.is_from_remote { + Some(&projection.projected_fields) + } else if projection.reordering_needed { + Some(&projection.reordering_indexes) + } else { + None + } + }; + match reordered_columns { + Some(reordered_columns) => { + let arrow_columns = reordered_columns + .iter() + .map(|&idx| record_batch.column(idx).clone()) + .collect(); + RecordBatch::try_new(self.target_schema.clone(), arrow_columns)? + } + _ => record_batch, + } + } + _ => record_batch, + }; + Ok(record_batch) + } + + pub fn record_batch_for_remote_log(&self, data: &[u8]) -> Result> { + let (batch_metadata, body_buffer, version) = parse_ipc_message(data)?; + + let record_batch = read_record_batch( + &body_buffer, + batch_metadata, + self.full_schema.clone(), + &HashMap::new(), + None, + &version, + )?; + + let record_batch = match &self.projection { + Some(projection) => { + let projected_columns: Vec<_> = projection + .projected_fields + .iter() + .map(|&idx| record_batch.column(idx).clone()) + .collect(); + RecordBatch::try_new(self.target_schema.clone(), projected_columns)? + } + None => record_batch, + }; + Ok(Some(record_batch)) + } +} + +pub enum LogRecordIterator { + Empty, + Arrow(ArrowLogRecordIterator), +} + +impl LogRecordIterator { + pub fn empty() -> Self { + LogRecordIterator::Empty + } +} + +impl Iterator for LogRecordIterator { + type Item = ScanRecord; + + fn next(&mut self) -> Option { + match self { + LogRecordIterator::Empty => None, + LogRecordIterator::Arrow(iter) => iter.next(), + } + } +} + +pub struct ArrowLogRecordIterator { + reader: ArrowReader, + base_offset: i64, + timestamp: i64, + row_id: usize, + change_type: ChangeType, +} + +#[allow(dead_code)] +impl ArrowLogRecordIterator { + fn new(reader: ArrowReader, base_offset: i64, timestamp: i64, change_type: ChangeType) -> Self { + Self { + reader, + base_offset, + timestamp, + row_id: 0, + change_type, + } + } +} + +impl Iterator for ArrowLogRecordIterator { + type Item = ScanRecord; + + fn next(&mut self) -> Option { + if self.row_id >= self.reader.row_count() { + return None; + } + + let columnar_row = self.reader.read(self.row_id); + let scan_record = ScanRecord::new( + columnar_row, + self.base_offset + self.row_id as i64, + self.timestamp, + self.change_type, + ); + self.row_id += 1; + Some(scan_record) + } +} + +pub struct ArrowReader { + record_batch: Arc, + row_type: Arc, + fluss_row_type: Option>, + row_column_indices: Arc<[usize]>, +} + +impl ArrowReader { + pub fn new(record_batch: Arc, row_type: Arc) -> Self { + let row_column_indices = arrow_row_column_indices(&record_batch); + ArrowReader { + record_batch, + row_type, + fluss_row_type: None, + row_column_indices, + } + } + + pub fn new_with_fluss_row_type( + record_batch: Arc, + row_type: Arc, + fluss_row_type: Option>, + ) -> Self { + let row_column_indices = match &fluss_row_type { + Some(rt) => fluss_row_column_indices(rt), + None => arrow_row_column_indices(&record_batch), + }; + ArrowReader { + record_batch, + row_type, + fluss_row_type, + row_column_indices, + } + } + + pub fn row_count(&self) -> usize { + self.record_batch.num_rows() + } + + pub fn read(&self, row_id: usize) -> ColumnarRow { + ColumnarRow::with_indices( + self.record_batch.clone(), + self.row_type.clone(), + row_id, + self.fluss_row_type.clone(), + self.row_column_indices.clone(), + ) + } +} +pub struct MyVec(pub StreamReader); + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{DataField, DataTypes, RowType}; + use crate::test_utils::build_table_info; + + #[test] + fn test_to_array_type() { + assert_eq!( + to_arrow_type(&DataTypes::boolean()).unwrap(), + ArrowDataType::Boolean + ); + assert_eq!( + to_arrow_type(&DataTypes::tinyint()).unwrap(), + ArrowDataType::Int8 + ); + assert_eq!( + to_arrow_type(&DataTypes::smallint()).unwrap(), + ArrowDataType::Int16 + ); + assert_eq!( + to_arrow_type(&DataTypes::bigint()).unwrap(), + ArrowDataType::Int64 + ); + assert_eq!( + to_arrow_type(&DataTypes::int()).unwrap(), + ArrowDataType::Int32 + ); + assert_eq!( + to_arrow_type(&DataTypes::float()).unwrap(), + ArrowDataType::Float32 + ); + assert_eq!( + to_arrow_type(&DataTypes::double()).unwrap(), + ArrowDataType::Float64 + ); + assert_eq!( + to_arrow_type(&DataTypes::char(16)).unwrap(), + ArrowDataType::Utf8 + ); + assert_eq!( + to_arrow_type(&DataTypes::string()).unwrap(), + ArrowDataType::Utf8 + ); + assert_eq!( + to_arrow_type(&DataTypes::decimal(10, 2)).unwrap(), + ArrowDataType::Decimal128(10, 2) + ); + assert_eq!( + to_arrow_type(&DataTypes::date()).unwrap(), + ArrowDataType::Date32 + ); + assert_eq!( + to_arrow_type(&DataTypes::time()).unwrap(), + ArrowDataType::Time32(arrow_schema::TimeUnit::Second) + ); + assert_eq!( + to_arrow_type(&DataTypes::time_with_precision(3)).unwrap(), + ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond) + ); + assert_eq!( + to_arrow_type(&DataTypes::time_with_precision(6)).unwrap(), + ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond) + ); + assert_eq!( + to_arrow_type(&DataTypes::time_with_precision(9)).unwrap(), + ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_with_precision(0)).unwrap(), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_with_precision(3)).unwrap(), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_with_precision(6)).unwrap(), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_with_precision(9)).unwrap(), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(0)).unwrap(), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(3)).unwrap(), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(6)).unwrap(), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::timestamp_ltz_with_precision(9)).unwrap(), + ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None) + ); + assert_eq!( + to_arrow_type(&DataTypes::bytes()).unwrap(), + ArrowDataType::Binary + ); + assert_eq!( + to_arrow_type(&DataTypes::binary(16)).unwrap(), + ArrowDataType::FixedSizeBinary(16) + ); + + assert_eq!( + to_arrow_type(&DataTypes::array(DataTypes::int())).unwrap(), + ArrowDataType::List(Field::new_list_field(ArrowDataType::Int32, true).into()) + ); + + assert_eq!( + to_arrow_type(&DataTypes::map(DataTypes::string(), DataTypes::int())).unwrap(), + ArrowDataType::Map( + Arc::new(Field::new( + "entries", + ArrowDataType::Struct(arrow_schema::Fields::from(vec![ + Field::new("key", ArrowDataType::Utf8, false), + Field::new("value", ArrowDataType::Int32, true), + ])), + false, + )), + false, + ) + ); + + assert_eq!( + to_arrow_type(&DataTypes::row(vec![ + DataTypes::field("f1", DataTypes::int()), + DataTypes::field("f2", DataTypes::string()), + ])) + .unwrap(), + ArrowDataType::Struct(arrow_schema::Fields::from(vec![ + Field::new("f1", ArrowDataType::Int32, true), + Field::new("f2", ArrowDataType::Utf8, true), + ])) + ); + } + + #[test] + fn test_arrow_map_schema_strictness() { + let map_type = DataTypes::map(DataTypes::string(), DataTypes::int()); + let arrow_type = to_arrow_type(&map_type).unwrap(); + + if let ArrowDataType::Map(entries_field, _) = arrow_type { + assert!( + !entries_field.is_nullable(), + "Arrow Map 'entries' field must be strictly non-nullable" + ); + } else { + panic!("Expected ArrowDataType::Map, got {:?}", arrow_type); + } + } + + #[test] + fn test_from_arrow_type_preserves_container_field_nullability() { + let arrow_list = ArrowDataType::List(Arc::new(arrow_schema::Field::new( + "item", + ArrowDataType::Int32, + false, + ))); + match from_arrow_type(&arrow_list).unwrap() { + DataType::Array(at) => assert!(!at.get_element_type().is_nullable()), + other => panic!("expected Array, got {other:?}"), + } + + let entries_struct = ArrowDataType::Struct(arrow_schema::Fields::from(vec![ + arrow_schema::Field::new("key", ArrowDataType::Utf8, false), + arrow_schema::Field::new("value", ArrowDataType::Int32, false), + ])); + let entries_field = arrow_schema::Field::new("entries", entries_struct, false); + let arrow_map = ArrowDataType::Map(Arc::new(entries_field), false); + match from_arrow_type(&arrow_map).unwrap() { + DataType::Map(m) => { + assert!(!m.key_type().is_nullable()); + assert!(!m.value_type().is_nullable()); + } + other => panic!("expected Map, got {other:?}"), + } + } + + #[test] + fn test_parse_ipc_message() { + let empty_body: &[u8] = &le_bytes(&[0xFFFFFFFF, 0x00000000]); + let result = parse_ipc_message(empty_body); + assert_eq!( + result.unwrap_err().to_string(), + String::from( + "Fluss hitting Arrow error Parser error: Range [0, 4) is out of bounds.\n\n: ParseError(\"Range [0, 4) is out of bounds.\\n\\n\")." + ) + ); + + let invalid_data = &[]; + assert_eq!( + parse_ipc_message(invalid_data).unwrap_err().to_string(), + String::from( + "Fluss hitting Arrow error Parser error: Invalid data length: 0: ParseError(\"Invalid data length: 0\")." + ) + ); + + let data_with_invalid_continuation: &[u8] = &le_bytes(&[0x00000001, 0x00000000]); + assert_eq!( + parse_ipc_message(data_with_invalid_continuation) + .unwrap_err() + .to_string(), + String::from( + "Fluss hitting Arrow error Parser error: Invalid continuation marker: 1: ParseError(\"Invalid continuation marker: 1\")." + ) + ); + + let data_with_invalid_length: &[u8] = &le_bytes(&[0xFFFFFFFF, 0x00000001]); + assert_eq!( + parse_ipc_message(data_with_invalid_length) + .unwrap_err() + .to_string(), + String::from( + "Fluss hitting Arrow error Parser error: Invalid data length. Remaining data length 0 is shorter than specified size 1: ParseError(\"Invalid data length. Remaining data length 0 is shorter than specified size 1\")." + ) + ); + + let data_with_invalid_length = &le_bytes(&[0xFFFFFFFF, 0x00000004, 0x00000000]); + assert_eq!( + parse_ipc_message(data_with_invalid_length) + .unwrap_err() + .to_string(), + String::from( + "Fluss hitting Arrow error Parser error: Not a record batch: ParseError(\"Not a record batch\")." + ) + ); + } + + #[test] + fn projection_rejects_out_of_bounds_index() { + let row_type = RowType::new(vec![ + DataField::new("id", DataTypes::int(), None), + DataField::new("name", DataTypes::string(), None), + ]); + let schema = to_arrow_schema(&row_type).unwrap(); + let result = + ReadContext::with_projection_pushdown(schema, Arc::new(row_type), vec![0, 2], false); + + assert!(matches!(result, Err(IllegalArgument { .. }))); + } + + #[test] + fn checksum_and_schema_id_read_minimum_header() { + // Header-only batches with record_count == 0 are valid; this covers the minimal bytes + // needed for checksum/schema_id access. + let mut data = vec![0u8; SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH]; + let crc = 0xA1B2C3D4u32; + let schema_id = 42i16; + LittleEndian::write_u32(&mut data[CRC_OFFSET..CRC_OFFSET + CRC_LENGTH], crc); + LittleEndian::write_i16( + &mut data[SCHEMA_ID_OFFSET..SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH], + schema_id, + ); + + let batch = LogRecordBatch::new(Bytes::from(data)); + assert_eq!(batch.checksum(), crc); + assert_eq!(batch.schema_id(), schema_id); + + let expected = crc32c(&batch.data[SCHEMA_ID_OFFSET..]); + assert_eq!(batch.compute_checksum(), expected); + } + + fn le_bytes(vals: &[u32]) -> Vec { + let mut out = Vec::with_capacity(vals.len() * 4); + for &v in vals { + out.extend_from_slice(&v.to_le_bytes()); + } + out + } + + #[test] + fn test_temporal_and_decimal_builder_validation() { + use crate::row::column_writer::ColumnWriter; + use arrow::array::Array; + + // Test valid builder creation with precision=10, scale=2 + let mut writer = ColumnWriter::create( + &DataTypes::decimal(10, 2), + &ArrowDataType::Decimal128(10, 2), + 0, + 256, + ) + .unwrap(); + let array = writer.finish(); + assert_eq!(array.data_type(), &ArrowDataType::Decimal128(10, 2)); + + // Test error case: invalid Arrow precision/scale (exceeds Arrow's limit) + let result = ColumnWriter::create( + &DataTypes::decimal(10, 2), + &ArrowDataType::Decimal128(100, 50), + 0, + 256, + ); + assert!(result.is_err()); + } + + #[test] + fn test_decimal_rescaling_and_validation() -> Result<()> { + use crate::row::{Datum, Decimal, GenericRow}; + use arrow::array::Decimal128Array; + use bigdecimal::BigDecimal; + use std::str::FromStr; + + // Test 1: Rescaling from scale 3 to scale 2 + let row_type = RowType::new(vec![DataField::new( + "amount", + DataTypes::decimal(10, 2), + None, + )]); + let mut builder = RowAppendRecordBatchBuilder::new(&row_type)?; + let decimal = Decimal::from_big_decimal(BigDecimal::from_str("123.456").unwrap(), 10, 3)?; + let row = GenericRow { + values: vec![Datum::Decimal(decimal)], + }; + builder.append(&row)?; + let batch = builder.build_arrow_record_batch()?; + let array = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(array.value(0), 12346); // 123.456 rounded to 2 decimal places + assert_eq!(array.scale(), 2); + + // Test 2: Precision overflow (should error) + let row_type = RowType::new(vec![DataField::new( + "amount", + DataTypes::decimal(5, 2), + None, + )]); + let mut builder = RowAppendRecordBatchBuilder::new(&row_type)?; + let decimal = Decimal::from_big_decimal(BigDecimal::from_str("123456.78").unwrap(), 10, 2)?; + let row = GenericRow { + values: vec![Datum::Decimal(decimal)], + }; + let result = builder.append(&row); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("precision overflow") + ); + + Ok(()) + } + + // Tests for file-backed streaming + + #[test] + fn test_file_source_streaming() -> Result<()> { + use tempfile::NamedTempFile; + + // Test 1: Basic file reads work + let test_data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]; + let mut tmp_file = NamedTempFile::new()?; + tmp_file.write_all(&test_data)?; + tmp_file.flush()?; + + let file_path = tmp_file.path().to_path_buf(); + let file = File::open(&file_path)?; + let mut source = FileSource::new(file, 0, file_path)?; + + // Read full data + let data = source.read_batch_data(0, 10)?; + assert_eq!(data.to_vec(), test_data); + + // Read partial data + let partial = source.read_batch_data(2, 5)?; + assert_eq!(partial.to_vec(), vec![3, 4, 5, 6, 7]); + + // Test 2: base_offset works (critical for remote logs with pos_in_log_segment) + let prefix = vec![0xFF; 100]; + let actual_data = vec![1, 2, 3, 4, 5]; + let mut tmp_file2 = NamedTempFile::new()?; + tmp_file2.write_all(&prefix)?; + tmp_file2.write_all(&actual_data)?; + tmp_file2.flush()?; + + let file_path2 = tmp_file2.path().to_path_buf(); + let file2 = File::open(&file_path2)?; + let mut source2 = FileSource::new(file2, 100, file_path2)?; // Skip first 100 bytes + + assert_eq!(source2.total_size(), 5); // Only counts data after offset + let data2 = source2.read_batch_data(0, 5)?; + assert_eq!(data2.to_vec(), actual_data); + + Ok(()) + } + + #[test] + fn test_all_types_end_to_end() -> Result<()> { + use crate::row::{Date, Datum, Decimal, GenericRow, Time, TimestampLtz, TimestampNtz}; + use arrow::array::{ + Date32Array, Decimal128Array, Int32Array, Time32MillisecondArray, + Time64NanosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, + }; + use bigdecimal::BigDecimal; + use std::str::FromStr; + + // Schema with int, decimal, date, time (ms + ns), timestamps (μs + ns) + let row_type = RowType::new(vec![ + DataField::new("id".to_string(), DataTypes::int(), None), + DataField::new("amount".to_string(), DataTypes::decimal(10, 2), None), + DataField::new("date".to_string(), DataTypes::date(), None), + DataField::new( + "time_ms".to_string(), + DataTypes::time_with_precision(3), + None, + ), + DataField::new( + "time_ns".to_string(), + DataTypes::time_with_precision(9), + None, + ), + DataField::new( + "ts_us".to_string(), + DataTypes::timestamp_with_precision(6), + None, + ), + DataField::new( + "ts_ltz_ns".to_string(), + DataTypes::timestamp_ltz_with_precision(9), + None, + ), + ]); + + let mut builder = RowAppendRecordBatchBuilder::new(&row_type)?; + + // Append rows with various data types + let row = GenericRow { + values: vec![ + Datum::Int32(1), + Datum::Decimal(Decimal::from_big_decimal( + BigDecimal::from_str("123.456").unwrap(), + 10, + 3, + )?), + // 18000 days since epoch = 2019-04-14 + Datum::Date(Date::new(18000)), + // 43200000 ms = 12:00:00.000 (noon) + Datum::Time(Time::new(43200000)), + // 12345 ms = 00:00:12.345 + Datum::Time(Time::new(12345)), + // 1609459200000 ms = 2021-01-01 00:00:00 UTC, with 123456 additional nanoseconds + Datum::TimestampNtz(TimestampNtz::from_millis_nanos(1609459200000, 123456)?), + // 1609459200000 ms = 2021-01-01 00:00:00 UTC, with 987654 additional nanoseconds + Datum::TimestampLtz(TimestampLtz::from_millis_nanos(1609459200000, 987654)?), + ], + }; + builder.append(&row)?; + + let batch = builder.build_arrow_record_batch()?; + + // Verify all conversions + assert_eq!( + batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 1 + ); + + let dec = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(dec.value(0), 12346); // 123.456 rounded to 2 decimal places + + assert_eq!( + batch + .column(2) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 18000 + ); + + assert_eq!( + batch + .column(3) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 43200000 + ); + + assert_eq!( + batch + .column(4) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 12345000000 + ); + + // Timestamp with sub-millisecond nanos preserved + assert_eq!( + batch + .column(5) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 1609459200000123 + ); + + assert_eq!( + batch + .column(6) + .as_any() + .downcast_ref::() + .unwrap() + .value(0), + 1609459200000987654 + ); + + Ok(()) + } + + #[test] + fn test_log_records_batches_from_file() -> Result<()> { + use crate::client::WriteRecord; + use crate::compression::{ + ArrowCompressionInfo, ArrowCompressionType, DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }; + use crate::metadata::{PhysicalTablePath, TablePath}; + use crate::row::GenericRow; + use tempfile::NamedTempFile; + + // Integration test: Real log record batch streamed from file + let row_type = RowType::new(vec![ + DataField::new("id".to_string(), DataTypes::int(), None), + DataField::new("name".to_string(), DataTypes::string(), None), + ]); + let table_path = TablePath::new("db".to_string(), "tbl".to_string()); + let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1)); + let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path))); + + let mut builder = MemoryLogRecordsArrowBuilder::new( + 1, + &row_type, + false, + ArrowCompressionInfo { + compression_type: ArrowCompressionType::None, + compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL, + }, + usize::MAX, + Arc::new(ArrowCompressionRatioEstimator::default()), + )?; + + let mut row = GenericRow::new(2); + row.set_field(0, 1_i32); + row.set_field(1, "alice"); + let record = WriteRecord::for_append( + Arc::clone(&table_info), + physical_table_path.clone(), + 1, + &row, + ); + builder.append(&record)?; + + let mut row2 = GenericRow::new(2); + row2.set_field(0, 2_i32); + row2.set_field(1, "bob"); + let record2 = + WriteRecord::for_append(Arc::clone(&table_info), physical_table_path, 2, &row2); + builder.append(&record2)?; + + let data = builder.build()?; + + // Write to file + let mut tmp_file = NamedTempFile::new()?; + tmp_file.write_all(&data)?; + tmp_file.flush()?; + + // Create file-backed LogRecordsBatches (should stream, not load all into memory) + let file_path = tmp_file.path().to_path_buf(); + let file = File::open(&file_path)?; + let mut batches = LogRecordsBatches::from_file(file, 0, file_path)?; + + // Iterate through batches (should work just like in-memory) + let batch = batches.next().expect("Should have at least one batch")?; + assert!(batch.size_in_bytes() > 0); + assert_eq!(batch.record_count(), 2); + + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/record/error.rs b/fluss-rust/crates/fluss/src/record/error.rs new file mode 100644 index 0000000000..22704a0cdf --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/error.rs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::io; +use thiserror::Error; + +#[derive(Error, Debug)] +#[non_exhaustive] +#[allow(dead_code)] +pub enum Error { + #[error(transparent)] + Io(#[from] io::Error), +} diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs new file mode 100644 index 0000000000..ed67aa0e24 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs @@ -0,0 +1,351 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Key-Value record implementation. +//! +//! This module provides the KvRecord struct which represents an immutable key-value record. +//! The record format is: +//! - Length => Int32 +//! - KeyLength => Unsigned VarInt +//! - Key => bytes +//! - Row => BinaryRow (optional, if null then this is a deletion record) + +use bytes::{BufMut, Bytes, BytesMut}; +use std::io; + +use crate::row::RowDecoder; +use crate::row::compacted::CompactedRow; +use crate::util::varint::{ + read_unsigned_varint_bytes, size_of_unsigned_varint, write_unsigned_varint_buf, +}; + +/// Length field size in bytes +pub const LENGTH_LENGTH: usize = 4; + +/// A key-value record containing raw key and value bytes. +/// +/// The schema is: +/// - Length => Int32 +/// - KeyLength => Unsigned VarInt +/// - Key => bytes +/// - Value => bytes (BinaryRow, written directly without length prefix) +/// +/// When the value is None (deletion), no Value bytes are present. +/// +/// This struct stores only raw bytes. To decode the value into a typed row, +/// use the `row()` method with a RowDecoder (typically obtained from the iterator). +/// +/// Reference implementation: +/// +#[derive(Debug, Clone)] +pub struct KvRecord { + key: Bytes, + value_bytes: Option, + size_in_bytes: usize, +} + +impl KvRecord { + /// Get the key bytes. + pub fn key(&self) -> &Bytes { + &self.key + } + + /// Get the raw value bytes (for testing). + #[cfg(test)] + pub(crate) fn value_bytes(&self) -> Option<&Bytes> { + self.value_bytes.as_ref() + } + + /// Decode the value bytes into a typed row using the provided decoder. + /// This creates a lightweight CompactedRow view over the raw bytes. + /// Actual field parsing is lazy (on first access). + pub fn row<'a>(&'a self, decoder: &dyn RowDecoder) -> Option> { + self.value_bytes.as_ref().map(|bytes| { + // Decode on-demand - CompactedRow<'a> lifetime tied to &'a self + decoder.decode(bytes.as_ref()) + }) + } + + /// Calculate the total size of the record when serialized (including length prefix). + pub fn size_of(key: &[u8], value: Option<&[u8]>) -> usize { + Self::size_without_length(key, value) + LENGTH_LENGTH + } + + /// Calculate the size without the length prefix. + fn size_without_length(key: &[u8], value: Option<&[u8]>) -> usize { + let key_len = key.len(); + let key_len_size = size_of_unsigned_varint(key_len as u32); + + match value { + Some(v) => key_len_size.saturating_add(key_len).saturating_add(v.len()), + None => { + // Deletion: no value bytes + key_len_size.saturating_add(key_len) + } + } + } + + /// Write a KV record to a buffer. + /// + /// Returns the number of bytes written. + pub fn write_to_buf(buf: &mut BytesMut, key: &[u8], value: Option<&[u8]>) -> io::Result { + let size_in_bytes = Self::size_without_length(key, value); + + let size_i32 = i32::try_from(size_in_bytes).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("Record size {size_in_bytes} exceeds i32::MAX"), + ) + })?; + buf.put_i32_le(size_i32); + let key_len = key.len() as u32; + write_unsigned_varint_buf(key_len, buf); + + buf.put_slice(key); + + if let Some(v) = value { + buf.put_slice(v); + } + // For None (deletion), don't write any value bytes + + Ok(size_in_bytes + LENGTH_LENGTH) + } + + /// Read a KV record from bytes at the given position. + /// + /// Returns the KvRecord and the number of bytes consumed. + /// The record contains only raw bytes; use `row()` with a RowDecoder to decode the value. + pub fn read_from(bytes: &Bytes, position: usize) -> io::Result<(Self, usize)> { + if bytes.len() < position.saturating_add(LENGTH_LENGTH) { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read record length", + )); + } + + let size_in_bytes_i32 = i32::from_le_bytes([ + bytes[position], + bytes[position + 1], + bytes[position + 2], + bytes[position + 3], + ]); + + if size_in_bytes_i32 < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Invalid record length: {size_in_bytes_i32}"), + )); + } + + let size_in_bytes = size_in_bytes_i32 as usize; + + let total_size = size_in_bytes.checked_add(LENGTH_LENGTH).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Record size overflow: {size_in_bytes} + {LENGTH_LENGTH}"), + ) + })?; + + let available = bytes.len().saturating_sub(position); + if available < total_size { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!( + "Not enough bytes to read record: expected {total_size}, available {available}" + ), + )); + } + + let mut current_offset = position + LENGTH_LENGTH; + let record_end = position + total_size; + + // Read key length as unsigned varint (bounded by record end) + let (key_len, varint_size) = + read_unsigned_varint_bytes(&bytes[current_offset..record_end])?; + current_offset += varint_size; + + // Read key bytes + let key_end = current_offset + key_len as usize; + if key_end > position + total_size { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Key length exceeds record size", + )); + } + let key = bytes.slice(current_offset..key_end); + current_offset = key_end; + + // Read value bytes directly (don't decode yet - will decode on-demand) + let value_bytes = if current_offset < record_end { + // Value is present: all remaining bytes are the value + Some(bytes.slice(current_offset..record_end)) + } else { + // No remaining bytes: this is a deletion record + None + }; + + Ok(( + Self { + key, + value_bytes, + size_in_bytes: total_size, + }, + total_size, + )) + } + + /// Get the total size in bytes of this record. + pub fn get_size_in_bytes(&self) -> usize { + self.size_in_bytes + } + + /// Check if this is a deletion record (no value). + pub fn is_deletion(&self) -> bool { + self.value_bytes.is_none() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_kv_record_basic_operations() { + let key = b"test_key"; + let value = b"test_value"; + + // Test size calculation with value + let size_with_value = KvRecord::size_of(key, Some(value)); + assert_eq!( + size_with_value, + LENGTH_LENGTH + size_of_unsigned_varint(key.len() as u32) + key.len() + value.len() + ); + + // Test size calculation without value (deletion) + let size_without_value = KvRecord::size_of(key, None); + assert_eq!( + size_without_value, + LENGTH_LENGTH + size_of_unsigned_varint(key.len() as u32) + key.len() + ); + + // Test write/read round trip with value + let mut buf = BytesMut::new(); + let written = KvRecord::write_to_buf(&mut buf, key, Some(value)).unwrap(); + + let bytes = buf.freeze(); + let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap(); + + assert_eq!(written, read_size); + assert_eq!(record.key().as_ref(), key); + assert_eq!(record.value_bytes().unwrap().as_ref(), value); + assert_eq!(record.get_size_in_bytes(), written); + assert!(!record.is_deletion()); + + // Test deletion record (no value) + let delete_key = b"delete_me"; + let mut buf = BytesMut::new(); + let written = KvRecord::write_to_buf(&mut buf, delete_key, None).unwrap(); + + let bytes = buf.freeze(); + let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap(); + + assert_eq!(written, read_size); + assert_eq!(record.key().as_ref(), delete_key); + assert!(record.is_deletion()); + assert!(record.value_bytes().is_none()); + } + + #[test] + fn test_kv_record_multiple_records() { + // Test multiple regular-sized records in buffer + let records = vec![ + (b"key1".as_slice(), Some(b"value1".as_slice())), + (b"key2".as_slice(), None), // Deletion + (b"key3".as_slice(), Some(b"value3".as_slice())), + ]; + + let mut buf = BytesMut::new(); + for (key, value) in &records { + KvRecord::write_to_buf(&mut buf, key, *value).unwrap(); + } + + let bytes = buf.freeze(); + let mut offset = 0; + for (expected_key, expected_value) in &records { + let (record, size) = KvRecord::read_from(&bytes, offset).unwrap(); + assert_eq!(record.key().as_ref(), *expected_key); + match expected_value { + Some(v) => { + assert_eq!(record.value_bytes().unwrap().as_ref(), *v); + assert!(!record.is_deletion()); + } + None => { + assert!(record.is_deletion()); + assert!(record.value_bytes().is_none()); + } + } + offset += size; + } + assert_eq!(offset, bytes.len()); + + // Test large keys and values + let large_key = vec![0u8; 1024]; + let large_value = vec![1u8; 4096]; + + let mut buf = BytesMut::new(); + let written = KvRecord::write_to_buf(&mut buf, &large_key, Some(&large_value)).unwrap(); + + let bytes = buf.freeze(); + let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap(); + + assert_eq!(written, read_size); + assert_eq!(record.key().len(), large_key.len()); + assert_eq!(record.value_bytes().unwrap().len(), large_value.len()); + } + + #[test] + fn test_invalid_record_lengths() { + let mut buf = BytesMut::new(); + buf.put_i32_le(-1); // Negative length + buf.put_u8(1); // Some dummy data + buf.put_slice(b"key"); + let bytes = buf.freeze(); + let result = KvRecord::read_from(&bytes, 0); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.kind(), io::ErrorKind::InvalidData); + } + + // Test overflow length + let mut buf = BytesMut::new(); + buf.put_i32_le(i32::MAX); // Very large length + buf.put_u8(1); // Some dummy data + let bytes = buf.freeze(); + let result = KvRecord::read_from(&bytes, 0); + assert!(result.is_err()); + + // Test impossibly large but non-negative length + let mut buf = BytesMut::new(); + buf.put_i32_le(1_000_000); + let bytes = buf.freeze(); + let result = KvRecord::read_from(&bytes, 0); + assert!(result.is_err()); + if let Err(e) = result { + assert_eq!(e.kind(), io::ErrorKind::UnexpectedEof); + } + } +} diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs new file mode 100644 index 0000000000..14ff2e91b4 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs @@ -0,0 +1,456 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! KV record batch implementation. +//! +//! The schema of a KvRecordBatch is: +//! - Length => Int32 +//! - Magic => Int8 +//! - CRC => Uint32 +//! - SchemaId => Int16 +//! - Attributes => Int8 +//! - WriterId => Int64 +//! - BatchSequence => Int32 +//! - RecordCount => Int32 +//! - Records => [Record] +//! +//! The CRC covers data from the SchemaId to the end of the batch. + +use bytes::Bytes; +use std::io; +use std::sync::Arc; + +use crate::error::Result; +use crate::record::kv::{KvRecord, ReadContext}; +use crate::row::RowDecoder; + +// Field lengths in bytes +pub const LENGTH_LENGTH: usize = 4; +pub const MAGIC_LENGTH: usize = 1; +pub const CRC_LENGTH: usize = 4; +pub const SCHEMA_ID_LENGTH: usize = 2; +pub const ATTRIBUTE_LENGTH: usize = 1; +pub const WRITE_CLIENT_ID_LENGTH: usize = 8; +pub const BATCH_SEQUENCE_LENGTH: usize = 4; +pub const RECORDS_COUNT_LENGTH: usize = 4; + +// Field offsets +pub const LENGTH_OFFSET: usize = 0; +pub const MAGIC_OFFSET: usize = LENGTH_OFFSET + LENGTH_LENGTH; +pub const CRC_OFFSET: usize = MAGIC_OFFSET + MAGIC_LENGTH; +pub const SCHEMA_ID_OFFSET: usize = CRC_OFFSET + CRC_LENGTH; +pub const ATTRIBUTES_OFFSET: usize = SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH; +pub const WRITE_CLIENT_ID_OFFSET: usize = ATTRIBUTES_OFFSET + ATTRIBUTE_LENGTH; +pub const BATCH_SEQUENCE_OFFSET: usize = WRITE_CLIENT_ID_OFFSET + WRITE_CLIENT_ID_LENGTH; +pub const RECORDS_COUNT_OFFSET: usize = BATCH_SEQUENCE_OFFSET + BATCH_SEQUENCE_LENGTH; +pub const RECORDS_OFFSET: usize = RECORDS_COUNT_OFFSET + RECORDS_COUNT_LENGTH; + +/// Total header size +pub const RECORD_BATCH_HEADER_SIZE: usize = RECORDS_OFFSET; + +/// Overhead of the batch (length field) +pub const KV_OVERHEAD: usize = LENGTH_OFFSET + LENGTH_LENGTH; + +/// A KV record batch. +/// +/// This struct provides read access to a serialized KV record batch. +// Reference implementation: +// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecordBatch.java +pub struct KvRecordBatch { + data: Bytes, + position: usize, +} + +impl KvRecordBatch { + /// Create a new KvRecordBatch pointing to the given data at the specified position. + pub fn new(data: Bytes, position: usize) -> Self { + Self { data, position } + } + + /// Get the size in bytes of this batch. + pub fn size_in_bytes(&self) -> io::Result { + if self.data.len() < self.position.saturating_add(LENGTH_LENGTH) { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read batch length", + )); + } + let length_i32 = i32::from_le_bytes([ + self.data[self.position], + self.data[self.position + 1], + self.data[self.position + 2], + self.data[self.position + 3], + ]); + + if length_i32 < 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Invalid batch length: {length_i32}"), + )); + } + + let length = length_i32 as usize; + + Ok(length.saturating_add(KV_OVERHEAD)) + } + + /// Check if this batch is valid by verifying the checksum. + pub fn is_valid(&self) -> bool { + if !matches!(self.size_in_bytes(), Ok(s) if s >= RECORD_BATCH_HEADER_SIZE) { + return false; + } + + match (self.checksum(), self.compute_checksum()) { + (Ok(stored), Ok(computed)) => stored == computed, + _ => false, + } + } + + /// Get the magic byte. + pub fn magic(&self) -> io::Result { + if self.data.len() < self.position.saturating_add(MAGIC_OFFSET).saturating_add(1) { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read magic byte", + )); + } + Ok(self.data[self.position + MAGIC_OFFSET]) + } + + /// Get the checksum. + pub fn checksum(&self) -> io::Result { + if self.data.len() < self.position.saturating_add(CRC_OFFSET).saturating_add(4) { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read checksum", + )); + } + Ok(u32::from_le_bytes([ + self.data[self.position + CRC_OFFSET], + self.data[self.position + CRC_OFFSET + 1], + self.data[self.position + CRC_OFFSET + 2], + self.data[self.position + CRC_OFFSET + 3], + ])) + } + + /// Compute the checksum of this batch. + pub fn compute_checksum(&self) -> io::Result { + let size = self.size_in_bytes()?; + if size < RECORD_BATCH_HEADER_SIZE { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Batch size {size} is less than header size {RECORD_BATCH_HEADER_SIZE}"), + )); + } + + let start = self.position.saturating_add(SCHEMA_ID_OFFSET); + let end = self.position.saturating_add(size); + + if end > self.data.len() || start >= end { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to compute checksum", + )); + } + + Ok(crc32c::crc32c(&self.data[start..end])) + } + + /// Get the schema ID. + pub fn schema_id(&self) -> io::Result { + if self.data.len() + < self + .position + .saturating_add(SCHEMA_ID_OFFSET) + .saturating_add(2) + { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read schema ID", + )); + } + Ok(i16::from_le_bytes([ + self.data[self.position + SCHEMA_ID_OFFSET], + self.data[self.position + SCHEMA_ID_OFFSET + 1], + ])) + } + + /// Get the writer ID. + pub fn writer_id(&self) -> io::Result { + if self.data.len() + < self + .position + .saturating_add(WRITE_CLIENT_ID_OFFSET) + .saturating_add(8) + { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read writer ID", + )); + } + Ok(i64::from_le_bytes([ + self.data[self.position + WRITE_CLIENT_ID_OFFSET], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 1], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 2], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 3], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 4], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 5], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 6], + self.data[self.position + WRITE_CLIENT_ID_OFFSET + 7], + ])) + } + + /// Get the batch sequence. + pub fn batch_sequence(&self) -> io::Result { + if self.data.len() + < self + .position + .saturating_add(BATCH_SEQUENCE_OFFSET) + .saturating_add(4) + { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read batch sequence", + )); + } + Ok(i32::from_le_bytes([ + self.data[self.position + BATCH_SEQUENCE_OFFSET], + self.data[self.position + BATCH_SEQUENCE_OFFSET + 1], + self.data[self.position + BATCH_SEQUENCE_OFFSET + 2], + self.data[self.position + BATCH_SEQUENCE_OFFSET + 3], + ])) + } + + /// Get the number of records in this batch. + pub fn record_count(&self) -> io::Result { + if self.data.len() + < self + .position + .saturating_add(RECORDS_COUNT_OFFSET) + .saturating_add(4) + { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "Not enough bytes to read record count", + )); + } + Ok(i32::from_le_bytes([ + self.data[self.position + RECORDS_COUNT_OFFSET], + self.data[self.position + RECORDS_COUNT_OFFSET + 1], + self.data[self.position + RECORDS_COUNT_OFFSET + 2], + self.data[self.position + RECORDS_COUNT_OFFSET + 3], + ])) + } + + /// Create an iterable collection of records in this batch. + /// + /// This validates the batch checksum before returning the records. + /// For trusted data paths, use `records_unchecked()` to skip validation. + /// + /// Mirrors: KvRecordBatch.records(ReadContext) + pub fn records(&self, read_context: &dyn ReadContext) -> Result { + if !self.is_valid() { + return Err(crate::error::Error::IoUnexpectedError { + message: "Invalid batch checksum".to_string(), + source: io::Error::new(io::ErrorKind::InvalidData, "Invalid batch checksum"), + }); + } + self.records_unchecked(read_context) + } + + /// Create an iterable collection of records in this batch without validating the checksum. + pub fn records_unchecked(&self, read_context: &dyn ReadContext) -> Result { + let size = self.size_in_bytes()?; + let count = self.record_count()?; + let schema_id = self.schema_id()?; + + if count < 0 { + return Err(crate::error::Error::IoUnexpectedError { + message: format!("Invalid record count: {count}"), + source: io::Error::new(io::ErrorKind::InvalidData, "Invalid record count"), + }); + } + + // Get row decoder for this schema from context (cached) + let row_decoder = read_context.get_row_decoder(schema_id)?; + + Ok(KvRecords { + iter: KvRecordIterator { + data: self.data.clone(), + position: self.position + RECORDS_OFFSET, + end: self.position + size, + remaining_count: count, + }, + row_decoder, + }) + } +} + +/// Iterable collection of KV records with associated decoder. +/// +/// This wrapper provides both iteration capability and access to the row decoder +/// needed to decode record values into typed rows. +pub struct KvRecords { + iter: KvRecordIterator, + row_decoder: Arc, +} + +impl KvRecords { + /// Get a reference to the row decoder for decoding record values. + /// + /// Returns a reference tied to the lifetime of `&self`. + /// Use this when iterating by reference. + pub fn decoder(&self) -> &dyn RowDecoder { + &*self.row_decoder + } + + /// Get an owned Arc to the row decoder. + /// + /// Returns a cloned Arc that can outlive the KvRecords, + /// allowing you to grab it before consuming the iterator. + /// Useful if you must keep the decoder beyond the iterable’s lifetime(collect then decode style) + pub fn decoder_arc(&self) -> Arc { + Arc::clone(&self.row_decoder) + } +} + +impl IntoIterator for KvRecords { + type Item = io::Result; + type IntoIter = KvRecordIterator; + + fn into_iter(self) -> Self::IntoIter { + self.iter + } +} + +/// Iterator over records in a KV record batch. +pub struct KvRecordIterator { + data: Bytes, + position: usize, + end: usize, + remaining_count: i32, +} + +impl Iterator for KvRecordIterator { + type Item = io::Result; + + fn next(&mut self) -> Option { + if self.remaining_count <= 0 || self.position >= self.end { + return None; + } + + match KvRecord::read_from(&self.data, self.position) { + Ok((record, size)) => { + self.position += size; + self.remaining_count -= 1; + Some(Ok(record)) + } + Err(e) => { + self.remaining_count = 0; // Stop iteration on error + Some(Err(e)) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{DataTypes, KvFormat}; + use crate::record::kv::test_util::TestReadContext; + use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, KvRecordBatchBuilder}; + use crate::row::InternalRow; + use crate::row::binary::BinaryWriter; + + use bytes::{BufMut, BytesMut}; + + #[test] + fn test_invalid_batch_lengths() { + // Test negative length + let mut buf = BytesMut::new(); + buf.put_i32_le(-1); + let bytes = buf.freeze(); + let batch = KvRecordBatch::new(bytes, 0); + assert!(batch.size_in_bytes().is_err()); // Should error for invalid + assert!(!batch.is_valid()); + + // Test overflow length + let mut buf = BytesMut::new(); + buf.put_i32_le(i32::MAX); + let bytes = buf.freeze(); + let batch = KvRecordBatch::new(bytes, 0); + assert!(!batch.is_valid()); + + // Test too-short buffer + let mut buf = BytesMut::new(); + buf.put_i32_le(100); // Claims 100 bytes but buffer is tiny + let bytes = buf.freeze(); + let batch = KvRecordBatch::new(bytes, 0); + assert!(!batch.is_valid()); + } + + #[test] + fn test_kv_record_batch_build_and_read() { + use crate::row::compacted::CompactedRowWriter; + + let schema_id = 42; + let write_limit = 4096; + + let mut builder = KvRecordBatchBuilder::new(schema_id, write_limit, KvFormat::COMPACTED); + builder.set_writer_state(100, 5); + + let key1 = b"key1"; + let mut value1_writer = CompactedRowWriter::new(1); + value1_writer.write_bytes(&[1, 2, 3, 4, 5]); + + let row_bytes = value1_writer.buffer(); + builder.append_row(key1, Some(row_bytes)).unwrap(); + + let key2 = b"key2"; + builder.append_row(key2, None).unwrap(); + + let bytes = builder.build().unwrap(); + + let batch = KvRecordBatch::new(bytes.clone(), 0); + assert!(batch.is_valid()); + assert_eq!(batch.magic().unwrap(), CURRENT_KV_MAGIC_VALUE); + assert_eq!(batch.schema_id().unwrap(), schema_id as i16); + assert_eq!(batch.writer_id().unwrap(), 100); + assert_eq!(batch.batch_sequence().unwrap(), 5); + assert_eq!(batch.record_count().unwrap(), 2); + + // Create ReadContext for reading + let read_context = TestReadContext::compacted(vec![DataTypes::bytes()]); + + // Iterate and verify records using typed API + let records = batch.records(&read_context).unwrap(); + let decoder = records.decoder_arc(); // Get Arc before consuming + + let mut iter = records.into_iter(); + let record1 = iter.next().unwrap().unwrap(); + assert_eq!(record1.key().as_ref(), key1); + assert!(!record1.is_deletion()); + let row1 = record1.row(&*decoder).unwrap(); + assert_eq!(row1.get_bytes(0).unwrap(), &[1, 2, 3, 4, 5]); + + let record2 = iter.next().unwrap().unwrap(); + assert_eq!(record2.key().as_ref(), key2); + assert!(record2.is_deletion()); + + assert!(iter.next().is_none()); + } +} diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs new file mode 100644 index 0000000000..0e806337fd --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs @@ -0,0 +1,578 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! KV record batch builder implementation. +//! +//! This module provides the KvRecordBatchBuilder for building batches of KV records. + +use crate::error::{Error, Result}; +use crate::metadata::KvFormat; +use crate::record::kv::kv_record::KvRecord; +use crate::record::kv::kv_record_batch::{ + ATTRIBUTES_OFFSET, BATCH_SEQUENCE_OFFSET, CRC_OFFSET, LENGTH_LENGTH, LENGTH_OFFSET, + MAGIC_OFFSET, RECORD_BATCH_HEADER_SIZE, RECORDS_COUNT_OFFSET, SCHEMA_ID_OFFSET, + WRITE_CLIENT_ID_OFFSET, +}; +use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, NO_BATCH_SEQUENCE, NO_WRITER_ID}; +use bytes::{Bytes, BytesMut}; +use log::warn; +use std::io; + +/// Builder for KvRecordBatch. +/// +/// This builder accumulates KV records and produces a serialized batch with proper +/// header information and checksums. +// Reference implementation: +// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecordBatchBuilder.java +pub struct KvRecordBatchBuilder { + schema_id: i32, + magic: u8, + write_limit: usize, + buffer: BytesMut, + writer_id: i64, + batch_sequence: i32, + current_record_number: i32, + size_in_bytes: usize, + is_closed: bool, + kv_format: KvFormat, + aborted: bool, + built_buffer: Option, +} + +impl KvRecordBatchBuilder { + /// Create a new KvRecordBatchBuilder. + /// + /// # Arguments + /// * `schema_id` - The schema ID for records in this batch (must fit in i16) + /// * `write_limit` - Maximum bytes that can be appended + /// * `kv_format` - The KV format (Compacted, Indexed, or Aligned) + pub fn new(schema_id: i32, write_limit: usize, kv_format: KvFormat) -> Self { + assert!( + schema_id <= i16::MAX as i32, + "schema_id shouldn't be greater than the max value of i16: {}", + i16::MAX + ); + + let mut buffer = BytesMut::with_capacity(write_limit.max(RECORD_BATCH_HEADER_SIZE)); + + // Reserve space for header (we'll write it at the end) + buffer.resize(RECORD_BATCH_HEADER_SIZE, 0); + + Self { + schema_id, + magic: CURRENT_KV_MAGIC_VALUE, + write_limit, + buffer, + writer_id: NO_WRITER_ID, + batch_sequence: NO_BATCH_SEQUENCE, + current_record_number: 0, + size_in_bytes: RECORD_BATCH_HEADER_SIZE, + is_closed: false, + kv_format, + aborted: false, + built_buffer: None, + } + } + + /// Check if there is room for a new record containing the given key and row bytes. + /// If no records have been appended, this always returns true. + pub fn has_room_for_row(&self, key: &[u8], row_bytes: Option<&[u8]>) -> bool { + self.size_in_bytes + KvRecord::size_of(key, row_bytes) <= self.write_limit + } + + /// Append a KV record with row bytes to the batch. + /// + /// Returns an error if: + /// - The builder has been aborted + /// - The builder is closed + /// - Adding this record would exceed the write limit + /// - The maximum number of records is exceeded + /// - The KV format is not COMPACTED + pub fn append_row(&mut self, key: &[u8], row_bytes: Option<&[u8]>) -> io::Result<()> { + if self.kv_format != KvFormat::COMPACTED { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "append_row can only be used with KvFormat::COMPACTED", + )); + } + + if self.aborted { + return Err(io::Error::other( + "Tried to append a record, but KvRecordBatchBuilder has already been aborted", + )); + } + + if self.is_closed { + return Err(io::Error::other( + "Tried to append a record, but KvRecordBatchBuilder is closed for record appends", + )); + } + + // Check record count limit before mutation + if self.current_record_number == i32::MAX { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!( + "Maximum number of records per batch exceeded, max records: {}", + i32::MAX + ), + )); + } + + let record_size = KvRecord::size_of(key, row_bytes); + if self.size_in_bytes + record_size > self.write_limit { + return Err(io::Error::new( + io::ErrorKind::WriteZero, + format!( + "Adding record would exceed write limit: {} + {} > {}", + self.size_in_bytes, record_size, self.write_limit + ), + )); + } + + let record_byte_size = KvRecord::write_to_buf(&mut self.buffer, key, row_bytes)?; + debug_assert_eq!(record_byte_size, record_size, "Record size mismatch"); + + self.current_record_number += 1; + self.size_in_bytes += record_byte_size; + + // Invalidate cached buffer since we modified the batch + self.built_buffer = None; + + Ok(()) + } + + /// Set the writer state (writer ID and batch base sequence). + /// + /// This invalidates any cached buffer, ensuring the batch header will be rebuilt + /// on the next call to [`build`](Self::build). + pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) { + self.writer_id = writer_id; + self.batch_sequence = batch_base_sequence; + // Invalidate cached buffer since header fields changed + self.built_buffer = None; + } + + /// Build the batch and return the serialized bytes. + /// + /// This can be called multiple times as the batch is cached after the first build. + /// + /// # Caching and Mutations + /// + /// The builder caches the result after the first successful build. However, the cache + /// is invalidated (and the batch rebuilt) if any of the following occur after building: + /// - Calling [`append_row`](Self::append_row) to add records + /// - Calling [`set_writer_state`](Self::set_writer_state) to modify writer metadata + /// + /// This allows the builder to be reused with different writer states or to continue + /// appending records after an initial build, but callers should be aware that the + /// built bytes may change if mutations occur between builds. + /// + /// Note: [`close`](Self::close) prevents further appends but does not prevent writer state modifications. + pub fn build(&mut self) -> Result { + if self.aborted { + return Err(Error::UnexpectedError { + message: "Attempting to build an aborted record batch".to_string(), + source: None, + }); + } + + if let Some(ref cached) = self.built_buffer { + return Ok(cached.clone()); + } + + self.write_batch_header()?; + let bytes = self.buffer.clone().freeze(); + self.built_buffer = Some(bytes); + Ok(self.built_buffer.as_ref().unwrap().clone()) + } + + /// Get the writer ID. + pub fn writer_id(&self) -> i64 { + self.writer_id + } + + /// Get the batch sequence. + pub fn batch_sequence(&self) -> i32 { + self.batch_sequence + } + + /// Check if the builder is closed. + pub fn is_closed(&self) -> bool { + self.is_closed + } + + /// Abort the builder. + /// After aborting, no more records can be appended and the batch cannot be built. + pub fn abort(&mut self) { + self.aborted = true; + } + + /// Close the builder. + /// After closing, no more records can be appended, but the batch can still be built. + pub fn close(&mut self) -> Result<()> { + if self.aborted { + return Err(Error::UnexpectedError { + message: "Cannot close KvRecordBatchBuilder as it has already been aborted" + .to_string(), + source: None, + }); + } + self.is_closed = true; + Ok(()) + } + + /// Get the current size in bytes of the batch. + pub fn get_size_in_bytes(&self) -> usize { + self.size_in_bytes + } + + // ----------------------- Internal methods ------------------------------- + + /// Write the batch header. + fn write_batch_header(&mut self) -> io::Result<()> { + let size_without_length = self.size_in_bytes - LENGTH_LENGTH; + let total_size = i32::try_from(size_without_length).map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + format!("Batch size {size_without_length} exceeds i32::MAX"), + ) + })?; + + // Compute attributes before borrowing buffer mutably + let attributes = self.compute_attributes(); + + // Write to the beginning of the buffer + let header = &mut self.buffer[0..RECORD_BATCH_HEADER_SIZE]; + + // Write length + header[LENGTH_OFFSET..LENGTH_OFFSET + LENGTH_LENGTH] + .copy_from_slice(&total_size.to_le_bytes()); + + // Write magic + header[MAGIC_OFFSET] = self.magic; + + // Write empty CRC first (will update later) + header[CRC_OFFSET..CRC_OFFSET + 4].copy_from_slice(&0u32.to_le_bytes()); + + // Write schema ID + header[SCHEMA_ID_OFFSET..SCHEMA_ID_OFFSET + 2] + .copy_from_slice(&(self.schema_id as i16).to_le_bytes()); + + // Write attributes + header[ATTRIBUTES_OFFSET] = attributes; + + // Write writer ID + header[WRITE_CLIENT_ID_OFFSET..WRITE_CLIENT_ID_OFFSET + 8] + .copy_from_slice(&self.writer_id.to_le_bytes()); + + // Write batch sequence + header[BATCH_SEQUENCE_OFFSET..BATCH_SEQUENCE_OFFSET + 4] + .copy_from_slice(&self.batch_sequence.to_le_bytes()); + + // Write record count + header[RECORDS_COUNT_OFFSET..RECORDS_COUNT_OFFSET + 4] + .copy_from_slice(&self.current_record_number.to_le_bytes()); + + // Compute and update CRC + let crc = crc32c::crc32c(&self.buffer[SCHEMA_ID_OFFSET..self.size_in_bytes]); + self.buffer[CRC_OFFSET..CRC_OFFSET + 4].copy_from_slice(&crc.to_le_bytes()); + + Ok(()) + } + + /// Compute the attributes byte. + fn compute_attributes(&self) -> u8 { + // Currently no attributes are used + 0 + } +} + +impl Drop for KvRecordBatchBuilder { + fn drop(&mut self) { + // Warn if the builder has records but was never built or was aborted + if self.current_record_number > 0 && !self.aborted && self.built_buffer.is_none() { + warn!( + "Warning: KvRecordBatchBuilder dropped with {} record(s) that were never built. \ + Call build() to serialize the batch before dropping.", + self.current_record_number + ); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{DataTypes, RowType}; + use crate::row::binary::BinaryWriter; + use crate::row::compacted::{CompactedRow, CompactedRowWriter}; + use std::sync::LazyLock; + static TEST_ROW_TYPE: LazyLock = + LazyLock::new(|| RowType::with_data_types(vec![DataTypes::bytes()])); + + // Helper function to create a CompactedRowWriter with a single bytes field for testing + fn create_test_row(data: &[u8]) -> CompactedRow<'_> { + CompactedRow::from_bytes(&TEST_ROW_TYPE, data) + } + + #[test] + fn test_builder_basic_operations() { + // Test basic workflow: initial state, writer state, append, close, build + let schema_id = 42; + let write_limit = 4096; + let mut builder = KvRecordBatchBuilder::new(schema_id, write_limit, KvFormat::COMPACTED); + + assert!(!builder.is_closed()); + assert_eq!(builder.writer_id(), NO_WRITER_ID); + assert_eq!(builder.batch_sequence(), NO_BATCH_SEQUENCE); + + builder.set_writer_state(100, 5); + assert_eq!(builder.writer_id(), 100); + assert_eq!(builder.batch_sequence(), 5); + + let key1 = b"key1"; + let value1 = create_test_row(b"value1"); + assert!(builder.has_room_for_row(key1, Some(value1.as_bytes()))); + builder.append_row(key1, Some(value1.as_bytes())).unwrap(); + + let key2 = b"key2"; + assert!(builder.has_room_for_row(key2, None)); + builder.append_row(key2, None).unwrap(); + + builder.close().unwrap(); + assert!(builder.is_closed()); + + let bytes = builder.build().unwrap(); + assert!(bytes.len() > RECORD_BATCH_HEADER_SIZE); + + // Building again should return cached result + let bytes2 = builder.build().unwrap(); + assert_eq!(bytes.len(), bytes2.len()); + + // Test lifecycle: abort behavior + let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + let value = create_test_row(b"value"); + builder.append_row(b"key", Some(value.as_bytes())).unwrap(); + builder.abort(); + assert!(builder.append_row(b"key2", None).is_err()); + assert!(builder.build().is_err()); + assert!(builder.close().is_err()); + + // Test lifecycle: close behavior + let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + let value = create_test_row(b"value"); + builder.append_row(b"key", Some(value.as_bytes())).unwrap(); + builder.close().unwrap(); + assert!(builder.append_row(b"key2", None).is_err()); + assert!(builder.build().is_ok()); + + // Test KvFormat validation + let mut row_writer = CompactedRowWriter::new(1); + row_writer.write_int(42); + let row_bytes = row_writer.buffer(); + + // INDEXED format should reject append_row + let mut indexed_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::INDEXED); + let result = indexed_builder.append_row(b"key", Some(row_bytes)); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput); + + // COMPACTED format should accept append_row + let mut compacted_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + let result = compacted_builder.append_row(b"key", Some(row_bytes)); + assert!(result.is_ok()); + } + + #[test] + fn test_write_limit_enforcement() { + let write_limit = 100; // Very small limit + let mut builder = KvRecordBatchBuilder::new(1, write_limit, KvFormat::COMPACTED); + + // Test has_room_for_row helper + let large_key = vec![0u8; 1000]; + let large_value = vec![1u8; 1000]; + let large_row = create_test_row(&large_value); + assert!(!builder.has_room_for_row(&large_key, Some(large_row.as_bytes()))); + let small_value = create_test_row(b"value"); + assert!(builder.has_room_for_row(b"key", Some(small_value.as_bytes()))); + + // Test append enforcement - add small record first + builder + .append_row(b"key", Some(small_value.as_bytes())) + .unwrap(); + + // Try to add large record that exceeds limit (reuse large_row from above) + let result = builder.append_row(b"key2", Some(large_row.as_bytes())); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), io::ErrorKind::WriteZero); + } + + #[test] + fn test_append_checks_record_count_limit() { + let mut builder = KvRecordBatchBuilder::new(1, 100000, KvFormat::COMPACTED); + builder.current_record_number = i32::MAX - 1; + + let value1 = create_test_row(b"value1"); + builder + .append_row(b"key1", Some(value1.as_bytes())) + .unwrap(); + + let value2 = create_test_row(b"value2"); + let result = builder.append_row(b"key2", Some(value2.as_bytes())); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput); + } + + #[test] + #[should_panic(expected = "schema_id shouldn't be greater than")] + fn test_builder_invalid_schema_id() { + KvRecordBatchBuilder::new(i16::MAX as i32 + 1, 4096, KvFormat::COMPACTED); + } + + #[test] + fn test_builder_cache_invalidation() { + use crate::record::kv::KvRecordBatch; + + // Test cache invalidation on append + let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + builder.set_writer_state(100, 5); + + let value1 = create_test_row(b"value1"); + builder + .append_row(b"key1", Some(value1.as_bytes())) + .unwrap(); + let bytes1 = builder.build().unwrap(); + let len1 = bytes1.len(); + + // Append another record - this should invalidate the cache + let value2 = create_test_row(b"value2"); + builder + .append_row(b"key2", Some(value2.as_bytes())) + .unwrap(); + let bytes2 = builder.build().unwrap(); + let len2 = bytes2.len(); + + // Verify the second build includes both records + assert!(len2 > len1); + let batch = KvRecordBatch::new(bytes2, 0); + assert!(batch.is_valid()); + assert_eq!(batch.record_count().unwrap(), 2); + + // Test cache invalidation on writer state change + let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED); + builder.set_writer_state(100, 5); + let value = create_test_row(b"value"); + builder.append_row(b"key", Some(value.as_bytes())).unwrap(); + let bytes1 = builder.build().unwrap(); + + // Change writer state - this should invalidate the cache + builder.set_writer_state(200, 10); + let bytes2 = builder.build().unwrap(); + + assert_ne!(bytes1, bytes2); + + let batch1 = KvRecordBatch::new(bytes1, 0); + let batch2 = KvRecordBatch::new(bytes2, 0); + + assert_eq!(batch1.writer_id().unwrap(), 100); + assert_eq!(batch1.batch_sequence().unwrap(), 5); + assert_eq!(batch2.writer_id().unwrap(), 200); + assert_eq!(batch2.batch_sequence().unwrap(), 10); + } + + #[test] + fn test_builder_with_compacted_row_writer() -> crate::error::Result<()> { + use crate::record::kv::KvRecordBatch; + use crate::row::InternalRow; + + let mut builder = KvRecordBatchBuilder::new(1, 100000, KvFormat::COMPACTED); + builder.set_writer_state(100, 5); + + // Create and append first record with CompactedRowWriter + let mut row_writer1 = CompactedRowWriter::new(2); + row_writer1.write_int(42); + row_writer1.write_string("hello"); + + let row_bytes1 = row_writer1.buffer(); + + let key1 = b"key1"; + assert!(builder.has_room_for_row(key1, Some(row_bytes1))); + builder.append_row(key1, Some(row_bytes1))?; + + // Create and append second record + let mut row_writer2 = CompactedRowWriter::new(2); + row_writer2.write_int(100); + row_writer2.write_string("world"); + + let row_bytes2 = row_writer2.buffer(); + + let key2 = b"key2"; + builder.append_row(key2, Some(row_bytes2))?; + + // Append a deletion record + let key3 = b"key3"; + builder.append_row(key3, None)?; + + // Build and verify + builder.close()?; + let bytes = builder.build()?; + + let batch = KvRecordBatch::new(bytes, 0); + assert!(batch.is_valid()); + assert_eq!(batch.record_count()?, 3); + assert_eq!(batch.writer_id()?, 100); + assert_eq!(batch.batch_sequence()?, 5); + + // Create ReadContext for reading typed rows + let types = vec![DataTypes::int(), DataTypes::string()]; + let read_context = crate::record::kv::test_util::TestReadContext::compacted(types); + + // Read back and verify records using idiomatic for-loop + let records = batch.records(&read_context)?; + let decoder = records.decoder_arc(); + let mut record_count = 0; + + for rec in records { + let rec = rec?; + record_count += 1; + + match record_count { + 1 => { + assert_eq!(rec.key().as_ref(), key1); + let row = rec.row(&*decoder).unwrap(); + assert_eq!(row.get_int(0)?, 42); + assert_eq!(row.get_string(1)?, "hello"); + } + 2 => { + assert_eq!(rec.key().as_ref(), key2); + let row = rec.row(&*decoder).unwrap(); + assert_eq!(row.get_int(0)?, 100); + assert_eq!(row.get_string(1)?, "world"); + } + 3 => { + assert_eq!(rec.key().as_ref(), key3); + assert!(rec.is_deletion()); + } + _ => panic!("Unexpected record count"), + } + } + + assert_eq!(record_count, 3); + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs new file mode 100644 index 0000000000..4200e044b3 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Default implementation of ReadContext with decoder caching. + +use super::ReadContext; +use crate::error::Result; +use crate::metadata::{KvFormat, Schema}; +use crate::row::{RowDecoder, RowDecoderFactory}; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; + +/// Trait for fetching schemas by ID. +/// +/// This trait abstracts schema retrieval, allowing different implementations +/// (e.g., from metadata store, cache, or test mocks). +pub trait SchemaGetter: Send + Sync { + /// Get the schema for the given schema ID. + /// + /// # Arguments + /// * `schema_id` - The schema ID to fetch + /// + /// # Returns + /// An Arc-wrapped Schema for the specified ID, or an error if the schema + /// cannot be fetched (missing ID, network error, etc.) + fn get_schema(&self, schema_id: i16) -> Result>; +} + +/// Default implementation of ReadContext with decoder caching. +/// +/// This implementation caches RowDecoders by schema ID for performance, +/// avoiding repeated schema lookups and decoder creation. +/// +/// Reference: org.apache.fluss.record.KvRecordReadContext +pub struct KvRecordReadContext { + kv_format: KvFormat, + schema_getter: Arc, + row_decoder_cache: Mutex>>, +} + +impl KvRecordReadContext { + /// Create a new KvRecordReadContext. + /// + /// # Arguments + /// * `kv_format` - The KV format (COMPACTED or INDEXED) + /// * `schema_getter` - The schema getter for fetching schemas by ID + /// + /// # Returns + /// A new KvRecordReadContext instance + pub fn new(kv_format: KvFormat, schema_getter: Arc) -> Self { + Self { + kv_format, + schema_getter, + row_decoder_cache: Mutex::new(HashMap::new()), + } + } +} + +impl ReadContext for KvRecordReadContext { + fn get_row_decoder(&self, schema_id: i16) -> Result> { + // First check: fast path + { + let cache = self + .row_decoder_cache + .lock() + .unwrap_or_else(|poisoned| poisoned.into_inner()); + if let Some(decoder) = cache.get(&schema_id) { + return Ok(Arc::clone(decoder)); + } + } // Release lock before expensive operations + + // Build decoder outside the lock to avoid blocking other threads + let schema = self.schema_getter.get_schema(schema_id)?; + let row_type = schema.row_type().clone(); + + // Create decoder outside lock + let decoder = RowDecoderFactory::create(self.kv_format, row_type)?; + + // Second check: insert only if another thread didn't beat us to it + { + let mut cache = self + .row_decoder_cache + .lock() + .unwrap_or_else(|poisoned| poisoned.into_inner()); + // Check again - another thread might have inserted while we were building + if let Some(existing) = cache.get(&schema_id) { + return Ok(Arc::clone(existing)); + } + cache.insert(schema_id, Arc::clone(&decoder)); + } + + Ok(decoder) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{DataTypes, Schema}; + + struct MockSchemaGetter { + schema: Arc, + } + + impl MockSchemaGetter { + fn new(data_types: Vec) -> Self { + let mut builder = Schema::builder(); + for (i, dt) in data_types.iter().enumerate() { + builder = builder.column(format!("field{i}"), dt.clone()); + } + let schema = builder.build().expect("Failed to build schema"); + + Self { + schema: Arc::new(schema), + } + } + } + + impl SchemaGetter for MockSchemaGetter { + fn get_schema(&self, _schema_id: i16) -> Result> { + Ok(Arc::clone(&self.schema)) + } + } + + #[test] + fn test_kv_record_read_context() { + // Test decoder caching for same schema ID + let schema_getter = Arc::new(MockSchemaGetter::new(vec![ + DataTypes::int(), + DataTypes::string(), + ])); + let read_context = KvRecordReadContext::new(KvFormat::COMPACTED, schema_getter); + + // Get decoder twice - should return the same instance (cached) + let decoder1 = read_context.get_row_decoder(42).unwrap(); + let decoder2 = read_context.get_row_decoder(42).unwrap(); + + // Verify same instance (Arc pointer equality) + assert!(Arc::ptr_eq(&decoder1, &decoder2)); + + // Test different schema IDs get different decoders + let schema_getter = Arc::new(MockSchemaGetter::new(vec![DataTypes::int()])); + let read_context = KvRecordReadContext::new(KvFormat::COMPACTED, schema_getter); + + let decoder1 = read_context.get_row_decoder(10).unwrap(); + let decoder2 = read_context.get_row_decoder(20).unwrap(); + + // Should be different instances + assert!(!Arc::ptr_eq(&decoder1, &decoder2)); + } +} diff --git a/fluss-rust/crates/fluss/src/record/kv/mod.rs b/fluss-rust/crates/fluss/src/record/kv/mod.rs new file mode 100644 index 0000000000..4d0f894638 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/mod.rs @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Key-Value record and batch implementations. + +mod kv_record; +mod kv_record_batch; +mod kv_record_batch_builder; +mod kv_record_read_context; +mod read_context; +mod value_record_batch; + +#[cfg(test)] +mod test_util; + +pub use kv_record::{KvRecord, LENGTH_LENGTH as KV_RECORD_LENGTH_LENGTH}; +pub use kv_record_batch::*; +pub use kv_record_batch_builder::*; +pub use kv_record_read_context::{KvRecordReadContext, SchemaGetter}; +pub use read_context::ReadContext; +pub(crate) use value_record_batch::ValueRecordBatch; + +/// Current KV magic value +pub const CURRENT_KV_MAGIC_VALUE: u8 = 0; + +/// No writer ID constant +pub const NO_WRITER_ID: i64 = -1; + +/// No batch sequence constant +pub const NO_BATCH_SEQUENCE: i32 = -1; diff --git a/fluss-rust/crates/fluss/src/record/kv/read_context.rs b/fluss-rust/crates/fluss/src/record/kv/read_context.rs new file mode 100644 index 0000000000..63502613d1 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/read_context.rs @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Read context for KV record batches. +//! +//! Provides schema and decoder information needed for typed record reading. + +use crate::error::Result; +use crate::row::RowDecoder; +use std::sync::Arc; + +/// Context for reading KV records with type information. +/// +/// The ReadContext provides access to RowDecoders based on schema IDs, +/// enabling typed deserialization of KV record values. +/// +/// Reference: org.apache.fluss.record.KvRecordBatch.ReadContext +pub trait ReadContext: Send + Sync { + /// Get the row decoder for the given schema ID. + /// + /// The decoder is typically cached, so repeated calls with the same + /// schema ID should return the same decoder instance. + /// + /// # Arguments + /// * `schema_id` - The schema ID for which to get the decoder + /// + /// # Returns + /// An Arc-wrapped RowDecoder for the specified schema, or an error if + /// the schema is invalid or cannot be retrieved + fn get_row_decoder(&self, schema_id: i16) -> Result>; +} diff --git a/fluss-rust/crates/fluss/src/record/kv/test_util.rs b/fluss-rust/crates/fluss/src/record/kv/test_util.rs new file mode 100644 index 0000000000..54eaac8f3d --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/test_util.rs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Test utilities for KV record reading. + +use super::ReadContext; +use crate::error::Result; +use crate::metadata::{DataType, KvFormat, RowType}; +use crate::row::{RowDecoder, RowDecoderFactory}; +use std::sync::Arc; + +/// Simple test-only ReadContext that creates decoders directly from data types. +/// +/// This bypasses the production Schema/SchemaGetter machinery for simpler tests. +pub(crate) struct TestReadContext { + kv_format: KvFormat, + data_types: Vec, +} + +impl TestReadContext { + /// Create a test context for COMPACTED format (most common case). + pub(crate) fn compacted(data_types: Vec) -> Self { + Self { + kv_format: KvFormat::COMPACTED, + data_types, + } + } +} + +impl ReadContext for TestReadContext { + fn get_row_decoder(&self, _schema_id: i16) -> Result> { + // Directly create decoder from data types - no Schema needed! + let row_type = RowType::with_data_types(self.data_types.clone()); + RowDecoderFactory::create(self.kv_format, row_type) + } +} diff --git a/fluss-rust/crates/fluss/src/record/kv/value_record_batch.rs b/fluss-rust/crates/fluss/src/record/kv/value_record_batch.rs new file mode 100644 index 0000000000..fdd6b0702c --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/kv/value_record_batch.rs @@ -0,0 +1,188 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Reader for the value-record batch returned by a KV (primary-key) limit +//! scan. This is a distinct wire format from [`super::KvRecordBatch`]: it +//! carries value-only records (no keys, no CRC/writer-id header) and a schema +//! id *per record* rather than per batch. +//! +//! Batch layout (little-endian): +//! - Length => Int32 (size of everything after this field) +//! - Magic => Int8 +//! - RecordCount => Int32 +//! - Records => [ValueRecord] +//! +//! Each `ValueRecord`: +//! - Length => Int32 (size after this field: SchemaId + Value) +//! - SchemaId => Int16 +//! - Value => row bytes +//! +//! Reference: `org.apache.fluss.record.DefaultValueRecordBatch` and +//! `org.apache.fluss.record.DefaultValueRecord`. + +use crate::error::{Error, Result}; +use byteorder::{ByteOrder, LittleEndian}; +use bytes::Bytes; +use std::ops::Range; + +const LENGTH_LENGTH: usize = 4; +const MAGIC_LENGTH: usize = 1; +const RECORD_COUNT_LENGTH: usize = 4; +/// Offset of the record count within the batch header. +const RECORD_COUNT_OFFSET: usize = LENGTH_LENGTH + MAGIC_LENGTH; +/// Size of the batch header (`Length + Magic + RecordCount`). +const RECORD_BATCH_HEADER_SIZE: usize = LENGTH_LENGTH + MAGIC_LENGTH + RECORD_COUNT_LENGTH; +/// Size of a `ValueRecord`'s leading length field. +const RECORD_LENGTH_LENGTH: usize = 4; + +/// Read-only view over a serialized value-record batch. +pub(crate) struct ValueRecordBatch { + data: Bytes, +} + +impl ValueRecordBatch { + /// Wraps raw batch bytes. The batch is expected to start at offset 0. + pub(crate) fn new(data: Bytes) -> Self { + Self { data } + } + + /// Number of records declared in the batch header. + pub(crate) fn record_count(&self) -> Result { + if self.data.len() < RECORD_BATCH_HEADER_SIZE { + return Err(corrupt(format!( + "value-record batch too short: {} bytes, need {} for header", + self.data.len(), + RECORD_BATCH_HEADER_SIZE + ))); + } + Ok(LittleEndian::read_i32( + &self.data[RECORD_COUNT_OFFSET..RECORD_COUNT_OFFSET + RECORD_COUNT_LENGTH], + )) + } + + /// Returns one byte range per record, each spanning `[SchemaId | Value]`: + /// the payload [`crate::row::FixedSchemaDecoder::decode`] expects. Index + /// [`Self::data`] with a returned range to get it without copying. + pub(crate) fn value_ranges(&self) -> Result>> { + let count = self.record_count()?; + if count < 0 { + return Err(corrupt(format!("invalid record count {count}"))); + } + let mut ranges = Vec::with_capacity(count as usize); + let mut pos = RECORD_BATCH_HEADER_SIZE; + for i in 0..count as usize { + if pos + RECORD_LENGTH_LENGTH > self.data.len() { + return Err(corrupt(format!( + "truncated value-record batch: record {i} length field runs past end" + ))); + } + let rec_len = LittleEndian::read_i32(&self.data[pos..pos + RECORD_LENGTH_LENGTH]); + if rec_len < 0 { + return Err(corrupt(format!("record {i} has negative length {rec_len}"))); + } + let start = pos + RECORD_LENGTH_LENGTH; + let end = start + rec_len as usize; + if end > self.data.len() { + return Err(corrupt(format!( + "truncated value-record batch: record {i} payload runs past end" + ))); + } + ranges.push(start..end); + pos = end; + } + Ok(ranges) + } + + /// The underlying batch bytes. + pub(crate) fn data(&self) -> &Bytes { + &self.data + } +} + +fn corrupt(message: String) -> Error { + Error::UnexpectedError { + message, + source: None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::record::kv::SCHEMA_ID_LENGTH; + + /// Build a value-record batch from `(schema_id, row_bytes)` pairs, mirroring + /// the Java `DefaultValueRecordBatch.Builder` wire layout. + fn build_batch(records: &[(i16, &[u8])]) -> Vec { + let mut body = Vec::new(); + for (schema_id, row) in records { + let rec_len = (SCHEMA_ID_LENGTH + row.len()) as i32; + body.extend_from_slice(&rec_len.to_le_bytes()); + body.extend_from_slice(&schema_id.to_le_bytes()); + body.extend_from_slice(row); + } + let mut out = Vec::new(); + // Length covers Magic + RecordCount + body. + let length = (MAGIC_LENGTH + RECORD_COUNT_LENGTH + body.len()) as i32; + out.extend_from_slice(&length.to_le_bytes()); + out.push(0); // magic + out.extend_from_slice(&(records.len() as i32).to_le_bytes()); + out.extend_from_slice(&body); + out + } + + #[test] + fn parses_record_count_and_ranges() { + let raw = build_batch(&[(7, &[1, 2, 3]), (7, &[4, 5])]); + let batch = ValueRecordBatch::new(Bytes::from(raw)); + assert_eq!(batch.record_count().unwrap(), 2); + + let ranges = batch.value_ranges().unwrap(); + assert_eq!(ranges.len(), 2); + // First record payload = [schema_id(2) | row(3)] = 5 bytes. + let r0 = &batch.data()[ranges[0].clone()]; + assert_eq!(r0.len(), 5); + assert_eq!(LittleEndian::read_i16(&r0[..2]), 7); + assert_eq!(&r0[2..], &[1, 2, 3]); + // Second record payload = [schema_id(2) | row(2)] = 4 bytes. + let r1 = &batch.data()[ranges[1].clone()]; + assert_eq!(r1.len(), 4); + assert_eq!(&r1[2..], &[4, 5]); + } + + #[test] + fn empty_batch_has_no_ranges() { + let raw = build_batch(&[]); + let batch = ValueRecordBatch::new(Bytes::from(raw)); + assert_eq!(batch.record_count().unwrap(), 0); + assert!(batch.value_ranges().unwrap().is_empty()); + } + + #[test] + fn truncated_payload_errors() { + let mut raw = build_batch(&[(7, &[1, 2, 3])]); + raw.truncate(raw.len() - 2); // chop into the row payload + let batch = ValueRecordBatch::new(Bytes::from(raw)); + assert!(batch.value_ranges().is_err()); + } + + #[test] + fn short_header_errors() { + let batch = ValueRecordBatch::new(Bytes::from(vec![0u8, 1, 2])); + assert!(batch.record_count().is_err()); + } +} diff --git a/fluss-rust/crates/fluss/src/record/mod.rs b/fluss-rust/crates/fluss/src/record/mod.rs new file mode 100644 index 0000000000..462bdebbc1 --- /dev/null +++ b/fluss-rust/crates/fluss/src/record/mod.rs @@ -0,0 +1,327 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::metadata::TableBucket; +use crate::row::ColumnarRow; +use ::arrow::array::RecordBatch; +use core::fmt; +use std::collections::HashMap; + +mod arrow; +mod error; +pub mod kv; + +pub use arrow::*; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ChangeType { + /// Append-only operation + AppendOnly, + /// Insert operation + Insert, + /// Update operation containing the previous content of the updated row + UpdateBefore, + /// Update operation containing the new content of the updated row + UpdateAfter, + /// Delete operation + Delete, +} + +impl ChangeType { + /// Returns a short string representation of this ChangeType + pub fn short_string(&self) -> &'static str { + match self { + ChangeType::AppendOnly => "+A", + ChangeType::Insert => "+I", + ChangeType::UpdateBefore => "-U", + ChangeType::UpdateAfter => "+U", + ChangeType::Delete => "-D", + } + } + + /// Returns the byte value representation used for serialization + pub fn to_byte_value(&self) -> u8 { + match self { + ChangeType::AppendOnly => 0, + ChangeType::Insert => 1, + ChangeType::UpdateBefore => 2, + ChangeType::UpdateAfter => 3, + ChangeType::Delete => 4, + } + } + + /// Creates a ChangeType from its byte value representation + /// + /// # Errors + /// Returns an error if the byte value doesn't correspond to any ChangeType + pub fn from_byte_value(value: u8) -> Result { + match value { + 0 => Ok(ChangeType::AppendOnly), + 1 => Ok(ChangeType::Insert), + 2 => Ok(ChangeType::UpdateBefore), + 3 => Ok(ChangeType::UpdateAfter), + 4 => Ok(ChangeType::Delete), + _ => Err(format!("Unsupported byte value '{value}' for change type")), + } + } +} + +impl fmt::Display for ChangeType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.short_string()) + } +} + +#[derive(Clone)] +pub struct ScanRecord { + pub row: ColumnarRow, + offset: i64, + timestamp: i64, + change_type: ChangeType, +} + +impl ScanRecord { + const INVALID: i64 = -1; + + pub fn new_default(row: ColumnarRow) -> Self { + ScanRecord { + row, + offset: Self::INVALID, + timestamp: Self::INVALID, + change_type: ChangeType::Insert, + } + } + + pub fn new(row: ColumnarRow, offset: i64, timestamp: i64, change_type: ChangeType) -> Self { + ScanRecord { + row, + offset, + timestamp, + change_type, + } + } + + pub fn row(&self) -> &ColumnarRow { + &self.row + } + + /// Returns the position in the log + pub fn offset(&self) -> i64 { + self.offset + } + + /// Returns the timestamp + pub fn timestamp(&self) -> i64 { + self.timestamp + } + + /// Returns the change type + pub fn change_type(&self) -> &ChangeType { + &self.change_type + } +} + +pub struct ScanRecords { + records: HashMap>, +} + +impl ScanRecords { + pub fn empty() -> Self { + Self { + records: HashMap::new(), + } + } + + pub fn new(records: HashMap>) -> Self { + Self { records } + } + + pub fn records(&self, scan_bucket: &TableBucket) -> &[ScanRecord] { + self.records.get(scan_bucket).map_or(&[], |records| records) + } + + pub fn count(&self) -> usize { + self.records.values().map(|v| v.len()).sum() + } + + pub fn is_empty(&self) -> bool { + self.records.is_empty() + } + + pub fn records_by_buckets(&self) -> &HashMap> { + &self.records + } + + pub fn into_records_by_buckets(self) -> HashMap> { + self.records + } +} + +/// A batch of records with metadata about bucket and offsets. +/// +/// This is the batch-level equivalent of [`ScanRecord`], providing efficient +/// access to Arrow RecordBatches while preserving the bucket and offset information +/// needed for tracking consumption progress. +#[derive(Debug, Clone)] +pub struct ScanBatch { + /// The bucket this batch belongs to + bucket: TableBucket, + /// The Arrow RecordBatch containing the data + batch: RecordBatch, + /// Offset of the first record in this batch + base_offset: i64, +} + +impl ScanBatch { + pub fn new(bucket: TableBucket, batch: RecordBatch, base_offset: i64) -> Self { + Self { + bucket, + batch, + base_offset, + } + } + + pub fn bucket(&self) -> &TableBucket { + &self.bucket + } + + pub fn batch(&self) -> &RecordBatch { + &self.batch + } + + pub fn into_batch(self) -> RecordBatch { + self.batch + } + + pub fn base_offset(&self) -> i64 { + self.base_offset + } + + pub fn num_records(&self) -> usize { + self.batch.num_rows() + } + + /// Returns the offset of the last record in this batch. + pub fn last_offset(&self) -> i64 { + if self.batch.num_rows() == 0 { + self.base_offset - 1 + } else { + self.base_offset + self.batch.num_rows() as i64 - 1 + } + } +} + +impl IntoIterator for ScanRecords { + type Item = ScanRecord; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.records + .into_values() + .flatten() + .collect::>() + .into_iter() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use ::arrow::array::{Int32Array, RecordBatch}; + use ::arrow::datatypes::{DataType, Field, Schema}; + use std::sync::Arc; + + fn make_row(values: Vec, row_id: usize) -> ColumnarRow { + use crate::metadata::RowType; + let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, false)])); + let batch = RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(values))]) + .expect("record batch"); + let row_type = Arc::new(RowType::with_data_types(vec![ + crate::metadata::DataType::Int(crate::metadata::IntType::new()), + ])); + ColumnarRow::new(Arc::new(batch), row_type, row_id, None) + } + + #[test] + fn change_type_round_trip() { + let cases = [ + (ChangeType::AppendOnly, "+A", 0), + (ChangeType::Insert, "+I", 1), + (ChangeType::UpdateBefore, "-U", 2), + (ChangeType::UpdateAfter, "+U", 3), + (ChangeType::Delete, "-D", 4), + ]; + + for (change_type, short, byte) in cases { + assert_eq!(change_type.short_string(), short); + assert_eq!(change_type.to_byte_value(), byte); + assert_eq!(ChangeType::from_byte_value(byte).unwrap(), change_type); + } + + let err = ChangeType::from_byte_value(9).unwrap_err(); + assert!(err.contains("Unsupported byte value")); + } + + #[test] + fn scan_records_counts_and_iterates() { + let bucket0 = TableBucket::new(1, 0); + let bucket1 = TableBucket::new(1, 1); + let record0 = ScanRecord::new(make_row(vec![10, 11], 0), 5, 7, ChangeType::Insert); + let record1 = ScanRecord::new(make_row(vec![10, 11], 1), 6, 8, ChangeType::Delete); + + let mut records = HashMap::new(); + records.insert(bucket0.clone(), vec![record0.clone(), record1.clone()]); + + let scan_records = ScanRecords::new(records); + assert_eq!(scan_records.records(&bucket0).len(), 2); + assert!(scan_records.records(&bucket1).is_empty()); + assert_eq!(scan_records.count(), 2); + + let collected: Vec<_> = scan_records.into_iter().collect(); + assert_eq!(collected.len(), 2); + } + + #[test] + fn scan_record_default_values() { + let record = ScanRecord::new_default(make_row(vec![1], 0)); + assert_eq!(record.offset(), -1); + assert_eq!(record.timestamp(), -1); + assert_eq!(record.change_type(), &ChangeType::Insert); + } + + #[test] + fn scan_batch_last_offset() { + let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, false)])); + let bucket = TableBucket::new(1, 0); + + // Batch with 3 records starting at offset 100 -> last_offset = 102 + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + let scan_batch = ScanBatch::new(bucket.clone(), batch, 100); + assert_eq!(scan_batch.num_records(), 3); + assert_eq!(scan_batch.last_offset(), 102); + + // Empty batch -> last_offset = base_offset - 1 + let empty_batch = RecordBatch::new_empty(schema); + let empty_scan_batch = ScanBatch::new(bucket, empty_batch, 100); + assert_eq!(empty_scan_batch.num_records(), 0); + assert_eq!(empty_scan_batch.last_offset(), 99); + } +} diff --git a/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs new file mode 100644 index 0000000000..3380629599 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs @@ -0,0 +1,318 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::{DataType, RowType}; +use crate::row::Decimal; +use crate::row::binary::BinaryRowFormat; +use crate::row::datum::{TimestampLtz, TimestampNtz}; +use crate::row::{Datum, FlussArray, FlussMap}; + +/// Writer to write a composite data format, like row, array, +#[allow(dead_code)] +pub trait BinaryWriter { + /// Reset writer to prepare next write + fn reset(&mut self); + + /// Set null to this field + fn set_null_at(&mut self, pos: usize); + + fn write_boolean(&mut self, value: bool); + + fn write_byte(&mut self, value: u8); + + fn write_bytes(&mut self, value: &[u8]); + + fn write_char(&mut self, value: &str, length: usize); + + fn write_string(&mut self, value: &str); + + fn write_short(&mut self, value: i16); + + fn write_int(&mut self, value: i32); + + fn write_long(&mut self, value: i64); + + fn write_float(&mut self, value: f32); + + fn write_double(&mut self, value: f64); + + fn write_binary(&mut self, bytes: &[u8], length: usize); + + fn write_decimal(&mut self, value: &Decimal, precision: u32); + + /// Writes a TIME value. + /// + /// Note: TIME is physically stored as an i32 (milliseconds since midnight). + /// This method exists for type safety and semantic clarity, even though it's + /// currently equivalent to `write_int()`. The precision parameter is accepted + /// for API consistency with TIMESTAMP types, though TIME encoding doesn't + /// currently vary by precision. + fn write_time(&mut self, value: i32, precision: u32); + + fn write_timestamp_ntz(&mut self, value: &TimestampNtz, precision: u32); + + fn write_timestamp_ltz(&mut self, value: &TimestampLtz, precision: u32); + + fn write_array(&mut self, value: &FlussArray); + + fn write_map(&mut self, value: &FlussMap); + + // TODO Row serializer + // fn write_row(&mut self, pos: i32, value: &InternalRow); + + /// Finally, complete write to set real size to binary. + fn complete(&mut self); +} + +pub enum ValueWriter { + Nullable(InnerValueWriter), + NonNullable(InnerValueWriter), +} + +impl ValueWriter { + pub fn create_value_writer( + element_type: &DataType, + binary_row_format: Option<&BinaryRowFormat>, + ) -> Result { + let value_writer = + InnerValueWriter::create_inner_value_writer(element_type, binary_row_format)?; + if element_type.is_nullable() { + Ok(Self::Nullable(value_writer)) + } else { + Ok(Self::NonNullable(value_writer)) + } + } + + pub fn write_value( + &self, + writer: &mut W, + pos: usize, + value: &Datum, + ) -> Result<()> { + match self { + Self::Nullable(inner_value_writer) => { + if let Datum::Null = value { + writer.set_null_at(pos); + Ok(()) + } else { + inner_value_writer.write_value(writer, pos, value) + } + } + Self::NonNullable(inner_value_writer) => { + inner_value_writer.write_value(writer, pos, value) + } + } + } +} + +#[derive(Debug)] +pub enum InnerValueWriter { + Char, + String, + Boolean, + Binary, + Bytes, + TinyInt, + SmallInt, + Int, + BigInt, + Float, + Double, + Decimal(u32, u32), // precision, scale + Date, + Time(u32), // precision (not used in wire format, but kept for consistency) + TimestampNtz(u32), // precision + TimestampLtz(u32), // precision + Array, + Map, + Row(NestedRowWriter), +} + +#[derive(Debug)] +pub struct NestedRowWriter { + field_writers: Vec, + field_nullable: Vec, +} + +impl NestedRowWriter { + fn from_row_type(row_type: &RowType) -> Result { + let fields = row_type.fields(); + let mut field_writers = Vec::with_capacity(fields.len()); + let mut field_nullable = Vec::with_capacity(fields.len()); + for field in fields { + field_writers.push(InnerValueWriter::create_inner_value_writer( + field.data_type(), + None, + )?); + field_nullable.push(field.data_type().is_nullable()); + } + Ok(Self { + field_writers, + field_nullable, + }) + } + + fn field_count(&self) -> usize { + self.field_writers.len() + } +} + +/// Accessor for writing the fields/elements of a binary writer during runtime, the +/// fields/elements must be written in the order. +impl InnerValueWriter { + pub fn create_inner_value_writer( + data_type: &DataType, + _: Option<&BinaryRowFormat>, + ) -> Result { + match data_type { + DataType::Char(_) => Ok(InnerValueWriter::Char), + DataType::String(_) => Ok(InnerValueWriter::String), + DataType::Boolean(_) => Ok(InnerValueWriter::Boolean), + DataType::Binary(_) => Ok(InnerValueWriter::Binary), + DataType::Bytes(_) => Ok(InnerValueWriter::Bytes), + DataType::TinyInt(_) => Ok(InnerValueWriter::TinyInt), + DataType::SmallInt(_) => Ok(InnerValueWriter::SmallInt), + DataType::Int(_) => Ok(InnerValueWriter::Int), + DataType::BigInt(_) => Ok(InnerValueWriter::BigInt), + DataType::Float(_) => Ok(InnerValueWriter::Float), + DataType::Double(_) => Ok(InnerValueWriter::Double), + DataType::Decimal(d) => { + // Validation is done at DecimalType construction time + Ok(InnerValueWriter::Decimal(d.precision(), d.scale())) + } + DataType::Date(_) => Ok(InnerValueWriter::Date), + DataType::Time(t) => { + // Validation is done at TimeType construction time + Ok(InnerValueWriter::Time(t.precision())) + } + DataType::Timestamp(t) => { + // Validation is done at TimestampType construction time + Ok(InnerValueWriter::TimestampNtz(t.precision())) + } + DataType::TimestampLTz(t) => { + // Validation is done at TimestampLTzType construction time + Ok(InnerValueWriter::TimestampLtz(t.precision())) + } + DataType::Array(_) => Ok(InnerValueWriter::Array), + DataType::Map(_) => Ok(InnerValueWriter::Map), + DataType::Row(row_type) => Ok(InnerValueWriter::Row(NestedRowWriter::from_row_type( + row_type, + )?)), + } + } + pub fn write_value( + &self, + writer: &mut W, + _pos: usize, + value: &Datum, + ) -> Result<()> { + match (self, value) { + (InnerValueWriter::Char, Datum::String(v)) => { + writer.write_char(v, v.len()); + } + (InnerValueWriter::String, Datum::String(v)) => { + writer.write_string(v); + } + (InnerValueWriter::Boolean, Datum::Bool(v)) => { + writer.write_boolean(*v); + } + (InnerValueWriter::Binary, Datum::Blob(v)) => { + let b = v.as_ref(); + writer.write_binary(b, b.len()); + } + (InnerValueWriter::Bytes, Datum::Blob(v)) => { + writer.write_bytes(v.as_ref()); + } + (InnerValueWriter::TinyInt, Datum::Int8(v)) => { + writer.write_byte(*v as u8); + } + (InnerValueWriter::SmallInt, Datum::Int16(v)) => { + writer.write_short(*v); + } + (InnerValueWriter::Int, Datum::Int32(v)) => { + writer.write_int(*v); + } + (InnerValueWriter::BigInt, Datum::Int64(v)) => { + writer.write_long(*v); + } + (InnerValueWriter::Float, Datum::Float32(v)) => { + writer.write_float(v.into_inner()); + } + (InnerValueWriter::Double, Datum::Float64(v)) => { + writer.write_double(v.into_inner()); + } + (InnerValueWriter::Decimal(p, _s), Datum::Decimal(v)) => { + writer.write_decimal(v, *p); + } + (InnerValueWriter::Date, Datum::Date(d)) => { + writer.write_int(d.get_inner()); + } + (InnerValueWriter::Time(p), Datum::Time(t)) => { + writer.write_time(t.get_inner(), *p); + } + (InnerValueWriter::TimestampNtz(p), Datum::TimestampNtz(ts)) => { + writer.write_timestamp_ntz(ts, *p); + } + (InnerValueWriter::TimestampLtz(p), Datum::TimestampLtz(ts)) => { + writer.write_timestamp_ltz(ts, *p); + } + (InnerValueWriter::Array, Datum::Array(arr)) => { + writer.write_array(arr); + } + (InnerValueWriter::Map, Datum::Map(map)) => { + writer.write_map(map); + } + (InnerValueWriter::Row(nested_writer), Datum::Row(inner_row)) => { + use crate::row::compacted::CompactedRowWriter; + let field_count = nested_writer.field_count(); + if inner_row.values.len() != field_count { + return Err(IllegalArgument { + message: format!( + "nested row arity mismatch: schema has {} fields, got {}", + field_count, + inner_row.values.len(), + ), + }); + } + let mut nested = CompactedRowWriter::new(field_count); + for (i, datum) in inner_row.values.iter().enumerate() { + if datum.is_null() { + if !nested_writer.field_nullable[i] { + return Err(IllegalArgument { + message: format!( + "nested row field {i} is non-nullable but received null", + ), + }); + } + nested.set_null_at(i); + } else { + nested_writer.field_writers[i].write_value(&mut nested, i, datum)?; + } + } + writer.write_bytes(nested.buffer()); + } + _ => { + return Err(IllegalArgument { + message: format!("{self:?} used to write value {value:?}"), + }); + } + } + Ok(()) + } +} diff --git a/fluss-rust/crates/fluss/src/row/binary/iceberg_binary_row_writer.rs b/fluss-rust/crates/fluss/src/row/binary/iceberg_binary_row_writer.rs new file mode 100644 index 0000000000..82a61928ae --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/binary/iceberg_binary_row_writer.rs @@ -0,0 +1,564 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use bytes::{Bytes, BytesMut}; + +use crate::error::{Error, Result}; +use crate::metadata::DataType; +use crate::row::Decimal; +use crate::row::binary::{BinaryWriter, ValueWriter}; +use crate::row::binary_array::FlussArray; +use crate::row::binary_map::FlussMap; + +const MICROS_PER_MILLI: i64 = 1_000; + +/// Iceberg-specific binary writer for encoding key columns. +/// +/// Unlike [`CompactedRowWriter`] which uses varint encoding and length-prefixed +/// variable-length fields, this writer follows Iceberg's encoding conventions: +/// - Integers (int, date) are written as i64 (8 bytes, little-endian) +/// - Time values are converted from milliseconds to microseconds +/// - Timestamps are converted to microseconds +/// - Floats/doubles use fixed-width little-endian encoding +/// - Variable-length types (string, binary) are written without length prefixes +/// - Decimals are written as unscaled big-endian bytes without length prefixes +/// +/// The encoded bytes feed directly into `IcebergBucketingFunction`'s MurmurHash +/// for bucket assignment and must match the Java Fluss server's encoding exactly. +/// +/// [`CompactedRowWriter`]: crate::row::compacted::CompactedRowWriter +pub struct IcebergBinaryRowWriter { + position: usize, + buffer: BytesMut, +} + +impl Default for IcebergBinaryRowWriter { + fn default() -> Self { + Self::new() + } +} + +impl IcebergBinaryRowWriter { + pub fn new() -> Self { + let buffer = BytesMut::zeroed(64); + Self { + position: 0, + buffer, + } + } + + // Dependency order note: + // 1) Keep this PR scoped to writer-level Java parity. + // 2) Wire the writer through IcebergKeyEncoder in follow-up #308. + // TODO(#308): add end-to-end key-encoding tests via IcebergKeyEncoder + // (similar to CompactedKeyEncoder tests for CompactedKeyWriter). + pub fn create_value_writer(field_type: &DataType) -> Result { + match field_type { + // Match Java IcebergBinaryRowWriter.createFieldWriter() supported types exactly. + DataType::Int(_) + | DataType::Date(_) + | DataType::Time(_) + | DataType::BigInt(_) + | DataType::Float(_) + | DataType::Double(_) + | DataType::Timestamp(_) + | DataType::Decimal(_) + | DataType::String(_) + | DataType::Char(_) + | DataType::Binary(_) + | DataType::Bytes(_) => ValueWriter::create_value_writer(field_type, None), + + // Keep Java's explicit scalar-only rejection messaging for ARRAY/MAP. + DataType::Array(_) => Err(Error::UnsupportedOperation { + message: + "Array types cannot be used as bucket keys. Bucket keys must be scalar types." + .to_string(), + }), + DataType::Map(_) => Err(Error::UnsupportedOperation { + message: + "Map types cannot be used as bucket keys. Bucket keys must be scalar types." + .to_string(), + }), + + // BOOLEAN, TINYINT, SMALLINT, TIMESTAMP_LTZ, ROW and any future types. + _ => Err(Error::UnsupportedOperation { + message: format!( + "Unsupported type for Iceberg binary row writer: {:?}", + field_type + ), + }), + } + } + + #[allow(dead_code)] + pub fn position(&self) -> usize { + self.position + } + + #[allow(dead_code)] + pub fn buffer(&self) -> &[u8] { + &self.buffer[..self.position] + } + + pub fn to_bytes(&self) -> Bytes { + Bytes::copy_from_slice(&self.buffer[..self.position]) + } + + fn ensure_capacity(&mut self, need_len: usize) { + if (self.buffer.len() - self.position) < need_len { + let new_len = std::cmp::max(self.buffer.len() * 2, self.buffer.len() + need_len); + self.buffer.resize(new_len, 0); + } + } + + fn write_raw(&mut self, src: &[u8]) { + let end = self.position + src.len(); + self.ensure_capacity(src.len()); + self.buffer[self.position..end].copy_from_slice(src); + self.position = end; + } +} + +impl BinaryWriter for IcebergBinaryRowWriter { + fn reset(&mut self) { + if self.position > 0 { + self.buffer[..self.position].fill(0); + } + self.position = 0; + } + + fn set_null_at(&mut self, _pos: usize) { + panic!("Iceberg key columns do not support null values"); + } + + fn write_boolean(&mut self, value: bool) { + self.write_raw(&[if value { 1u8 } else { 0u8 }]); + } + + fn write_byte(&mut self, value: u8) { + self.write_raw(&[value]); + } + + fn write_bytes(&mut self, value: &[u8]) { + // Iceberg: raw bytes, no length prefix + self.write_raw(value); + } + + fn write_char(&mut self, value: &str, _length: usize) { + // Iceberg: same as string — raw UTF-8, no length prefix + self.write_string(value); + } + + fn write_string(&mut self, value: &str) { + // Iceberg: raw UTF-8 bytes, no length prefix + self.write_raw(value.as_bytes()); + } + + fn write_short(&mut self, value: i16) { + self.write_raw(&value.to_le_bytes()); + } + + fn write_int(&mut self, value: i32) { + // Iceberg: promote i32 to i64, write as 8 bytes little-endian + self.write_raw(&(value as i64).to_le_bytes()); + } + + fn write_long(&mut self, value: i64) { + self.write_raw(&value.to_le_bytes()); + } + + fn write_float(&mut self, value: f32) { + self.write_raw(&value.to_le_bytes()); + } + + fn write_double(&mut self, value: f64) { + self.write_raw(&value.to_le_bytes()); + } + + fn write_binary(&mut self, bytes: &[u8], length: usize) { + // Iceberg: raw bytes, no length prefix + self.write_raw(&bytes[..length.min(bytes.len())]); + } + + fn write_decimal(&mut self, value: &Decimal, _precision: u32) { + // Iceberg: unscaled big-endian bytes, no length prefix + let unscaled_bytes = value.to_unscaled_bytes(); + self.write_raw(&unscaled_bytes); + } + + fn write_time(&mut self, value: i32, _precision: u32) { + // NOTE: this is the same with Java's long arithmetic wraps on overflow. + let micros = (value as i64).wrapping_mul(MICROS_PER_MILLI); + self.write_raw(µs.to_le_bytes()); + } + + fn write_timestamp_ntz(&mut self, value: &crate::row::datum::TimestampNtz, _precision: u32) { + // NOTE: this is the same with Java's long arithmetic wraps on overflow. + let millis = value.get_millisecond(); + let nanos = value.get_nano_of_millisecond(); + let micros = millis + .wrapping_mul(MICROS_PER_MILLI) + .wrapping_add((nanos as i64) / MICROS_PER_MILLI); + self.write_raw(µs.to_le_bytes()); + } + + fn write_timestamp_ltz(&mut self, value: &crate::row::datum::TimestampLtz, _precision: u32) { + // NOTE: this is the same with Java's long arithmetic wraps on overflow. + let millis = value.get_epoch_millisecond(); + let nanos = value.get_nano_of_millisecond(); + let micros = millis + .wrapping_mul(MICROS_PER_MILLI) + .wrapping_add((nanos as i64) / MICROS_PER_MILLI); + self.write_raw(µs.to_le_bytes()); + } + + fn write_array(&mut self, _value: &FlussArray) { + unreachable!("Array/Map types are rejected during value writer creation"); + } + + fn write_map(&mut self, _value: &FlussMap) { + unreachable!("Array/Map types are rejected during value writer creation"); + } + + fn complete(&mut self) { + // No finalization needed for Iceberg key encoding + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::metadata::{DataTypes, SmallIntType, TinyIntType}; + use crate::row::datum::{TimestampLtz, TimestampNtz}; + use bigdecimal::{BigDecimal, num_bigint::BigInt}; + + fn assert_unsupported_type(dt: DataType, expected_fragment: &str) { + match IcebergBinaryRowWriter::create_value_writer(&dt) { + Err(e) => assert!( + e.to_string().contains(expected_fragment), + "unexpected error for {dt:?}: {e}" + ), + Ok(_) => panic!("expected error for unsupported type {dt:?}, got Ok"), + } + } + + #[test] + fn test_write_int_as_i64_le() { + let mut w = IcebergBinaryRowWriter::new(); + w.write_int(42); + assert_eq!(w.buffer(), &42i64.to_le_bytes()); + } + + #[test] + fn test_write_int_negative() { + let mut w = IcebergBinaryRowWriter::new(); + w.write_int(-1); + assert_eq!(w.buffer(), &(-1i64).to_le_bytes()); + } + + #[test] + fn test_write_long() { + let mut w = IcebergBinaryRowWriter::new(); + w.write_long(123456789012345i64); + assert_eq!(w.buffer(), &123456789012345i64.to_le_bytes()); + } + + #[test] + fn test_write_float() { + let mut w = IcebergBinaryRowWriter::new(); + let val = 1.23f32; + w.write_float(val); + assert_eq!(w.buffer(), &val.to_le_bytes()); + } + + #[test] + fn test_write_double() { + let mut w = IcebergBinaryRowWriter::new(); + let val = 9.876543210f64; + w.write_double(val); + assert_eq!(w.buffer(), &val.to_le_bytes()); + } + + #[test] + fn test_write_string_no_length_prefix() { + let mut w = IcebergBinaryRowWriter::new(); + w.write_string("hello"); + assert_eq!(w.buffer(), b"hello"); + } + + #[test] + fn test_write_bytes_no_length_prefix() { + let mut w = IcebergBinaryRowWriter::new(); + let data = &[0xDE, 0xAD, 0xBE, 0xEF]; + w.write_bytes(data); + assert_eq!(w.buffer(), data); + } + + #[test] + fn test_write_binary_no_length_prefix() { + let mut w = IcebergBinaryRowWriter::new(); + let data = &[1, 2, 3, 4, 5]; + w.write_binary(data, 3); + assert_eq!(w.buffer(), &[1, 2, 3]); + } + + #[test] + fn test_write_time_millis_to_micros() { + let mut w = IcebergBinaryRowWriter::new(); + // 1000 ms = 1_000_000 µs + w.write_time(1000, 0); + assert_eq!(w.buffer(), &1_000_000i64.to_le_bytes()); + } + + #[test] + fn test_write_timestamp_ntz_compact() { + let mut w = IcebergBinaryRowWriter::new(); + let ts = TimestampNtz::new(1672531200000); // 2023-01-01 00:00:00 UTC + w.write_timestamp_ntz(&ts, 3); + let expected_micros = 1672531200000i64 * 1000; + assert_eq!(w.buffer(), &expected_micros.to_le_bytes()); + } + + #[test] + fn test_write_timestamp_ntz_with_nanos() { + let mut w = IcebergBinaryRowWriter::new(); + let ts = TimestampNtz::from_millis_nanos(1000, 500_000).unwrap(); + w.write_timestamp_ntz(&ts, 6); + // 1000ms * 1000 + 500_000ns / 1000 = 1_000_000 + 500 = 1_000_500 µs + assert_eq!(w.buffer(), &1_000_500i64.to_le_bytes()); + } + + #[test] + fn test_write_timestamp_ltz() { + let mut w = IcebergBinaryRowWriter::new(); + let ts = TimestampLtz::from_millis_nanos(2000, 300_000).unwrap(); + w.write_timestamp_ltz(&ts, 6); + // 2000ms * 1000 + 300_000ns / 1000 = 2_000_000 + 300 = 2_000_300 µs + assert_eq!(w.buffer(), &2_000_300i64.to_le_bytes()); + } + + #[test] + fn test_write_timestamp_ntz_overflow_wraps_like_java() { + let mut w = IcebergBinaryRowWriter::new(); + let ts = TimestampNtz::from_millis_nanos(i64::MAX, 999_999).unwrap(); + w.write_timestamp_ntz(&ts, 9); + + let expected = i64::MAX.wrapping_mul(MICROS_PER_MILLI).wrapping_add(999); + assert_eq!(w.buffer(), &expected.to_le_bytes()); + } + + #[test] + fn test_write_timestamp_ltz_overflow_wraps_like_java() { + let mut w = IcebergBinaryRowWriter::new(); + let ts = TimestampLtz::from_millis_nanos(i64::MIN, 999_999).unwrap(); + w.write_timestamp_ltz(&ts, 9); + + let expected = i64::MIN.wrapping_mul(MICROS_PER_MILLI).wrapping_add(999); + assert_eq!(w.buffer(), &expected.to_le_bytes()); + } + + #[test] + fn test_write_decimal_compact() { + let mut w = IcebergBinaryRowWriter::new(); + let bd = BigDecimal::new(BigInt::from(12345), 2); // 123.45 + let decimal = Decimal::from_big_decimal(bd, 10, 2).unwrap(); + w.write_decimal(&decimal, 10); + + let expected = BigInt::from(12345).to_signed_bytes_be(); + assert_eq!(w.buffer(), expected.as_slice()); + } + + #[test] + fn test_write_decimal_non_compact() { + let mut w = IcebergBinaryRowWriter::new(); + let bd = BigDecimal::new(BigInt::from(12345), 0); + let decimal = Decimal::from_big_decimal(bd, 28, 0).unwrap(); + w.write_decimal(&decimal, 28); + + let expected = BigInt::from(12345).to_signed_bytes_be(); + assert_eq!(w.buffer(), expected.as_slice()); + } + + #[test] + fn test_write_boolean() { + let mut w = IcebergBinaryRowWriter::new(); + w.write_boolean(true); + assert_eq!(w.buffer(), &[1u8]); + + w.reset(); + w.write_boolean(false); + assert_eq!(w.buffer(), &[0u8]); + } + + #[test] + #[should_panic(expected = "Iceberg key columns do not support null values")] + fn test_set_null_panics() { + let mut w = IcebergBinaryRowWriter::new(); + w.set_null_at(0); + } + + #[test] + fn test_reset_clears_position() { + let mut w = IcebergBinaryRowWriter::new(); + w.write_int(42); + assert_eq!(w.position(), 8); + w.reset(); + assert_eq!(w.position(), 0); + assert_eq!(w.buffer().len(), 0); + } + + #[test] + fn test_to_bytes() { + let mut w = IcebergBinaryRowWriter::new(); + w.write_string("test"); + let bytes = w.to_bytes(); + assert_eq!(bytes.as_ref(), b"test"); + } + + #[test] + fn test_multiple_writes() { + let mut w = IcebergBinaryRowWriter::new(); + w.write_int(1); + w.write_string("ab"); + let buf = w.buffer().to_vec(); + // 8 bytes for int-as-i64 + 2 bytes for "ab" + assert_eq!(buf.len(), 10); + assert_eq!(&buf[..8], &1i64.to_le_bytes()); + assert_eq!(&buf[8..], b"ab"); + } + + #[test] + fn test_buffer_growth() { + let mut w = IcebergBinaryRowWriter::new(); + // Write more than 64 bytes to trigger buffer growth + let large = vec![0xAAu8; 128]; + w.write_bytes(&large); + assert_eq!(w.buffer(), large.as_slice()); + } + + #[test] + fn test_create_value_writer_rejects_tinyint() { + let dt = DataType::TinyInt(TinyIntType::new()); + match IcebergBinaryRowWriter::create_value_writer(&dt) { + Err(e) => assert!( + e.to_string() + .contains("Unsupported type for Iceberg binary row writer"), + "unexpected error: {e}", + ), + Ok(_) => panic!("expected error for TinyInt, got Ok"), + } + } + + #[test] + fn test_create_value_writer_rejects_smallint() { + let dt = DataType::SmallInt(SmallIntType::new()); + match IcebergBinaryRowWriter::create_value_writer(&dt) { + Err(e) => assert!( + e.to_string() + .contains("Unsupported type for Iceberg binary row writer"), + "unexpected error: {e}", + ), + Ok(_) => panic!("expected error for SmallInt, got Ok"), + } + } + + #[test] + fn test_create_value_writer_rejects_boolean() { + assert_unsupported_type( + DataTypes::boolean(), + "Unsupported type for Iceberg binary row writer", + ); + } + + #[test] + fn test_create_value_writer_rejects_timestamp_ltz() { + assert_unsupported_type( + DataTypes::timestamp_ltz(), + "Unsupported type for Iceberg binary row writer", + ); + } + + #[test] + fn test_create_value_writer_rejects_array() { + assert_unsupported_type( + DataTypes::array(DataTypes::int()), + "Array types cannot be used as bucket keys", + ); + } + + #[test] + fn test_create_value_writer_rejects_map() { + assert_unsupported_type( + DataTypes::map(DataTypes::string(), DataTypes::int()), + "Map types cannot be used as bucket keys", + ); + } + + #[test] + fn test_create_value_writer_rejects_row() { + assert_unsupported_type( + DataTypes::row(vec![DataTypes::field("f0", DataTypes::int())]), + "Unsupported type for Iceberg binary row writer", + ); + } + + #[test] + fn test_create_value_writer_accepts_java_supported_scalar_types() { + let supported_types = vec![ + ("int", DataTypes::int()), + ("date", DataTypes::date()), + ("time", DataTypes::time()), + ("bigint", DataTypes::bigint()), + ("float", DataTypes::float()), + ("double", DataTypes::double()), + ("timestamp_ntz", DataTypes::timestamp()), + ("decimal", DataTypes::decimal(10, 2)), + ("string", DataTypes::string()), + ("char", DataTypes::char(16)), + ("binary", DataTypes::binary(8)), + ("bytes", DataTypes::bytes()), + ]; + + for (name, data_type) in supported_types { + let res = IcebergBinaryRowWriter::create_value_writer(&data_type); + if let Err(e) = res { + panic!("expected {name} to be supported, got error: {e}"); + } + } + } + + #[test] + fn test_write_char_same_as_string() { + let mut w1 = IcebergBinaryRowWriter::new(); + w1.write_char("hello", 10); + + let mut w2 = IcebergBinaryRowWriter::new(); + w2.write_string("hello"); + + assert_eq!(w1.buffer(), w2.buffer()); + } + + #[test] + fn test_write_date_as_int() { + // Date encoding goes through write_int (via InnerValueWriter::Date) + // which writes as i64 LE in Iceberg encoding + let mut w = IcebergBinaryRowWriter::new(); + let days_since_epoch = 19000i32; // ~2022-01-06 + w.write_int(days_since_epoch); + assert_eq!(w.buffer(), &(days_since_epoch as i64).to_le_bytes()); + } +} diff --git a/fluss-rust/crates/fluss/src/row/binary/mod.rs b/fluss-rust/crates/fluss/src/row/binary/mod.rs new file mode 100644 index 0000000000..d6248dc515 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/binary/mod.rs @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +mod binary_writer; +mod iceberg_binary_row_writer; + +pub use binary_writer::*; +pub use iceberg_binary_row_writer::IcebergBinaryRowWriter; + +/// The binary row format types, it indicates the generated row type by the [`BinaryWriter`] +#[allow(dead_code)] +pub enum BinaryRowFormat { + Compacted, + Aligned, + Indexed, +} diff --git a/fluss-rust/crates/fluss/src/row/binary_array.rs b/fluss-rust/crates/fluss/src/row/binary_array.rs new file mode 100644 index 0000000000..b987cec8b7 --- /dev/null +++ b/fluss-rust/crates/fluss/src/row/binary_array.rs @@ -0,0 +1,1288 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Binary array format matching Java's `BinaryArray.java` layout. +//! +//! Binary layout: +//! ```text +//! [size(4B)] + [null bits (4-byte word aligned)] + [fixed-length part] + [variable-length part] +//! ``` +//! +//! Java reference: `BinaryArray.java`, `BinaryArrayWriter.java` + +use crate::error::Error::IllegalArgument; +use crate::error::Result; +use crate::metadata::{DataType, RowType}; +use crate::row::Decimal; +use crate::row::InternalRow; +use crate::row::binary::{BinaryRowFormat, ValueWriter}; +use crate::row::binary_map::FlussMap; +use crate::row::compacted::{CompactedRow, CompactedRowWriter, calculate_bit_set_width_in_bytes}; +use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz}; +use crate::row::field_getter::FieldGetter; +use bytes::Bytes; +use serde::Serialize; +use std::fmt; +use std::hash::{Hash, Hasher}; + +const MAX_FIX_PART_DATA_SIZE: usize = 7; +const HIGHEST_FIRST_BIT: u64 = 0x80_u64 << 56; +const HIGHEST_SECOND_TO_EIGHTH_BIT: u64 = 0x7F_u64 << 56; + +/// Calculates the header size in bytes: 4 (for element count) + null bits (4-byte word aligned). +/// Matches Java's `BinaryArray.calculateHeaderInBytes(numFields)`. +pub fn calculate_header_in_bytes(num_elements: usize) -> usize { + 4 + num_elements.div_ceil(32) * 4 +} + +/// Calculates the fixed-length part size per element for a given data type. +/// Matches Java's `BinaryArray.calculateFixLengthPartSize(DataType)`. +pub fn calculate_fix_length_part_size(element_type: &DataType) -> usize { + match element_type { + DataType::Boolean(_) | DataType::TinyInt(_) => 1, + DataType::SmallInt(_) => 2, + DataType::Int(_) | DataType::Float(_) | DataType::Date(_) | DataType::Time(_) => 4, + DataType::BigInt(_) + | DataType::Double(_) + | DataType::Char(_) + | DataType::String(_) + | DataType::Binary(_) + | DataType::Bytes(_) + | DataType::Decimal(_) + | DataType::Timestamp(_) + | DataType::TimestampLTz(_) + | DataType::Array(_) + | DataType::Map(_) + | DataType::Row(_) => 8, + } +} + +/// Rounds a byte count up to the nearest 8-byte word boundary. +/// Matches Java's `roundNumberOfBytesToNearestWord`. +fn round_to_nearest_word(num_bytes: usize) -> usize { + (num_bytes + 7) & !7 +} + +fn is_variable_length_type(dt: &DataType) -> bool { + match dt { + DataType::Char(_) + | DataType::String(_) + | DataType::Binary(_) + | DataType::Bytes(_) + | DataType::Array(_) + | DataType::Map(_) + | DataType::Row(_) => true, + DataType::Decimal(d) => !Decimal::is_compact_precision(d.precision()), + DataType::Timestamp(t) => !TimestampNtz::is_compact(t.precision()), + DataType::TimestampLTz(t) => !TimestampLtz::is_compact(t.precision()), + _ => false, + } +} + +/// A Fluss binary array, wire-compatible with Java's `BinaryArray`. +/// +/// Stores elements in a flat byte buffer with a header (element count + null bitmap) +/// followed by fixed-length slots and an optional variable-length section. +/// +/// Uses `Bytes` internally so cloning is O(1) reference-counted. +// TODO: FlussArray currently exposes only fallible getters. Infallible +// fast-path variants may be added later as non-breaking extensions. +#[derive(Clone)] +pub struct FlussArray { + data: Bytes, + size: usize, + element_offset: usize, +} + +impl fmt::Debug for FlussArray { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("FlussArray") + .field("size", &self.size) + .field("data_len", &self.data.len()) + .finish() + } +} + +impl fmt::Display for FlussArray { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "FlussArray[size={}]", self.size) + } +} + +impl PartialEq for FlussArray { + fn eq(&self, other: &Self) -> bool { + self.data == other.data + } +} + +impl Eq for FlussArray {} + +impl PartialOrd for FlussArray { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for FlussArray { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.data.cmp(&other.data) + } +} + +impl Hash for FlussArray { + fn hash(&self, state: &mut H) { + self.data.hash(state); + } +} + +impl Serialize for FlussArray { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + serializer.serialize_bytes(&self.data) + } +} + +impl FlussArray { + /// Validates the raw bytes and computes derived fields (size, element_offset). + fn validate(data: &[u8]) -> Result<(usize, usize)> { + if data.len() < 4 { + return Err(IllegalArgument { + message: format!( + "FlussArray data too short: need at least 4 bytes, got {}", + data.len() + ), + }); + } + let raw_size = i32::from_le_bytes(data[0..4].try_into().unwrap()); + if raw_size < 0 { + return Err(IllegalArgument { + message: format!("FlussArray size must be non-negative, got {raw_size}"), + }); + } + let size = raw_size as usize; + let element_offset = calculate_header_in_bytes(size); + if element_offset > data.len() { + return Err(IllegalArgument { + message: format!( + "FlussArray header exceeds payload: header={}, payload={}", + element_offset, + data.len() + ), + }); + } + Ok((size, element_offset)) + } + + /// Creates a FlussArray from a byte slice (copies data). + pub fn from_bytes(data: &[u8]) -> Result { + let (size, element_offset) = Self::validate(data)?; + Ok(FlussArray { + data: Bytes::copy_from_slice(data), + size, + element_offset, + }) + } + + /// Creates a FlussArray from an owned `Vec` without copying. + pub fn from_vec(data: Vec) -> Result { + let (size, element_offset) = Self::validate(&data)?; + Ok(FlussArray { + data: Bytes::from(data), + size, + element_offset, + }) + } + + /// Creates a FlussArray from owned bytes without copying. + fn from_owned_bytes(data: Bytes) -> Result { + let (size, element_offset) = Self::validate(&data)?; + Ok(FlussArray { + data, + size, + element_offset, + }) + } + + /// Returns the number of elements. + pub fn size(&self) -> usize { + self.size + } + + /// Returns the raw bytes of this array (the complete binary representation). + pub fn as_bytes(&self) -> &[u8] { + &self.data + } + + /// Returns true if the element at position `pos` is null. + pub fn is_null_at(&self, pos: usize) -> bool { + let byte_index = pos >> 3; + let bit = pos & 7; + (self.data[4 + byte_index] & (1u8 << bit)) != 0 + } + + /// Returns the logically occupied bytes of this array, including the variable-length part. + /// This is used to detect trailing garbage in binary containers. + pub fn extent(&self, element_type: &DataType) -> Result { + let header_size = calculate_header_in_bytes(self.size); + let element_size = calculate_fix_length_part_size(element_type); + let fixed_part_size = round_to_nearest_word(header_size + self.size * element_size); + + if !is_variable_length_type(element_type) { + return Ok(fixed_part_size); + } + + let mut max_extent = fixed_part_size; + for i in 0..self.size { + if !self.is_null_at(i) { + let packed = self.read_i64(i, "extent calculation")? as u64; + let mark = packed & HIGHEST_FIRST_BIT; + if mark == 0 { + let offset = (packed >> 32) as usize; + let len = (packed & 0xFFFF_FFFF) as usize; + max_extent = max_extent.max(offset + len); + } + } + } + + Ok(round_to_nearest_word(max_extent)) + } + + fn checked_slice(&self, start: usize, len: usize, context: &str) -> Result<&[u8]> { + let end = start.checked_add(len).ok_or_else(|| IllegalArgument { + message: format!("Overflow while reading {context}: start={start}, len={len}"), + })?; + if end > self.data.len() { + return Err(IllegalArgument { + message: format!( + "Out-of-bounds while reading {context}: start={start}, len={len}, payload={}", + self.data.len() + ), + }); + } + Ok(&self.data[start..end]) + } + + fn checked_element_offset( + &self, + pos: usize, + element_size: usize, + context: &str, + ) -> Result { + if pos >= self.size { + return Err(IllegalArgument { + message: format!( + "Array element index out of bounds while reading {context}: pos={pos}, size={}", + self.size + ), + }); + } + let rel = pos.checked_mul(element_size).ok_or_else(|| IllegalArgument { + message: format!( + "Overflow while calculating array element offset for {context}: pos={pos}, element_size={element_size}" + ), + })?; + self.element_offset + .checked_add(rel) + .ok_or_else(|| IllegalArgument { + message: format!( + "Overflow while adding base offset for {context}: base={}, rel={rel}", + self.element_offset + ), + }) + } + + fn read_fixed_bytes(&self, pos: usize, len: usize, context: &str) -> Result<&[u8]> { + let offset = self.checked_element_offset(pos, len, context)?; + self.checked_slice(offset, len, context) + } + + fn read_i16(&self, pos: usize, context: &str) -> Result { + let bytes = self.read_fixed_bytes(pos, 2, context)?; + Ok(i16::from_le_bytes([bytes[0], bytes[1]])) + } + + fn read_i32(&self, pos: usize, context: &str) -> Result { + let bytes = self.read_fixed_bytes(pos, 4, context)?; + Ok(i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]])) + } + + fn read_i64(&self, pos: usize, context: &str) -> Result { + let bytes = self.read_fixed_bytes(pos, 8, context)?; + let mut buf = [0_u8; 8]; + buf.copy_from_slice(bytes); + Ok(i64::from_le_bytes(buf)) + } + + fn read_i64_at_offset(&self, offset: usize, context: &str) -> Result { + let bytes = self.checked_slice(offset, 8, context)?; + let mut buf = [0_u8; 8]; + buf.copy_from_slice(bytes); + Ok(i64::from_le_bytes(buf)) + } + + fn read_var_len_span(&self, pos: usize) -> Result<(usize, usize)> { + let field_offset = self.checked_element_offset(pos, 8, "variable-length array element")?; + let packed = self.read_i64(pos, "variable-length array element")? as u64; + let mark = packed & HIGHEST_FIRST_BIT; + + if mark == 0 { + let offset = (packed >> 32) as usize; + let len = (packed & 0xFFFF_FFFF) as usize; + let _ = self.checked_slice(offset, len, "variable-length array element")?; + Ok((offset, len)) + } else { + let len = ((packed & HIGHEST_SECOND_TO_EIGHTH_BIT) >> 56) as usize; + if len > MAX_FIX_PART_DATA_SIZE { + return Err(IllegalArgument { + message: format!( + "Inline array element length must be <= {MAX_FIX_PART_DATA_SIZE}, got {len}" + ), + }); + } + // Java stores inline bytes in the 8-byte slot itself. + // On little-endian, bytes start at field_offset; on big-endian they start at +1. + let start = if cfg!(target_endian = "little") { + field_offset + } else { + field_offset + 1 + }; + let _ = self.checked_slice(start, len, "inline array element")?; + Ok((start, len)) + } + } + + fn read_var_len_bytes(&self, pos: usize) -> Result<&[u8]> { + let (start, len) = self.read_var_len_span(pos)?; + Ok(&self.data[start..start + len]) + } + + pub fn get_boolean(&self, pos: usize) -> Result { + let bytes = self.read_fixed_bytes(pos, 1, "boolean array element")?; + Ok(bytes[0] != 0) + } + + pub fn get_byte(&self, pos: usize) -> Result { + let bytes = self.read_fixed_bytes(pos, 1, "byte array element")?; + Ok(bytes[0] as i8) + } + + pub fn get_short(&self, pos: usize) -> Result { + self.read_i16(pos, "short array element") + } + + pub fn get_int(&self, pos: usize) -> Result { + self.read_i32(pos, "int array element") + } + + pub fn get_long(&self, pos: usize) -> Result { + self.read_i64(pos, "long array element") + } + + pub fn get_float(&self, pos: usize) -> Result { + let bits = self.read_i32(pos, "float array element")? as u32; + Ok(f32::from_bits(bits)) + } + + pub fn get_double(&self, pos: usize) -> Result { + let bits = self.read_i64(pos, "double array element")? as u64; + Ok(f64::from_bits(bits)) + } + + /// Reads the offset_and_size packed long for variable-length elements. + fn get_offset_and_size(&self, pos: usize) -> Result<(usize, usize)> { + let packed = self.get_long(pos)? as u64; + let offset = (packed >> 32) as usize; + let size = (packed & 0xFFFF_FFFF) as usize; + Ok((offset, size)) + } + + pub fn get_string(&self, pos: usize) -> Result<&str> { + let bytes = self.read_var_len_bytes(pos)?; + std::str::from_utf8(bytes).map_err(|e| IllegalArgument { + message: format!("Invalid UTF-8 in array element at position {pos}: {e}"), + }) + } + + pub fn get_binary(&self, pos: usize) -> Result<&[u8]> { + self.read_var_len_bytes(pos) + } + + pub fn get_decimal(&self, pos: usize, precision: u32, scale: u32) -> Result { + if Decimal::is_compact_precision(precision) { + let unscaled = self.get_long(pos)?; + Decimal::from_unscaled_long(unscaled, precision, scale) + } else { + let (offset, size) = self.get_offset_and_size(pos)?; + let bytes = self.checked_slice(offset, size, "decimal bytes")?; + Decimal::from_unscaled_bytes(bytes, precision, scale) + } + } + + pub fn get_date(&self, pos: usize) -> Result { + Ok(Date::new(self.get_int(pos)?)) + } + + pub fn get_time(&self, pos: usize) -> Result