diff --git a/.github/actions/verify-tag-version/action.yml b/.github/actions/verify-tag-version/action.yml
new file mode 100644
index 0000000000..1b34bdba03
--- /dev/null
+++ b/.github/actions/verify-tag-version/action.yml
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Verify that the pushed tag version matches the workspace package version in Cargo.toml.
+# Tag v0.2.0 or v0.2.0-rc1; Cargo 0.2.0. Compare base version (strip -rc*): both pass when Cargo is 0.2.0.
+# Requires: checkout before this step (Cargo.toml in workspace root). Use on tag push (GITHUB_REF like refs/tags/v0.1.0).
+
+name: 'Verify tag matches crate version'
+description: 'Exits with error if GITHUB_REF tag base version does not match [workspace.package] version in Cargo.toml (strips -rc*).'
+
+runs:
+  using: 'composite'
+  steps:
+    - run: |
+        echo "Tag and crate version match: $TAG_VERSION"
+      shell: bash
+# uncomment for 0.1.0-incubating-rc0, add it back when incubating is not needed anymore
+#    - run: |
+#        TAG_VERSION="${GITHUB_REF#refs/tags/v}"
+#        CRATE_VERSION=$(sed -n '/^\[workspace.package\]/,/^\[/p' Cargo.toml | grep '^\s*version\s*=' | head -1 | sed -E 's/.*"([^"]+)".*/\1/')
+#        base() { echo "$1" | sed -E 's/-rc(\.[0-9]+|[0-9]+)$//'; }
+#        if [ "$(base "$TAG_VERSION")" != "$(base "$CRATE_VERSION")" ]; then
+#          echo "::error::Tag version ($TAG_VERSION) does not match Cargo.toml version ($CRATE_VERSION). Run scripts/bump-version.sh before tagging, or tag the version that is in Cargo.toml."
+#          exit 1
+#        fi
+#        echo "Tag and crate version match: $TAG_VERSION"
+#      shell: bash
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000000..714e644bd5
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+version: 2
+updates:
+  # GitHub Actions used by the repository's workflows
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+
+  # Rust client workspace
+  - package-ecosystem: "cargo"
+    directory: "/fluss-rust"
+    schedule:
+      interval: "monthly"
diff --git a/.github/release.yml b/.github/release.yml
new file mode 100644
index 0000000000..3ca2be6277
--- /dev/null
+++ b/.github/release.yml
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Configures "Generate release notes" on GitHub Releases.
+# https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes
+
+changelog:
+  categories:
+    - title: Added
+      labels:
+        - feat
+        - feature
+    - title: Changed
+      labels:
+        - refactor
+    - title: Fixed
+      labels:
+        - fix
+        - bugfix
+    - title: Docs
+      labels:
+        - docs
+        - documentation
+    - title: CI / Build
+      labels:
+        - ci
+        - build
+    - title: Chore
+      labels:
+        - chore
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 4ef7d372de..ee4a269d73 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -25,11 +25,13 @@ on:
     paths-ignore:
       - 'website/**'
       - 'helm/**'
+      - 'fluss-rust/**'
       - '**/*.md'
   pull_request:
     paths-ignore:
       - 'website/**'
       - 'helm/**'
+      - 'fluss-rust/**'
       - '**/*.md'
 
 concurrency:
diff --git a/.github/workflows/client-integration.yml b/.github/workflows/client-integration.yml
new file mode 100644
index 0000000000..6d77be877f
--- /dev/null
+++ b/.github/workflows/client-integration.yml
@@ -0,0 +1,366 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Client integration tests against a SAME-REVISION server (FIP-40 §3.2).
+# Builds the Fluss server image from this source tree ONCE, caches + saves it,
+# then fans out the Rust / Python / C++ / Elixir integration suites against that
+# fluss:dev image (build-once-fan-out, à la Temporal/PyFlink). The image build is
+# cached on server/proto hashes, so client-only PRs reuse it instead of rebuilding.
+
+name: Client Integration
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'fluss-rpc/src/main/proto/**'
+      - 'fluss-server/**'
+      - 'fluss-common/**'
+      - 'fluss-dist/**'
+      - 'docker/fluss/**'
+      - 'fluss-rust/crates/**'
+      - 'fluss-rust/bindings/**'
+      - 'fluss-rust/Cargo.toml'
+      - 'fluss-rust/Cargo.lock'
+      - '.github/workflows/client-integration.yml'
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'fluss-rpc/src/main/proto/**'
+      - 'fluss-server/**'
+      - 'fluss-common/**'
+      - 'fluss-dist/**'
+      - 'docker/fluss/**'
+      - 'fluss-rust/crates/**'
+      - 'fluss-rust/bindings/**'
+      - 'fluss-rust/Cargo.toml'
+      - 'fluss-rust/Cargo.lock'
+      - '.github/workflows/client-integration.yml'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+jobs:
+  # Decide which client suites to run, mirroring the per-binding scoping the
+  # standalone fluss-rust repo had: a binding suite runs only when its own
+  # binding, the core fluss-rs crate, or the server/proto changed. On non-PR
+  # events (push to main, manual) everything runs.
+  detect-changes:
+    runs-on: ubuntu-latest
+    outputs:
+      rust: ${{ steps.filter.outputs.rust }}
+      python: ${{ steps.filter.outputs.python }}
+      cpp: ${{ steps.filter.outputs.cpp }}
+      elixir: ${{ steps.filter.outputs.elixir }}
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - id: filter
+        run: |
+          if [ "${{ github.event_name }}" != "pull_request" ]; then
+            all=true; changed=""
+          else
+            all=false
+            changed=$(git diff --name-only "${{ github.event.pull_request.base.sha }}...HEAD")
+          fi
+          echo "Changed files:"; echo "$changed"
+          has() { echo "$changed" | grep -qE "$1"; }
+          protocol=false; core=false; py=false; cpp=false; ex=false
+          has '^(fluss-rpc/src/main/proto/|fluss-server/|fluss-common/|fluss-dist/|docker/fluss/)' && protocol=true || true
+          has '^(fluss-rust/crates/|fluss-rust/Cargo\.)' && core=true || true
+          has '^fluss-rust/bindings/python/' && py=true || true
+          has '^fluss-rust/bindings/cpp/' && cpp=true || true
+          has '^fluss-rust/bindings/elixir/' && ex=true || true
+          # a suite runs if: non-PR (all) OR core crate OR server/proto OR its own binding changed
+          gate() { if [ "$all" = true ] || [ "$core" = true ] || [ "$protocol" = true ] || [ "$1" = true ]; then echo true; else echo false; fi; }
+          {
+            echo "rust=$(gate false)"
+            echo "python=$(gate $py)"
+            echo "cpp=$(gate $cpp)"
+            echo "elixir=$(gate $ex)"
+          } >> "$GITHUB_OUTPUT"
+
+  # Build the server image from THIS source tree once; cache it on server/proto
+  # hashes so client-only PRs restore it instead of rebuilding. The saved image
+  # is uploaded as an artifact and loaded by every client integration job.
+  build-server-image:
+    needs: detect-changes
+    if: needs.detect-changes.outputs.rust == 'true' || needs.detect-changes.outputs.python == 'true' || needs.detect-changes.outputs.cpp == 'true' || needs.detect-changes.outputs.elixir == 'true'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Cache server image
+        id: image-cache
+        uses: actions/cache@v4
+        with:
+          path: /tmp/fluss-dev.tar
+          key: fluss-dev-image-${{ hashFiles('fluss-server/**', 'fluss-common/**', 'fluss-rpc/**', 'fluss-dist/**', 'docker/fluss/**', 'pom.xml') }}
+
+      - name: Set up JDK 17
+        if: steps.image-cache.outputs.cache-hit != 'true'
+        uses: actions/setup-java@v5
+        with:
+          java-version: '17'
+          distribution: 'temurin'
+          cache: maven
+
+      - name: Build server image (fluss:dev) from source
+        if: steps.image-cache.outputs.cache-hit != 'true'
+        run: |
+          ./mvnw -B --no-transfer-progress clean package -pl fluss-dist -am -DskipTests
+          rm -rf docker/fluss/build-target
+          mkdir -p docker/fluss/build-target
+          cp -r build-target/* docker/fluss/build-target/
+          docker build -t fluss:dev docker/fluss
+          docker save fluss:dev -o /tmp/fluss-dev.tar
+
+      - name: Upload server image
+        uses: actions/upload-artifact@v4
+        with:
+          name: fluss-dev-image
+          path: /tmp/fluss-dev.tar
+          retention-days: 1
+
+  rust-integration:
+    needs: [detect-changes, build-server-image]
+    if: needs.detect-changes.outputs.rust == 'true'
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: fluss-rust
+    env:
+      FLUSS_IMAGE: fluss
+      FLUSS_VERSION: dev
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/download-artifact@v4
+        with:
+          name: fluss-dev-image
+          path: /tmp
+      - name: Load server image
+        run: docker load -i /tmp/fluss-dev.tar
+      - name: Install protoc
+        uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
+        with:
+          workspaces: fluss-rust
+      - name: Integration tests
+        run: cargo test --features integration_tests --test test_fluss -p fluss-rs
+        env:
+          RUST_LOG: DEBUG
+          RUST_BACKTRACE: full
+
+  python-integration:
+    needs: [detect-changes, build-server-image]
+    if: needs.detect-changes.outputs.python == 'true'
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: ["3.9", "3.10", "3.11", "3.12"]
+    defaults:
+      run:
+        working-directory: fluss-rust
+    env:
+      FLUSS_TEST_CLUSTER_BIN: ${{ github.workspace }}/fluss-rust/target/debug/fluss-test-cluster
+      FLUSS_IMAGE: fluss
+      FLUSS_VERSION: dev
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/download-artifact@v4
+        with:
+          name: fluss-dev-image
+          path: /tmp
+      - name: Load server image
+        run: docker load -i /tmp/fluss-dev.tar
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install uv
+        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78
+      - name: Install protoc
+        uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
+        with:
+          workspaces: fluss-rust
+      - name: Build fluss-test-cluster binary
+        run: cargo build -p fluss-test-cluster
+      - name: Build Python bindings
+        working-directory: fluss-rust/bindings/python
+        run: |
+          uv sync --extra dev --no-install-project
+          uv run --no-sync maturin develop --uv
+      - name: Run tests (parallel)
+        working-directory: fluss-rust/bindings/python
+        run: uv run --no-sync pytest test/ -v -n 2 --dist=loadfile
+        env:
+          RUST_LOG: DEBUG
+          RUST_BACKTRACE: full
+          FLUSS_SKIP_CLUSTER_TEARDOWN: "1"
+      - name: Dump fluss cluster container logs
+        if: always()
+        run: |
+          mkdir -p cluster-logs
+          for c in $(docker ps -a --filter "name=shared-test" --format '{{.Names}}'); do
+            docker logs "$c" > "cluster-logs/$c.log" 2>&1 || true
+          done
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: cluster-logs-${{ matrix.python }}
+          path: fluss-rust/cluster-logs/
+          if-no-files-found: ignore
+          retention-days: 3
+
+  cpp-integration:
+    needs: [detect-changes, build-server-image]
+    if: needs.detect-changes.outputs.cpp == 'true'
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: fluss-rust
+    env:
+      FLUSS_TEST_CLUSTER_BIN: ${{ github.workspace }}/fluss-rust/target/debug/fluss-test-cluster
+      FLUSS_IMAGE: fluss
+      FLUSS_VERSION: dev
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/download-artifact@v4
+        with:
+          name: fluss-dev-image
+          path: /tmp
+      - name: Load server image
+        run: docker load -i /tmp/fluss-dev.tar
+      - name: Install protoc
+        uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Install Apache Arrow C++
+        run: |
+          sudo apt-get install -y -V ca-certificates lsb-release wget
+          wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+          sudo apt-get install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
+          sudo apt-get update
+          sudo apt-get install -y -V libarrow-dev
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
+        with:
+          workspaces: fluss-rust
+      - name: Setup sccache
+        uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9
+      - name: Build fluss-test-cluster binary
+        run: cargo build -p fluss-test-cluster
+      - name: Build C++ bindings and tests
+        working-directory: fluss-rust/bindings/cpp
+        env:
+          SCCACHE_GHA_ENABLED: "true"
+        run: |
+          cmake -B build \
+            -DFLUSS_ENABLE_TESTING=ON \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DCMAKE_C_COMPILER_LAUNCHER=sccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+          cmake --build build --parallel
+          sccache --show-stats
+      - name: Run C++ integration tests (parallel)
+        working-directory: fluss-rust/bindings/cpp
+        run: cd build && ctest -j$(nproc) --output-on-failure --timeout 300
+        env:
+          RUST_LOG: DEBUG
+          RUST_BACKTRACE: full
+
+  elixir-integration:
+    needs: [detect-changes, build-server-image]
+    if: needs.detect-changes.outputs.elixir == 'true'
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: fluss-rust
+    env:
+      OTP_VERSION: "28.0.2"
+      ELIXIR_VERSION: "1.19.5"
+      FLUSS_TEST_CLUSTER_BIN: ${{ github.workspace }}/fluss-rust/target/debug/fluss-test-cluster
+      MIX_ENV: test
+      FLUSS_IMAGE: fluss
+      FLUSS_VERSION: dev
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/download-artifact@v4
+        with:
+          name: fluss-dev-image
+          path: /tmp
+      - name: Load server image
+        run: docker load -i /tmp/fluss-dev.tar
+      - name: Set up BEAM
+        uses: erlef/setup-beam@fc68ffb90438ef2936bbb3251622353b3dcb2f93 # v1.24.0
+        with:
+          otp-version: ${{ env.OTP_VERSION }}
+          elixir-version: ${{ env.ELIXIR_VERSION }}
+      - name: Install protoc
+        run: sudo apt-get update && sudo apt-get install -y protobuf-compiler
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
+        with:
+          workspaces: fluss-rust
+      - name: Cache Mix deps and build
+        uses: actions/cache@v4
+        with:
+          path: |
+            fluss-rust/bindings/elixir/deps
+            fluss-rust/bindings/elixir/_build
+          key: ${{ runner.os }}-mix-otp${{ env.OTP_VERSION }}-elixir${{ env.ELIXIR_VERSION }}-${{ hashFiles('fluss-rust/bindings/elixir/mix.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-mix-otp${{ env.OTP_VERSION }}-elixir${{ env.ELIXIR_VERSION }}-
+      - name: Build fluss-test-cluster binary
+        run: cargo build -p fluss-test-cluster
+      - name: Fetch Elixir deps
+        working-directory: fluss-rust/bindings/elixir
+        run: mix deps.get
+      - name: Check formatting
+        working-directory: fluss-rust/bindings/elixir
+        run: mix format --check-formatted
+      - name: Compile (warnings as errors)
+        working-directory: fluss-rust/bindings/elixir
+        run: mix compile --warnings-as-errors
+      - name: Credo
+        working-directory: fluss-rust/bindings/elixir
+        run: mix credo
+      - name: Run unit tests
+        working-directory: fluss-rust/bindings/elixir
+        run: mix test
+      - name: Run integration tests
+        working-directory: fluss-rust/bindings/elixir
+        run: mix test --include integration --only integration
+        env:
+          RUST_LOG: DEBUG
+          RUST_BACKTRACE: full
diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml
index 5f52ffa3d9..aa69703eec 100644
--- a/.github/workflows/license-check.yml
+++ b/.github/workflows/license-check.yml
@@ -17,7 +17,17 @@ name: Check License
 permissions:
   contents: read
 
-on: [push, pull_request]
+on:
+  push:
+    paths-ignore:
+      - 'fluss-rust/**'
+      - 'website/**'
+      - '**/*.md'
+  pull_request:
+    paths-ignore:
+      - 'fluss-rust/**'
+      - 'website/**'
+      - '**/*.md'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.event.number || github.run_id }}
diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml
new file mode 100644
index 0000000000..ddbc4f0cf9
--- /dev/null
+++ b/.github/workflows/python-release.yml
@@ -0,0 +1,180 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Publish the fluss Python binding to PyPI.
+# Trigger: push tag only (e.g. v0.1.0).
+# Pre-release tags (containing '-') publish to TestPyPI; release tags publish to PyPI.
+#
+# Token auth: add secrets PYPI_API_TOKEN / TEST_PYPI_API_TOKEN for publishing.
+
+name: Release Python
+
+on:
+  push:
+    tags:
+      - "v*"  # Only version-like tags (e.g. v0.1.0, v0.1.0-rc1); avoids running on arbitrary tags
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  version-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: ./.github/actions/verify-tag-version
+
+  sdist:
+    runs-on: ubuntu-latest
+    needs: [version-check]
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install protoc
+        run: sudo apt-get update && sudo apt-get install -y protobuf-compiler
+
+      # Vendor the canonical proto so the sdist builds standalone from source.
+      - name: Vendor canonical proto into the crate
+        working-directory: fluss-rust
+        run: scripts/vendor-proto.sh
+
+      - uses: PyO3/maturin-action@v1
+        with:
+          working-directory: fluss-rust/bindings/python
+          command: sdist
+          args: -o dist
+
+      - name: Upload sdist
+        uses: actions/upload-artifact@v7
+        with:
+          name: wheels-sdist
+          path: fluss-rust/bindings/python/dist
+
+  wheels:
+    runs-on: ${{ matrix.os }}
+    needs: [version-check]
+    strategy:
+      matrix:
+        include:
+          - { os: windows-latest }
+          - { os: macos-15-intel, target: "x86_64-apple-darwin" }
+          - { os: macos-15, target: "aarch64-apple-darwin" }
+          - { os: ubuntu-latest, target: "x86_64" }
+          - { os: ubuntu-latest, target: "aarch64", manylinux: "manylinux_2_28" }
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install protoc (Linux)
+        if: runner.os == 'Linux'
+        run: sudo apt-get update && sudo apt-get install -y protobuf-compiler
+
+      - name: Install protoc (macOS)
+        if: runner.os == 'macOS'
+        run: brew install protobuf
+
+      - name: Install protoc (Windows)
+        if: runner.os == 'Windows'
+        run: choco install protoc -y
+        shell: pwsh
+
+      # Install protoc in manylinux container (x86_64/aarch64); script shared via YAML anchor
+      - uses: PyO3/maturin-action@v1
+        with:
+          working-directory: fluss-rust/bindings/python
+          target: ${{ matrix.target }}
+          command: build
+          args: --release -o dist -i python3.9
+          manylinux: ${{ matrix.manylinux || 'auto' }}
+          before-script-linux: &protoc-install |
+            set -e
+            ARCH=$(uname -m)
+            case "$ARCH" in
+              x86_64)  ZIP=protoc-27.1-linux-x86_64.zip ;;
+              aarch64) ZIP=protoc-27.1-linux-aarch_64.zip ;;
+              *) echo "Unsupported arch $ARCH"; exit 1 ;;
+            esac
+            curl -sLO "https://github.com/protocolbuffers/protobuf/releases/download/v27.1/${ZIP}"
+            python3 -c "import zipfile; zipfile.ZipFile('${ZIP}').extractall('/tmp/protoc_install')"
+            chmod +x /tmp/protoc_install/bin/protoc
+            rm -f "${ZIP}"
+            export PATH="/tmp/protoc_install/bin:$PATH"
+            export PROTOC=/tmp/protoc_install/bin/protoc
+      - uses: PyO3/maturin-action@v1
+        with:
+          working-directory: fluss-rust/bindings/python
+          target: ${{ matrix.target }}
+          command: build
+          args: --release -o dist -i python3.10
+          manylinux: ${{ matrix.manylinux || 'auto' }}
+          before-script-linux: *protoc-install
+      - uses: PyO3/maturin-action@v1
+        with:
+          working-directory: fluss-rust/bindings/python
+          target: ${{ matrix.target }}
+          command: build
+          args: --release -o dist -i python3.11
+          manylinux: ${{ matrix.manylinux || 'auto' }}
+          before-script-linux: *protoc-install
+      - uses: PyO3/maturin-action@v1
+        with:
+          working-directory: fluss-rust/bindings/python
+          target: ${{ matrix.target }}
+          command: build
+          args: --release -o dist -i python3.12
+          manylinux: ${{ matrix.manylinux || 'auto' }}
+          before-script-linux: *protoc-install
+
+      - name: Upload wheels
+        uses: actions/upload-artifact@v7
+        with:
+          name: wheels-${{ matrix.os }}-${{ matrix.target || 'native' }}
+          path: fluss-rust/bindings/python/dist
+
+  release:
+    name: Publish to PyPI
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    needs: [version-check, sdist, wheels]
+    if: startsWith(github.ref, 'refs/tags/')
+    steps:
+      - uses: actions/download-artifact@v8
+        with:
+          pattern: wheels-*
+          merge-multiple: true
+          path: fluss-rust/bindings/python/dist
+
+      - name: Publish to TestPyPI
+        if: contains(github.ref, '-')
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e
+        with:
+          repository-url: https://test.pypi.org/legacy/
+          skip-existing: true
+          packages-dir: fluss-rust/bindings/python/dist
+          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+
+      - name: Publish to PyPI
+        if: ${{ !contains(github.ref, '-') }}
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e
+        with:
+          skip-existing: true
+          packages-dir: fluss-rust/bindings/python/dist
+          password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/rust-build-and-test.yml b/.github/workflows/rust-build-and-test.yml
new file mode 100644
index 0000000000..d59fadce59
--- /dev/null
+++ b/.github/workflows/rust-build-and-test.yml
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Rust Build and Tests
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'fluss-rust/crates/**'
+      - 'fluss-rust/Cargo.toml'
+      - 'fluss-rust/Cargo.lock'
+      - 'fluss-rust/rust-toolchain.toml'
+      - 'fluss-rust/.cargo/**'
+      - 'fluss-rpc/src/main/proto/**'
+      - '.github/workflows/rust-build-and-test.yml'
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'fluss-rust/crates/**'
+      - 'fluss-rust/Cargo.toml'
+      - 'fluss-rust/Cargo.lock'
+      - 'fluss-rust/rust-toolchain.toml'
+      - 'fluss-rust/.cargo/**'
+      - 'fluss-rpc/src/main/proto/**'
+      - '.github/workflows/rust-build-and-test.yml'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    working-directory: fluss-rust
+
+jobs:
+  build-and-unit-test:
+    timeout-minutes: 60
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os:
+          - ubuntu-latest
+          - macos-latest
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install protoc
+        uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
+        with:
+          workspaces: fluss-rust
+
+      - name: Build
+        run: cargo build --workspace --all-targets --exclude fluss_python --exclude fluss-cpp --exclude fluss_nif
+
+      - name: Unit Test
+        run: cargo test --all-targets --workspace --exclude fluss_python --exclude fluss-cpp --exclude fluss_nif
+        env:
+          RUST_LOG: DEBUG
+          RUST_BACKTRACE: full
diff --git a/.github/workflows/rust-docs-check.yml b/.github/workflows/rust-docs-check.yml
new file mode 100644
index 0000000000..e2e6e72059
--- /dev/null
+++ b/.github/workflows/rust-docs-check.yml
@@ -0,0 +1,51 @@
+################################################################################
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+# Checks for broken links in the fluss-rust client documentation.
+name: Rust Documentation Check
+permissions:
+  contents: read
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      - 'fluss-rust/website/**'
+      - '.github/workflows/rust-docs-check.yml'
+  push:
+    branches: [main]
+    paths:
+      - 'fluss-rust/website/**'
+      - '.github/workflows/rust-docs-check.yml'
+
+jobs:
+  check-documentation:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: fluss-rust/website
+    steps:
+      - uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 24
+      - name: Install dependencies
+        run: npm install
+      - name: Test build website
+        run: npm run build -- --no-minify
diff --git a/.github/workflows/rust-license-and-format.yml b/.github/workflows/rust-license-and-format.yml
new file mode 100644
index 0000000000..2c2d4f6b41
--- /dev/null
+++ b/.github/workflows/rust-license-and-format.yml
@@ -0,0 +1,98 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Rust License and Formatting Check
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'fluss-rust/crates/**'
+      - 'fluss-rust/bindings/**'
+      - 'fluss-rust/Cargo.toml'
+      - 'fluss-rust/Cargo.lock'
+      - 'fluss-rust/deny.toml'
+      - 'fluss-rust/.licenserc.yaml'
+      - 'fluss-rust/rustfmt.toml'
+      - 'fluss-rust/rust-toolchain.toml'
+      - 'fluss-rpc/src/main/proto/**'
+      - '.github/workflows/rust-license-and-format.yml'
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'fluss-rust/crates/**'
+      - 'fluss-rust/bindings/**'
+      - 'fluss-rust/Cargo.toml'
+      - 'fluss-rust/Cargo.lock'
+      - 'fluss-rust/deny.toml'
+      - 'fluss-rust/.licenserc.yaml'
+      - 'fluss-rust/rustfmt.toml'
+      - 'fluss-rust/rust-toolchain.toml'
+      - 'fluss-rpc/src/main/proto/**'
+      - '.github/workflows/rust-license-and-format.yml'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    working-directory: fluss-rust
+
+jobs:
+  check-license-and-formatting:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Check License Header
+        uses: apache/skywalking-eyes/header@61275cc80d0798a405cb070f7d3a8aaf7cf2c2c1 # v0.8.0
+        with:
+          config: fluss-rust/.licenserc.yaml
+
+      - name: Install cargo-deny
+        uses: taiki-e/install-action@v2
+        with:
+          tool: cargo-deny@0.14.22
+
+      - name: Check dependency licenses (Apache-compatible)
+        run: cargo deny check licenses
+
+      - name: Install protoc
+        uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3.0.0
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2.9.1
+        with:
+          workspaces: fluss-rust
+
+      - name: Format
+        run: cargo fmt --all -- --check
+
+      - name: Clippy
+        run: cargo clippy --all-targets --workspace -- -D warnings
+
+      - name: Rustdoc
+        # fluss_python is excluded: its [lib] name = "fluss" collides with fluss-rs
+        run: cargo doc --workspace --no-deps --exclude fluss_python
+        env:
+          RUSTDOCFLAGS: -D warnings
diff --git a/.github/workflows/rust-release.yml b/.github/workflows/rust-release.yml
new file mode 100644
index 0000000000..d2f8901400
--- /dev/null
+++ b/.github/workflows/rust-release.yml
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Publish the fluss Rust crate to crates.io.
+# Trigger: push tag only (e.g. v0.1.0).
+# Pre-release tags (containing '-') do not publish; release tags publish to crates.io.
+#
+# Token auth: add secret CARGO_REGISTRY_TOKEN for crates.io publishing.
+
+name: Release Rust
+
+on:
+  push:
+    tags:
+      - "v*"  # Only version-like tags (e.g. v0.1.0, v0.1.0-rc1); avoids running on arbitrary tags
+
+defaults:
+  run:
+    working-directory: fluss-rust
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: ./.github/actions/verify-tag-version
+
+      - name: Install protoc
+        run: sudo apt-get update && sudo apt-get install -y protobuf-compiler
+
+      # build.rs reads the canonical proto from the in-repo fluss-rpc, which is
+      # outside the published crate; vendor it so the crate publishes standalone.
+      - name: Vendor canonical proto into the crate
+        run: scripts/vendor-proto.sh
+
+      - name: Dry run (crates/fluss)
+        run: cargo publish -p fluss-rs --dry-run --allow-dirty
+
+      - name: Publish fluss-rs to crates.io
+        if: startsWith(github.ref, 'refs/tags/') && !contains(github.ref, '-')
+        run: cargo publish -p fluss-rs --allow-dirty
+        env:
+          CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
diff --git a/fluss-rust/.cargo/config.toml b/fluss-rust/.cargo/config.toml
new file mode 100644
index 0000000000..57efc7ff75
--- /dev/null
+++ b/fluss-rust/.cargo/config.toml
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[target.aarch64-apple-darwin]
+rustflags = [
+    "-C", "link-arg=-undefined",
+    "-C", "link-arg=dynamic_lookup",
+]
\ No newline at end of file
diff --git a/fluss-rust/.gitignore b/fluss-rust/.gitignore
new file mode 100644
index 0000000000..eb3a06e6b1
--- /dev/null
+++ b/fluss-rust/.gitignore
@@ -0,0 +1,54 @@
+.DS_Store
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
+
+# RustRover
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+.vscode/
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+*.dylib
+*.dSYM/
+*.egg-info/
+dist/
+build/
+.venv/
+uv.lock
+
+# CPP
+*CMakeFiles/
+.cache/
+
+# Website (Docusaurus)
+website/node_modules
+website/build
+website/.docusaurus
+website/.cache-loader
+website/.env.local
+website/.env.development.local
+website/.env.test.local
+website/.env.production.local
+website/npm-debug.log*
+website/yarn-debug.log*
+website/yarn-error.log*
+website/package-lock.json
+website/versioned_docs
+website/versioned_sidebars
+website/versions.json
+website/pnpm-lock.yaml
diff --git a/fluss-rust/.licenserc.yaml b/fluss-rust/.licenserc.yaml
new file mode 100644
index 0000000000..a3647d7f27
--- /dev/null
+++ b/fluss-rust/.licenserc.yaml
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+header:
+  license:
+    spdx-id: Apache-2.0
+    copyright-owner: Apache Software Foundation
+
+  paths:
+    - 'fluss-rust/**'
+
+  paths-ignore:
+    # bare (gitignore-style) patterns match the basename at any depth
+    - '.gitignore'
+    - 'Cargo.lock'
+    - 'LICENSE'
+    - 'NOTICE'
+    - 'DISCLAIMER'
+    - 'fluss-rust/bindings/python/fluss/py.typed'
+    - 'fluss-rust/**/mix.lock'
+    - 'fluss-rust/website/**'
+    - '**/*.md'
+    - 'fluss-rust/**/DEPENDENCIES.*.tsv'
+    - 'fluss-rust/**/*.env'
+  comment: on-failure
diff --git a/fluss-rust/Cargo.lock b/fluss-rust/Cargo.lock
new file mode 100644
index 0000000000..4570d4d81c
--- /dev/null
+++ b/fluss-rust/Cargo.lock
@@ -0,0 +1,4743 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "ahash"
+version = "0.8.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75"
+dependencies = [
+ "cfg-if",
+ "const-random",
+ "getrandom 0.3.4",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anstream"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
+
+[[package]]
+name = "anstyle-parse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d"
+dependencies = [
+ "anstyle",
+ "once_cell_polyfill",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.102"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+
+[[package]]
+name = "arrow"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8"
+dependencies = [
+ "arrow-arith",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-csv",
+ "arrow-data",
+ "arrow-ipc",
+ "arrow-json",
+ "arrow-ord",
+ "arrow-row",
+ "arrow-schema",
+ "arrow-select",
+ "arrow-string",
+]
+
+[[package]]
+name = "arrow-arith"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "num-traits",
+]
+
+[[package]]
+name = "arrow-array"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef"
+dependencies = [
+ "ahash",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "half",
+ "hashbrown 0.16.1",
+ "num-complex",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "arrow-buffer"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2"
+dependencies = [
+ "bytes",
+ "half",
+ "num-bigint",
+ "num-traits",
+]
+
+[[package]]
+name = "arrow-cast"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-ord",
+ "arrow-schema",
+ "arrow-select",
+ "atoi",
+ "base64 0.22.1",
+ "chrono",
+ "half",
+ "lexical-core",
+ "num-traits",
+ "ryu",
+]
+
+[[package]]
+name = "arrow-csv"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a"
+dependencies = [
+ "arrow-array",
+ "arrow-cast",
+ "arrow-schema",
+ "chrono",
+ "csv",
+ "csv-core",
+ "regex",
+]
+
+[[package]]
+name = "arrow-data"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304"
+dependencies = [
+ "arrow-buffer",
+ "arrow-schema",
+ "half",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "arrow-ipc"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "flatbuffers",
+ "lz4_flex",
+ "zstd",
+]
+
+[[package]]
+name = "arrow-json"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "half",
+ "indexmap 2.13.1",
+ "itoa",
+ "lexical-core",
+ "memchr",
+ "num-traits",
+ "ryu",
+ "serde_core",
+ "serde_json",
+ "simdutf8",
+]
+
+[[package]]
+name = "arrow-ord"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+]
+
+[[package]]
+name = "arrow-pyarrow"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d18c442b4c266aaf3d7f7dd40fd7ae058cef7f113b00ff0cd8256e1e218ec544"
+dependencies = [
+ "arrow-array",
+ "arrow-data",
+ "arrow-schema",
+ "pyo3",
+]
+
+[[package]]
+name = "arrow-row"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "half",
+]
+
+[[package]]
+name = "arrow-schema"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "arrow-select"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b"
+dependencies = [
+ "ahash",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "num-traits",
+]
+
+[[package]]
+name = "arrow-string"
+version = "57.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "memchr",
+ "num-traits",
+ "regex",
+ "regex-syntax",
+]
+
+[[package]]
+name = "astral-tokio-tar"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c23f3af104b40a3430ccb90ed5f7bd877a8dc5c26fc92fde51a22b40890dcf9"
+dependencies = [
+ "filetime",
+ "futures-core",
+ "libc",
+ "portable-atomic",
+ "rustc-hash",
+ "tokio",
+ "tokio-stream",
+ "xattr",
+]
+
+[[package]]
+name = "async-stream"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "atoi"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "atomic-waker"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
+
+[[package]]
+name = "autocfg"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
+
+[[package]]
+name = "axum"
+version = "0.8.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8"
+dependencies = [
+ "axum-core",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "serde_core",
+ "sync_wrapper",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "sync_wrapper",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "backon"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef"
+dependencies = [
+ "fastrand",
+ "gloo-timers",
+ "tokio",
+]
+
+[[package]]
+name = "base64"
+version = "0.21.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
+
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
+[[package]]
+name = "bigdecimal"
+version = "0.4.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695"
+dependencies = [
+ "autocfg",
+ "libm",
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+ "serde",
+]
+
+[[package]]
+name = "bitflags"
+version = "2.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af"
+
+[[package]]
+name = "bitvec"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c"
+dependencies = [
+ "funty",
+ "radium",
+ "tap",
+ "wyz",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "bollard"
+version = "0.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee04c4c84f1f811b017f2fbb7dd8815c976e7ca98593de9c1e2afad0f636bff4"
+dependencies = [
+ "async-stream",
+ "base64 0.22.1",
+ "bitflags",
+ "bollard-buildkit-proto",
+ "bollard-stubs",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "hex",
+ "home",
+ "http",
+ "http-body-util",
+ "hyper",
+ "hyper-named-pipe",
+ "hyper-rustls",
+ "hyper-util",
+ "hyperlocal",
+ "log",
+ "num",
+ "pin-project-lite",
+ "rand 0.9.3",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-pki-types",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "serde_urlencoded",
+ "thiserror 2.0.18",
+ "time",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tonic",
+ "tower-service",
+ "url",
+ "winapi",
+]
+
+[[package]]
+name = "bollard-buildkit-proto"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85a885520bf6249ab931a764ffdb87b0ceef48e6e7d807cfdb21b751e086e1ad"
+dependencies = [
+ "prost",
+ "prost-types",
+ "tonic",
+ "tonic-prost",
+ "ureq",
+]
+
+[[package]]
+name = "bollard-stubs"
+version = "1.52.1-rc.29.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f0a8ca8799131c1837d1282c3f81f31e76ceb0ce426e04a7fe1ccee3287c066"
+dependencies = [
+ "base64 0.22.1",
+ "bollard-buildkit-proto",
+ "bytes",
+ "prost",
+ "serde",
+ "serde_json",
+ "serde_repr",
+ "time",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.20.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "bytes"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33"
+
+[[package]]
+name = "cc"
+version = "1.2.59"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283"
+dependencies = [
+ "find-msvc-tools",
+ "jobserver",
+ "libc",
+ "shlex",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
+[[package]]
+name = "chrono"
+version = "0.4.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0"
+dependencies = [
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "serde",
+ "wasm-bindgen",
+ "windows-link",
+]
+
+[[package]]
+name = "clap"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "clap_lex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
+
+[[package]]
+name = "codespan-reporting"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af491d569909a7e4dee0ad7db7f5341fef5c614d5b8ec8cf765732aba3cff681"
+dependencies = [
+ "serde",
+ "termcolor",
+ "unicode-width",
+]
+
+[[package]]
+name = "colorchoice"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
+
+[[package]]
+name = "const-oid"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+
+[[package]]
+name = "const-random"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359"
+dependencies = [
+ "const-random-macro",
+]
+
+[[package]]
+name = "const-random-macro"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
+dependencies = [
+ "getrandom 0.2.17",
+ "once_cell",
+ "tiny-keccak",
+]
+
+[[package]]
+name = "core-foundation"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crc32c"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
+dependencies = [
+ "rustc_version",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
+[[package]]
+name = "crypto-common"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "csv"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde_core",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "cxx"
+version = "1.0.194"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "747d8437319e3a2f43d93b341c137927ca70c0f5dabeea7a005a73665e247c7e"
+dependencies = [
+ "cc",
+ "cxx-build",
+ "cxxbridge-cmd",
+ "cxxbridge-flags",
+ "cxxbridge-macro",
+ "foldhash 0.2.0",
+ "link-cplusplus",
+]
+
+[[package]]
+name = "cxx-build"
+version = "1.0.194"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0f4697d190a142477b16aef7da8a99bfdc41e7e8b1687583c0d23a79c7afc1e"
+dependencies = [
+ "cc",
+ "codespan-reporting",
+ "indexmap 2.13.1",
+ "proc-macro2",
+ "quote",
+ "scratch",
+ "syn",
+]
+
+[[package]]
+name = "cxxbridge-cmd"
+version = "1.0.194"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0956799fa8678d4c50eed028f2de1c0552ae183c76e976cf7ca8c4e36a7c328"
+dependencies = [
+ "clap",
+ "codespan-reporting",
+ "indexmap 2.13.1",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "cxxbridge-flags"
+version = "1.0.194"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23384a836ab4f0ad98ace7e3955ad2de39de42378ab487dc28d3990392cb283a"
+
+[[package]]
+name = "cxxbridge-macro"
+version = "1.0.194"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6acc6b5822b9526adfb4fc377b67128fdd60aac757cc4a741a6278603f763cf"
+dependencies = [
+ "indexmap 2.13.1",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "darling"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0"
+dependencies = [
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "dashmap"
+version = "6.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+ "hashbrown 0.14.5",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "delegate"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "780eb241654bf097afb00fc5f054a09b687dad862e485fdcf8399bb056565370"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "deranged"
+version = "0.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
+dependencies = [
+ "powerfmt",
+ "serde_core",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "const-oid",
+ "crypto-common",
+ "subtle",
+]
+
+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "dlv-list"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f"
+dependencies = [
+ "const-random",
+]
+
+[[package]]
+name = "docker_credential"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d89dfcba45b4afad7450a99b39e751590463e45c04728cf555d36bb66940de8"
+dependencies = [
+ "base64 0.21.7",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "dyn-clone"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
+
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "endian-type"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d"
+
+[[package]]
+name = "equivalent"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
+
+[[package]]
+name = "erased-serde"
+version = "0.4.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2add8a07dd6a8d93ff627029c51de145e12686fbc36ecb298ac22e74cf02dec"
+dependencies = [
+ "serde",
+ "serde_core",
+ "typeid",
+]
+
+[[package]]
+name = "errno"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "etcetera"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de48cc4d1c1d97a20fd819def54b890cadde72ed3ad0c614822a0a433361be96"
+dependencies = [
+ "cfg-if",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "fastrand"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f"
+
+[[package]]
+name = "ferroid"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb330bbd4cb7a5b9f559427f06f98a4f853a137c8298f3bd3f8ca57663e21986"
+dependencies = [
+ "portable-atomic",
+ "rand 0.9.3",
+ "web-time",
+]
+
+[[package]]
+name = "filetime"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "libredox",
+]
+
+[[package]]
+name = "find-msvc-tools"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
+
+[[package]]
+name = "fixedbitset"
+version = "0.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99"
+
+[[package]]
+name = "flatbuffers"
+version = "25.12.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3"
+dependencies = [
+ "bitflags",
+ "rustc_version",
+]
+
+[[package]]
+name = "fluss-cpp"
+version = "1.0.0"
+dependencies = [
+ "anyhow",
+ "arrow",
+ "bigdecimal",
+ "cxx",
+ "cxx-build",
+ "fluss-rs",
+ "tokio",
+]
+
+[[package]]
+name = "fluss-examples"
+version = "1.0.0"
+dependencies = [
+ "clap",
+ "fluss-rs",
+ "tikv-jemallocator",
+ "tokio",
+]
+
+[[package]]
+name = "fluss-rs"
+version = "1.0.0"
+dependencies = [
+ "arrow",
+ "arrow-schema",
+ "bigdecimal",
+ "bitvec",
+ "byteorder",
+ "bytes",
+ "clap",
+ "crc32c",
+ "dashmap",
+ "delegate",
+ "fluss-test-cluster",
+ "futures",
+ "jiff",
+ "linked-hash-map",
+ "log",
+ "metrics",
+ "metrics-util",
+ "opendal",
+ "ordered-float",
+ "parking_lot",
+ "parse-display 0.10.0",
+ "prost",
+ "prost-build",
+ "rand 0.9.3",
+ "scopeguard",
+ "serde",
+ "serde_json",
+ "snafu",
+ "strum",
+ "strum_macros",
+ "tempfile",
+ "thiserror 1.0.69",
+ "tokio",
+ "url",
+ "uuid",
+]
+
+[[package]]
+name = "fluss-test-cluster"
+version = "1.0.0"
+dependencies = [
+ "clap",
+ "fluss-rs",
+ "serde",
+ "serde_json",
+ "testcontainers",
+ "tokio",
+]
+
+[[package]]
+name = "fluss_nif"
+version = "1.0.0"
+dependencies = [
+ "bigdecimal",
+ "fluss-rs",
+ "rustler",
+ "tokio",
+]
+
+[[package]]
+name = "fluss_python"
+version = "1.0.0"
+dependencies = [
+ "arrow",
+ "arrow-array",
+ "arrow-pyarrow",
+ "arrow-schema",
+ "bigdecimal",
+ "fluss-rs",
+ "indexmap 2.13.1",
+ "jiff",
+ "pyo3",
+ "pyo3-async-runtimes",
+ "tokio",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "foldhash"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
+
+[[package]]
+name = "foldhash"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb"
+
+[[package]]
+name = "form_urlencoded"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "funty"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c"
+
+[[package]]
+name = "futures"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893"
+
+[[package]]
+name = "futures-task"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
+
+[[package]]
+name = "futures-util"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "slab",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "wasi",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "r-efi 5.3.0",
+ "wasip2",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi 6.0.0",
+ "wasip2",
+ "wasip3",
+]
+
+[[package]]
+name = "gloo-timers"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "h2"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "http",
+ "indexmap 2.13.1",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "half"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+ "zerocopy",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
+
+[[package]]
+name = "hashbrown"
+version = "0.15.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
+dependencies = [
+ "foldhash 0.1.5",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
+dependencies = [
+ "foldhash 0.2.0",
+]
+
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
+[[package]]
+name = "hmac"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "home"
+version = "0.5.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "http"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a"
+dependencies = [
+ "bytes",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "httparse"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "hyper"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca"
+dependencies = [
+ "atomic-waker",
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+ "want",
+]
+
+[[package]]
+name = "hyper-named-pipe"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278"
+dependencies = [
+ "hex",
+ "hyper",
+ "hyper-util",
+ "pin-project-lite",
+ "tokio",
+ "tower-service",
+ "winapi",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.27.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
+dependencies = [
+ "http",
+ "hyper",
+ "hyper-util",
+ "rustls",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls",
+ "tower-service",
+ "webpki-roots",
+]
+
+[[package]]
+name = "hyper-timeout"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
+dependencies = [
+ "hyper",
+ "hyper-util",
+ "pin-project-lite",
+ "tokio",
+ "tower-service",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "ipnet",
+ "libc",
+ "percent-encoding",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "hyperlocal"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7"
+dependencies = [
+ "hex",
+ "http-body-util",
+ "hyper",
+ "hyper-util",
+ "pin-project-lite",
+ "tokio",
+ "tower-service",
+]
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.65"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "log",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "icu_collections"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c"
+dependencies = [
+ "displaydoc",
+ "potential_utf",
+ "utf8_iter",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locale_core"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4"
+dependencies = [
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38"
+
+[[package]]
+name = "icu_properties"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de"
+dependencies = [
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14"
+
+[[package]]
+name = "icu_provider"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421"
+dependencies = [
+ "displaydoc",
+ "icu_locale_core",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "id-arena"
+version = "2.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
+
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
+[[package]]
+name = "idna"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
+dependencies = [
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+ "serde",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.16.1",
+ "serde",
+ "serde_core",
+]
+
+[[package]]
+name = "indoc"
+version = "2.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "inventory"
+version = "0.3.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4f0c30c76f2f4ccee3fe55a2435f691ca00c0e4bd87abe4f4a851b1d4dac39b"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "ipnet"
+version = "2.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2"
+
+[[package]]
+name = "iri-string"
+version = "0.7.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695"
+
+[[package]]
+name = "itertools"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "jiff"
+version = "0.2.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359"
+dependencies = [
+ "jiff-static",
+ "jiff-tzdb-platform",
+ "js-sys",
+ "log",
+ "portable-atomic",
+ "portable-atomic-util",
+ "serde_core",
+ "wasm-bindgen",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "jiff-static"
+version = "0.2.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "jiff-tzdb"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076"
+
+[[package]]
+name = "jiff-tzdb-platform"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8"
+dependencies = [
+ "jiff-tzdb",
+]
+
+[[package]]
+name = "jobserver"
+version = "0.1.34"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
+dependencies = [
+ "getrandom 0.3.4",
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9"
+dependencies = [
+ "cfg-if",
+ "futures-util",
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "leb128fmt"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
+
+[[package]]
+name = "lexical-core"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594"
+dependencies = [
+ "lexical-parse-float",
+ "lexical-parse-integer",
+ "lexical-util",
+ "lexical-write-float",
+ "lexical-write-integer",
+]
+
+[[package]]
+name = "lexical-parse-float"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56"
+dependencies = [
+ "lexical-parse-integer",
+ "lexical-util",
+]
+
+[[package]]
+name = "lexical-parse-integer"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34"
+dependencies = [
+ "lexical-util",
+]
+
+[[package]]
+name = "lexical-util"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17"
+
+[[package]]
+name = "lexical-write-float"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361"
+dependencies = [
+ "lexical-util",
+ "lexical-write-integer",
+]
+
+[[package]]
+name = "lexical-write-integer"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df"
+dependencies = [
+ "lexical-util",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.184"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af"
+
+[[package]]
+name = "libloading"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60"
+dependencies = [
+ "cfg-if",
+ "windows-link",
+]
+
+[[package]]
+name = "libm"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
+
+[[package]]
+name = "libredox"
+version = "0.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08"
+dependencies = [
+ "bitflags",
+ "libc",
+ "plain",
+ "redox_syscall 0.7.3",
+]
+
+[[package]]
+name = "link-cplusplus"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f78c730aaa7d0b9336a299029ea49f9ee53b0ed06e9202e8cb7db9bae7b8c82"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "linked-hash-map"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
+
+[[package]]
+name = "litemap"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0"
+
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
+dependencies = [
+ "value-bag",
+]
+
+[[package]]
+name = "lru-slab"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
+
+[[package]]
+name = "lz4_flex"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746"
+dependencies = [
+ "twox-hash",
+]
+
+[[package]]
+name = "matchit"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
+
+[[package]]
+name = "md-5"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+dependencies = [
+ "cfg-if",
+ "digest",
+]
+
+[[package]]
+name = "memchr"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
+
+[[package]]
+name = "memoffset"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "metrics"
+version = "0.24.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff56c2e7dce6bd462e3b8919986a617027481b1dcc703175b58cf9dd98a2f071"
+dependencies = [
+ "portable-atomic",
+ "rapidhash",
+]
+
+[[package]]
+name = "metrics-util"
+version = "0.20.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e56997f084e57b045edf17c3ed8ba7f9f779c670df8206dfd1c736f4c02dc4a"
+dependencies = [
+ "aho-corasick",
+ "crossbeam-epoch",
+ "crossbeam-utils",
+ "hashbrown 0.16.1",
+ "indexmap 2.13.1",
+ "metrics",
+ "ordered-float",
+ "quanta",
+ "radix_trie",
+ "rand 0.9.3",
+ "rand_xoshiro",
+ "rapidhash",
+ "sketches-ddsketch",
+]
+
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
+[[package]]
+name = "mio"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1"
+dependencies = [
+ "libc",
+ "wasi",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "multimap"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
+
+[[package]]
+name = "nibble_vec"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a5d83df9f36fe23f0c3648c6bbb8b0298bb5f1939c8f2704431371f4b84d43"
+dependencies = [
+ "smallvec",
+]
+
+[[package]]
+name = "num"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-conv"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967"
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824"
+dependencies = [
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "once_cell_polyfill"
+version = "1.70.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe"
+
+[[package]]
+name = "opendal"
+version = "0.55.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d075ab8a203a6ab4bc1bce0a4b9fe486a72bf8b939037f4b78d95386384bc80a"
+dependencies = [
+ "anyhow",
+ "backon",
+ "base64 0.22.1",
+ "bytes",
+ "crc32c",
+ "futures",
+ "getrandom 0.2.17",
+ "http",
+ "http-body",
+ "jiff",
+ "log",
+ "md-5",
+ "percent-encoding",
+ "quick-xml 0.38.4",
+ "reqsign",
+ "reqwest",
+ "serde",
+ "serde_json",
+ "tokio",
+ "url",
+ "uuid",
+]
+
+[[package]]
+name = "openssl-probe"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
+
+[[package]]
+name = "ordered-float"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7d950ca161dc355eaf28f82b11345ed76c6e1f6eb1f4f4479e0323b9e2fbd0e"
+dependencies = [
+ "num-traits",
+ "rand 0.8.5",
+ "serde",
+]
+
+[[package]]
+name = "ordered-multimap"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79"
+dependencies = [
+ "dlv-list",
+ "hashbrown 0.14.5",
+]
+
+[[package]]
+name = "parking_lot"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall 0.5.18",
+ "smallvec",
+ "windows-link",
+]
+
+[[package]]
+name = "parse-display"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "914a1c2265c98e2446911282c6ac86d8524f495792c38c5bd884f80499c7538a"
+dependencies = [
+ "parse-display-derive 0.9.1",
+ "regex",
+ "regex-syntax",
+]
+
+[[package]]
+name = "parse-display"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "287d8d3ebdce117b8539f59411e4ed9ec226e0a4153c7f55495c6070d68e6f72"
+dependencies = [
+ "parse-display-derive 0.10.0",
+ "regex",
+ "regex-syntax",
+]
+
+[[package]]
+name = "parse-display-derive"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ae7800a4c974efd12df917266338e79a7a74415173caf7e70aa0a0707345281"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "regex",
+ "regex-syntax",
+ "structmeta",
+ "syn",
+]
+
+[[package]]
+name = "parse-display-derive"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fc048687be30d79502dea2f623d052f3a074012c6eac41726b7ab17213616b1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "regex",
+ "regex-syntax",
+ "structmeta",
+ "syn",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220"
+
+[[package]]
+name = "petgraph"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455"
+dependencies = [
+ "fixedbitset",
+ "hashbrown 0.15.5",
+ "indexmap 2.13.1",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
+
+[[package]]
+name = "pkg-config"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
+
+[[package]]
+name = "plain"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
+
+[[package]]
+name = "portable-atomic"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49"
+
+[[package]]
+name = "portable-atomic-util"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3"
+dependencies = [
+ "portable-atomic",
+]
+
+[[package]]
+name = "potential_utf"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564"
+dependencies = [
+ "zerovec",
+]
+
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "prettyplease"
+version = "0.2.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
+dependencies = [
+ "proc-macro2",
+ "syn",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "prost"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568"
+dependencies = [
+ "bytes",
+ "prost-derive",
+]
+
+[[package]]
+name = "prost-build"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7"
+dependencies = [
+ "heck",
+ "itertools",
+ "log",
+ "multimap",
+ "petgraph",
+ "prettyplease",
+ "prost",
+ "prost-types",
+ "regex",
+ "syn",
+ "tempfile",
+]
+
+[[package]]
+name = "prost-derive"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b"
+dependencies = [
+ "anyhow",
+ "itertools",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "prost-types"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7"
+dependencies = [
+ "prost",
+]
+
+[[package]]
+name = "pyo3"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383"
+dependencies = [
+ "indoc",
+ "libc",
+ "memoffset",
+ "once_cell",
+ "portable-atomic",
+ "pyo3-build-config",
+ "pyo3-ffi",
+ "pyo3-macros",
+ "unindent",
+]
+
+[[package]]
+name = "pyo3-async-runtimes"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6ee6d4cb3e8d5b925f5cdb38da183e0ff18122eb2048d4041c9e7034d026e23"
+dependencies = [
+ "futures",
+ "once_cell",
+ "pin-project-lite",
+ "pyo3",
+ "tokio",
+]
+
+[[package]]
+name = "pyo3-build-config"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f"
+dependencies = [
+ "python3-dll-a",
+ "target-lexicon",
+]
+
+[[package]]
+name = "pyo3-ffi"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105"
+dependencies = [
+ "libc",
+ "pyo3-build-config",
+]
+
+[[package]]
+name = "pyo3-macros"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded"
+dependencies = [
+ "proc-macro2",
+ "pyo3-macros-backend",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "pyo3-macros-backend"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "pyo3-build-config",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "python3-dll-a"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d381ef313ae70b4da5f95f8a4de773c6aa5cd28f73adec4b4a31df70b66780d8"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "quanta"
+version = "0.12.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi",
+ "web-sys",
+ "winapi",
+]
+
+[[package]]
+name = "quick-xml"
+version = "0.37.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
+[[package]]
+name = "quick-xml"
+version = "0.38.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
+[[package]]
+name = "quinn"
+version = "0.11.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
+dependencies = [
+ "bytes",
+ "cfg_aliases",
+ "pin-project-lite",
+ "quinn-proto",
+ "quinn-udp",
+ "rustc-hash",
+ "rustls",
+ "socket2",
+ "thiserror 2.0.18",
+ "tokio",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-proto"
+version = "0.11.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
+dependencies = [
+ "bytes",
+ "getrandom 0.3.4",
+ "lru-slab",
+ "rand 0.9.3",
+ "ring",
+ "rustc-hash",
+ "rustls",
+ "rustls-pki-types",
+ "slab",
+ "thiserror 2.0.18",
+ "tinyvec",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-udp"
+version = "0.5.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
+dependencies = [
+ "cfg_aliases",
+ "libc",
+ "once_cell",
+ "socket2",
+ "tracing",
+ "windows-sys 0.60.2",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "r-efi"
+version = "6.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
+
+[[package]]
+name = "radium"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09"
+
+[[package]]
+name = "radix_trie"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c069c179fcdc6a2fe24d8d18305cf085fdbd4f922c041943e203685d6a1c58fd"
+dependencies = [
+ "endian-type",
+ "nibble_vec",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha 0.3.1",
+ "rand_core 0.6.4",
+ "serde",
+]
+
+[[package]]
+name = "rand"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ec095654a25171c2124e9e3393a930bddbffdc939556c914957a4c3e0a87166"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom 0.2.17",
+ "serde",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom 0.3.4",
+]
+
+[[package]]
+name = "rand_xoshiro"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41"
+dependencies = [
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rapidhash"
+version = "4.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e48930979c155e2f33aa36ab3119b5ee81332beb6482199a8ecd6029b80b59"
+dependencies = [
+ "rustversion",
+]
+
+[[package]]
+name = "raw-cpuid"
+version = "11.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.5.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "ref-cast"
+version = "1.0.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d"
+dependencies = [
+ "ref-cast-impl",
+]
+
+[[package]]
+name = "ref-cast-impl"
+version = "1.0.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "regex"
+version = "1.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-lite"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973"
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
+
+[[package]]
+name = "reqsign"
+version = "0.16.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43451dbf3590a7590684c25fb8d12ecdcc90ed3ac123433e500447c7d77ed701"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "base64 0.22.1",
+ "chrono",
+ "form_urlencoded",
+ "getrandom 0.2.17",
+ "hex",
+ "hmac",
+ "home",
+ "http",
+ "log",
+ "once_cell",
+ "percent-encoding",
+ "quick-xml 0.37.5",
+ "rand 0.8.5",
+ "reqwest",
+ "rust-ini",
+ "serde",
+ "serde_json",
+ "sha1",
+ "sha2",
+ "tokio",
+]
+
+[[package]]
+name = "reqwest"
+version = "0.12.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-rustls",
+ "hyper-util",
+ "js-sys",
+ "log",
+ "percent-encoding",
+ "pin-project-lite",
+ "quinn",
+ "rustls",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls",
+ "tokio-util",
+ "tower",
+ "tower-http",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams",
+ "web-sys",
+ "webpki-roots",
+]
+
+[[package]]
+name = "ring"
+version = "0.17.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "getrandom 0.2.17",
+ "libc",
+ "untrusted",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rust-ini"
+version = "0.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "796e8d2b6696392a43bea58116b667fb4c29727dc5abd27d6acf338bb4f688c7"
+dependencies = [
+ "cfg-if",
+ "ordered-multimap",
+]
+
+[[package]]
+name = "rustc-hash"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rustix"
+version = "1.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
+dependencies = [
+ "bitflags",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "rustler"
+version = "0.37.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c779e2cbfa2987990205d0d8fc142163739e45a4c6592dc637896c77fec01280"
+dependencies = [
+ "inventory",
+ "libloading",
+ "regex-lite",
+ "rustler_codegen",
+]
+
+[[package]]
+name = "rustler_codegen"
+version = "0.37.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e120f8936c779b6c2e09992a2dfa9a4e8bcd0794c02bb654fde03e03ce8c31"
+dependencies = [
+ "heck",
+ "inventory",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "rustls"
+version = "0.23.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
+dependencies = [
+ "log",
+ "once_cell",
+ "ring",
+ "rustls-pki-types",
+ "rustls-webpki",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
+dependencies = [
+ "openssl-probe",
+ "rustls-pki-types",
+ "schannel",
+ "security-framework",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
+dependencies = [
+ "web-time",
+ "zeroize",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.103.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e"
+dependencies = [
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "ryu"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+
+[[package]]
+name = "schannel"
+version = "0.1.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "schemars"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f"
+dependencies = [
+ "dyn-clone",
+ "ref-cast",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc"
+dependencies = [
+ "dyn-clone",
+ "ref-cast",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "scratch"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d68f2ec51b097e4c1a75b681a8bec621909b5e91f15bb7b840c4f2f7b01148b2"
+
+[[package]]
+name = "security-framework"
+version = "3.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
+dependencies = [
+ "bitflags",
+ "core-foundation",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
+
+[[package]]
+name = "serde"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
+dependencies = [
+ "serde_core",
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_core"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.228"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_fmt"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e497af288b3b95d067a23a4f749f2861121ffcb2f6d8379310dcda040c345ed"
+dependencies = [
+ "serde_core",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.149"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86"
+dependencies = [
+ "itoa",
+ "memchr",
+ "serde",
+ "serde_core",
+ "zmij",
+]
+
+[[package]]
+name = "serde_repr"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_urlencoded"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_with"
+version = "3.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f"
+dependencies = [
+ "base64 0.22.1",
+ "chrono",
+ "hex",
+ "indexmap 1.9.3",
+ "indexmap 2.13.1",
+ "schemars 0.9.0",
+ "schemars 1.2.1",
+ "serde_core",
+ "serde_json",
+ "serde_with_macros",
+ "time",
+]
+
+[[package]]
+name = "serde_with_macros"
+version = "3.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "sha1"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sha2"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "shlex"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b"
+dependencies = [
+ "errno",
+ "libc",
+]
+
+[[package]]
+name = "simdutf8"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
+
+[[package]]
+name = "sketches-ddsketch"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b"
+
+[[package]]
+name = "slab"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
+
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
+[[package]]
+name = "snafu"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e84b3f4eacbf3a1ce05eac6763b4d629d60cbc94d632e4092c54ade71f1e1a2"
+dependencies = [
+ "snafu-derive",
+]
+
+[[package]]
+name = "snafu-derive"
+version = "0.8.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1c97747dbf44bb1ca44a561ece23508e99cb592e862f22222dcf42f51d1e451"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "socket2"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
+dependencies = [
+ "libc",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
+[[package]]
+name = "structmeta"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e1575d8d40908d70f6fd05537266b90ae71b15dbbe7a8b7dffa2b759306d329"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "structmeta-derive",
+ "syn",
+]
+
+[[package]]
+name = "structmeta-derive"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "152a0b65a590ff6c3da95cabe2353ee04e6167c896b28e3b14478c2636c922fc"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "strum"
+version = "0.26.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06"
+
+[[package]]
+name = "strum_macros"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn",
+]
+
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
+[[package]]
+name = "sval"
+version = "2.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2eb9318255ebd817902d7e279d8f8e39b35b1b9954decd5eb9ea0e30e5fd2b6a"
+
+[[package]]
+name = "sval_buffer"
+version = "2.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "12571299185e653fdb0fbfe36cd7f6529d39d4e747a60b15a3f34574b7b97c61"
+dependencies = [
+ "sval",
+ "sval_ref",
+]
+
+[[package]]
+name = "sval_dynamic"
+version = "2.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39526f24e997706c0de7f03fb7371f7f5638b66a504ded508e20ad173d0a3677"
+dependencies = [
+ "sval",
+]
+
+[[package]]
+name = "sval_fmt"
+version = "2.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "933dd3bb26965d682280fcc49400ac2a05036f4ee1e6dbd61bf8402d5a5c3a54"
+dependencies = [
+ "itoa",
+ "ryu",
+ "sval",
+]
+
+[[package]]
+name = "sval_json"
+version = "2.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0cda08f6d5c9948024a6551077557b1fdcc3880ff2f20ae839667d2ec2d87ed"
+dependencies = [
+ "itoa",
+ "ryu",
+ "sval",
+]
+
+[[package]]
+name = "sval_nested"
+version = "2.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88d49d5e6c1f9fd0e53515819b03a97ca4eb1bff5c8ee097c43391c09ecfb19f"
+dependencies = [
+ "sval",
+ "sval_buffer",
+ "sval_ref",
+]
+
+[[package]]
+name = "sval_ref"
+version = "2.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14f876c5a78405375b4e19cbb9554407513b59c93dea12dc6a4af4e1d30899ca"
+dependencies = [
+ "sval",
+]
+
+[[package]]
+name = "sval_serde"
+version = "2.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f9ccd3b7f7200239a655e517dd3fd48d960b9111ad24bd6a5e055bef17607c7"
+dependencies = [
+ "serde_core",
+ "sval",
+ "sval_nested",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sync_wrapper"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]
+
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
+[[package]]
+name = "target-lexicon"
+version = "0.13.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca"
+
+[[package]]
+name = "tempfile"
+version = "3.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
+dependencies = [
+ "fastrand",
+ "getrandom 0.4.2",
+ "once_cell",
+ "rustix",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "termcolor"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "testcontainers"
+version = "0.27.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0bd36b06a2a6c0c3c81a83be1ab05fe86460d054d4d51bf513bc56b3e15bdc22"
+dependencies = [
+ "astral-tokio-tar",
+ "async-trait",
+ "bollard",
+ "bytes",
+ "docker_credential",
+ "either",
+ "etcetera",
+ "ferroid",
+ "futures",
+ "http",
+ "itertools",
+ "log",
+ "memchr",
+ "parse-display 0.9.1",
+ "pin-project-lite",
+ "serde",
+ "serde_json",
+ "serde_with",
+ "thiserror 2.0.18",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "url",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
+dependencies = [
+ "thiserror-impl 2.0.18",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tikv-jemalloc-sys"
+version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "tikv-jemallocator"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a"
+dependencies = [
+ "libc",
+ "tikv-jemalloc-sys",
+]
+
+[[package]]
+name = "time"
+version = "0.3.47"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c"
+dependencies = [
+ "deranged",
+ "itoa",
+ "num-conv",
+ "powerfmt",
+ "serde_core",
+ "time-core",
+ "time-macros",
+]
+
+[[package]]
+name = "time-core"
+version = "0.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
+
+[[package]]
+name = "time-macros"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215"
+dependencies = [
+ "num-conv",
+ "time-core",
+]
+
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+
+[[package]]
+name = "tinystr"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd"
+dependencies = [
+ "bytes",
+ "libc",
+ "mio",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.26.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-sink",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tonic"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec"
+dependencies = [
+ "async-trait",
+ "axum",
+ "base64 0.22.1",
+ "bytes",
+ "h2",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-timeout",
+ "hyper-util",
+ "percent-encoding",
+ "pin-project",
+ "socket2",
+ "sync_wrapper",
+ "tokio",
+ "tokio-stream",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic-prost"
+version = "0.14.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309"
+dependencies = [
+ "bytes",
+ "prost",
+ "tonic",
+]
+
+[[package]]
+name = "tower"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "indexmap 2.13.1",
+ "pin-project-lite",
+ "slab",
+ "sync_wrapper",
+ "tokio",
+ "tokio-util",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.6.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
+dependencies = [
+ "bitflags",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "iri-string",
+ "pin-project-lite",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
+
+[[package]]
+name = "tower-service"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
+
+[[package]]
+name = "tracing"
+version = "0.1.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
+dependencies = [
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
+dependencies = [
+ "once_cell",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
+name = "twox-hash"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c"
+
+[[package]]
+name = "typeid"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c"
+
+[[package]]
+name = "typenum"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-width"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
+[[package]]
+name = "unindent"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3"
+
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "ureq"
+version = "3.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
+dependencies = [
+ "base64 0.22.1",
+ "log",
+ "percent-encoding",
+ "rustls",
+ "rustls-pki-types",
+ "ureq-proto",
+ "utf8-zero",
+]
+
+[[package]]
+name = "ureq-proto"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
+dependencies = [
+ "base64 0.22.1",
+ "http",
+ "httparse",
+ "log",
+]
+
+[[package]]
+name = "url"
+version = "2.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+ "serde_derive",
+]
+
+[[package]]
+name = "utf8-zero"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e"
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
+[[package]]
+name = "uuid"
+version = "1.23.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9"
+dependencies = [
+ "getrandom 0.4.2",
+ "js-sys",
+ "serde_core",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "value-bag"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ba6f5989077681266825251a52748b8c1d8a4ad098cc37e440103d0ea717fc0"
+dependencies = [
+ "value-bag-serde1",
+ "value-bag-sval2",
+]
+
+[[package]]
+name = "value-bag-serde1"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16530907bfe2999a1773ca5900a65101e092c70f642f25cc23ca0c43573262c5"
+dependencies = [
+ "erased-serde",
+ "serde_core",
+ "serde_fmt",
+]
+
+[[package]]
+name = "value-bag-sval2"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d00ae130edd690eaa877e4f40605d534790d1cf1d651e7685bd6a144521b251f"
+dependencies = [
+ "sval",
+ "sval_buffer",
+ "sval_dynamic",
+ "sval_fmt",
+ "sval_json",
+ "sval_ref",
+ "sval_serde",
+]
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "wasip2"
+version = "1.0.2+wasi-0.2.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasip3"
+version = "0.4.0+wasi-0.3.0-rc-2026-01-06"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.67"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03623de6905b7206edd0a75f69f747f134b7f0a2323392d664448bf2d3c5d87e"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.117"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "wasm-encoder"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319"
+dependencies = [
+ "leb128fmt",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-metadata"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909"
+dependencies = [
+ "anyhow",
+ "indexmap 2.13.1",
+ "wasm-encoder",
+ "wasmparser",
+]
+
+[[package]]
+name = "wasm-streams"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "wasmparser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe"
+dependencies = [
+ "bitflags",
+ "hashbrown 0.15.5",
+ "indexmap 2.13.1",
+ "semver",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.94"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd70027e39b12f0849461e08ffc50b9cd7688d942c1c8e3c7b22273236b4dd0a"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed"
+dependencies = [
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
+dependencies = [
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-core"
+version = "0.62.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
+dependencies = [
+ "windows-implement",
+ "windows-interface",
+ "windows-link",
+ "windows-result",
+ "windows-strings",
+]
+
+[[package]]
+name = "windows-implement"
+version = "0.60.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "windows-interface"
+version = "0.59.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "windows-link"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
+
+[[package]]
+name = "windows-result"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-strings"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.6",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.60.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
+dependencies = [
+ "windows-targets 0.53.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.61.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm 0.52.6",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.53.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3"
+dependencies = [
+ "windows-link",
+ "windows_aarch64_gnullvm 0.53.1",
+ "windows_aarch64_msvc 0.53.1",
+ "windows_i686_gnu 0.53.1",
+ "windows_i686_gnullvm 0.53.1",
+ "windows_i686_msvc 0.53.1",
+ "windows_x86_64_gnu 0.53.1",
+ "windows_x86_64_gnullvm 0.53.1",
+ "windows_x86_64_msvc 0.53.1",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.53.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
+
+[[package]]
+name = "wit-bindgen"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5"
+dependencies = [
+ "wit-bindgen-rust-macro",
+]
+
+[[package]]
+name = "wit-bindgen-core"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc"
+dependencies = [
+ "anyhow",
+ "heck",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-bindgen-rust"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21"
+dependencies = [
+ "anyhow",
+ "heck",
+ "indexmap 2.13.1",
+ "prettyplease",
+ "syn",
+ "wasm-metadata",
+ "wit-bindgen-core",
+ "wit-component",
+]
+
+[[package]]
+name = "wit-bindgen-rust-macro"
+version = "0.51.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a"
+dependencies = [
+ "anyhow",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wit-bindgen-core",
+ "wit-bindgen-rust",
+]
+
+[[package]]
+name = "wit-component"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2"
+dependencies = [
+ "anyhow",
+ "bitflags",
+ "indexmap 2.13.1",
+ "log",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "wasm-encoder",
+ "wasm-metadata",
+ "wasmparser",
+ "wit-parser",
+]
+
+[[package]]
+name = "wit-parser"
+version = "0.244.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736"
+dependencies = [
+ "anyhow",
+ "id-arena",
+ "indexmap 2.13.1",
+ "log",
+ "semver",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "unicode-xid",
+ "wasmparser",
+]
+
+[[package]]
+name = "writeable"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4"
+
+[[package]]
+name = "wyz"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed"
+dependencies = [
+ "tap",
+]
+
+[[package]]
+name = "xattr"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
+dependencies = [
+ "libc",
+ "rustix",
+]
+
+[[package]]
+name = "yoke"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca"
+dependencies = [
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.48"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "zerofrom"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
+
+[[package]]
+name = "zerotrie"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+]
+
+[[package]]
+name = "zerovec"
+version = "0.11.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "zmij"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
+
+[[package]]
+name = "zstd"
+version = "0.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a"
+dependencies = [
+ "zstd-safe",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "7.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d"
+dependencies = [
+ "zstd-sys",
+]
+
+[[package]]
+name = "zstd-sys"
+version = "2.0.16+zstd.1.5.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748"
+dependencies = [
+ "cc",
+ "pkg-config",
+]
diff --git a/fluss-rust/Cargo.toml b/fluss-rust/Cargo.toml
new file mode 100644
index 0000000000..a555a9198a
--- /dev/null
+++ b/fluss-rust/Cargo.toml
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[workspace.package]
+authors = ["Apache Fluss <dev@fluss.apache.org>"]
+categories = ["api-bindings", "database"]
+edition = "2024"
+homepage = "https://clients.fluss.apache.org/"
+license = "Apache-2.0"
+repository = "https://github.com/apache/fluss-rust"
+rust-version = "1.85"
+version = "1.0.0"
+keywords = ["fluss", "streaming-storage", "datalake"]
+
+[workspace]
+resolver = "2"
+members = ["crates/fluss", "crates/fluss-test-cluster", "crates/examples", "bindings/python", "bindings/cpp", "bindings/elixir/native/fluss_nif"]
+
+[workspace.dependencies]
+fluss = { package = "fluss-rs", version = "1.0.0", path = "crates/fluss", features = ["storage-all"] }
+tokio = { version = "1.44.2", features = ["full"] }
+clap = { version = "4.5.37", features = ["derive"] }
+arrow = { version = "57.0.0", features = ["ipc_compression", "ffi"] }
+bigdecimal = "0.4"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+metrics = "0.24"
+opendal = "0.53"
+jiff = { version = "0.2" }
diff --git a/fluss-rust/DEPENDENCIES.rust.tsv b/fluss-rust/DEPENDENCIES.rust.tsv
new file mode 100644
index 0000000000..b46eeac210
--- /dev/null
+++ b/fluss-rust/DEPENDENCIES.rust.tsv
@@ -0,0 +1,325 @@
+crate	Apache-2.0	Apache-2.0 WITH LLVM-exception	BSD-2-Clause	BSD-3-Clause	BSL-1.0	CC0-1.0	CDLA-Permissive-2.0	ISC	LGPL-2.1-or-later	MIT	Unicode-3.0	Unlicense	Zlib
+ahash@0.8.12	X									X			
+aho-corasick@1.1.4										X		X	
+android_system_properties@0.1.5	X									X			
+anstream@1.0.0	X									X			
+anstyle@1.0.14	X									X			
+anstyle-parse@1.0.0	X									X			
+anstyle-query@1.1.5	X									X			
+anstyle-wincon@3.0.11	X									X			
+anyhow@1.0.102	X									X			
+arrow@57.3.0	X												
+arrow-arith@57.3.0	X												
+arrow-array@57.3.0	X												
+arrow-buffer@57.3.0	X												
+arrow-cast@57.3.0	X												
+arrow-csv@57.3.0	X												
+arrow-data@57.3.0	X												
+arrow-ipc@57.3.0	X												
+arrow-json@57.3.0	X												
+arrow-ord@57.3.0	X												
+arrow-pyarrow@57.3.0	X												
+arrow-row@57.3.0	X												
+arrow-schema@57.3.0	X												
+arrow-select@57.3.0	X												
+arrow-string@57.3.0	X												
+async-trait@0.1.89	X									X			
+atoi@2.0.0										X			
+atomic-waker@1.1.2	X									X			
+autocfg@1.5.0	X									X			
+backon@1.6.0	X												
+base64@0.22.1	X									X			
+bigdecimal@0.4.10	X									X			
+bitflags@2.11.0	X									X			
+bitvec@1.0.1										X			
+block-buffer@0.10.4	X									X			
+bumpalo@3.20.2	X									X			
+byteorder@1.5.0										X		X	
+bytes@1.11.1										X			
+cc@1.2.57	X									X			
+cfg-if@1.0.4	X									X			
+chrono@0.4.44	X									X			
+clap@4.6.0	X									X			
+clap_builder@4.6.0	X									X			
+clap_derive@4.6.0	X									X			
+clap_lex@1.1.0	X									X			
+codespan-reporting@0.13.1	X												
+colorchoice@1.0.5	X									X			
+const-oid@0.9.6	X									X			
+const-random@0.1.18	X									X			
+const-random-macro@0.1.16	X									X			
+core-foundation-sys@0.8.7	X									X			
+cpufeatures@0.2.17	X									X			
+crc32c@0.6.8	X									X			
+crossbeam-utils@0.8.21	X									X			
+crunchy@0.2.4										X			
+crypto-common@0.1.7	X									X			
+csv@1.4.0										X		X	
+csv-core@0.1.13										X		X	
+cxx@1.0.194	X									X			
+cxx-build@1.0.194	X									X			
+cxxbridge-flags@1.0.194	X									X			
+cxxbridge-macro@1.0.194	X									X			
+dashmap@6.1.0										X			
+delegate@0.13.5	X									X			
+digest@0.10.7	X									X			
+displaydoc@0.2.5	X									X			
+either@1.15.0	X									X			
+equivalent@1.0.2	X									X			
+errno@0.3.14	X									X			
+fastrand@2.3.0	X									X			
+find-msvc-tools@0.1.9	X									X			
+fixedbitset@0.5.7	X									X			
+flatbuffers@25.12.19	X												
+fluss-cpp@0.1.0	X												
+fluss-examples@0.1.0	X												
+fluss-rs@0.1.0	X												
+fluss_python@0.1.0	X												
+fnv@1.0.7	X									X			
+foldhash@0.1.5													X
+foldhash@0.2.0													X
+form_urlencoded@1.2.2	X									X			
+funty@2.0.0										X			
+futures@0.3.32	X									X			
+futures-channel@0.3.32	X									X			
+futures-core@0.3.32	X									X			
+futures-executor@0.3.32	X									X			
+futures-io@0.3.32	X									X			
+futures-macro@0.3.32	X									X			
+futures-sink@0.3.32	X									X			
+futures-task@0.3.32	X									X			
+futures-util@0.3.32	X									X			
+generic-array@0.14.7										X			
+getrandom@0.2.17	X									X			
+getrandom@0.3.4	X									X			
+getrandom@0.4.2	X									X			
+gloo-timers@0.3.0	X									X			
+h2@0.4.13										X			
+half@2.7.1	X									X			
+hashbrown@0.14.5	X									X			
+hashbrown@0.15.5	X									X			
+hashbrown@0.16.1	X									X			
+heck@0.5.0	X									X			
+hex@0.4.3	X									X			
+hmac@0.12.1	X									X			
+home@0.5.12	X									X			
+http@1.4.0	X									X			
+http-body@1.0.1										X			
+http-body-util@0.1.3										X			
+httparse@1.10.1	X									X			
+httpdate@1.0.3	X									X			
+hyper@1.8.1										X			
+hyper-rustls@0.27.7	X							X		X			
+hyper-util@0.1.20										X			
+iana-time-zone@0.1.65	X									X			
+iana-time-zone-haiku@0.1.2	X									X			
+icu_collections@2.1.1											X		
+icu_locale_core@2.1.1											X		
+icu_normalizer@2.1.1											X		
+icu_normalizer_data@2.1.1											X		
+icu_properties@2.1.2											X		
+icu_properties_data@2.1.2											X		
+icu_provider@2.1.1											X		
+idna@1.1.0	X									X			
+idna_adapter@1.2.1	X									X			
+indexmap@2.13.0	X									X			
+indoc@2.0.7	X									X			
+ipnet@2.12.0	X									X			
+iri-string@0.7.11	X									X			
+is_terminal_polyfill@1.70.2	X									X			
+itertools@0.14.0	X									X			
+itoa@1.0.18	X									X			
+jiff@0.2.23										X		X	
+jiff-tzdb@0.1.6										X		X	
+jiff-tzdb-platform@0.1.3										X		X	
+jobserver@0.1.34	X									X			
+js-sys@0.3.91	X									X			
+lexical-core@1.0.6	X									X			
+lexical-parse-float@1.0.6	X									X			
+lexical-parse-integer@1.0.6	X									X			
+lexical-util@1.0.7	X									X			
+lexical-write-float@1.0.6	X									X			
+lexical-write-integer@1.0.6	X									X			
+libc@0.2.183	X									X			
+libm@0.2.16										X			
+link-cplusplus@1.0.12	X									X			
+linked-hash-map@0.5.6	X									X			
+linux-raw-sys@0.12.1	X	X								X			
+litemap@0.8.1											X		
+lock_api@0.4.14	X									X			
+log@0.4.29	X									X			
+lz4_flex@0.12.1										X			
+md-5@0.10.6	X									X			
+memchr@2.8.0										X		X	
+memoffset@0.9.1										X			
+mio@1.1.1										X			
+multimap@0.10.1	X									X			
+num-bigint@0.4.6	X									X			
+num-complex@0.4.6	X									X			
+num-integer@0.1.46	X									X			
+num-traits@0.2.19	X									X			
+once_cell@1.21.4	X									X			
+once_cell_polyfill@1.70.2	X									X			
+opendal@0.55.0	X												
+ordered-float@5.1.0										X			
+parking_lot@0.12.5	X									X			
+parking_lot_core@0.9.12	X									X			
+parse-display@0.10.0	X									X			
+parse-display-derive@0.10.0	X									X			
+percent-encoding@2.3.2	X									X			
+petgraph@0.8.3	X									X			
+pin-project-lite@0.2.17	X									X			
+pin-utils@0.1.0	X									X			
+pkg-config@0.3.32	X									X			
+portable-atomic@1.13.1	X									X			
+portable-atomic-util@0.2.6	X									X			
+potential_utf@0.1.4											X		
+ppv-lite86@0.2.21	X									X			
+prettyplease@0.2.37	X									X			
+proc-macro2@1.0.106	X									X			
+prost@0.14.3	X												
+prost-build@0.14.3	X												
+prost-derive@0.14.3	X												
+prost-types@0.14.3	X												
+pyo3@0.26.0	X									X			
+pyo3-async-runtimes@0.26.0	X												
+pyo3-build-config@0.26.0	X									X			
+pyo3-ffi@0.26.0	X									X			
+pyo3-macros@0.26.0	X									X			
+pyo3-macros-backend@0.26.0	X									X			
+python3-dll-a@0.2.14										X			
+quick-xml@0.37.5										X			
+quick-xml@0.38.4										X			
+quote@1.0.45	X									X			
+r-efi@5.3.0	X								X	X			
+r-efi@6.0.0	X								X	X			
+radium@0.7.0										X			
+rand@0.8.5	X									X			
+rand@0.9.2	X									X			
+rand_chacha@0.3.1	X									X			
+rand_chacha@0.9.0	X									X			
+rand_core@0.6.4	X									X			
+rand_core@0.9.5	X									X			
+redox_syscall@0.5.18										X			
+regex@1.12.3	X									X			
+regex-automata@0.4.14	X									X			
+regex-syntax@0.8.10	X									X			
+reqsign@0.16.5	X												
+reqwest@0.12.28	X									X			
+ring@0.17.14	X							X					
+rustc_version@0.4.1	X									X			
+rustix@1.1.4	X	X								X			
+rustls@0.23.37	X							X		X			
+rustls-pki-types@1.14.0	X									X			
+rustls-webpki@0.103.10								X					
+rustversion@1.0.22	X									X			
+ryu@1.0.23	X				X								
+scopeguard@1.2.0	X									X			
+scratch@1.0.9	X									X			
+semver@1.0.27	X									X			
+serde@1.0.228	X									X			
+serde_core@1.0.228	X									X			
+serde_derive@1.0.228	X									X			
+serde_json@1.0.149	X									X			
+serde_urlencoded@0.7.1	X									X			
+sha1@0.10.6	X									X			
+sha2@0.10.9	X									X			
+shlex@1.3.0	X									X			
+signal-hook-registry@1.4.8	X									X			
+simdutf8@0.1.5	X									X			
+slab@0.4.12										X			
+smallvec@1.15.1	X									X			
+snafu@0.8.9	X									X			
+snafu-derive@0.8.9	X									X			
+socket2@0.6.3	X									X			
+stable_deref_trait@1.2.1	X									X			
+strsim@0.11.1										X			
+structmeta@0.3.0	X									X			
+structmeta-derive@0.3.0	X									X			
+strum@0.26.3										X			
+strum_macros@0.26.4										X			
+subtle@2.6.1				X									
+syn@2.0.117	X									X			
+sync_wrapper@1.0.2	X												
+synstructure@0.13.2										X			
+tap@1.0.1										X			
+target-lexicon@0.13.5		X											
+tempfile@3.27.0	X									X			
+termcolor@1.4.1										X		X	
+thiserror@1.0.69	X									X			
+thiserror-impl@1.0.69	X									X			
+tikv-jemalloc-sys@0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7	X									X			
+tikv-jemallocator@0.6.1	X									X			
+tiny-keccak@2.0.2						X							
+tinystr@0.8.2											X		
+tokio@1.50.0										X			
+tokio-macros@2.6.1										X			
+tokio-rustls@0.26.4	X									X			
+tokio-util@0.7.18										X			
+tower@0.5.3										X			
+tower-http@0.6.8										X			
+tower-layer@0.3.3										X			
+tower-service@0.3.3										X			
+tracing@0.1.44										X			
+tracing-attributes@0.1.31										X			
+tracing-core@0.1.36										X			
+try-lock@0.2.5										X			
+twox-hash@2.1.2										X			
+typenum@1.19.0	X									X			
+unicode-ident@1.0.24	X									X	X		
+unicode-width@0.2.2	X									X			
+unindent@0.2.4	X									X			
+untrusted@0.9.0								X					
+url@2.5.8	X									X			
+utf8_iter@1.0.4	X									X			
+utf8parse@0.2.2	X									X			
+uuid@1.22.0	X									X			
+value-bag@1.12.0	X									X			
+version_check@0.9.5	X									X			
+want@0.3.1										X			
+wasi@0.11.1+wasi-snapshot-preview1	X	X								X			
+wasip2@1.0.2+wasi-0.2.9	X	X								X			
+wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06	X	X								X			
+wasm-bindgen@0.2.114	X									X			
+wasm-bindgen-futures@0.4.64	X									X			
+wasm-bindgen-macro@0.2.114	X									X			
+wasm-bindgen-macro-support@0.2.114	X									X			
+wasm-bindgen-shared@0.2.114	X									X			
+wasm-streams@0.4.2	X									X			
+web-sys@0.3.91	X									X			
+webpki-roots@1.0.6							X						
+winapi-util@0.1.11										X		X	
+windows-core@0.62.2	X									X			
+windows-implement@0.60.2	X									X			
+windows-interface@0.59.3	X									X			
+windows-link@0.2.1	X									X			
+windows-result@0.4.1	X									X			
+windows-strings@0.5.1	X									X			
+windows-sys@0.52.0	X									X			
+windows-sys@0.61.2	X									X			
+windows-targets@0.52.6	X									X			
+windows_aarch64_gnullvm@0.52.6	X									X			
+windows_aarch64_msvc@0.52.6	X									X			
+windows_i686_gnu@0.52.6	X									X			
+windows_i686_gnullvm@0.52.6	X									X			
+windows_i686_msvc@0.52.6	X									X			
+windows_x86_64_gnu@0.52.6	X									X			
+windows_x86_64_gnullvm@0.52.6	X									X			
+windows_x86_64_msvc@0.52.6	X									X			
+wit-bindgen@0.51.0	X	X								X			
+writeable@0.6.2											X		
+wyz@0.5.1										X			
+yoke@0.8.1											X		
+yoke-derive@0.8.1											X		
+zerocopy@0.8.47	X		X							X			
+zerocopy-derive@0.8.47	X		X							X			
+zerofrom@0.1.6											X		
+zerofrom-derive@0.1.6											X		
+zeroize@1.8.2	X									X			
+zerotrie@0.2.3											X		
+zerovec@0.11.5											X		
+zerovec-derive@0.11.2											X		
+zmij@1.0.21										X			
+zstd@0.13.3										X			
+zstd-safe@7.2.4	X									X			
+zstd-sys@2.0.16+zstd.1.5.7	X									X			
diff --git a/fluss-rust/DEVELOPMENT.md b/fluss-rust/DEVELOPMENT.md
new file mode 100644
index 0000000000..a1180d6f6c
--- /dev/null
+++ b/fluss-rust/DEVELOPMENT.md
@@ -0,0 +1,106 @@
+# Development Guide
+
+Welcome to the development guide of `fluss-rust`! This project builds `fluss-rust` client and language specific bindings.  
+
+## Pre-requisites
+
+- protobuf
+- rust
+
+You can install these using your favourite package / version manager. Example installation using mise:
+
+```bash
+mise install protobuf
+mise install rust
+```
+
+## IDE Setup
+
+We recommend [RustRover](https://www.jetbrains.com/rust/) IDE to work with fluss-rust code base.
+
+### Importing fluss-rust
+
+1. On your terminal, clone fluss-rust project from GitHub
+   ```bash
+   git clone https://github.com/apache/fluss-rust.git
+   ```
+1. Open RustRover, on `Projects` tab, click `Open` and navigate to the root directory of fluss-rust
+1. Click `Open`
+
+### Copyright Profile
+
+Fluss and Fluss-rust are Apache projects and as such every files need to have Apache licence header. This can be automated in RustRover by adding a Copyright profile:
+
+1. Go to `Settings` -> `Editor` -> `Copyright` -> `Copyright Profiles`.
+1. Add a new profile and name it `Apache`.
+1. Add the following text as the license text:
+   ```
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+   ```
+1. Go to `Editor` -> `Copyright` and choose the `Apache` profile as the default profile for this project.
+1. Click `Apply`
+
+We also use line comment formatting for licence headers. 
+1. Go to `Editor` -> `Copyright` -> `Formatting` -> `Rust`
+1. Choose `Use custom formatting`  
+1. Choose `Use line comment`
+
+## Project directories
+
+Source files are organized in the following manner
+
+1. `crates/fluss` - fluss rust client crate source
+1. `crates/examples` - fluss rust client examples
+1. `bindings` - bindings to other languages e.g. C++ under `bindings/cpp` and Python under `bindings/python`
+1. Click `Apply`
+2. 
+## Building & Testing
+
+See [quickstart](README.md#quick-start) for steps to run example code.
+
+Running all unit tests for fluss rust client: 
+
+```bash
+cargo test --workspace
+```
+
+Running all integration test cases:
+
+```bash
+cargo test --features integration_tests --workspace
+```
+
+
+### License check (cargo-deny)
+
+We use [cargo-deny](https://embarkstudios.github.io/cargo-deny/) to ensure all dependency licenses are Apache-compatible. When present, configuration lives in a `deny.toml` file at the repo root and should enforce an Apache-compatible license policy.
+
+```bash
+cargo install cargo-deny --locked
+cargo deny check licenses
+```
+
+### Formatting and Clippy
+
+Our CI runs cargo formatting and clippy to help keep the code base styling tidy and readable. Run the following commands and address any errors or warnings to ensure that your PR can complete CI successfully.
+
+```bash
+cargo fmt --all
+cargo clippy --all-targets --fix --allow-dirty --allow-staged
+```
+
diff --git a/fluss-rust/MODULE.bazel b/fluss-rust/MODULE.bazel
new file mode 100644
index 0000000000..f0e6025073
--- /dev/null
+++ b/fluss-rust/MODULE.bazel
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Required at repository root for root module mode (`bazel_dep(name = "fluss-cpp", ...)`).
+# Consumer examples use `local_path_override(..., path = "/path/to/fluss-rust")`, so
+# Bazel resolves the module from the repository root. This also matches the Rust
+# workspace layout used by `bindings/cpp` during cargo-based Bazel/CMake builds.
+# `0.0.0` is a local-development placeholder in this repository branch.
+# Consumers should depend on a published release version.
+module(
+    name = "fluss-cpp",
+    version = "0.0.0",
+)
+
+bazel_dep(name = "rules_cc", version = "0.0.17")
+bazel_dep(name = "platforms", version = "0.0.10")
+bazel_dep(name = "rules_foreign_cc", version = "0.15.1")
+bazel_dep(name = "rules_python", version = "1.2.0")
+
+python = use_extension("@rules_python//python/extensions:python.bzl", "python")
+python.toolchain(python_version = "3.12")
+use_repo(python, "python_3_12")
+
+foreign_cc_tools = use_extension("@rules_foreign_cc//foreign_cc:extensions.bzl", "tools")
+use_repo(
+    foreign_cc_tools,
+    "cmake_3.31.8_toolchains",
+    "cmake_src",
+    "ninja_1.13.0_toolchains",
+    "ninja_build_src",
+    "rules_foreign_cc_framework_toolchains",
+)
+
+register_toolchains(
+    "@rules_foreign_cc_framework_toolchains//:all",
+    "@cmake_3.31.8_toolchains//:all",
+    "@ninja_1.13.0_toolchains//:all",
+    "@python_3_12//:all",
+    "@rules_foreign_cc//toolchains:all",
+)
+
+cpp_sdk = use_extension("//bindings/cpp/bazel/cpp:deps.bzl", "cpp_sdk")
+cpp_sdk.config(
+    mode = "build",
+    arrow_cpp_version = "19.0.1",
+    protobuf_version = "3.25.5",
+    ep_cmake_ranlib = "/usr/bin/ranlib",
+    ep_cmake_ar = "/usr/bin/ar",
+    ep_cmake_nm = "/usr/bin/nm",
+)
+use_repo(cpp_sdk, "apache_arrow_cpp")
diff --git a/fluss-rust/README.md b/fluss-rust/README.md
new file mode 100644
index 0000000000..a88ec2f3ae
--- /dev/null
+++ b/fluss-rust/README.md
@@ -0,0 +1,125 @@
+# Apache Fluss™ Rust (Incubating)
+
+![Experimental](https://img.shields.io/badge/status-experimental-orange)
+
+Rust implementation of [Apache Fluss™](https://fluss.apache.org/).
+
+
+## Why Fluss?
+[Fluss](https://fluss.apache.org/) is a streaming storage built for real-time analytics which can serve as the real-time data layer for Lakehouse architectures.
+It bridges the gap between streaming data and the data Lakehouse by enabling low-latency, high-throughput data ingestion and processing while seamlessly integrating with popular compute engines.
+
+## Why Fluss Rust Client
+It's an official Rust client for interacting with Fluss. This client provides foundational capabilities for table management and log streaming operations, enabling developers to explore Fluss within Rust ecosystems.
+
+## Quick-Start
+
+### Step1 Start Fluss cluster
+#### Requirements
+Fluss runs on all UNIX-like environments, e.g. Linux, Mac OS X. Before you start to setup the system, make sure you have the following software installed on your test machine:
+
+Java 17 or higher (Java 8 and Java 11 are not recommended)
+If your cluster does not fulfill these software requirements you will need to install/upgrade it.
+
+Fluss requires the JAVA_HOME environment variable to be set on all nodes and point to the directory of your Java installation.
+
+#### Fluss Setup
+Go to the [downloads](https://fluss.apache.org/downloads/) page and download the latest Fluss release (currently 0.8.0). Make sure to pick the Fluss package matching your Java version. After downloading the latest release, extract it:
+```shell
+tar -xzf fluss-0.8.0-incubating-bin.tgz
+cd fluss-0.8.0-incubating/
+```
+You can start Fluss local cluster by running the following command:
+```shell
+./bin/local-cluster.sh start
+```
+After that, the Fluss local cluster is started.
+
+### Run Provided Example
+Only supports Linux or macOs. You will need to [install Rust](https://www.rust-lang.org/tools/install) firstly. 
+
+After that, go the project directory, build it and run the example:
+```shell
+cargo build --example example-table --release
+cd target/release/examples
+./example-table
+```
+The example code is as follows:
+```rust
+#[tokio::main]
+pub async fn main() -> Result<()> {
+    // 1: create the table;
+    let mut args = Args::default();
+    args.bootstrap_servers = "127.0.0.1:9123".to_string();
+    let conn_config = ConnectionConfig::from_args(args);
+    let conn = FlussConnection::new(conn_config).await;
+
+    let admin = conn.get_admin();
+
+    let table_descriptor = TableDescriptor::builder()
+        .schema(
+            Schema::builder()
+                .column("c1", DataTypes::int())
+                .column("c2", DataTypes::string())
+                .build(),
+        )
+        .build();
+
+    let table_path = TablePath::new("fluss".to_owned(), "rust_test".to_owned());
+
+    admin
+        .create_table(&table_path, &table_descriptor, true)
+        .await
+        .unwrap();
+
+    // 2: get the table
+    let table_info = admin.get_table_info(&table_path).await.unwrap();
+    print!("Get created table:\n {}\n", table_info);
+
+    // let's sleep 2 seconds to wait leader ready
+    thread::sleep(Duration::from_secs(2));
+
+    // 3: append log to the table
+    let table = conn.get_table(&table_path).await;
+    let append_writer = table.new_append().create_writer();
+    let batch = record_batch!(("c1", Int32, [1, 2, 3, 4, 5, 6]), ("c2", Utf8, ["a1", "a2", "a3", "a4", "a5", "a6"])).unwrap();
+    append_writer.append(batch)?;
+    append_writer.flush().await?;
+    println!("Start to scan log records......");
+    // 4: scan the records
+    let log_scanner = table.new_scan().create_log_scanner();
+    log_scanner.subscribe(0, 0).await;
+
+    loop {
+        let scan_records = log_scanner.poll(Duration::from_secs(10)).await?;
+        println!("Start to poll records......");
+        for record in scan_records {
+            let row = record.row();
+            println!(
+                "{{{}, {}}}@{}",
+                row.get_int(0),
+                row.get_string(1),
+                record.offset()
+            );
+        }
+    }
+    Ok(())
+}
+```
+
+You can change it according to your needs, have fun!
+
+#### Clear environment
+Then, stop your Fluss cluster. Go to your Fluss home, stop it via the following commands:
+```shell
+./bin/local-cluster.sh stop
+```
+
+## Documentation
+
+- [Development Guide](DEVELOPMENT.md) – Build, test, and contribute to fluss-rust.
+- [Release Guide](website/docs/release/create-release.md) – How to build, release, and sign official Fluss client packages (Rust, Python, C++).
+
+## License
+
+Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0)
\ No newline at end of file
diff --git a/fluss-rust/bindings/cpp/.bazelrc b/fluss-rust/bindings/cpp/.bazelrc
new file mode 100644
index 0000000000..ce7d81f82a
--- /dev/null
+++ b/fluss-rust/bindings/cpp/.bazelrc
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Bazel configuration for fluss-rust C++ bindings
+
+# Enable BzlMod
+common --enable_bzlmod
+
+# Debug configuration (matches BUILD.bazel settings)
+build:debug --compilation_mode=dbg
+build:debug --copt=-g3
+build:debug --copt=-ggdb
+build:debug --copt=-O0
+build:debug --copt=-fno-omit-frame-pointer
+build:debug --copt=-DDEBUG
+build:debug --strip=never
+build:debug --linkopt=-g
+
+# Release configuration
+build:release --compilation_mode=opt
+build:release --copt=-O2
+build:release --copt=-DNDEBUG
+build:release --strip=always
diff --git a/fluss-rust/bindings/cpp/.clang-format b/fluss-rust/bindings/cpp/.clang-format
new file mode 100644
index 0000000000..1c31900ec4
--- /dev/null
+++ b/fluss-rust/bindings/cpp/.clang-format
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+---
+BasedOnStyle: Google
+ColumnLimit: 100
+IndentWidth: 4
diff --git a/fluss-rust/bindings/cpp/.gitignore b/fluss-rust/bindings/cpp/.gitignore
new file mode 100644
index 0000000000..1f1632b95c
--- /dev/null
+++ b/fluss-rust/bindings/cpp/.gitignore
@@ -0,0 +1,27 @@
+build/
+cmake-build-*/
+CMakeFiles/
+.idea/
+*.o
+*.a
+*.so
+*.dylib
+
+# Bazel build outputs
+bazel-build/
+bazel-bin
+bazel-out
+bazel-testlogs
+bazel-cpp
+bazel-*
+MODULE.bazel.lock
+
+# Keep versioned Bazel consumer examples (name starts with bazel-).
+!examples/bazel-consumer/
+!examples/bazel-consumer/**
+# `build/` is ignored globally above; keep this fixture path visible.
+!examples/bazel-consumer/build/
+!examples/bazel-consumer/build/**
+examples/bazel-consumer/**/MODULE.bazel.lock
+examples/bazel-consumer/**/bazel-*
+examples/bazel-consumer/**/tmp.log
diff --git a/fluss-rust/bindings/cpp/BUILD.bazel b/fluss-rust/bindings/cpp/BUILD.bazel
new file mode 100644
index 0000000000..d247baf18c
--- /dev/null
+++ b/fluss-rust/bindings/cpp/BUILD.bazel
@@ -0,0 +1,436 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+licenses(["notice"])
+
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_import", "cc_library")
+
+config_setting(
+    name = "debug_mode",
+    values = {"compilation_mode": "dbg"},
+)
+
+config_setting(
+    name = "fastbuild_mode",
+    values = {"compilation_mode": "fastbuild"},
+)
+
+config_setting(
+    name = "release_mode",
+    values = {"compilation_mode": "opt"},
+)
+
+_PROTOC_SETUP_SNIPPET = """
+        set -e
+        if [ -n "$${CARGO:-}" ]; then
+            if [ ! -x "$$CARGO" ]; then
+                echo "Error: CARGO is set but not executable: $$CARGO" >&2
+                exit 1
+            fi
+            CARGO_BIN="$$CARGO"
+        else
+            CARGO_BIN=$$(command -v cargo || true)
+            if [ -z "$$CARGO_BIN" ]; then
+                echo "Error: cargo not found in PATH and CARGO is not set" >&2
+                exit 1
+            fi
+        fi
+        if [ -n "$${PROTOC:-}" ]; then
+            if [ ! -x "$$PROTOC" ]; then
+                echo "Error: PROTOC is set but not executable: $$PROTOC" >&2
+                exit 1
+            fi
+            export PROTOC
+        else
+            PROTOC_BIN=$$(command -v protoc || true)
+            if [ -z "$$PROTOC_BIN" ]; then
+                echo "Error: protoc not found in PATH and PROTOC is not set" >&2
+                exit 1
+            fi
+            export PROTOC="$$PROTOC_BIN"
+        fi
+"""
+
+genrule(
+    name = "cargo_build_debug",
+    srcs = glob([
+        "src/**/*.rs",
+        "Cargo.toml",
+    ]),
+    outs = [
+        "rust_lib_debug.a",
+        "rust_bridge_cc_debug.cc",
+        "rust_bridge_h_debug.h",
+        "src/lib.rs_debug.h",
+        "cxxbridge/rust/cxx_debug.h",
+    ],
+    cmd = _PROTOC_SETUP_SNIPPET + """
+        EXECROOT=$$(pwd)
+        OUTPUT_LIB=$(location rust_lib_debug.a)
+        OUTPUT_CC=$(location rust_bridge_cc_debug.cc)
+        OUTPUT_H=$(location rust_bridge_h_debug.h)
+        OUTPUT_SRC_H=$(location src/lib.rs_debug.h)
+        OUTPUT_CXX_H=$(location cxxbridge/rust/cxx_debug.h)
+        # Resolve real source path from sandbox symlink
+        SANDBOX_CARGO=$(location Cargo.toml)
+        REAL_CARGO=$$(readlink -f $$SANDBOX_CARGO 2>/dev/null || python3 -c "import os; print(os.path.realpath('$$SANDBOX_CARGO'))")
+        CARGO_DIR=$$(dirname $$REAL_CARGO)
+        # Find Cargo workspace root (fluss-rust directory, 2 levels up from bindings/cpp)
+        WORKSPACE_ROOT=$$(cd $$CARGO_DIR/../.. && pwd)
+        if [ ! -f $$WORKSPACE_ROOT/Cargo.toml ]; then
+            echo "Error: Cannot find workspace root Cargo.toml at $$WORKSPACE_ROOT" >&2
+            exit 1
+        fi
+        cd $$WORKSPACE_ROOT
+        "$$CARGO_BIN" build --manifest-path $$CARGO_DIR/Cargo.toml
+        CARGO_TARGET_DIR=$$WORKSPACE_ROOT/target
+        # cxxbridge uses the Cargo package name (with hyphen): fluss-cpp
+        RUST_BRIDGE_DIR=$$CARGO_TARGET_DIR/cxxbridge/fluss-cpp/src
+        # Cargo converts hyphens to underscores in library file names: libfluss_cpp.a
+        RUST_LIB=$$CARGO_TARGET_DIR/debug/libfluss_cpp.a
+        if [ ! -f $$RUST_LIB ]; then
+            echo "Error: Rust library not found at $$RUST_LIB" >&2
+            exit 1
+        fi
+        if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.cc ]; then
+            echo "Error: cxxbridge CC file not found at $$RUST_BRIDGE_DIR/lib.rs.cc" >&2
+            exit 1
+        fi
+        if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.h ]; then
+            echo "Error: cxxbridge header file not found at $$RUST_BRIDGE_DIR/lib.rs.h" >&2
+            exit 1
+        fi
+        cd $$EXECROOT
+        mkdir -p $$(dirname $$OUTPUT_SRC_H) $$(dirname $$OUTPUT_CXX_H)
+        cp $$RUST_LIB $$OUTPUT_LIB || (echo "Failed to copy $$RUST_LIB to $$OUTPUT_LIB" >&2; exit 1)
+        cp $$RUST_BRIDGE_DIR/lib.rs.cc $$OUTPUT_CC || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.cc to $$OUTPUT_CC" >&2; exit 1)
+        cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_H" >&2; exit 1)
+        cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_SRC_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_SRC_H" >&2; exit 1)
+        CXX_H_SOURCE=$$CARGO_TARGET_DIR/cxxbridge/rust/cxx.h
+        if [ ! -f $$CXX_H_SOURCE ] && [ ! -L $$CXX_H_SOURCE ]; then
+            echo "Error: cxx.h not found at $$CXX_H_SOURCE" >&2
+            exit 1
+        fi
+        cp -L $$CXX_H_SOURCE $$OUTPUT_CXX_H || (echo "Failed to copy $$CXX_H_SOURCE to $$OUTPUT_CXX_H" >&2; exit 1)
+    """,
+    message = "Building Rust library (debug) with cargo...",
+    local = 1,
+)
+
+genrule(
+    name = "cargo_build_release",
+    srcs = glob([
+        "src/**/*.rs",
+        "Cargo.toml",
+    ]),
+    outs = [
+        "rust_lib_release.a",
+        "rust_bridge_cc_release.cc",
+        "rust_bridge_h_release.h",
+        "src/lib.rs_release.h",
+        "cxxbridge/rust/cxx_release.h",
+    ],
+    cmd = _PROTOC_SETUP_SNIPPET + """
+        EXECROOT=$$(pwd)
+        OUTPUT_LIB=$(location rust_lib_release.a)
+        OUTPUT_CC=$(location rust_bridge_cc_release.cc)
+        OUTPUT_H=$(location rust_bridge_h_release.h)
+        OUTPUT_SRC_H=$(location src/lib.rs_release.h)
+        OUTPUT_CXX_H=$(location cxxbridge/rust/cxx_release.h)
+        # Resolve real source path from sandbox symlink
+        SANDBOX_CARGO=$(location Cargo.toml)
+        REAL_CARGO=$$(readlink -f $$SANDBOX_CARGO 2>/dev/null || python3 -c "import os; print(os.path.realpath('$$SANDBOX_CARGO'))")
+        CARGO_DIR=$$(dirname $$REAL_CARGO)
+        # Find Cargo workspace root (fluss-rust directory, 2 levels up from bindings/cpp)
+        WORKSPACE_ROOT=$$(cd $$CARGO_DIR/../.. && pwd)
+        if [ ! -f $$WORKSPACE_ROOT/Cargo.toml ]; then
+            echo "Error: Cannot find workspace root Cargo.toml at $$WORKSPACE_ROOT" >&2
+            exit 1
+        fi
+        cd $$WORKSPACE_ROOT
+        "$$CARGO_BIN" build --release --manifest-path $$CARGO_DIR/Cargo.toml
+        CARGO_TARGET_DIR=$$WORKSPACE_ROOT/target
+        # cxxbridge uses the Cargo package name (with hyphen): fluss-cpp
+        RUST_BRIDGE_DIR=$$CARGO_TARGET_DIR/cxxbridge/fluss-cpp/src
+        # Cargo converts hyphens to underscores in library file names: libfluss_cpp.a
+        RUST_LIB=$$CARGO_TARGET_DIR/release/libfluss_cpp.a
+        if [ ! -f $$RUST_LIB ]; then
+            echo "Error: Rust library not found at $$RUST_LIB" >&2
+            exit 1
+        fi
+        if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.cc ]; then
+            echo "Error: cxxbridge CC file not found at $$RUST_BRIDGE_DIR/lib.rs.cc" >&2
+            exit 1
+        fi
+        if [ ! -f $$RUST_BRIDGE_DIR/lib.rs.h ]; then
+            echo "Error: cxxbridge header file not found at $$RUST_BRIDGE_DIR/lib.rs.h" >&2
+            exit 1
+        fi
+        cd $$EXECROOT
+        mkdir -p $$(dirname $$OUTPUT_SRC_H) $$(dirname $$OUTPUT_CXX_H)
+        cp $$RUST_LIB $$OUTPUT_LIB || (echo "Failed to copy $$RUST_LIB to $$OUTPUT_LIB" >&2; exit 1)
+        cp $$RUST_BRIDGE_DIR/lib.rs.cc $$OUTPUT_CC || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.cc to $$OUTPUT_CC" >&2; exit 1)
+        cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_H" >&2; exit 1)
+        cp $$RUST_BRIDGE_DIR/lib.rs.h $$OUTPUT_SRC_H || (echo "Failed to copy $$RUST_BRIDGE_DIR/lib.rs.h to $$OUTPUT_SRC_H" >&2; exit 1)
+        CXX_H_SOURCE=$$CARGO_TARGET_DIR/cxxbridge/rust/cxx.h
+        if [ ! -f $$CXX_H_SOURCE ] && [ ! -L $$CXX_H_SOURCE ]; then
+            echo "Error: cxx.h not found at $$CXX_H_SOURCE" >&2
+            exit 1
+        fi
+        cp -L $$CXX_H_SOURCE $$OUTPUT_CXX_H || (echo "Failed to copy $$CXX_H_SOURCE to $$OUTPUT_CXX_H" >&2; exit 1)
+    """,
+    message = "Building Rust library (release) with cargo...",
+    local = 1,
+)
+
+filegroup(
+    name = "lib_rs_h_selected",
+    srcs = select({
+        ":debug_mode": [":src/lib.rs_debug.h"],
+        ":fastbuild_mode": [":src/lib.rs_debug.h"],
+        ":release_mode": [":src/lib.rs_release.h"],
+    }),
+)
+
+genrule(
+    name = "lib_rs_h_unified",
+    srcs = [":lib_rs_h_selected"],
+    outs = ["src/lib.rs.h"],
+    cmd = "cp $(location :lib_rs_h_selected) $(location src/lib.rs.h)",
+    message = "Unifying lib.rs.h for C++ includes",
+)
+
+filegroup(
+    name = "rust_bridge_cc_selected",
+    srcs = select({
+        ":debug_mode": [":rust_bridge_cc_debug.cc"],
+        ":fastbuild_mode": [":rust_bridge_cc_debug.cc"],
+        ":release_mode": [":rust_bridge_cc_release.cc"],
+    }),
+)
+
+genrule(
+    name = "rust_bridge_cc_unified",
+    srcs = [":rust_bridge_cc_selected"],
+    outs = ["rust_bridge_cc.cc"],
+    cmd = "cp $(location :rust_bridge_cc_selected) $(location rust_bridge_cc.cc)",
+    message = "Unifying rust_bridge_cc.cc for C++ compilation",
+)
+
+filegroup(
+    name = "rust_bridge_h_selected",
+    srcs = select({
+        ":debug_mode": [":rust_bridge_h_debug.h"],
+        ":fastbuild_mode": [":rust_bridge_h_debug.h"],
+        ":release_mode": [":rust_bridge_h_release.h"],
+    }),
+)
+
+genrule(
+    name = "rust_bridge_h_unified",
+    srcs = [":rust_bridge_h_selected"],
+    outs = ["rust_bridge_h.h"],
+    cmd = "cp $(location :rust_bridge_h_selected) $(location rust_bridge_h.h)",
+    message = "Unifying rust_bridge_h.h for C++ includes",
+)
+
+filegroup(
+    name = "cxx_h_selected",
+    srcs = select({
+        ":debug_mode": [":cxxbridge/rust/cxx_debug.h"],
+        ":fastbuild_mode": [":cxxbridge/rust/cxx_debug.h"],
+        ":release_mode": [":cxxbridge/rust/cxx_release.h"],
+    }),
+)
+
+genrule(
+    name = "cxx_h_unified",
+    srcs = [":cxx_h_selected"],
+    outs = ["cxxbridge/rust/cxx.h"],
+    cmd = "mkdir -p $$(dirname $(location cxxbridge/rust/cxx.h)) && cp $(location :cxx_h_selected) $(location cxxbridge/rust/cxx.h)",
+    message = "Unifying cxx.h for C++ includes",
+)
+
+cc_import(
+    name = "rust_lib",
+    static_library = select({
+        ":debug_mode": ":rust_lib_debug.a",
+        ":fastbuild_mode": ":rust_lib_debug.a",
+        ":release_mode": ":rust_lib_release.a",
+    }),
+    alwayslink = True,
+)
+
+cc_library(
+    name = "fluss_cpp",
+    srcs = [
+        "src/admin.cpp",
+        "src/connection.cpp",
+        "src/table.cpp",
+    ],
+    hdrs = [
+        "include/fluss.hpp",
+    ],
+    textual_hdrs = [
+        "src/ffi_converter.hpp",
+        ":rust_bridge_h_unified",
+        ":lib_rs_h_unified",
+        ":cxx_h_unified",
+    ],
+    strip_include_prefix = "include",
+    copts = [
+        "-std=c++17",
+    ] + select({
+        ":debug_mode": [
+            "-g3",
+            "-O0",
+            "-ggdb",
+            "-fno-omit-frame-pointer",
+            "-DDEBUG",
+        ],
+        ":fastbuild_mode": [
+            "-g",
+            "-O0",
+        ],
+        ":release_mode": [
+            "-O2",
+            "-DNDEBUG",
+        ],
+    }),
+    includes = [
+        "src",
+        "cxxbridge",
+    ],
+    linkopts = [
+        "-ldl",
+        "-lpthread",
+    ] + select({
+        ":debug_mode": ["-g"],
+        ":fastbuild_mode": ["-g"],
+        ":release_mode": [],
+    }) + select({
+        "@platforms//os:macos": [
+            "-framework", "CoreFoundation",
+            "-framework", "Security",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":rust_lib",
+        "//bindings/cpp/bazel/cpp:arrow_cpp_dep",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_binary(
+    name = "fluss_cpp_example",
+    srcs = [
+        "examples/example.cpp",
+    ],
+    deps = [":fluss_cpp"],
+    copts = [
+        "-std=c++17",
+    ] + select({
+        ":debug_mode": [
+            "-g3",
+            "-O0",
+            "-ggdb",
+            "-fno-omit-frame-pointer",
+            "-DDEBUG",
+        ],
+        ":fastbuild_mode": [
+            "-g",
+            "-O0",
+        ],
+        ":release_mode": [
+            "-O2",
+            "-DNDEBUG",
+        ],
+    }),
+    linkopts = select({
+        ":debug_mode": ["-g"],
+        ":fastbuild_mode": ["-g"],
+        ":release_mode": [],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+cc_binary(
+    name = "fluss_cpp_admin_example",
+    srcs = [
+        "examples/admin_example.cpp",
+    ],
+    deps = [":fluss_cpp"],
+    copts = [
+        "-std=c++17",
+    ] + select({
+        ":debug_mode": [
+            "-g3",
+            "-O0",
+            "-ggdb",
+            "-fno-omit-frame-pointer",
+            "-DDEBUG",
+        ],
+        ":fastbuild_mode": [
+            "-g",
+            "-O0",
+        ],
+        ":release_mode": [
+            "-O2",
+            "-DNDEBUG",
+        ],
+    }),
+    linkopts = select({
+        ":debug_mode": ["-g"],
+        ":fastbuild_mode": ["-g"],
+        ":release_mode": [],
+    }),
+    visibility = ["//visibility:public"],
+)
+
+cc_binary(
+    name = "fluss_cpp_kv_example",
+    srcs = [
+        "examples/kv_example.cpp",
+    ],
+    deps = [":fluss_cpp"],
+    copts = [
+        "-std=c++17",
+    ] + select({
+        ":debug_mode": [
+            "-g3",
+            "-O0",
+            "-ggdb",
+            "-fno-omit-frame-pointer",
+            "-DDEBUG",
+        ],
+        ":fastbuild_mode": [
+            "-g",
+            "-O0",
+        ],
+        ":release_mode": [
+            "-O2",
+            "-DNDEBUG",
+        ],
+    }),
+    linkopts = select({
+        ":debug_mode": ["-g"],
+        ":fastbuild_mode": ["-g"],
+        ":release_mode": [],
+    }),
+    visibility = ["//visibility:public"],
+)
diff --git a/fluss-rust/bindings/cpp/CMakeLists.txt b/fluss-rust/bindings/cpp/CMakeLists.txt
new file mode 100644
index 0000000000..44407ac860
--- /dev/null
+++ b/fluss-rust/bindings/cpp/CMakeLists.txt
@@ -0,0 +1,293 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+cmake_minimum_required(VERSION 3.22)
+
+if (POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+endif()
+
+project(fluss-cpp LANGUAGES CXX)
+
+include(FetchContent)
+set(FLUSS_GOOGLETEST_VERSION 1.15.2 CACHE STRING "version of GoogleTest")
+set(FLUSS_NLOHMANN_JSON_VERSION 3.12.0 CACHE STRING "version of nlohmann/json")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+set(FLUSS_CPP_DEP_MODE "system" CACHE STRING "Dependency provisioning mode for fluss-cpp (system|build)")
+set_property(CACHE FLUSS_CPP_DEP_MODE PROPERTY STRINGS system build)
+set(FLUSS_CPP_ARROW_VERSION "19.0.1" CACHE STRING "Arrow C++ version baseline for fluss-cpp")
+set(FLUSS_CPP_PROTOBUF_VERSION "3.25.5" CACHE STRING "Protobuf/protoc version baseline for fluss-cpp")
+set(FLUSS_CPP_ARROW_SYSTEM_ROOT "" CACHE PATH "Optional Arrow installation prefix for system mode")
+set(FLUSS_CPP_ARROW_SOURCE_URL
+    "https://github.com/apache/arrow/archive/refs/tags/apache-arrow-19.0.1.tar.gz"
+    CACHE STRING
+    "Arrow source archive URL used in build mode")
+set(FLUSS_CPP_ARROW_SOURCE_SHA256
+    "4c898504958841cc86b6f8710ecb2919f96b5e10fa8989ac10ac4fca8362d86a"
+    CACHE STRING
+    "SHA256 for the Arrow source archive used in build mode")
+
+find_package(Threads REQUIRED)
+
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+option(FLUSS_ENABLE_ADDRESS_SANITIZER "Enable address sanitizer" OFF)
+option(FLUSS_ENABLE_TESTING "Enable building test binary for fluss" OFF)
+option(FLUSS_DEV "Enable dev mode" OFF)
+
+if (FLUSS_DEV)
+    set(FLUSS_ENABLE_ADDRESS_SANITIZER ON)
+    set(FLUSS_ENABLE_TESTING ON)
+endif()
+
+if (NOT FLUSS_CPP_DEP_MODE STREQUAL "system" AND NOT FLUSS_CPP_DEP_MODE STREQUAL "build")
+    message(FATAL_ERROR "Unsupported FLUSS_CPP_DEP_MODE='${FLUSS_CPP_DEP_MODE}'. Expected 'system' or 'build'.")
+endif()
+
+find_program(FLUSS_PROTOC_EXECUTABLE NAMES protoc)
+if (NOT FLUSS_PROTOC_EXECUTABLE)
+    message(FATAL_ERROR "protoc not found. Install protoc or set it in PATH. (Fluss baseline: ${FLUSS_CPP_PROTOBUF_VERSION})")
+endif()
+
+if (DEFINED ENV{CARGO} AND NOT "$ENV{CARGO}" STREQUAL "" AND EXISTS "$ENV{CARGO}")
+    set(FLUSS_CARGO_EXECUTABLE "$ENV{CARGO}")
+else()
+    if (DEFINED ENV{CARGO} AND NOT "$ENV{CARGO}" STREQUAL "")
+        get_filename_component(_FLUSS_CARGO_HINT_DIR "$ENV{CARGO}" DIRECTORY)
+    endif()
+    find_program(FLUSS_CARGO_EXECUTABLE NAMES cargo HINTS "${_FLUSS_CARGO_HINT_DIR}")
+endif()
+if (NOT FLUSS_CARGO_EXECUTABLE)
+    message(FATAL_ERROR "cargo not found. Install Rust toolchain or set CARGO/PATH.")
+endif()
+
+execute_process(
+    COMMAND ${FLUSS_PROTOC_EXECUTABLE} --version
+    OUTPUT_VARIABLE FLUSS_PROTOC_VERSION_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_QUIET
+)
+string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" FLUSS_PROTOC_VERSION "${FLUSS_PROTOC_VERSION_OUTPUT}")
+set(FLUSS_PROTOC_VERSION_NORM "${FLUSS_PROTOC_VERSION}")
+set(FLUSS_CPP_PROTOBUF_VERSION_NORM "${FLUSS_CPP_PROTOBUF_VERSION}")
+string(REGEX REPLACE "^3\\." "" FLUSS_PROTOC_VERSION_NORM "${FLUSS_PROTOC_VERSION_NORM}")
+string(REGEX REPLACE "^3\\." "" FLUSS_CPP_PROTOBUF_VERSION_NORM "${FLUSS_CPP_PROTOBUF_VERSION_NORM}")
+if (FLUSS_PROTOC_VERSION AND
+    NOT FLUSS_PROTOC_VERSION VERSION_EQUAL FLUSS_CPP_PROTOBUF_VERSION AND
+    NOT FLUSS_PROTOC_VERSION_NORM VERSION_EQUAL FLUSS_CPP_PROTOBUF_VERSION_NORM)
+    message(WARNING
+        "protoc version (${FLUSS_PROTOC_VERSION}) does not match Fluss baseline "
+        "(${FLUSS_CPP_PROTOBUF_VERSION}). Build may still work, but this is outside the tested baseline.")
+endif()
+
+message(STATUS "Fluss C++ dependency mode: ${FLUSS_CPP_DEP_MODE}")
+message(STATUS "Fluss C++ protoc executable: ${FLUSS_PROTOC_EXECUTABLE} (${FLUSS_PROTOC_VERSION_OUTPUT})")
+message(STATUS "Fluss C++ cargo executable: ${FLUSS_CARGO_EXECUTABLE}")
+
+if (FLUSS_CPP_DEP_MODE STREQUAL "system")
+    if (FLUSS_CPP_ARROW_SYSTEM_ROOT)
+        list(APPEND CMAKE_PREFIX_PATH "${FLUSS_CPP_ARROW_SYSTEM_ROOT}")
+        set(Arrow_ROOT "${FLUSS_CPP_ARROW_SYSTEM_ROOT}")
+    endif()
+
+    find_package(Arrow REQUIRED)
+
+    if (DEFINED Arrow_VERSION AND Arrow_VERSION AND NOT Arrow_VERSION VERSION_EQUAL FLUSS_CPP_ARROW_VERSION)
+        message(WARNING
+            "Arrow version (${Arrow_VERSION}) does not match Fluss baseline "
+            "(${FLUSS_CPP_ARROW_VERSION}). Build may still work, but this is outside the tested baseline.")
+    endif()
+else()
+    # Build mode: provision Arrow C++ from source in-tree.
+    set(ARROW_BUILD_SHARED ON CACHE BOOL "" FORCE)
+    set(ARROW_BUILD_STATIC OFF CACHE BOOL "" FORCE)
+    set(ARROW_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+    set(ARROW_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
+    set(ARROW_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
+    set(ARROW_BUILD_INTEGRATION OFF CACHE BOOL "" FORCE)
+    set(ARROW_BUILD_UTILITIES OFF CACHE BOOL "" FORCE)
+    set(ARROW_COMPUTE OFF CACHE BOOL "" FORCE)
+    set(ARROW_CSV OFF CACHE BOOL "" FORCE)
+    set(ARROW_DATASET OFF CACHE BOOL "" FORCE)
+    set(ARROW_FILESYSTEM OFF CACHE BOOL "" FORCE)
+    set(ARROW_JSON OFF CACHE BOOL "" FORCE)
+    set(ARROW_PARQUET OFF CACHE BOOL "" FORCE)
+    set(ARROW_IPC ON CACHE BOOL "" FORCE)
+    # Reduce third-party sub-build complexity in build mode.
+    set(ARROW_JEMALLOC OFF CACHE BOOL "" FORCE)
+    set(ARROW_MIMALLOC OFF CACHE BOOL "" FORCE)
+    set(ARROW_DEPENDENCY_SOURCE BUNDLED CACHE STRING "" FORCE)
+    set(ARROW_SIMD_LEVEL NONE CACHE STRING "" FORCE)
+    set(ARROW_RUNTIME_SIMD_LEVEL NONE CACHE STRING "" FORCE)
+
+    FetchContent_Declare(
+        apache_arrow_src
+        URL ${FLUSS_CPP_ARROW_SOURCE_URL}
+        URL_HASH SHA256=${FLUSS_CPP_ARROW_SOURCE_SHA256}
+        SOURCE_SUBDIR cpp
+    )
+    FetchContent_MakeAvailable(apache_arrow_src)
+    set(FLUSS_CPP_ARROW_EXTRA_INCLUDE_DIRS
+        "${apache_arrow_src_SOURCE_DIR}/cpp/src"
+        "${apache_arrow_src_BINARY_DIR}/src")
+
+    if (TARGET arrow_shared AND NOT TARGET Arrow::arrow_shared)
+        add_library(Arrow::arrow_shared ALIAS arrow_shared)
+    endif()
+    if (NOT TARGET Arrow::arrow_shared)
+        message(FATAL_ERROR "Arrow build mode did not produce target Arrow::arrow_shared (or arrow_shared).")
+    endif()
+endif()
+
+# Get cargo target dir
+execute_process(COMMAND ${FLUSS_CARGO_EXECUTABLE} locate-project --workspace --message-format plain
+    OUTPUT_VARIABLE CARGO_MANIFEST_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR})
+if (NOT CARGO_MANIFEST_PATH)
+    message(FATAL_ERROR
+        "Failed to resolve Cargo workspace target dir via '${FLUSS_CARGO_EXECUTABLE} locate-project'. "
+        "Check Rust toolchain installation and PATH/CARGO.")
+endif()
+get_filename_component(CARGO_WORKSPACE_DIR "${CARGO_MANIFEST_PATH}" DIRECTORY)
+set(CARGO_TARGET_DIR "${CARGO_WORKSPACE_DIR}/target")
+
+set(CARGO_MANIFEST ${PROJECT_SOURCE_DIR}/Cargo.toml)
+set(RUST_SOURCE_FILE ${PROJECT_SOURCE_DIR}/src/lib.rs)
+set(RUST_BRIDGE_CPP ${CARGO_TARGET_DIR}/cxxbridge/fluss-cpp/src/lib.rs.cc)
+set(RUST_HEADER_FILE ${CARGO_TARGET_DIR}/cxxbridge/fluss-cpp/src/lib.rs.h)
+
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(RUST_LIB ${CARGO_TARGET_DIR}/debug/${CMAKE_STATIC_LIBRARY_PREFIX}fluss_cpp${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+    set(RUST_LIB ${CARGO_TARGET_DIR}/release/${CMAKE_STATIC_LIBRARY_PREFIX}fluss_cpp${CMAKE_STATIC_LIBRARY_SUFFIX})
+endif()
+
+set(CPP_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include
+                    ${PROJECT_SOURCE_DIR}/src
+                    ${CARGO_TARGET_DIR}/cxxbridge
+                    ${CARGO_TARGET_DIR}/cxxbridge/fluss-cpp/src)
+
+file(GLOB CPP_SOURCE_FILE "src/*.cpp")
+file(GLOB CPP_HEADER_FILE "include/*.hpp")
+
+if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+    list(APPEND CARGO_BUILD_FLAGS "--release")
+endif()
+
+add_custom_target(cargo_build
+    COMMAND ${CMAKE_COMMAND} -E env PROTOC=${FLUSS_PROTOC_EXECUTABLE} ${FLUSS_CARGO_EXECUTABLE} build --manifest-path ${CARGO_MANIFEST} ${CARGO_BUILD_FLAGS}
+    BYPRODUCTS ${RUST_BRIDGE_CPP} ${RUST_LIB} ${RUST_HEADER_FILE}
+    DEPENDS ${RUST_SOURCE_FILE}
+    USES_TERMINAL
+    COMMENT "Running cargo..."
+)
+
+add_library(fluss_cpp STATIC ${CPP_SOURCE_FILE} ${RUST_BRIDGE_CPP})
+target_sources(fluss_cpp PUBLIC ${CPP_HEADER_FILE})
+target_sources(fluss_cpp PRIVATE ${RUST_HEADER_FILE})
+target_include_directories(fluss_cpp PUBLIC ${CPP_INCLUDE_DIR})
+if (FLUSS_CPP_ARROW_EXTRA_INCLUDE_DIRS)
+    target_include_directories(fluss_cpp PUBLIC ${FLUSS_CPP_ARROW_EXTRA_INCLUDE_DIRS})
+endif()
+target_link_libraries(fluss_cpp PUBLIC ${RUST_LIB})
+target_link_libraries(fluss_cpp PRIVATE ${CMAKE_DL_LIBS} Threads::Threads)
+target_link_libraries(fluss_cpp PUBLIC Arrow::arrow_shared)
+target_compile_definitions(fluss_cpp PRIVATE ARROW_FOUND)
+if(APPLE)
+    target_link_libraries(fluss_cpp PUBLIC "-framework CoreFoundation" "-framework Security")
+endif()
+
+add_executable(fluss_cpp_example examples/example.cpp)
+target_link_libraries(fluss_cpp_example PRIVATE fluss_cpp)
+target_link_libraries(fluss_cpp_example PRIVATE Arrow::arrow_shared)
+target_compile_definitions(fluss_cpp_example PRIVATE ARROW_FOUND)
+target_include_directories(fluss_cpp_example PUBLIC ${CPP_INCLUDE_DIR})
+
+add_executable(fluss_cpp_admin_example examples/admin_example.cpp)
+target_link_libraries(fluss_cpp_admin_example PRIVATE fluss_cpp)
+target_link_libraries(fluss_cpp_admin_example PRIVATE Arrow::arrow_shared)
+target_compile_definitions(fluss_cpp_admin_example PRIVATE ARROW_FOUND)
+target_include_directories(fluss_cpp_admin_example PUBLIC ${CPP_INCLUDE_DIR})
+
+add_executable(fluss_cpp_kv_example examples/kv_example.cpp)
+target_link_libraries(fluss_cpp_kv_example PRIVATE fluss_cpp)
+target_link_libraries(fluss_cpp_kv_example PRIVATE Arrow::arrow_shared)
+target_compile_definitions(fluss_cpp_kv_example PRIVATE ARROW_FOUND)
+target_include_directories(fluss_cpp_kv_example PUBLIC ${CPP_INCLUDE_DIR})
+
+if (CARGO_TARGET_DIR)
+    set_target_properties(fluss_cpp
+        PROPERTIES ADDITIONAL_CLEAN_FILES "${CARGO_TARGET_DIR}"
+    )
+endif()
+add_dependencies(fluss_cpp cargo_build)
+
+if (FLUSS_ENABLE_ADDRESS_SANITIZER)
+    target_compile_options(fluss_cpp PRIVATE -fsanitize=leak,address,undefined -fno-omit-frame-pointer -fno-common -O1)
+    target_link_options(fluss_cpp PRIVATE -fsanitize=leak,address,undefined)
+endif()
+
+if (FLUSS_ENABLE_TESTING)
+    FetchContent_Declare(
+        googletest
+        URL https://github.com/google/googletest/archive/refs/tags/v${FLUSS_GOOGLETEST_VERSION}.tar.gz
+    )
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+    FetchContent_MakeAvailable(googletest)
+
+    if (NOT TARGET nlohmann_json::nlohmann_json)
+        set(JSON_BuildTests OFF CACHE INTERNAL "")
+        FetchContent_Declare(
+            nlohmann_json
+            URL https://github.com/nlohmann/json/archive/refs/tags/v${FLUSS_NLOHMANN_JSON_VERSION}.tar.gz
+            URL_HASH SHA256=4b92eb0c06d10683f7447ce9406cb97cd4b453be18d7279320f7b2f025c10187
+        )
+        FetchContent_MakeAvailable(nlohmann_json)
+    endif()
+
+    enable_testing()
+    include(GoogleTest)
+
+    file(GLOB TEST_SOURCE_FILES "test/*.cpp")
+    add_executable(fluss_cpp_test ${TEST_SOURCE_FILES})
+    target_link_libraries(fluss_cpp_test PRIVATE fluss_cpp GTest::gtest nlohmann_json::nlohmann_json)
+    target_link_libraries(fluss_cpp_test PRIVATE Arrow::arrow_shared)
+    target_compile_definitions(fluss_cpp_test PRIVATE ARROW_FOUND)
+    target_include_directories(fluss_cpp_test PRIVATE
+        ${CPP_INCLUDE_DIR}
+        ${PROJECT_SOURCE_DIR}/test
+    )
+
+    gtest_discover_tests(fluss_cpp_test
+        PROPERTIES
+            TIMEOUT 120
+            FIXTURES_REQUIRED fluss_cluster
+    )
+
+    add_test(NAME fluss_cluster_cleanup COMMAND fluss_cpp_test --cleanup)
+    set_tests_properties(fluss_cluster_cleanup PROPERTIES
+        FIXTURES_CLEANUP fluss_cluster
+    )
+endif()
diff --git a/fluss-rust/bindings/cpp/Cargo.toml b/fluss-rust/bindings/cpp/Cargo.toml
new file mode 100644
index 0000000000..26816522fe
--- /dev/null
+++ b/fluss-rust/bindings/cpp/Cargo.toml
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "fluss-cpp"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+rust-version.workspace = true
+publish = false
+
+[lib]
+crate-type = ["staticlib"]
+
+[dependencies]
+anyhow = "1.0"
+arrow = { workspace = true, features = ["ffi"] }
+bigdecimal = { workspace = true }
+cxx = "1.0"
+fluss = { workspace = true, features = ["storage-all"] }
+tokio = { workspace = true, features = ["rt-multi-thread", "macros"] }
+
+[build-dependencies]
+cxx-build = "1.0"
diff --git a/fluss-rust/bindings/cpp/DEPENDENCIES.rust.tsv b/fluss-rust/bindings/cpp/DEPENDENCIES.rust.tsv
new file mode 100644
index 0000000000..89dbf76539
--- /dev/null
+++ b/fluss-rust/bindings/cpp/DEPENDENCIES.rust.tsv
@@ -0,0 +1,309 @@
+crate	Apache-2.0	Apache-2.0 WITH LLVM-exception	BSD-2-Clause	BSD-3-Clause	BSL-1.0	CC0-1.0	CDLA-Permissive-2.0	ISC	LGPL-2.1-or-later	MIT	Unicode-3.0	Unlicense	Zlib
+ahash@0.8.12	X									X			
+aho-corasick@1.1.4										X		X	
+android_system_properties@0.1.5	X									X			
+anstream@1.0.0	X									X			
+anstyle@1.0.14	X									X			
+anstyle-parse@1.0.0	X									X			
+anstyle-query@1.1.5	X									X			
+anstyle-wincon@3.0.11	X									X			
+anyhow@1.0.102	X									X			
+arrow@57.3.0	X												
+arrow-arith@57.3.0	X												
+arrow-array@57.3.0	X												
+arrow-buffer@57.3.0	X												
+arrow-cast@57.3.0	X												
+arrow-csv@57.3.0	X												
+arrow-data@57.3.0	X												
+arrow-ipc@57.3.0	X												
+arrow-json@57.3.0	X												
+arrow-ord@57.3.0	X												
+arrow-row@57.3.0	X												
+arrow-schema@57.3.0	X												
+arrow-select@57.3.0	X												
+arrow-string@57.3.0	X												
+async-trait@0.1.89	X									X			
+atoi@2.0.0										X			
+atomic-waker@1.1.2	X									X			
+autocfg@1.5.0	X									X			
+backon@1.6.0	X												
+base64@0.22.1	X									X			
+bigdecimal@0.4.10	X									X			
+bitflags@2.11.0	X									X			
+bitvec@1.0.1										X			
+block-buffer@0.10.4	X									X			
+bumpalo@3.20.2	X									X			
+byteorder@1.5.0										X		X	
+bytes@1.11.1										X			
+cc@1.2.57	X									X			
+cfg-if@1.0.4	X									X			
+chrono@0.4.44	X									X			
+clap@4.6.0	X									X			
+clap_builder@4.6.0	X									X			
+clap_derive@4.6.0	X									X			
+clap_lex@1.1.0	X									X			
+codespan-reporting@0.13.1	X												
+colorchoice@1.0.5	X									X			
+const-oid@0.9.6	X									X			
+const-random@0.1.18	X									X			
+const-random-macro@0.1.16	X									X			
+core-foundation-sys@0.8.7	X									X			
+cpufeatures@0.2.17	X									X			
+crc32c@0.6.8	X									X			
+crossbeam-utils@0.8.21	X									X			
+crunchy@0.2.4										X			
+crypto-common@0.1.7	X									X			
+csv@1.4.0										X		X	
+csv-core@0.1.13										X		X	
+cxx@1.0.194	X									X			
+cxx-build@1.0.194	X									X			
+cxxbridge-flags@1.0.194	X									X			
+cxxbridge-macro@1.0.194	X									X			
+dashmap@6.1.0										X			
+delegate@0.13.5	X									X			
+digest@0.10.7	X									X			
+displaydoc@0.2.5	X									X			
+either@1.15.0	X									X			
+equivalent@1.0.2	X									X			
+errno@0.3.14	X									X			
+fastrand@2.3.0	X									X			
+find-msvc-tools@0.1.9	X									X			
+fixedbitset@0.5.7	X									X			
+flatbuffers@25.12.19	X												
+fluss-cpp@0.1.0	X												
+fluss-rs@0.1.0	X												
+fnv@1.0.7	X									X			
+foldhash@0.1.5													X
+foldhash@0.2.0													X
+form_urlencoded@1.2.2	X									X			
+funty@2.0.0										X			
+futures@0.3.32	X									X			
+futures-channel@0.3.32	X									X			
+futures-core@0.3.32	X									X			
+futures-executor@0.3.32	X									X			
+futures-io@0.3.32	X									X			
+futures-macro@0.3.32	X									X			
+futures-sink@0.3.32	X									X			
+futures-task@0.3.32	X									X			
+futures-util@0.3.32	X									X			
+generic-array@0.14.7										X			
+getrandom@0.2.17	X									X			
+getrandom@0.3.4	X									X			
+getrandom@0.4.2	X									X			
+gloo-timers@0.3.0	X									X			
+h2@0.4.13										X			
+half@2.7.1	X									X			
+hashbrown@0.14.5	X									X			
+hashbrown@0.15.5	X									X			
+hashbrown@0.16.1	X									X			
+heck@0.5.0	X									X			
+hex@0.4.3	X									X			
+hmac@0.12.1	X									X			
+home@0.5.12	X									X			
+http@1.4.0	X									X			
+http-body@1.0.1										X			
+http-body-util@0.1.3										X			
+httparse@1.10.1	X									X			
+httpdate@1.0.3	X									X			
+hyper@1.8.1										X			
+hyper-rustls@0.27.7	X							X		X			
+hyper-util@0.1.20										X			
+iana-time-zone@0.1.65	X									X			
+iana-time-zone-haiku@0.1.2	X									X			
+icu_collections@2.1.1											X		
+icu_locale_core@2.1.1											X		
+icu_normalizer@2.1.1											X		
+icu_normalizer_data@2.1.1											X		
+icu_properties@2.1.2											X		
+icu_properties_data@2.1.2											X		
+icu_provider@2.1.1											X		
+idna@1.1.0	X									X			
+idna_adapter@1.2.1	X									X			
+indexmap@2.13.0	X									X			
+ipnet@2.12.0	X									X			
+iri-string@0.7.11	X									X			
+is_terminal_polyfill@1.70.2	X									X			
+itertools@0.14.0	X									X			
+itoa@1.0.18	X									X			
+jiff@0.2.23										X		X	
+jiff-tzdb@0.1.6										X		X	
+jiff-tzdb-platform@0.1.3										X		X	
+jobserver@0.1.34	X									X			
+js-sys@0.3.91	X									X			
+lexical-core@1.0.6	X									X			
+lexical-parse-float@1.0.6	X									X			
+lexical-parse-integer@1.0.6	X									X			
+lexical-util@1.0.7	X									X			
+lexical-write-float@1.0.6	X									X			
+lexical-write-integer@1.0.6	X									X			
+libc@0.2.183	X									X			
+libm@0.2.16										X			
+link-cplusplus@1.0.12	X									X			
+linked-hash-map@0.5.6	X									X			
+linux-raw-sys@0.12.1	X	X								X			
+litemap@0.8.1											X		
+lock_api@0.4.14	X									X			
+log@0.4.29	X									X			
+lz4_flex@0.12.1										X			
+md-5@0.10.6	X									X			
+memchr@2.8.0										X		X	
+mio@1.1.1										X			
+multimap@0.10.1	X									X			
+num-bigint@0.4.6	X									X			
+num-complex@0.4.6	X									X			
+num-integer@0.1.46	X									X			
+num-traits@0.2.19	X									X			
+once_cell@1.21.4	X									X			
+once_cell_polyfill@1.70.2	X									X			
+opendal@0.55.0	X												
+ordered-float@5.1.0										X			
+parking_lot@0.12.5	X									X			
+parking_lot_core@0.9.12	X									X			
+parse-display@0.10.0	X									X			
+parse-display-derive@0.10.0	X									X			
+percent-encoding@2.3.2	X									X			
+petgraph@0.8.3	X									X			
+pin-project-lite@0.2.17	X									X			
+pin-utils@0.1.0	X									X			
+pkg-config@0.3.32	X									X			
+portable-atomic@1.13.1	X									X			
+portable-atomic-util@0.2.6	X									X			
+potential_utf@0.1.4											X		
+ppv-lite86@0.2.21	X									X			
+prettyplease@0.2.37	X									X			
+proc-macro2@1.0.106	X									X			
+prost@0.14.3	X												
+prost-build@0.14.3	X												
+prost-derive@0.14.3	X												
+prost-types@0.14.3	X												
+quick-xml@0.37.5										X			
+quick-xml@0.38.4										X			
+quote@1.0.45	X									X			
+r-efi@5.3.0	X								X	X			
+r-efi@6.0.0	X								X	X			
+radium@0.7.0										X			
+rand@0.8.5	X									X			
+rand@0.9.2	X									X			
+rand_chacha@0.3.1	X									X			
+rand_chacha@0.9.0	X									X			
+rand_core@0.6.4	X									X			
+rand_core@0.9.5	X									X			
+redox_syscall@0.5.18										X			
+regex@1.12.3	X									X			
+regex-automata@0.4.14	X									X			
+regex-syntax@0.8.10	X									X			
+reqsign@0.16.5	X												
+reqwest@0.12.28	X									X			
+ring@0.17.14	X							X					
+rustc_version@0.4.1	X									X			
+rustix@1.1.4	X	X								X			
+rustls@0.23.37	X							X		X			
+rustls-pki-types@1.14.0	X									X			
+rustls-webpki@0.103.10								X					
+rustversion@1.0.22	X									X			
+ryu@1.0.23	X				X								
+scopeguard@1.2.0	X									X			
+scratch@1.0.9	X									X			
+semver@1.0.27	X									X			
+serde@1.0.228	X									X			
+serde_core@1.0.228	X									X			
+serde_derive@1.0.228	X									X			
+serde_json@1.0.149	X									X			
+serde_urlencoded@0.7.1	X									X			
+sha1@0.10.6	X									X			
+sha2@0.10.9	X									X			
+shlex@1.3.0	X									X			
+signal-hook-registry@1.4.8	X									X			
+simdutf8@0.1.5	X									X			
+slab@0.4.12										X			
+smallvec@1.15.1	X									X			
+snafu@0.8.9	X									X			
+snafu-derive@0.8.9	X									X			
+socket2@0.6.3	X									X			
+stable_deref_trait@1.2.1	X									X			
+strsim@0.11.1										X			
+structmeta@0.3.0	X									X			
+structmeta-derive@0.3.0	X									X			
+strum@0.26.3										X			
+strum_macros@0.26.4										X			
+subtle@2.6.1				X									
+syn@2.0.117	X									X			
+sync_wrapper@1.0.2	X												
+synstructure@0.13.2										X			
+tap@1.0.1										X			
+tempfile@3.27.0	X									X			
+termcolor@1.4.1										X		X	
+thiserror@1.0.69	X									X			
+thiserror-impl@1.0.69	X									X			
+tiny-keccak@2.0.2						X							
+tinystr@0.8.2											X		
+tokio@1.50.0										X			
+tokio-macros@2.6.1										X			
+tokio-rustls@0.26.4	X									X			
+tokio-util@0.7.18										X			
+tower@0.5.3										X			
+tower-http@0.6.8										X			
+tower-layer@0.3.3										X			
+tower-service@0.3.3										X			
+tracing@0.1.44										X			
+tracing-attributes@0.1.31										X			
+tracing-core@0.1.36										X			
+try-lock@0.2.5										X			
+twox-hash@2.1.2										X			
+typenum@1.19.0	X									X			
+unicode-ident@1.0.24	X									X	X		
+unicode-width@0.2.2	X									X			
+untrusted@0.9.0								X					
+url@2.5.8	X									X			
+utf8_iter@1.0.4	X									X			
+utf8parse@0.2.2	X									X			
+uuid@1.22.0	X									X			
+value-bag@1.12.0	X									X			
+version_check@0.9.5	X									X			
+want@0.3.1										X			
+wasi@0.11.1+wasi-snapshot-preview1	X	X								X			
+wasip2@1.0.2+wasi-0.2.9	X	X								X			
+wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06	X	X								X			
+wasm-bindgen@0.2.114	X									X			
+wasm-bindgen-futures@0.4.64	X									X			
+wasm-bindgen-macro@0.2.114	X									X			
+wasm-bindgen-macro-support@0.2.114	X									X			
+wasm-bindgen-shared@0.2.114	X									X			
+wasm-streams@0.4.2	X									X			
+web-sys@0.3.91	X									X			
+webpki-roots@1.0.6							X						
+winapi-util@0.1.11										X		X	
+windows-core@0.62.2	X									X			
+windows-implement@0.60.2	X									X			
+windows-interface@0.59.3	X									X			
+windows-link@0.2.1	X									X			
+windows-result@0.4.1	X									X			
+windows-strings@0.5.1	X									X			
+windows-sys@0.52.0	X									X			
+windows-sys@0.61.2	X									X			
+windows-targets@0.52.6	X									X			
+windows_aarch64_gnullvm@0.52.6	X									X			
+windows_aarch64_msvc@0.52.6	X									X			
+windows_i686_gnu@0.52.6	X									X			
+windows_i686_gnullvm@0.52.6	X									X			
+windows_i686_msvc@0.52.6	X									X			
+windows_x86_64_gnu@0.52.6	X									X			
+windows_x86_64_gnullvm@0.52.6	X									X			
+windows_x86_64_msvc@0.52.6	X									X			
+wit-bindgen@0.51.0	X	X								X			
+writeable@0.6.2											X		
+wyz@0.5.1										X			
+yoke@0.8.1											X		
+yoke-derive@0.8.1											X		
+zerocopy@0.8.47	X		X							X			
+zerocopy-derive@0.8.47	X		X							X			
+zerofrom@0.1.6											X		
+zerofrom-derive@0.1.6											X		
+zeroize@1.8.2	X									X			
+zerotrie@0.2.3											X		
+zerovec@0.11.5											X		
+zerovec-derive@0.11.2											X		
+zmij@1.0.21										X			
+zstd@0.13.3										X			
+zstd-safe@7.2.4	X									X			
+zstd-sys@2.0.16+zstd.1.5.7	X									X			
diff --git a/fluss-rust/bindings/cpp/README.md b/fluss-rust/bindings/cpp/README.md
new file mode 100644
index 0000000000..1a8d9f2f64
--- /dev/null
+++ b/fluss-rust/bindings/cpp/README.md
@@ -0,0 +1,42 @@
+# Apache Fluss™ C++ Bindings (Incubating)
+
+C++ bindings for Fluss, built on top of the [fluss-rust](../../crates/fluss) client. The API is exposed via a C++ header ([include/fluss.hpp](include/fluss.hpp)) and implemented with Rust FFI.
+
+## Requirements
+
+- Rust (see [rust-toolchain.toml](../../rust-toolchain.toml) at repo root)
+- C++17-capable compiler
+- CMake 3.18+ and/or Bazel
+- Apache Arrow (for Arrow-based APIs)
+
+## Build
+
+From the repository root or from `bindings/cpp`:
+
+**With CMake:**
+
+```bash
+cd bindings/cpp
+mkdir build && cd build
+cmake ..
+cmake --build .
+```
+
+By default, CMake now uses `Release` when `CMAKE_BUILD_TYPE` is not specified.
+
+**With Bazel:**
+
+```bash
+cd bindings/cpp
+bazel build //...
+```
+`ci.sh` defaults to optimized builds via `-c opt` (override with `BAZEL_BUILD_FLAGS` if needed).
+See [ci.sh](ci.sh) for the CI build sequence.
+
+
+## TODO
+
+- [] How to introduce fluss-cpp in your own project, https://github.com/apache/opendal/blob/main/bindings/cpp/README.md is a good reference
+- [ ] Add CMake/Bazel install and packaging instructions.
+- [ ] Document API usage and minimal example in this README.
+- [ ] Add more C++ examples (log scan, upsert, etc.).
diff --git a/fluss-rust/bindings/cpp/bazel/cpp/BUILD.bazel b/fluss-rust/bindings/cpp/bazel/cpp/BUILD.bazel
new file mode 100644
index 0000000000..e4b730dc9b
--- /dev/null
+++ b/fluss-rust/bindings/cpp/bazel/cpp/BUILD.bazel
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package(default_visibility = ["//visibility:public"])
+
+# Stable indirection target for the Arrow C++ dependency. The implementation
+# repo name can change across modes (registry/build/system) without touching
+# bindings/cpp/BUILD.bazel.
+alias(
+    name = "arrow_cpp_dep",
+    actual = "@apache_arrow_cpp//:arrow_cpp",
+)
diff --git a/fluss-rust/bindings/cpp/bazel/cpp/deps.bzl b/fluss-rust/bindings/cpp/bazel/cpp/deps.bzl
new file mode 100644
index 0000000000..6dd5e1b635
--- /dev/null
+++ b/fluss-rust/bindings/cpp/bazel/cpp/deps.bzl
@@ -0,0 +1,349 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Bzlmod extension for fluss C++ SDK dependency provisioning."""
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+_ARROW_BUILD_FILE_TEMPLATE = """
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
+
+package(default_visibility = ["//visibility:public"])
+
+filegroup(
+    name = "all_srcs",
+    srcs = glob(
+        ["**"],
+        exclude = [
+            "**/BUILD",
+            "**/BUILD.bazel",
+        ],
+    ),
+)
+
+cmake(
+    name = "arrow_cpp",
+    lib_source = ":all_srcs",
+    working_directory = "cpp",
+    generate_args = ["-GUnix Makefiles"],
+    cache_entries = {
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_INSTALL_LIBDIR": "lib",
+        "CMAKE_POSITION_INDEPENDENT_CODE": "ON",
+        "ARROW_BUILD_SHARED": "ON",
+        "ARROW_BUILD_STATIC": "OFF",
+        "ARROW_BUILD_TESTS": "OFF",
+        "ARROW_BUILD_EXAMPLES": "OFF",
+        "ARROW_BUILD_BENCHMARKS": "OFF",
+        "ARROW_BUILD_INTEGRATION": "OFF",
+        "ARROW_BUILD_UTILITIES": "OFF",
+        "ARROW_COMPUTE": "OFF",
+        "ARROW_CSV": "OFF",
+        "ARROW_DATASET": "OFF",
+        "ARROW_FILESYSTEM": "OFF",
+        "ARROW_JSON": "OFF",
+        "ARROW_PARQUET": "OFF",
+        "ARROW_IPC": "ON",
+        "ARROW_JEMALLOC": "OFF",
+        "ARROW_MIMALLOC": "OFF",
+        "ARROW_SIMD_LEVEL": "NONE",
+        "ARROW_RUNTIME_SIMD_LEVEL": "NONE",
+        "ARROW_DEPENDENCY_SOURCE": "BUNDLED",
+        # Temporary workarounds for older images / Bazel sandbox toolchain detection.
+        "EP_CMAKE_RANLIB": "__EP_CMAKE_RANLIB__",
+        "EP_CMAKE_AR": "__EP_CMAKE_AR__",
+        "EP_CMAKE_NM": "__EP_CMAKE_NM__",
+    },
+    out_include_dir = "include",
+    out_lib_dir = "lib",
+    out_shared_libs = select({
+        "@platforms//os:macos": [
+            "libarrow.dylib",
+            "libarrow.1900.dylib",
+        ],
+        "//conditions:default": [
+            "libarrow.so",
+            "libarrow.so.1900",
+            "libarrow.so.1900.1.0",
+        ],
+    }),
+)
+"""
+
+_ARROW_PATCH_CMDS = [
+    "sed -i.bak 's|#define ARROW_CXX_COMPILER_FLAGS \"@CMAKE_CXX_FLAGS@\"|#define ARROW_CXX_COMPILER_FLAGS \"\"|' cpp/src/arrow/util/config.h.cmake && rm -f cpp/src/arrow/util/config.h.cmake.bak",
+]
+
+_SYSTEM_ARROW_BUILD_FILE_TEMPLATE = """
+load("@rules_cc//cc:defs.bzl", "cc_import", "cc_library")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_import(
+    name = "arrow_shared_import",
+    shared_library = "__SYSTEM_ARROW_SHARED_LIBRARY__",
+)
+
+filegroup(
+    name = "arrow_runtime_libs",
+    srcs = [
+__SYSTEM_ARROW_RUNTIME_SRCS__
+    ],
+)
+
+cc_library(
+    name = "arrow_cpp",
+    hdrs = [
+__SYSTEM_ARROW_HDRS__
+    ],
+    includes = ["__SYSTEM_ARROW_INCLUDE_DIR__"],
+    data = [":arrow_runtime_libs"],
+    deps = [":arrow_shared_import"],
+)
+"""
+
+_ARROW_BUILD_VERSIONS = {
+    "19.0.1": {
+        "urls": ["https://github.com/apache/arrow/archive/refs/tags/apache-arrow-19.0.1.tar.gz"],
+        "strip_prefix": "arrow-apache-arrow-19.0.1",
+        "integrity": "sha256-TImFBJWIQcyGtvhxDsspGflrXhD6iYmsEKxPyoNi2Go=",
+    },
+}
+
+_config_tag = tag_class(attrs = {
+    "mode": attr.string(default = "build"),
+    "arrow_cpp_version": attr.string(default = "19.0.1"),
+    "protobuf_version": attr.string(default = "3.25.5"),
+    "ep_cmake_ranlib": attr.string(default = "ranlib"),
+    "ep_cmake_ar": attr.string(default = "ar"),
+    "ep_cmake_nm": attr.string(default = "nm"),
+    "system_arrow_prefix": attr.string(default = "/usr"),
+    "system_arrow_include_dir": attr.string(default = "include"),
+    "system_arrow_shared_library": attr.string(default = "lib/x86_64-linux-gnu/libarrow.so"),
+    "system_arrow_runtime_glob": attr.string(default = "lib/x86_64-linux-gnu/libarrow.so*"),
+})
+
+def _render_arrow_build_file(tag):
+    return _ARROW_BUILD_FILE_TEMPLATE.replace(
+        "__EP_CMAKE_RANLIB__",
+        tag.ep_cmake_ranlib,
+    ).replace(
+        "__EP_CMAKE_AR__",
+        tag.ep_cmake_ar,
+    ).replace(
+        "__EP_CMAKE_NM__",
+        tag.ep_cmake_nm,
+    )
+
+def _render_system_arrow_build_file(tag, shared_library_override = None):
+    shared_library = shared_library_override if shared_library_override else (tag.system_arrow_shared_library if hasattr(tag, "system_arrow_shared_library") else tag.shared_library)
+    include_dir = tag.system_arrow_include_dir if hasattr(tag, "system_arrow_include_dir") else tag.include_dir
+    return _SYSTEM_ARROW_BUILD_FILE_TEMPLATE.replace(
+        "__SYSTEM_ARROW_SHARED_LIBRARY__",
+        "sysroot/" + shared_library,
+    ).replace(
+        "__SYSTEM_ARROW_INCLUDE_DIR__",
+        "sysroot/" + include_dir,
+    )
+
+def _starlark_string_list(items):
+    if not items:
+        return ""
+    return "\n".join(['        "%s",' % i for i in items])
+
+def _list_files(repo_ctx, base_dir, suffixes):
+    result = repo_ctx.execute([
+        "/usr/bin/find",
+        base_dir,
+        "(",
+        "-type",
+        "f",
+        "-o",
+        "-type",
+        "l",
+        ")",
+    ])
+    if result.return_code != 0:
+        fail("failed to enumerate files under %s: %s" % (base_dir, result.stderr))
+    files = []
+    for line in result.stdout.splitlines():
+        for suffix in suffixes:
+            if line.endswith(suffix):
+                files.append(line)
+                break
+    return sorted(files)
+
+def _copy_file_to_sysroot(repo_ctx, prefix, rel_path):
+    if rel_path.startswith("/"):
+        fail("expected relative path under prefix, got absolute path: %s" % rel_path)
+    src = prefix + "/" + rel_path
+    dst = "sysroot/" + rel_path
+    dst_parent = dst.rsplit("/", 1)[0] if "/" in dst else "sysroot"
+    mkdir_res = repo_ctx.execute(["/bin/mkdir", "-p", dst_parent])
+    if mkdir_res.return_code != 0:
+        fail("failed to create directory %s: %s" % (dst_parent, mkdir_res.stderr))
+    # Resolve symlinks into real files to keep the generated sysroot self-contained.
+    cp_res = repo_ctx.execute(["/bin/cp", "-L", src, dst])
+    if cp_res.return_code != 0:
+        fail("failed to copy %s to %s: %s" % (src, dst, cp_res.stderr))
+
+def _system_arrow_repo_impl(repo_ctx):
+    prefix = repo_ctx.attr.prefix.rstrip("/")
+    include_dir = repo_ctx.attr.include_dir
+    shared_library = repo_ctx.attr.shared_library
+    runtime_glob = repo_ctx.attr.runtime_glob
+
+    mkdir_res = repo_ctx.execute(["/bin/mkdir", "-p", "sysroot"])
+    if mkdir_res.return_code != 0:
+        fail("failed to create sysroot directory: %s" % mkdir_res.stderr)
+
+    include_dir_for_scan = include_dir
+    if include_dir_for_scan.endswith("/"):
+        include_dir_for_scan = include_dir_for_scan[:-1]
+    header_root = prefix + "/" + include_dir_for_scan + "/arrow"
+    headers = _list_files(repo_ctx, header_root, [".h", ".hpp"])
+    header_srcs_rel = []
+    header_srcs = []
+    for h in headers:
+        if not h.startswith(prefix + "/"):
+            fail("header path %s is outside prefix %s" % (h, prefix))
+        rel = h[len(prefix) + 1:]
+        header_srcs_rel.append(rel)
+        header_srcs.append("sysroot/" + rel)
+
+    runtime_dir = runtime_glob.rsplit("/", 1)[0]
+    runtime_prefix = runtime_glob.rsplit("/", 1)[1].replace("*", "")
+    runtime_files = _list_files(repo_ctx, prefix + "/" + runtime_dir, [""])
+    runtime_srcs_rel = []
+    runtime_srcs = []
+    for f in runtime_files:
+        rel = f[len(prefix) + 1:] if f.startswith(prefix + "/") else None
+        if rel == None:
+            continue
+        if rel.startswith(runtime_dir + "/") and rel.rsplit("/", 1)[1].startswith(runtime_prefix):
+            runtime_srcs_rel.append(rel)
+            runtime_srcs.append("sysroot/" + rel)
+    runtime_srcs_rel = sorted(runtime_srcs_rel)
+    runtime_srcs = sorted(runtime_srcs)
+
+    # Prefer a versioned soname file as the imported shared library so Bazel
+    # runfiles contain the exact filename required by the runtime loader.
+    shared_import_rel = "sysroot/" + shared_library
+    shared_basename = shared_library.rsplit("/", 1)[1]
+    soname_candidates = []
+    for rel in runtime_srcs_rel:
+        base = rel.rsplit("/", 1)[1]
+        if base == shared_basename:
+            continue
+        if base.startswith(shared_basename + "."):
+            soname_candidates.append("sysroot/" + rel)
+    if soname_candidates:
+        # Prefer shortest suffix first (e.g. libarrow.so.1900 before
+        # libarrow.so.1900.1.0) to match ELF SONAME naming when available.
+        soname_candidates = sorted(soname_candidates, key = lambda s: (len(s), s))
+        shared_import_rel = soname_candidates[0]
+
+    # Copy only required Arrow artifacts instead of mirroring the full system prefix.
+    copy_rel_paths = {}
+    for rel in header_srcs_rel + runtime_srcs_rel + [shared_library]:
+        copy_rel_paths[rel] = True
+    for rel in sorted(copy_rel_paths.keys()):
+        _copy_file_to_sysroot(repo_ctx, prefix, rel)
+
+    build_file = _render_system_arrow_build_file(repo_ctx.attr, shared_library_override = shared_import_rel[len("sysroot/"):]).replace(
+        "__SYSTEM_ARROW_HDRS__",
+        _starlark_string_list(header_srcs),
+    ).replace(
+        "__SYSTEM_ARROW_RUNTIME_SRCS__",
+        _starlark_string_list(runtime_srcs),
+    )
+    repo_ctx.file("BUILD.bazel", build_file)
+
+_system_arrow_repository = repository_rule(
+    implementation = _system_arrow_repo_impl,
+    attrs = {
+        "prefix": attr.string(mandatory = True),
+        "include_dir": attr.string(mandatory = True),
+        "shared_library": attr.string(mandatory = True),
+        "runtime_glob": attr.string(mandatory = True),
+    },
+    local = True,
+)
+
+def _select_config(ctx):
+    selected = None
+    selected_owner = None
+    root_selected = None
+    for mod in ctx.modules:
+        for tag in mod.tags.config:
+            is_root = hasattr(mod, "is_root") and mod.is_root
+            if is_root:
+                if root_selected != None:
+                    fail("cpp_sdk.config may only be declared once in the root module")
+                root_selected = tag
+                continue
+            if selected == None:
+                selected = tag
+                selected_owner = mod.name
+            elif selected_owner != mod.name:
+                # Prefer root override. Dependency defaults are tolerated as long
+                # as they come from a single module.
+                fail("multiple dependency defaults for cpp_sdk.config without root override")
+    if root_selected != None:
+        return root_selected
+    return selected
+
+def _cpp_sdk_impl(ctx):
+    tag = _select_config(ctx)
+    if tag == None:
+        return
+
+    if tag.mode == "registry":
+        return
+
+    if tag.mode == "system":
+        _system_arrow_repository(
+            name = "apache_arrow_cpp",
+            prefix = tag.system_arrow_prefix,
+            include_dir = tag.system_arrow_include_dir,
+            shared_library = tag.system_arrow_shared_library,
+            runtime_glob = tag.system_arrow_runtime_glob,
+        )
+        return
+
+    if tag.mode != "build":
+        fail("unsupported cpp_sdk mode: %s" % tag.mode)
+
+    arrow_version = _ARROW_BUILD_VERSIONS.get(tag.arrow_cpp_version)
+    if arrow_version == None:
+        fail("unsupported arrow_cpp_version for build mode: %s" % tag.arrow_cpp_version)
+
+    http_archive(
+        name = "apache_arrow_cpp",
+        urls = arrow_version["urls"],
+        strip_prefix = arrow_version["strip_prefix"],
+        integrity = arrow_version["integrity"],
+        patch_cmds = _ARROW_PATCH_CMDS,
+        build_file_content = _render_arrow_build_file(tag),
+    )
+
+cpp_sdk = module_extension(
+    implementation = _cpp_sdk_impl,
+    tag_classes = {
+        "config": _config_tag,
+    },
+)
diff --git a/fluss-rust/bindings/cpp/build.rs b/fluss-rust/bindings/cpp/build.rs
new file mode 100644
index 0000000000..ec75e24aeb
--- /dev/null
+++ b/fluss-rust/bindings/cpp/build.rs
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+fn main() {
+    cxx_build::bridge("src/lib.rs")
+        .std("c++17")
+        .compile("fluss-cpp-bridge");
+
+    println!("cargo:rerun-if-changed=src/lib.rs");
+}
diff --git a/fluss-rust/bindings/cpp/ci.sh b/fluss-rust/bindings/cpp/ci.sh
new file mode 100755
index 0000000000..ebf5f09205
--- /dev/null
+++ b/fluss-rust/bindings/cpp/ci.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -xe 
+
+DIR="$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)"
+BAZEL_BUILD_FLAGS="${BAZEL_BUILD_FLAGS:--c opt}"
+
+# Set Bazel output base to bazel-build directory
+# This ensures all Bazel outputs are in bazel-build/.bazel-output-base
+BAZEL_OUTPUT_BASE="$DIR/bazel-build/.bazel-output-base"
+
+# Create output base directory if it doesn't exist
+mkdir -p "$BAZEL_OUTPUT_BASE"
+
+# Wrapper function to run bazel with --output_base
+bazel() {
+    command bazel --output_base="$BAZEL_OUTPUT_BASE" "$@"
+}
+
+compile() {
+    bazel build ${BAZEL_BUILD_FLAGS} //:fluss_cpp
+}
+
+build_example() {
+    bazel build ${BAZEL_BUILD_FLAGS} //:fluss_cpp_example
+}
+
+run_example() {
+    build_example
+    bazel run ${BAZEL_BUILD_FLAGS} //:fluss_cpp_example
+}
+
+clean() {
+    bazel clean
+    # Remove bazel-* symlinks (Bazel automatically creates these)
+    rm -f "$DIR"/bazel-*
+    # Also remove the bazel-build directory if it exists
+    if [ -d "$DIR/bazel-build" ]; then
+        rm -rf "$DIR/bazel-build"
+    fi
+    echo "Cleaned all Bazel outputs and symlinks"
+}
+
+show_outputs() {
+    echo "=== Library outputs ==="
+    bazel cquery //:fluss_cpp --output=files 2>/dev/null || echo "Run 'bazel build //:fluss_cpp' first"
+    echo ""
+    echo "=== Example binary outputs ==="
+    bazel cquery //:fluss_cpp_example --output=files 2>/dev/null || echo "Run 'bazel build //:fluss_cpp_example' first"
+    echo ""
+    echo "=== To run the example ==="
+    echo "  bazel run //:fluss_cpp_example"
+    echo ""
+    echo "=== To find outputs manually ==="
+    echo "  bazel info bazel-bin"
+}
+
+case $1 in 
+    compile )
+        compile
+        ;;
+    example )
+        build_example
+        ;;
+    run )
+        run_example
+        ;;
+    outputs )
+        show_outputs
+        ;;
+    clean )
+        clean
+        ;;
+    * )
+        echo "Usage: $0 {compile|example|run|outputs|clean}"
+        echo ""
+        echo "Commands:"
+        echo "  compile  - Build the fluss_cpp library"
+        echo "  example  - Build the example binary"
+        echo "  run      - Build and run the example binary"
+        echo "  outputs  - Show the location of build outputs"
+        echo "  clean    - Clean all Bazel outputs"
+        exit 1
+        ;;
+esac
diff --git a/fluss-rust/bindings/cpp/examples/admin_example.cpp b/fluss-rust/bindings/cpp/examples/admin_example.cpp
new file mode 100644
index 0000000000..37683b9513
--- /dev/null
+++ b/fluss-rust/bindings/cpp/examples/admin_example.cpp
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "fluss.hpp"
+
+static void check(const char* step, const fluss::Result& r) {
+    if (!r.Ok()) {
+        std::cerr << step << " failed: code=" << r.error_code << " msg=" << r.error_message
+                  << std::endl;
+        std::exit(1);
+    }
+}
+
+int main() {
+    const std::string db_name = "admin_example_db";
+    const std::string table_name = "admin_example_table";
+
+    // 1) Connect and get Admin
+    fluss::Configuration config;
+    config.bootstrap_servers = "127.0.0.1:9123";
+
+    fluss::Connection conn;
+    check("create", fluss::Connection::Create(config, conn));
+
+    fluss::Admin admin;
+    check("get_admin", conn.GetAdmin(admin));
+
+    // 2) Database operations
+    std::cout << "--- Database operations ---" << std::endl;
+
+    bool exists = false;
+    check("database_exists (before create)", admin.DatabaseExists(db_name, exists));
+    std::cout << "Database " << db_name << " exists before create: " << (exists ? "yes" : "no")
+              << std::endl;
+
+    fluss::DatabaseDescriptor db_desc;
+    db_desc.comment = "Example database for Admin API";
+    db_desc.properties["owner"] = "admin_example";
+    check("create_database", admin.CreateDatabase(db_name, db_desc, true));
+
+    check("database_exists (after create)", admin.DatabaseExists(db_name, exists));
+    std::cout << "Database " << db_name << " exists after create: " << (exists ? "yes" : "no")
+              << std::endl;
+
+    fluss::DatabaseInfo db_info;
+    check("get_database_info", admin.GetDatabaseInfo(db_name, db_info));
+    std::cout << "Database info: name=" << db_info.database_name << " comment=" << db_info.comment
+              << " created_time=" << db_info.created_time << std::endl;
+
+    std::vector<std::string> databases;
+    check("list_databases", admin.ListDatabases(databases));
+    std::cout << "List databases (" << databases.size() << "): ";
+    for (size_t i = 0; i < databases.size(); ++i) {
+        if (i > 0) std::cout << ", ";
+        std::cout << databases[i];
+    }
+    std::cout << std::endl;
+
+    // 3) Table operations in the new database
+    std::cout << "--- Table operations ---" << std::endl;
+
+    fluss::TablePath table_path(db_name, table_name);
+
+    bool table_exists_flag = false;
+    check("table_exists (before create)", admin.TableExists(table_path, table_exists_flag));
+    std::cout << "Table " << db_name << "." << table_name
+              << " exists before create: " << (table_exists_flag ? "yes" : "no") << std::endl;
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .Build();
+    auto descriptor = fluss::TableDescriptor::NewBuilder()
+                          .SetSchema(schema)
+                          .SetBucketCount(1)
+                          .SetComment("admin example table")
+                          .Build();
+
+    check("create_table", admin.CreateTable(table_path, descriptor, true));
+
+    check("table_exists (after create)", admin.TableExists(table_path, table_exists_flag));
+    std::cout << "Table exists after create: " << (table_exists_flag ? "yes" : "no") << std::endl;
+
+    std::vector<std::string> tables;
+    check("list_tables", admin.ListTables(db_name, tables));
+    std::cout << "List tables in " << db_name << " (" << tables.size() << "): ";
+    for (size_t i = 0; i < tables.size(); ++i) {
+        if (i > 0) std::cout << ", ";
+        std::cout << tables[i];
+    }
+    std::cout << std::endl;
+
+    // 4) Cleanup: drop table, then drop database
+    std::cout << "--- Cleanup ---" << std::endl;
+    check("drop_table", admin.DropTable(table_path, true));
+    check("drop_database", admin.DropDatabase(db_name, true, true));
+
+    check("database_exists (after drop)", admin.DatabaseExists(db_name, exists));
+    std::cout << "Database exists after drop: " << (exists ? "yes" : "no") << std::endl;
+
+    std::cout << "Admin example completed successfully." << std::endl;
+    return 0;
+}
diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/build/BUILD.bazel b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/BUILD.bazel
new file mode 100644
index 0000000000..afd35edd7e
--- /dev/null
+++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/BUILD.bazel
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+cc_binary(
+    name = "consumer_build",
+    srcs = ["main.cc"],
+    copts = ["-std=c++17"],
+    deps = ["@fluss-cpp//bindings/cpp:fluss_cpp"],
+)
diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/build/MODULE.bazel b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/MODULE.bazel
new file mode 100644
index 0000000000..f31165c1cd
--- /dev/null
+++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/MODULE.bazel
@@ -0,0 +1,39 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module(name = "fluss_cpp_consumer_build")
+
+bazel_dep(name = "rules_cc", version = "0.2.14")
+bazel_dep(name = "fluss-cpp", version = "0.1.0")
+
+# Local override for repository-local validation only.
+local_path_override(
+    module_name = "fluss-cpp",
+    # Repository root path (the directory containing `bindings/cpp`).
+    path = "../../../../../",
+)
+
+fluss_cpp = use_extension("@fluss-cpp//bindings/cpp/bazel/cpp:deps.bzl", "cpp_sdk")
+fluss_cpp.config(
+    mode = "build",
+    protobuf_version = "3.25.5",
+    arrow_cpp_version = "19.0.1",
+    ep_cmake_ranlib = "/usr/bin/ranlib",
+    ep_cmake_ar = "/usr/bin/ar",
+    ep_cmake_nm = "/usr/bin/nm",
+)
+use_repo(fluss_cpp, "apache_arrow_cpp")
diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/build/main.cc b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/main.cc
new file mode 100644
index 0000000000..87e5b6820f
--- /dev/null
+++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/build/main.cc
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "fluss.hpp"
+
+#include <iostream>
+
+int main() {
+    fluss::TablePath table_path("demo_db", "demo_table");
+    std::cout << "Bazel build-mode dependency example ready: "
+              << table_path.ToString() << std::endl;
+    return 0;
+}
+
diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/system/BUILD.bazel b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/BUILD.bazel
new file mode 100644
index 0000000000..2f24e6dec7
--- /dev/null
+++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/BUILD.bazel
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+cc_binary(
+    name = "consumer_system",
+    srcs = ["main.cc"],
+    copts = ["-std=c++17"],
+    deps = ["@fluss-cpp//bindings/cpp:fluss_cpp"],
+)
diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/system/MODULE.bazel b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/MODULE.bazel
new file mode 100644
index 0000000000..2a4d6a6584
--- /dev/null
+++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/MODULE.bazel
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+module(name = "fluss_cpp_consumer_system")
+
+bazel_dep(name = "rules_cc", version = "0.2.14")
+bazel_dep(name = "fluss-cpp", version = "0.1.0")
+
+# Repository-local example path (repository root containing `bindings/cpp`).
+# If you copy this example out of tree, replace this with an absolute path
+# (for example: /path/to/fluss-rust).
+local_path_override(
+    module_name = "fluss-cpp",
+    path = "../../../../../",
+)
+
+# Intended interface for preinstalled protoc + Arrow C++ environments.
+fluss_cpp = use_extension("@fluss-cpp//bindings/cpp/bazel/cpp:deps.bzl", "cpp_sdk")
+fluss_cpp.config(
+    mode = "system",
+    protobuf_version = "3.25.5",
+    arrow_cpp_version = "19.0.1",
+    # Adjust these paths for your environment.
+    # Ubuntu 22.04 (apt / custom package) commonly uses lib/x86_64-linux-gnu.
+    system_arrow_prefix = "/usr",
+    system_arrow_include_dir = "include",
+    system_arrow_shared_library = "lib/x86_64-linux-gnu/libarrow.so",
+    system_arrow_runtime_glob = "lib/x86_64-linux-gnu/libarrow.so*",
+)
+use_repo(fluss_cpp, "apache_arrow_cpp")
diff --git a/fluss-rust/bindings/cpp/examples/bazel-consumer/system/main.cc b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/main.cc
new file mode 100644
index 0000000000..b1f0b70b84
--- /dev/null
+++ b/fluss-rust/bindings/cpp/examples/bazel-consumer/system/main.cc
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "fluss.hpp"
+
+#include <iostream>
+
+int main() {
+    fluss::TablePath table_path("demo_db", "demo_table");
+    std::cout << "Bazel system-mode dependency example ready: "
+              << table_path.ToString() << std::endl;
+    return 0;
+}
diff --git a/fluss-rust/bindings/cpp/examples/example.cpp b/fluss-rust/bindings/cpp/examples/example.cpp
new file mode 100644
index 0000000000..d86ee5cda7
--- /dev/null
+++ b/fluss-rust/bindings/cpp/examples/example.cpp
@@ -0,0 +1,790 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/array/builder_binary.h>
+#include <arrow/array/builder_primitive.h>
+#include <arrow/record_batch.h>
+#include <arrow/type.h>
+
+#include <chrono>
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#include "fluss.hpp"
+
+static void check(const char* step, const fluss::Result& r) {
+    if (!r.Ok()) {
+        std::cerr << step << " failed: code=" << r.error_code << " msg=" << r.error_message
+                  << std::endl;
+        std::exit(1);
+    }
+}
+
+int main() {
+    // 1) Connect
+    fluss::Configuration config;
+    config.bootstrap_servers = "127.0.0.1:9123";
+
+    fluss::Connection conn;
+    check("create", fluss::Connection::Create(config, conn));
+
+    // 2) Admin
+    fluss::Admin admin;
+    check("get_admin", conn.GetAdmin(admin));
+
+    fluss::TablePath table_path("fluss", "sample_table_cpp_v1");
+
+    // 2.1) Drop table if exists
+    std::cout << "Dropping table if exists..." << std::endl;
+    auto drop_result = admin.DropTable(table_path, true);
+    if (drop_result.Ok()) {
+        std::cout << "Table dropped successfully" << std::endl;
+    } else {
+        std::cout << "Table drop result: " << drop_result.error_message << std::endl;
+    }
+
+    // 3) Schema with scalar and temporal columns
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .AddColumn("score", fluss::DataType::Float())
+                      .AddColumn("age", fluss::DataType::Int())
+                      .AddColumn("event_date", fluss::DataType::Date())
+                      .AddColumn("event_time", fluss::DataType::Time())
+                      .AddColumn("created_at", fluss::DataType::Timestamp())
+                      .AddColumn("updated_at", fluss::DataType::TimestampLtz())
+                      .Build();
+
+    auto descriptor = fluss::TableDescriptor::NewBuilder()
+                          .SetSchema(schema)
+                          .SetBucketCount(3)
+                          .SetComment("cpp example table with 3 buckets")
+                          .Build();
+
+    std::cout << "Creating table with 3 buckets..." << std::endl;
+    check("create_table", admin.CreateTable(table_path, descriptor, false));
+
+    // 4) Get table
+    fluss::Table table;
+    check("get_table", conn.GetTable(table_path, table));
+
+    // 5) Write rows with scalar and temporal values
+    fluss::AppendWriter writer;
+    check("new_append_writer", table.NewAppend().CreateWriter(writer));
+
+    struct RowData {
+        int id;
+        const char* name;
+        float score;
+        int age;
+        fluss::Date date;
+        fluss::Time time;
+        fluss::Timestamp ts_ntz;
+        fluss::Timestamp ts_ltz;
+    };
+
+    auto tp_now = std::chrono::system_clock::now();
+    std::vector<RowData> rows = {
+        {1, "Alice", 95.2f, 25, fluss::Date::FromYMD(2024, 6, 15), fluss::Time::FromHMS(14, 30, 45),
+         fluss::Timestamp::FromTimePoint(tp_now), fluss::Timestamp::FromMillis(1718467200000)},
+        {2, "Bob", 87.2f, 30, fluss::Date::FromYMD(2025, 1, 1), fluss::Time::FromHMS(0, 0, 0),
+         fluss::Timestamp::FromMillis(1735689600000),
+         fluss::Timestamp::FromMillisNanos(1735689600000, 500000)},
+        {3, "Charlie", 92.1f, 35, fluss::Date::FromYMD(1999, 12, 31),
+         fluss::Time::FromHMS(23, 59, 59), fluss::Timestamp::FromMillis(946684799999),
+         fluss::Timestamp::FromMillis(946684799999)},
+    };
+
+    // Fire-and-forget: queue rows, flush at end
+    for (const auto& r : rows) {
+        fluss::GenericRow row;
+        row.SetInt32(0, r.id);
+        row.SetString(1, r.name);
+        row.SetFloat32(2, r.score);
+        row.SetInt32(3, r.age);
+        row.SetDate(4, r.date);
+        row.SetTime(5, r.time);
+        row.SetTimestampNtz(6, r.ts_ntz);
+        row.SetTimestampLtz(7, r.ts_ltz);
+        check("append", writer.Append(row));
+    }
+    check("flush", writer.Flush());
+    std::cout << "Wrote " << rows.size() << " rows (fire-and-forget + flush)" << std::endl;
+
+    // Per-record acknowledgment
+    {
+        fluss::GenericRow row;
+        row.SetInt32(0, 100);
+        row.SetString(1, "AckTest");
+        row.SetFloat32(2, 99.9f);
+        row.SetInt32(3, 42);
+        row.SetDate(4, fluss::Date::FromYMD(2025, 3, 1));
+        row.SetTime(5, fluss::Time::FromHMS(12, 0, 0));
+        row.SetTimestampNtz(6, fluss::Timestamp::FromMillis(1740787200000));
+        row.SetTimestampLtz(7, fluss::Timestamp::FromMillis(1740787200000));
+        fluss::WriteResult wr;
+        check("append", writer.Append(row, wr));
+        check("wait", wr.Wait());
+        std::cout << "Row acknowledged by server" << std::endl;
+    }
+
+    // Append a row with all fields null (matches Rust log_table.rs all_supported_datatypes)
+    {
+        fluss::GenericRow row;
+        size_t field_count = 8;
+        for (size_t i = 0; i < field_count; ++i) {
+            row.SetNull(i);
+        }
+        check("append_null_row", writer.Append(row));
+    }
+    check("flush_null", writer.Flush());
+    std::cout << "Wrote row with all fields null" << std::endl;
+
+    // 6) Full scan — verify all column types including temporal
+    fluss::LogScanner scanner;
+    check("new_log_scanner", table.NewScan().CreateLogScanner(scanner));
+
+    auto info = table.GetTableInfo();
+    int buckets = info.num_buckets;
+    for (int b = 0; b < buckets; ++b) {
+        check("subscribe", scanner.Subscribe(b, 0));
+    }
+
+    fluss::ScanRecords records;
+    check("poll", scanner.Poll(5000, records));
+
+    // Flat iteration over all records (regardless of bucket)
+    std::cout << "Scanned records: " << records.Count() << " across " << records.BucketCount()
+              << " buckets" << std::endl;
+    for (const auto& rec : records) {
+        std::cout << "  offset=" << rec.offset << " timestamp=" << rec.timestamp << std::endl;
+    }
+
+    // Per-bucket access (with type verification)
+    bool scan_ok = true;
+    bool found_null_row = false;
+    for (const auto& tb : records.Buckets()) {
+        auto view = records.Records(tb);
+        std::cout << "  Bucket " << tb.bucket_id;
+        if (tb.partition_id.has_value()) {
+            std::cout << " (partition=" << *tb.partition_id << ")";
+        }
+        std::cout << ": " << view.Size() << " records" << std::endl;
+        for (const auto& rec : view) {
+            // Check if this is the all-null row
+            if (rec.row.IsNull(0)) {
+                found_null_row = true;
+                for (size_t i = 0; i < rec.row.FieldCount(); ++i) {
+                    if (!rec.row.IsNull(i)) {
+                        std::cerr << "ERROR: column " << i << " should be null" << std::endl;
+                        scan_ok = false;
+                    }
+                }
+                std::cout << "    [null row] all " << rec.row.FieldCount() << " fields are null"
+                          << std::endl;
+                continue;
+            }
+
+            // Non-null rows: verify types
+            if (rec.row.GetType(4) != fluss::TypeId::Date) {
+                std::cerr << "ERROR: field 4 expected Date, got "
+                          << static_cast<int>(rec.row.GetType(4)) << std::endl;
+                scan_ok = false;
+            }
+            if (rec.row.GetType(5) != fluss::TypeId::Time) {
+                std::cerr << "ERROR: field 5 expected Time, got "
+                          << static_cast<int>(rec.row.GetType(5)) << std::endl;
+                scan_ok = false;
+            }
+            if (rec.row.GetType(6) != fluss::TypeId::Timestamp) {
+                std::cerr << "ERROR: field 6 expected Timestamp, got "
+                          << static_cast<int>(rec.row.GetType(6)) << std::endl;
+                scan_ok = false;
+            }
+            if (rec.row.GetType(7) != fluss::TypeId::TimestampLtz) {
+                std::cerr << "ERROR: field 7 expected TimestampLtz, got "
+                          << static_cast<int>(rec.row.GetType(7)) << std::endl;
+                scan_ok = false;
+            }
+
+            // Name-based getters
+            auto date = rec.row.GetDate("event_date");
+            auto time = rec.row.GetTime("event_time");
+            auto ts_ntz = rec.row.GetTimestamp("created_at");
+            auto ts_ltz = rec.row.GetTimestamp("updated_at");
+
+            std::cout << "    id=" << rec.row.GetInt32("id")
+                      << " name=" << rec.row.GetString("name")
+                      << " score=" << rec.row.GetFloat32("score")
+                      << " age=" << rec.row.GetInt32("age") << " date=" << date.Year() << "-"
+                      << date.Month() << "-" << date.Day() << " time=" << time.Hour() << ":"
+                      << time.Minute() << ":" << time.Second() << " ts_ntz=" << ts_ntz.epoch_millis
+                      << " ts_ltz=" << ts_ltz.epoch_millis << "+" << ts_ltz.nano_of_millisecond
+                      << "ns" << std::endl;
+        }
+    }
+
+    if (!found_null_row) {
+        std::cerr << "ERROR: did not find the all-null row" << std::endl;
+        scan_ok = false;
+    }
+
+    if (!scan_ok) {
+        std::cerr << "Full scan type verification FAILED!" << std::endl;
+        std::exit(1);
+    }
+
+    // 7a) Projected scan by index — project [id, updated_at(TimestampLtz)] to verify
+    //     NTZ/LTZ disambiguation works with column index remapping
+    std::vector<size_t> projected_columns = {0, 7};
+    fluss::LogScanner projected_scanner;
+    check("new_log_scanner_with_projection",
+          table.NewScan().ProjectByIndex(projected_columns).CreateLogScanner(projected_scanner));
+
+    for (int b = 0; b < buckets; ++b) {
+        check("subscribe_projected", projected_scanner.Subscribe(b, 0));
+    }
+
+    fluss::ScanRecords projected_records;
+    check("poll_projected", projected_scanner.Poll(5000, projected_records));
+
+    std::cout << "Projected records: " << projected_records.Count() << std::endl;
+    for (const auto& tb : projected_records.Buckets()) {
+        for (const auto& rec : projected_records.Records(tb)) {
+            if (rec.row.FieldCount() != 2) {
+                std::cerr << "ERROR: expected 2 fields, got " << rec.row.FieldCount() << std::endl;
+                scan_ok = false;
+                continue;
+            }
+            // Skip the all-null row
+            if (rec.row.IsNull(0)) {
+                std::cout << "  [null row] skipped" << std::endl;
+                continue;
+            }
+            if (rec.row.GetType(0) != fluss::TypeId::Int) {
+                std::cerr << "ERROR: projected field 0 expected Int, got "
+                          << static_cast<int>(rec.row.GetType(0)) << std::endl;
+                scan_ok = false;
+            }
+            if (rec.row.GetType(1) != fluss::TypeId::TimestampLtz) {
+                std::cerr << "ERROR: projected field 1 expected TimestampLtz, got "
+                          << static_cast<int>(rec.row.GetType(1)) << std::endl;
+                scan_ok = false;
+            }
+
+            auto ts = rec.row.GetTimestamp(1);
+            std::cout << "  id=" << rec.row.GetInt32(0) << " updated_at=" << ts.epoch_millis << "+"
+                      << ts.nano_of_millisecond << "ns" << std::endl;
+        }
+    }
+
+    // 7b) Projected scan by column names — same columns as above but using names
+    fluss::LogScanner name_projected_scanner;
+    check("project_by_name_scanner", table.NewScan()
+                                         .ProjectByName({"id", "updated_at"})
+                                         .CreateLogScanner(name_projected_scanner));
+
+    for (int b = 0; b < buckets; ++b) {
+        check("subscribe_name_projected", name_projected_scanner.Subscribe(b, 0));
+    }
+
+    fluss::ScanRecords name_projected_records;
+    check("poll_name_projected", name_projected_scanner.Poll(5000, name_projected_records));
+
+    std::cout << "Name-projected records: " << name_projected_records.Count() << std::endl;
+    for (const auto& tb : name_projected_records.Buckets()) {
+        for (const auto& rec : name_projected_records.Records(tb)) {
+            if (rec.row.FieldCount() != 2) {
+                std::cerr << "ERROR: expected 2 fields, got " << rec.row.FieldCount() << std::endl;
+                scan_ok = false;
+                continue;
+            }
+            // Skip the all-null row
+            if (rec.row.IsNull(0)) {
+                std::cout << "  [null row] skipped" << std::endl;
+                continue;
+            }
+            if (rec.row.GetType(0) != fluss::TypeId::Int) {
+                std::cerr << "ERROR: name-projected field 0 expected Int, got "
+                          << static_cast<int>(rec.row.GetType(0)) << std::endl;
+                scan_ok = false;
+            }
+            if (rec.row.GetType(1) != fluss::TypeId::TimestampLtz) {
+                std::cerr << "ERROR: name-projected field 1 expected TimestampLtz, got "
+                          << static_cast<int>(rec.row.GetType(1)) << std::endl;
+                scan_ok = false;
+            }
+
+            auto ts = rec.row.GetTimestamp(1);
+            std::cout << "  id=" << rec.row.GetInt32(0) << " updated_at=" << ts.epoch_millis << "+"
+                      << ts.nano_of_millisecond << "ns" << std::endl;
+        }
+    }
+
+    if (scan_ok) {
+        std::cout << "Scan verification passed!" << std::endl;
+    } else {
+        std::cerr << "Scan verification FAILED!" << std::endl;
+        std::exit(1);
+    }
+
+    // 8) List offsets examples
+    std::cout << "\n=== List Offsets Examples ===" << std::endl;
+
+    std::vector<int32_t> all_bucket_ids;
+    all_bucket_ids.reserve(buckets);
+    for (int b = 0; b < buckets; ++b) {
+        all_bucket_ids.push_back(b);
+    }
+
+    std::unordered_map<int32_t, int64_t> earliest_offsets;
+    check("list_earliest_offsets",
+          admin.ListOffsets(table_path, all_bucket_ids, fluss::OffsetSpec::Earliest(),
+                            earliest_offsets));
+    std::cout << "Earliest offsets:" << std::endl;
+    for (const auto& [bucket_id, offset] : earliest_offsets) {
+        std::cout << "  Bucket " << bucket_id << ": offset=" << offset << std::endl;
+    }
+
+    std::unordered_map<int32_t, int64_t> latest_offsets;
+    check("list_latest_offsets", admin.ListOffsets(table_path, all_bucket_ids,
+                                                   fluss::OffsetSpec::Latest(), latest_offsets));
+    std::cout << "Latest offsets:" << std::endl;
+    for (const auto& [bucket_id, offset] : latest_offsets) {
+        std::cout << "  Bucket " << bucket_id << ": offset=" << offset << std::endl;
+    }
+
+    auto now = std::chrono::system_clock::now();
+    auto one_hour_ago = now - std::chrono::hours(1);
+    auto timestamp_ms =
+        std::chrono::duration_cast<std::chrono::milliseconds>(one_hour_ago.time_since_epoch())
+            .count();
+
+    std::unordered_map<int32_t, int64_t> timestamp_offsets;
+    check("list_timestamp_offsets",
+          admin.ListOffsets(table_path, all_bucket_ids, fluss::OffsetSpec::Timestamp(timestamp_ms),
+                            timestamp_offsets));
+    std::cout << "Offsets for timestamp " << timestamp_ms << " (1 hour ago):" << std::endl;
+    for (const auto& [bucket_id, offset] : timestamp_offsets) {
+        std::cout << "  Bucket " << bucket_id << ": offset=" << offset << std::endl;
+    }
+
+    // 9) Batch subscribe
+    std::cout << "\n=== Batch Subscribe Example ===" << std::endl;
+    fluss::LogScanner batch_scanner;
+    check("new_log_scanner_for_batch", table.NewScan().CreateLogScanner(batch_scanner));
+
+    std::vector<fluss::BucketSubscription> subscriptions;
+    for (const auto& [bucket_id, offset] : earliest_offsets) {
+        subscriptions.push_back({bucket_id, offset});
+        std::cout << "Preparing subscription: bucket=" << bucket_id << ", offset=" << offset
+                  << std::endl;
+    }
+
+    check("subscribe_buckets", batch_scanner.Subscribe(subscriptions));
+    std::cout << "Batch subscribed to " << subscriptions.size() << " buckets" << std::endl;
+
+    fluss::ScanRecords batch_records;
+    check("poll_batch", batch_scanner.Poll(5000, batch_records));
+
+    std::cout << "Scanned " << batch_records.Count() << " records from batch subscription"
+              << std::endl;
+    for (const auto& tb : batch_records.Buckets()) {
+        size_t shown = 0;
+        for (const auto& rec : batch_records.Records(tb)) {
+            if (shown < 5) {
+                std::cout << "  bucket_id=" << tb.bucket_id << ", offset=" << rec.offset
+                          << ", timestamp=" << rec.timestamp << std::endl;
+            }
+            ++shown;
+        }
+        if (shown > 5) {
+            std::cout << "  ... and " << (shown - 5) << " more records in bucket " << tb.bucket_id
+                      << std::endl;
+        }
+    }
+
+    // 9.1) Unsubscribe from a bucket
+    std::cout << "\n=== Unsubscribe Example ===" << std::endl;
+    check("unsubscribe", batch_scanner.Unsubscribe(subscriptions[0].bucket_id));
+    std::cout << "Unsubscribed from bucket " << subscriptions[0].bucket_id << std::endl;
+
+    // 10) Arrow record batch polling
+    std::cout << "\n=== Testing Arrow Record Batch Polling ===" << std::endl;
+
+    fluss::LogScanner arrow_scanner;
+    check("new_record_batch_log_scanner",
+          table.NewScan().CreateRecordBatchLogScanner(arrow_scanner));
+
+    for (int b = 0; b < buckets; ++b) {
+        check("subscribe_arrow", arrow_scanner.Subscribe(b, 0));
+    }
+
+    fluss::ArrowRecordBatches arrow_batches;
+    check("poll_record_batch", arrow_scanner.PollRecordBatch(5000, arrow_batches));
+
+    std::cout << "Polled " << arrow_batches.Size() << " Arrow record batches" << std::endl;
+    for (size_t i = 0; i < arrow_batches.Size(); ++i) {
+        const auto& batch = arrow_batches[i];
+        if (batch->Available()) {
+            std::cout << "  Batch " << i << ": " << batch->GetArrowRecordBatch()->num_rows()
+                      << " rows" << std::endl;
+        } else {
+            std::cout << "  Batch " << i << ": not available" << std::endl;
+        }
+    }
+
+    // 11) Arrow record batch polling with projection
+    std::cout << "\n=== Testing Arrow Record Batch Polling with Projection ===" << std::endl;
+
+    fluss::LogScanner projected_arrow_scanner;
+    check("new_record_batch_log_scanner_with_projection",
+          table.NewScan()
+              .ProjectByIndex(projected_columns)
+              .CreateRecordBatchLogScanner(projected_arrow_scanner));
+
+    for (int b = 0; b < buckets; ++b) {
+        check("subscribe_projected_arrow", projected_arrow_scanner.Subscribe(b, 0));
+    }
+
+    fluss::ArrowRecordBatches projected_arrow_batches;
+    check("poll_projected_record_batch",
+          projected_arrow_scanner.PollRecordBatch(5000, projected_arrow_batches));
+
+    std::cout << "Polled " << projected_arrow_batches.Size() << " projected Arrow record batches"
+              << std::endl;
+    for (size_t i = 0; i < projected_arrow_batches.Size(); ++i) {
+        const auto& batch = projected_arrow_batches[i];
+        if (batch->Available()) {
+            std::cout << "  Batch " << i << ": " << batch->GetArrowRecordBatch()->num_rows()
+                      << " rows" << std::endl;
+        } else {
+            std::cout << "  Batch " << i << ": not available" << std::endl;
+        }
+    }
+
+    // 12) AppendArrowBatch — write an Arrow RecordBatch directly
+    std::cout << "\n=== AppendArrowBatch Example ===" << std::endl;
+    {
+        // Build an Arrow RecordBatch matching sample_table_cpp_v1 schema:
+        //   id:INT, name:STRING, score:FLOAT, age:INT,
+        //   event_date:DATE, event_time:TIME, created_at:TIMESTAMP, updated_at:TIMESTAMP_LTZ
+        auto arrow_schema = arrow::schema({
+            arrow::field("id", arrow::int32()),
+            arrow::field("name", arrow::utf8()),
+            arrow::field("score", arrow::float32()),
+            arrow::field("age", arrow::int32()),
+            arrow::field("event_date", arrow::date32()),
+            arrow::field("event_time", arrow::time32(arrow::TimeUnit::MILLI)),
+            arrow::field("created_at", arrow::timestamp(arrow::TimeUnit::MICRO)),
+            arrow::field("updated_at", arrow::timestamp(arrow::TimeUnit::MICRO)),
+        });
+
+        arrow::Int32Builder id_builder;
+        arrow::StringBuilder name_builder;
+        arrow::FloatBuilder score_builder;
+        arrow::Int32Builder age_builder;
+        arrow::Date32Builder date_builder;
+        arrow::Time32Builder time_builder(arrow::time32(arrow::TimeUnit::MILLI),
+                                          arrow::default_memory_pool());
+        arrow::TimestampBuilder ts_ntz_builder(arrow::timestamp(arrow::TimeUnit::MICRO),
+                                               arrow::default_memory_pool());
+        arrow::TimestampBuilder ts_ltz_builder(arrow::timestamp(arrow::TimeUnit::MICRO),
+                                               arrow::default_memory_pool());
+
+        // Row 1
+        (void)id_builder.Append(200);
+        (void)name_builder.Append("ArrowAlice");
+        (void)score_builder.Append(88.5f);
+        (void)age_builder.Append(28);
+        (void)date_builder.Append(19888);               // days since epoch (2024-06-15 ≈ 19888)
+        (void)time_builder.Append(52245000);            // 14:30:45 in ms
+        (void)ts_ntz_builder.Append(1718467200000000);  // micros
+        (void)ts_ltz_builder.Append(1718467200000000);
+
+        // Row 2
+        (void)id_builder.Append(201);
+        (void)name_builder.Append("ArrowBob");
+        (void)score_builder.Append(91.3f);
+        (void)age_builder.Append(33);
+        (void)date_builder.Append(20089);    // 2025-01-02
+        (void)time_builder.Append(3600000);  // 01:00:00
+        (void)ts_ntz_builder.Append(1735689600000000);
+        (void)ts_ltz_builder.Append(1735689600000000);
+
+        auto batch_result = arrow::RecordBatch::Make(
+            arrow_schema, 2,
+            {id_builder.Finish().ValueOrDie(), name_builder.Finish().ValueOrDie(),
+             score_builder.Finish().ValueOrDie(), age_builder.Finish().ValueOrDie(),
+             date_builder.Finish().ValueOrDie(), time_builder.Finish().ValueOrDie(),
+             ts_ntz_builder.Finish().ValueOrDie(), ts_ltz_builder.Finish().ValueOrDie()});
+
+        check("append_arrow_batch", writer.AppendArrowBatch(batch_result));
+        check("flush_arrow", writer.Flush());
+        std::cout << "Wrote 2 rows via AppendArrowBatch" << std::endl;
+
+        // Verify by scanning from latest offsets
+        fluss::LogScanner arrow_write_scanner;
+        check("new_arrow_write_scanner", table.NewScan().CreateLogScanner(arrow_write_scanner));
+        for (const auto& [bid, off] : latest_offsets) {
+            check("subscribe_arrow_write", arrow_write_scanner.Subscribe(bid, off));
+        }
+
+        fluss::ScanRecords arrow_write_records;
+        check("poll_arrow_write", arrow_write_scanner.Poll(5000, arrow_write_records));
+        std::cout << "Scanned " << arrow_write_records.Count()
+                  << " records written via AppendArrowBatch:" << std::endl;
+        for (const auto& tb : arrow_write_records.Buckets()) {
+            for (const auto& rec : arrow_write_records.Records(tb)) {
+                std::cout << "  id=" << rec.row.GetInt32(0) << " name=" << rec.row.GetString(1)
+                          << " score=" << rec.row.GetFloat32(2) << std::endl;
+            }
+        }
+    }
+
+    // 13) Decimal support example
+    std::cout << "\n=== Decimal Support Example ===" << std::endl;
+
+    fluss::TablePath decimal_table_path("fluss", "decimal_table_cpp_v1");
+
+    // Drop table if exists
+    admin.DropTable(decimal_table_path, true);
+
+    // Create schema with decimal columns
+    auto decimal_schema = fluss::Schema::NewBuilder()
+                              .AddColumn("id", fluss::DataType::Int())
+                              .AddColumn("price", fluss::DataType::Decimal(10, 2))   // compact
+                              .AddColumn("amount", fluss::DataType::Decimal(28, 8))  // i128
+                              .Build();
+
+    auto decimal_descriptor = fluss::TableDescriptor::NewBuilder()
+                                  .SetSchema(decimal_schema)
+                                  .SetBucketCount(1)
+                                  .SetComment("cpp decimal example table")
+                                  .Build();
+
+    check("create_decimal_table", admin.CreateTable(decimal_table_path, decimal_descriptor, false));
+
+    // Get table and writer
+    fluss::Table decimal_table;
+    check("get_decimal_table", conn.GetTable(decimal_table_path, decimal_table));
+
+    fluss::AppendWriter decimal_writer;
+    check("new_decimal_writer", decimal_table.NewAppend().CreateWriter(decimal_writer));
+
+    // Just provide the value — Rust resolves (p,s) from schema
+    {
+        fluss::GenericRow row;
+        row.SetInt32(0, 1);
+        row.SetDecimal(1, "123.45");      // Rust knows DECIMAL(10,2)
+        row.SetDecimal(2, "1.00000000");  // Rust knows DECIMAL(28,8)
+        check("append_decimal", decimal_writer.Append(row));
+    }
+    {
+        fluss::GenericRow row;
+        row.SetInt32(0, 2);
+        row.SetDecimal(1, "-999.99");
+        row.SetDecimal(2, "3.14159265");
+        check("append_decimal", decimal_writer.Append(row));
+    }
+    {
+        fluss::GenericRow row;
+        row.SetInt32(0, 3);
+        row.SetDecimal(1, "500.00");
+        row.SetDecimal(2, "2.71828182");
+        check("append_decimal", decimal_writer.Append(row));
+    }
+    check("flush_decimal", decimal_writer.Flush());
+    std::cout << "Wrote 3 decimal rows" << std::endl;
+
+    // Scan and read back
+    fluss::LogScanner decimal_scanner;
+    check("new_decimal_scanner", decimal_table.NewScan().CreateLogScanner(decimal_scanner));
+    check("subscribe_decimal", decimal_scanner.Subscribe(0, 0));
+
+    fluss::ScanRecords decimal_records;
+    check("poll_decimal", decimal_scanner.Poll(5000, decimal_records));
+
+    std::cout << "Scanned decimal records: " << decimal_records.Count() << std::endl;
+    for (const auto& tb : decimal_records.Buckets()) {
+        for (const auto& rec : decimal_records.Records(tb)) {
+            std::cout << "  id=" << rec.row.GetInt32(0) << " price=" << rec.row.GetDecimalString(1)
+                      << " amount=" << rec.row.GetDecimalString(2)
+                      << " is_decimal=" << rec.row.IsDecimal(1) << std::endl;
+        }
+    }
+
+    // 14) Partitioned table example
+    std::cout << "\n=== Partitioned Table Example ===" << std::endl;
+
+    fluss::TablePath partitioned_table_path("fluss", "partitioned_table_cpp_v1");
+
+    // Drop if exists
+    check("drop_partitioned_table_if_exists", admin.DropTable(partitioned_table_path, true));
+
+    // Create a partitioned table with a "region" partition key
+    auto partitioned_schema = fluss::Schema::NewBuilder()
+                                  .AddColumn("id", fluss::DataType::Int())
+                                  .AddColumn("region", fluss::DataType::String())
+                                  .AddColumn("value", fluss::DataType::BigInt())
+                                  .Build();
+
+    auto partitioned_descriptor = fluss::TableDescriptor::NewBuilder()
+                                      .SetSchema(partitioned_schema)
+                                      .SetPartitionKeys({"region"})
+                                      .SetBucketCount(1)
+                                      .SetComment("cpp partitioned table example")
+                                      .Build();
+
+    check("create_partitioned_table",
+          admin.CreateTable(partitioned_table_path, partitioned_descriptor, false));
+    std::cout << "Created partitioned table" << std::endl;
+
+    // Create partitions
+    check("create_partition_US",
+          admin.CreatePartition(partitioned_table_path, {{"region", "US"}}, true));
+    check("create_partition_EU",
+          admin.CreatePartition(partitioned_table_path, {{"region", "EU"}}, true));
+    std::cout << "Created partitions: US, EU" << std::endl;
+
+    // List all partitions
+    std::vector<fluss::PartitionInfo> partition_infos;
+    check("list_partition_infos",
+          admin.ListPartitionInfos(partitioned_table_path, partition_infos));
+    for (const auto& pi : partition_infos) {
+        std::cout << "  Partition: " << pi.partition_name << " (id=" << pi.partition_id << ")"
+                  << std::endl;
+    }
+
+    // List partitions with partial spec filter
+    std::vector<fluss::PartitionInfo> us_partition_infos;
+    check("list_partition_infos_with_spec",
+          admin.ListPartitionInfos(partitioned_table_path, {{"region", "US"}}, us_partition_infos));
+    std::cout << "  Filtered (region=US): " << us_partition_infos.size() << " partition(s)"
+              << std::endl;
+
+    // Write data to partitioned table
+    fluss::Table partitioned_table;
+    check("get_partitioned_table", conn.GetTable(partitioned_table_path, partitioned_table));
+
+    fluss::AppendWriter partitioned_writer;
+    check("new_partitioned_writer", partitioned_table.NewAppend().CreateWriter(partitioned_writer));
+
+    struct PartitionedRow {
+        int id;
+        const char* region;
+        int64_t value;
+    };
+
+    std::vector<PartitionedRow> partitioned_rows = {
+        {1, "US", 100},
+        {2, "US", 200},
+        {3, "EU", 300},
+        {4, "EU", 400},
+    };
+
+    for (const auto& r : partitioned_rows) {
+        fluss::GenericRow row;
+        row.SetInt32(0, r.id);
+        row.SetString(1, r.region);
+        row.SetInt64(2, r.value);
+        check("append_partitioned", partitioned_writer.Append(row));
+    }
+    check("flush_partitioned", partitioned_writer.Flush());
+    std::cout << "Wrote " << partitioned_rows.size() << " rows to partitioned table" << std::endl;
+
+    // 14.1) subscribe_partition_buckets: subscribe to each partition individually
+    std::cout << "\n--- Testing SubscribePartitionBuckets ---" << std::endl;
+    fluss::LogScanner partition_scanner;
+    check("new_partition_scanner", partitioned_table.NewScan().CreateLogScanner(partition_scanner));
+
+    for (const auto& pi : partition_infos) {
+        check("subscribe_partition_buckets",
+              partition_scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0));
+        std::cout << "Subscribed to partition " << pi.partition_name << std::endl;
+    }
+
+    fluss::ScanRecords partition_records;
+    check("poll_partitioned", partition_scanner.Poll(5000, partition_records));
+    std::cout << "Scanned " << partition_records.Count() << " records from partitioned table"
+              << std::endl;
+    for (const auto& tb : partition_records.Buckets()) {
+        for (const auto& rec : partition_records.Records(tb)) {
+            std::cout << "  partition_id="
+                      << (tb.partition_id.has_value() ? std::to_string(*tb.partition_id) : "none")
+                      << ", id=" << rec.row.GetInt32(0) << ", region=" << rec.row.GetString(1)
+                      << ", value=" << rec.row.GetInt64(2) << std::endl;
+        }
+    }
+
+    // 14.2) subscribe_partition_buckets: batch subscribe to all partitions at once
+    std::cout << "\n--- Testing SubscribePartitionBuckets (batch) ---" << std::endl;
+    fluss::LogScanner partition_batch_scanner;
+    check("new_partition_batch_scanner",
+          partitioned_table.NewScan().CreateLogScanner(partition_batch_scanner));
+
+    std::vector<fluss::PartitionBucketSubscription> partition_subs;
+    for (const auto& pi : partition_infos) {
+        partition_subs.push_back({pi.partition_id, 0, 0});
+    }
+    check("subscribe_partition_buckets",
+          partition_batch_scanner.SubscribePartitionBuckets(partition_subs));
+    std::cout << "Batch subscribed to " << partition_subs.size() << " partition+bucket combinations"
+              << std::endl;
+
+    fluss::ScanRecords partition_batch_records;
+    check("poll_partition_batch", partition_batch_scanner.Poll(5000, partition_batch_records));
+    std::cout << "Scanned " << partition_batch_records.Count()
+              << " records from batch partition subscription" << std::endl;
+    for (const auto& tb : partition_batch_records.Buckets()) {
+        for (const auto& rec : partition_batch_records.Records(tb)) {
+            std::cout << "  id=" << rec.row.GetInt32(0) << ", region=" << rec.row.GetString(1)
+                      << ", value=" << rec.row.GetInt64(2) << std::endl;
+        }
+    }
+
+    // 14.3) UnsubscribePartition: unsubscribe from one partition, verify remaining
+    std::cout << "\n--- Testing UnsubscribePartition ---" << std::endl;
+    fluss::LogScanner unsub_partition_scanner;
+    check("new_unsub_partition_scanner",
+          partitioned_table.NewScan().CreateLogScanner(unsub_partition_scanner));
+
+    for (const auto& pi : partition_infos) {
+        check("subscribe_for_unsub",
+              unsub_partition_scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0));
+    }
+    // Unsubscribe from the first partition
+    check("unsubscribe_partition",
+          unsub_partition_scanner.UnsubscribePartition(partition_infos[0].partition_id, 0));
+    std::cout << "Unsubscribed from partition " << partition_infos[0].partition_name << std::endl;
+
+    fluss::ScanRecords unsub_records;
+    check("poll_after_unsub", unsub_partition_scanner.Poll(5000, unsub_records));
+    std::cout << "After unsubscribe, scanned " << unsub_records.Count() << " records" << std::endl;
+    for (const auto& tb : unsub_records.Buckets()) {
+        for (const auto& rec : unsub_records.Records(tb)) {
+            std::cout << "  id=" << rec.row.GetInt32(0) << ", region=" << rec.row.GetString(1)
+                      << ", value=" << rec.row.GetInt64(2) << std::endl;
+        }
+    }
+
+    // Cleanup
+    check("drop_partitioned_table", admin.DropTable(partitioned_table_path, true));
+    std::cout << "Dropped partitioned table" << std::endl;
+    return 0;
+}
diff --git a/fluss-rust/bindings/cpp/examples/kv_example.cpp b/fluss-rust/bindings/cpp/examples/kv_example.cpp
new file mode 100644
index 0000000000..46ed01f682
--- /dev/null
+++ b/fluss-rust/bindings/cpp/examples/kv_example.cpp
@@ -0,0 +1,537 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "fluss.hpp"
+
+static void check(const char* step, const fluss::Result& r) {
+    if (!r.Ok()) {
+        std::cerr << step << " failed: code=" << r.error_code << " msg=" << r.error_message
+                  << std::endl;
+        std::exit(1);
+    }
+}
+
+int main() {
+    // 1) Connect and get Admin
+    fluss::Configuration config;
+    config.bootstrap_servers = "127.0.0.1:9123";
+
+    fluss::Connection conn;
+    check("create", fluss::Connection::Create(config, conn));
+
+    fluss::Admin admin;
+    check("get_admin", conn.GetAdmin(admin));
+
+    fluss::TablePath kv_table_path("fluss", "kv_table_cpp_v1");
+
+    // Drop if exists
+    admin.DropTable(kv_table_path, true);
+
+    // 2) Create a KV table with primary key, including decimal and temporal types
+    auto kv_schema = fluss::Schema::NewBuilder()
+                         .AddColumn("user_id", fluss::DataType::Int())
+                         .AddColumn("name", fluss::DataType::String())
+                         .AddColumn("email", fluss::DataType::String())
+                         .AddColumn("score", fluss::DataType::Float())
+                         .AddColumn("balance", fluss::DataType::Decimal(10, 2))
+                         .AddColumn("birth_date", fluss::DataType::Date())
+                         .AddColumn("login_time", fluss::DataType::Time())
+                         .AddColumn("created_at", fluss::DataType::Timestamp())
+                         .AddColumn("last_seen", fluss::DataType::TimestampLtz())
+                         .SetPrimaryKeys({"user_id"})
+                         .Build();
+
+    auto kv_descriptor = fluss::TableDescriptor::NewBuilder()
+                             .SetSchema(kv_schema)
+                             .SetBucketCount(3)
+                             .SetComment("cpp kv table example")
+                             .Build();
+
+    check("create_kv_table", admin.CreateTable(kv_table_path, kv_descriptor, false));
+    std::cout << "Created KV table with primary key" << std::endl;
+
+    fluss::Table kv_table;
+    check("get_kv_table", conn.GetTable(kv_table_path, kv_table));
+
+    // 3) Upsert rows using name-based Set()
+    //    - Set("balance", "1234.56") auto-routes to SetDecimal (schema-aware)
+    //    - Set("created_at", ts) auto-routes to SetTimestampNtz (schema-aware)
+    //    - Set("last_seen", ts) auto-routes to SetTimestampLtz (schema-aware)
+    std::cout << "\n--- Upsert Rows ---" << std::endl;
+    fluss::UpsertWriter upsert_writer;
+    check("new_upsert_writer", kv_table.NewUpsert().CreateWriter(upsert_writer));
+
+    // Fire-and-forget upserts (flush at the end)
+    {
+        auto row = kv_table.NewRow();
+        row.Set("user_id", 1);
+        row.Set("name", "Alice");
+        row.Set("email", "alice@example.com");
+        row.Set("score", 95.5f);
+        row.Set("balance", "1234.56");
+        row.Set("birth_date", fluss::Date::FromYMD(1990, 3, 15));
+        row.Set("login_time", fluss::Time::FromHMS(9, 30, 0));
+        row.Set("created_at", fluss::Timestamp::FromMillis(1700000000000));
+        row.Set("last_seen", fluss::Timestamp::FromMillis(1700000060000));
+        check("upsert_1", upsert_writer.Upsert(row));
+    }
+    {
+        auto row = kv_table.NewRow();
+        row.Set("user_id", 2);
+        row.Set("name", "Bob");
+        row.Set("email", "bob@example.com");
+        row.Set("score", 87.3f);
+        row.Set("balance", "567.89");
+        row.Set("birth_date", fluss::Date::FromYMD(1985, 7, 22));
+        row.Set("login_time", fluss::Time::FromHMS(14, 15, 30));
+        row.Set("created_at", fluss::Timestamp::FromMillis(1700000100000));
+        row.Set("last_seen", fluss::Timestamp::FromMillis(1700000200000));
+        check("upsert_2", upsert_writer.Upsert(row));
+    }
+
+    // Per-record acknowledgment
+    {
+        auto row = kv_table.NewRow();
+        row.Set("user_id", 3);
+        row.Set("name", "Charlie");
+        row.Set("email", "charlie@example.com");
+        row.Set("score", 92.0f);
+        row.Set("balance", "99999.99");
+        row.Set("birth_date", fluss::Date::FromYMD(2000, 1, 1));
+        row.Set("login_time", fluss::Time::FromHMS(23, 59, 59));
+        row.Set("created_at", fluss::Timestamp::FromMillis(1700000300000));
+        row.Set("last_seen", fluss::Timestamp::FromMillis(1700000400000));
+        fluss::WriteResult wr;
+        check("upsert_3", upsert_writer.Upsert(row, wr));
+        check("upsert_3_wait", wr.Wait());
+        std::cout << "Upsert acknowledged by server" << std::endl;
+    }
+
+    check("upsert_flush", upsert_writer.Flush());
+    std::cout << "Upserted 3 rows" << std::endl;
+
+    // 4) Lookup by primary key — verify all types round-trip
+    std::cout << "\n--- Lookup by Primary Key ---" << std::endl;
+    fluss::Lookuper lookuper;
+    check("new_lookuper", kv_table.NewLookup().CreateLookuper(lookuper));
+
+    // Lookup existing key
+    {
+        auto pk_row = kv_table.NewRow();
+        pk_row.Set("user_id", 1);
+
+        fluss::LookupResult result;
+        check("lookup_1", lookuper.Lookup(pk_row, result));
+        if (result.Found()) {
+            // Name-based getters — same data as index-based but self-documenting
+            auto date = result.GetDate("birth_date");
+            auto time = result.GetTime("login_time");
+            auto created = result.GetTimestamp("created_at");
+            auto seen = result.GetTimestamp("last_seen");
+            std::cout << "Found user_id=1:"
+                      << "\n  name=" << result.GetString("name")
+                      << "\n  email=" << result.GetString("email")
+                      << "\n  score=" << result.GetFloat32("score")
+                      << "\n  balance=" << result.GetDecimalString("balance")
+                      << "\n  birth_date=" << date.Year() << "-" << date.Month() << "-"
+                      << date.Day() << "\n  login_time=" << time.Hour() << ":" << time.Minute()
+                      << ":" << time.Second() << "\n  created_at(ms)=" << created.epoch_millis
+                      << "\n  last_seen(ms)=" << seen.epoch_millis << std::endl;
+        } else {
+            std::cerr << "ERROR: Expected to find user_id=1" << std::endl;
+            std::exit(1);
+        }
+    }
+
+    // Lookup non-existing key
+    {
+        auto pk_row = kv_table.NewRow();
+        pk_row.Set("user_id", 999);
+
+        fluss::LookupResult result;
+        check("lookup_999", lookuper.Lookup(pk_row, result));
+        if (!result.Found()) {
+            std::cout << "user_id=999 not found (expected)" << std::endl;
+        } else {
+            std::cerr << "ERROR: Expected user_id=999 to not be found" << std::endl;
+            std::exit(1);
+        }
+    }
+
+    // 4b) Null row round-trip (matches Rust kv_table.rs all_supported_datatypes)
+    //     Upsert a row with all non-PK fields null, lookup, verify IsNull
+    std::cout << "\n--- Null Row Round-Trip ---" << std::endl;
+    {
+        auto row = kv_table.NewRow();
+        row.Set("user_id", 100);
+        row.SetNull(1);  // name
+        row.SetNull(2);  // email
+        row.SetNull(3);  // score
+        row.SetNull(4);  // balance
+        row.SetNull(5);  // birth_date
+        row.SetNull(6);  // login_time
+        row.SetNull(7);  // created_at
+        row.SetNull(8);  // last_seen
+        fluss::WriteResult wr;
+        check("upsert_null_row", upsert_writer.Upsert(row, wr));
+        check("upsert_null_row_wait", wr.Wait());
+    }
+    {
+        auto pk_row = kv_table.NewRow();
+        pk_row.Set("user_id", 100);
+
+        fluss::LookupResult result;
+        check("lookup_null_row", lookuper.Lookup(pk_row, result));
+        if (!result.Found()) {
+            std::cerr << "ERROR: Expected to find user_id=100 (null row)" << std::endl;
+            std::exit(1);
+        }
+
+        // Verify PK is not null
+        if (result.IsNull(0)) {
+            std::cerr << "ERROR: PK (user_id) should not be null" << std::endl;
+            std::exit(1);
+        }
+
+        // Verify all nullable columns are null (matches Rust is_null_at assertions)
+        bool null_ok = true;
+        for (size_t i = 1; i < result.FieldCount(); ++i) {
+            if (!result.IsNull(i)) {
+                std::cerr << "ERROR: column " << i << " should be null" << std::endl;
+                null_ok = false;
+            }
+        }
+        if (null_ok) {
+            std::cout << "Null row verified: all " << (result.FieldCount() - 1)
+                      << " nullable fields are null" << std::endl;
+        } else {
+            std::exit(1);
+        }
+    }
+
+    // 5) Update via upsert (overwrite existing key)
+    std::cout << "\n--- Update via Upsert ---" << std::endl;
+    {
+        auto row = kv_table.NewRow();
+        row.Set("user_id", 1);
+        row.Set("name", "Alice Updated");
+        row.Set("email", "alice.new@example.com");
+        row.Set("score", 99.0f);
+        row.Set("balance", "9999.00");
+        row.Set("birth_date", fluss::Date::FromYMD(1990, 3, 15));
+        row.Set("login_time", fluss::Time::FromHMS(10, 0, 0));
+        row.Set("created_at", fluss::Timestamp::FromMillis(1700000000000));
+        row.Set("last_seen", fluss::Timestamp::FromMillis(1700000500000));
+        fluss::WriteResult wr;
+        check("upsert_update", upsert_writer.Upsert(row, wr));
+        check("upsert_update_wait", wr.Wait());
+    }
+
+    // Verify update
+    {
+        auto pk_row = kv_table.NewRow();
+        pk_row.Set("user_id", 1);
+
+        fluss::LookupResult result;
+        check("lookup_updated", lookuper.Lookup(pk_row, result));
+        if (result.Found() && result.GetString(1) == "Alice Updated") {
+            std::cout << "Update verified: name=" << result.GetString(1)
+                      << " balance=" << result.GetDecimalString(4)
+                      << " last_seen(ms)=" << result.GetTimestamp(8).epoch_millis << std::endl;
+        } else {
+            std::cerr << "ERROR: Update verification failed" << std::endl;
+            std::exit(1);
+        }
+    }
+
+    // 6) Delete by primary key
+    std::cout << "\n--- Delete by Primary Key ---" << std::endl;
+    {
+        auto pk_row = kv_table.NewRow();
+        pk_row.Set("user_id", 2);
+        fluss::WriteResult wr;
+        check("delete_2", upsert_writer.Delete(pk_row, wr));
+        check("delete_2_wait", wr.Wait());
+        std::cout << "Deleted user_id=2" << std::endl;
+    }
+
+    // Verify deletion
+    {
+        auto pk_row = kv_table.NewRow();
+        pk_row.Set("user_id", 2);
+
+        fluss::LookupResult result;
+        check("lookup_deleted", lookuper.Lookup(pk_row, result));
+        if (!result.Found()) {
+            std::cout << "Delete verified: user_id=2 not found" << std::endl;
+        } else {
+            std::cerr << "ERROR: Expected user_id=2 to be deleted" << std::endl;
+            std::exit(1);
+        }
+    }
+
+    // 7) Partial update by column names
+    std::cout << "\n--- Partial Update by Column Names ---" << std::endl;
+    fluss::UpsertWriter partial_writer;
+    check("new_partial_upsert_writer", kv_table.NewUpsert()
+                                           .PartialUpdateByName({"user_id", "balance", "last_seen"})
+                                           .CreateWriter(partial_writer));
+
+    {
+        auto row = kv_table.NewRow();
+        row.Set("user_id", 3);
+        row.Set("balance", "50000.00");
+        row.Set("last_seen", fluss::Timestamp::FromMillis(1700000999000));
+        fluss::WriteResult wr;
+        check("partial_upsert", partial_writer.Upsert(row, wr));
+        check("partial_upsert_wait", wr.Wait());
+        std::cout << "Partial update: set balance=50000.00, last_seen for user_id=3" << std::endl;
+    }
+
+    // Verify partial update (other fields unchanged)
+    {
+        auto pk_row = kv_table.NewRow();
+        pk_row.Set("user_id", 3);
+
+        fluss::LookupResult result;
+        check("lookup_partial", lookuper.Lookup(pk_row, result));
+        if (result.Found()) {
+            std::cout << "Partial update verified:"
+                      << "\n  name=" << result.GetString(1) << " (unchanged)"
+                      << "\n  balance=" << result.GetDecimalString(4) << " (updated)"
+                      << "\n  last_seen(ms)=" << result.GetTimestamp(8).epoch_millis << " (updated)"
+                      << std::endl;
+        } else {
+            std::cerr << "ERROR: Expected to find user_id=3" << std::endl;
+            std::exit(1);
+        }
+    }
+
+    // 8) Partial update by column indices (using index-based setters for lower overhead)
+    std::cout << "\n--- Partial Update by Column Indices ---" << std::endl;
+    fluss::UpsertWriter partial_writer_idx;
+    // Columns: 0=user_id (PK), 1=name — update name only
+    check("new_partial_upsert_writer_idx",
+          kv_table.NewUpsert().PartialUpdateByIndex({0, 1}).CreateWriter(partial_writer_idx));
+
+    {
+        // Index-based setters: lighter than name-based, useful for hot paths
+        fluss::GenericRow row;
+        row.SetInt32(0, 3);                   // user_id (PK)
+        row.SetString(1, "Charlie Updated");  // name
+        fluss::WriteResult wr;
+        check("partial_upsert_idx", partial_writer_idx.Upsert(row, wr));
+        check("partial_upsert_idx_wait", wr.Wait());
+        std::cout << "Partial update by indices: set name='Charlie Updated' for user_id=3"
+                  << std::endl;
+    }
+
+    // Verify: name changed, balance/last_seen unchanged from previous partial update
+    {
+        auto pk_row = kv_table.NewRow();
+        pk_row.Set("user_id", 3);
+
+        fluss::LookupResult result;
+        check("lookup_partial_idx", lookuper.Lookup(pk_row, result));
+        if (result.Found()) {
+            std::cout << "Partial update by indices verified:"
+                      << "\n  name=" << result.GetString(1) << " (updated)"
+                      << "\n  balance=" << result.GetDecimalString(4) << " (unchanged)"
+                      << "\n  last_seen(ms)=" << result.GetTimestamp(8).epoch_millis
+                      << " (unchanged)" << std::endl;
+        } else {
+            std::cerr << "ERROR: Expected to find user_id=3" << std::endl;
+            std::exit(1);
+        }
+    }
+
+    // Cleanup
+    check("drop_kv_table", admin.DropTable(kv_table_path, true));
+
+    // 9) Partitioned KV table
+    std::cout << "\n--- Partitioned KV Table ---" << std::endl;
+    fluss::TablePath partitioned_kv_path("fluss", "partitioned_kv_cpp_v1");
+    admin.DropTable(partitioned_kv_path, true);
+
+    // PK columns intentionally interleaved with non-PK columns to verify
+    // that lookup correctly builds a dense PK-only row (not sparse full-width).
+    auto partitioned_kv_schema = fluss::Schema::NewBuilder()
+                                     .AddColumn("region", fluss::DataType::String())
+                                     .AddColumn("score", fluss::DataType::BigInt())
+                                     .AddColumn("user_id", fluss::DataType::Int())
+                                     .AddColumn("name", fluss::DataType::String())
+                                     .SetPrimaryKeys({"region", "user_id"})
+                                     .Build();
+
+    auto partitioned_kv_descriptor = fluss::TableDescriptor::NewBuilder()
+                                         .SetSchema(partitioned_kv_schema)
+                                         .SetPartitionKeys({"region"})
+                                         .SetComment("partitioned kv table example")
+                                         .Build();
+
+    check("create_partitioned_kv",
+          admin.CreateTable(partitioned_kv_path, partitioned_kv_descriptor, false));
+    std::cout << "Created partitioned KV table" << std::endl;
+
+    // Create partitions
+    check("create_US", admin.CreatePartition(partitioned_kv_path, {{"region", "US"}}));
+    check("create_EU", admin.CreatePartition(partitioned_kv_path, {{"region", "EU"}}));
+    check("create_APAC", admin.CreatePartition(partitioned_kv_path, {{"region", "APAC"}}));
+    std::cout << "Created partitions: US, EU, APAC" << std::endl;
+
+    fluss::Table partitioned_kv_table;
+    check("get_partitioned_kv_table", conn.GetTable(partitioned_kv_path, partitioned_kv_table));
+
+    fluss::UpsertWriter partitioned_writer;
+    check("new_partitioned_writer",
+          partitioned_kv_table.NewUpsert().CreateWriter(partitioned_writer));
+
+    // Upsert rows across partitions
+    // Column order: region(0), score(1), user_id(2), name(3)
+    struct TestRow {
+        const char* region;
+        int64_t score;
+        int32_t user_id;
+        const char* name;
+    };
+    TestRow test_data[] = {
+        {"US", 100, 1, "Gustave"}, {"US", 200, 2, "Lune"},   {"EU", 150, 1, "Sciel"},
+        {"EU", 250, 2, "Maelle"},  {"APAC", 300, 1, "Noco"},
+    };
+
+    for (const auto& td : test_data) {
+        auto row = partitioned_kv_table.NewRow();
+        row.Set("region", td.region);
+        row.Set("score", td.score);
+        row.Set("user_id", td.user_id);
+        row.Set("name", td.name);
+        check("partitioned_upsert", partitioned_writer.Upsert(row));
+    }
+    check("partitioned_flush", partitioned_writer.Flush());
+    std::cout << "Upserted 5 rows across 3 partitions" << std::endl;
+
+    // Lookup all rows
+    fluss::Lookuper partitioned_lookuper;
+    check("new_partitioned_lookuper",
+          partitioned_kv_table.NewLookup().CreateLookuper(partitioned_lookuper));
+
+    for (const auto& td : test_data) {
+        auto pk = partitioned_kv_table.NewRow();
+        pk.Set("region", td.region);
+        pk.Set("user_id", td.user_id);
+
+        fluss::LookupResult result;
+        check("partitioned_lookup", partitioned_lookuper.Lookup(pk, result));
+        if (!result.Found()) {
+            std::cerr << "ERROR: Expected to find region=" << td.region << " user_id=" << td.user_id
+                      << std::endl;
+            std::exit(1);
+        }
+        if (result.GetString(3) != td.name || result.GetInt64(1) != td.score) {
+            std::cerr << "ERROR: Data mismatch for region=" << td.region
+                      << " user_id=" << td.user_id << std::endl;
+            std::exit(1);
+        }
+    }
+    std::cout << "All 5 rows verified across partitions" << std::endl;
+
+    // Update within a partition
+    {
+        auto row = partitioned_kv_table.NewRow();
+        row.Set("region", "US");
+        row.Set("score", static_cast<int64_t>(999));
+        row.Set("user_id", 1);
+        row.Set("name", "Gustave Updated");
+        fluss::WriteResult wr;
+        check("partitioned_update", partitioned_writer.Upsert(row, wr));
+        check("partitioned_update_wait", wr.Wait());
+    }
+    {
+        auto pk = partitioned_kv_table.NewRow();
+        pk.Set("region", "US");
+        pk.Set("user_id", 1);
+        fluss::LookupResult result;
+        check("partitioned_lookup_updated", partitioned_lookuper.Lookup(pk, result));
+        if (!result.Found() || result.GetString(3) != "Gustave Updated" ||
+            result.GetInt64(1) != 999) {
+            std::cerr << "ERROR: Partition update verification failed" << std::endl;
+            std::exit(1);
+        }
+        std::cout << "Update verified: US/1 name=" << result.GetString(3)
+                  << " score=" << result.GetInt64(1) << std::endl;
+    }
+
+    // Lookup in non-existent partition
+    {
+        auto pk = partitioned_kv_table.NewRow();
+        pk.Set("region", "UNKNOWN");
+        pk.Set("user_id", 1);
+        fluss::LookupResult result;
+        check("partitioned_lookup_unknown", partitioned_lookuper.Lookup(pk, result));
+        if (result.Found()) {
+            std::cerr << "ERROR: Expected UNKNOWN partition lookup to return not found"
+                      << std::endl;
+            std::exit(1);
+        }
+        std::cout << "UNKNOWN partition lookup: not found (expected)" << std::endl;
+    }
+
+    // Delete within a partition
+    {
+        auto pk = partitioned_kv_table.NewRow();
+        pk.Set("region", "EU");
+        pk.Set("user_id", 1);
+        fluss::WriteResult wr;
+        check("partitioned_delete", partitioned_writer.Delete(pk, wr));
+        check("partitioned_delete_wait", wr.Wait());
+    }
+    {
+        auto pk = partitioned_kv_table.NewRow();
+        pk.Set("region", "EU");
+        pk.Set("user_id", 1);
+        fluss::LookupResult result;
+        check("partitioned_lookup_deleted", partitioned_lookuper.Lookup(pk, result));
+        if (result.Found()) {
+            std::cerr << "ERROR: Expected EU/1 to be deleted" << std::endl;
+            std::exit(1);
+        }
+        std::cout << "Delete verified: EU/1 not found" << std::endl;
+    }
+
+    // Verify other record in same partition still exists
+    {
+        auto pk = partitioned_kv_table.NewRow();
+        pk.Set("region", "EU");
+        pk.Set("user_id", 2);
+        fluss::LookupResult result;
+        check("partitioned_lookup_eu2", partitioned_lookuper.Lookup(pk, result));
+        if (!result.Found() || result.GetString(3) != "Maelle") {
+            std::cerr << "ERROR: Expected EU/2 (Maelle) to still exist" << std::endl;
+            std::exit(1);
+        }
+        std::cout << "EU/2 still exists: name=" << result.GetString(3) << std::endl;
+    }
+
+    check("drop_partitioned_kv", admin.DropTable(partitioned_kv_path, true));
+    std::cout << "\nKV table example completed successfully!" << std::endl;
+
+    return 0;
+}
diff --git a/fluss-rust/bindings/cpp/include/fluss.hpp b/fluss-rust/bindings/cpp/include/fluss.hpp
new file mode 100644
index 0000000000..d019b42787
--- /dev/null
+++ b/fluss-rust/bindings/cpp/include/fluss.hpp
@@ -0,0 +1,1633 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+// Forward declare Arrow classes to avoid including heavy Arrow headers in header
+namespace arrow {
+class RecordBatch;
+}
+
+namespace fluss {
+
+namespace ffi {
+struct Connection;
+struct Admin;
+struct Table;
+struct AppendWriter;
+struct WriteResult;
+struct LogScanner;
+struct UpsertWriter;
+struct Lookuper;
+struct ScanResultInner;
+struct GenericRowInner;
+struct LookupResultInner;
+struct ArrayWriterInner;
+struct ArrayViewInner;
+}  // namespace ffi
+
+/// Named constants for Fluss API error codes.
+///
+/// Server API errors have error_code > 0 or == -1.
+/// Client-side errors have error_code == CLIENT_ERROR (-2).
+/// These constants match the Rust core FlussError enum and are stable across protocol versions.
+/// New server error codes work automatically (error_code is a raw int, not a closed enum) —
+/// these constants are convenience names, not an exhaustive list.
+struct ErrorCode {
+    /// Client-side error (not from server API protocol). Check error_message for details.
+    static constexpr int CLIENT_ERROR = -2;
+    /// No error.
+    static constexpr int NONE = 0;
+    /// The server experienced an unexpected error when processing the request.
+    static constexpr int UNKNOWN_SERVER_ERROR = -1;
+    /// The server disconnected before a response was received.
+    static constexpr int NETWORK_EXCEPTION = 1;
+    /// The version of API is not supported.
+    static constexpr int UNSUPPORTED_VERSION = 2;
+    /// This message has failed its CRC checksum, exceeds the valid size, or is otherwise corrupt.
+    static constexpr int CORRUPT_MESSAGE = 3;
+    /// The database does not exist.
+    static constexpr int DATABASE_NOT_EXIST = 4;
+    /// The database is not empty.
+    static constexpr int DATABASE_NOT_EMPTY = 5;
+    /// The database already exists.
+    static constexpr int DATABASE_ALREADY_EXIST = 6;
+    /// The table does not exist.
+    static constexpr int TABLE_NOT_EXIST = 7;
+    /// The table already exists.
+    static constexpr int TABLE_ALREADY_EXIST = 8;
+    /// The schema does not exist.
+    static constexpr int SCHEMA_NOT_EXIST = 9;
+    /// Exception occurred while storing data for log in server.
+    static constexpr int LOG_STORAGE_EXCEPTION = 10;
+    /// Exception occurred while storing data for kv in server.
+    static constexpr int KV_STORAGE_EXCEPTION = 11;
+    /// Not leader or follower.
+    static constexpr int NOT_LEADER_OR_FOLLOWER = 12;
+    /// The record is too large.
+    static constexpr int RECORD_TOO_LARGE_EXCEPTION = 13;
+    /// The record is corrupt.
+    static constexpr int CORRUPT_RECORD_EXCEPTION = 14;
+    /// The client has attempted to perform an operation on an invalid table.
+    static constexpr int INVALID_TABLE_EXCEPTION = 15;
+    /// The client has attempted to perform an operation on an invalid database.
+    static constexpr int INVALID_DATABASE_EXCEPTION = 16;
+    /// The replication factor is larger than the number of available tablet servers.
+    static constexpr int INVALID_REPLICATION_FACTOR = 17;
+    /// Produce request specified an invalid value for required acks.
+    static constexpr int INVALID_REQUIRED_ACKS = 18;
+    /// The log offset is out of range.
+    static constexpr int LOG_OFFSET_OUT_OF_RANGE_EXCEPTION = 19;
+    /// The table is not a primary key table.
+    static constexpr int NON_PRIMARY_KEY_TABLE_EXCEPTION = 20;
+    /// The table or bucket does not exist.
+    static constexpr int UNKNOWN_TABLE_OR_BUCKET_EXCEPTION = 21;
+    /// The update version is invalid.
+    static constexpr int INVALID_UPDATE_VERSION_EXCEPTION = 22;
+    /// The coordinator is invalid.
+    static constexpr int INVALID_COORDINATOR_EXCEPTION = 23;
+    /// The leader epoch is invalid.
+    static constexpr int FENCED_LEADER_EPOCH_EXCEPTION = 24;
+    /// The request timed out.
+    static constexpr int REQUEST_TIME_OUT = 25;
+    /// The general storage exception.
+    static constexpr int STORAGE_EXCEPTION = 26;
+    /// The server did not attempt to execute this operation.
+    static constexpr int OPERATION_NOT_ATTEMPTED_EXCEPTION = 27;
+    /// Records are written to the server already, but to fewer in-sync replicas than required.
+    static constexpr int NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION = 28;
+    /// Messages are rejected since there are fewer in-sync replicas than required.
+    static constexpr int NOT_ENOUGH_REPLICAS_EXCEPTION = 29;
+    /// Get file access security token exception.
+    static constexpr int SECURITY_TOKEN_EXCEPTION = 30;
+    /// The tablet server received an out of order sequence batch.
+    static constexpr int OUT_OF_ORDER_SEQUENCE_EXCEPTION = 31;
+    /// The tablet server received a duplicate sequence batch.
+    static constexpr int DUPLICATE_SEQUENCE_EXCEPTION = 32;
+    /// The tablet server could not locate the writer metadata.
+    static constexpr int UNKNOWN_WRITER_ID_EXCEPTION = 33;
+    /// The requested column projection is invalid.
+    static constexpr int INVALID_COLUMN_PROJECTION = 34;
+    /// The requested target column to write is invalid.
+    static constexpr int INVALID_TARGET_COLUMN = 35;
+    /// The partition does not exist.
+    static constexpr int PARTITION_NOT_EXISTS = 36;
+    /// The table is not partitioned.
+    static constexpr int TABLE_NOT_PARTITIONED_EXCEPTION = 37;
+    /// The timestamp is invalid.
+    static constexpr int INVALID_TIMESTAMP_EXCEPTION = 38;
+    /// The config is invalid.
+    static constexpr int INVALID_CONFIG_EXCEPTION = 39;
+    /// The lake storage is not configured.
+    static constexpr int LAKE_STORAGE_NOT_CONFIGURED_EXCEPTION = 40;
+    /// The kv snapshot does not exist.
+    static constexpr int KV_SNAPSHOT_NOT_EXIST = 41;
+    /// The partition already exists.
+    static constexpr int PARTITION_ALREADY_EXISTS = 42;
+    /// The partition spec is invalid.
+    static constexpr int PARTITION_SPEC_INVALID_EXCEPTION = 43;
+    /// There is no currently available leader for the given partition.
+    static constexpr int LEADER_NOT_AVAILABLE_EXCEPTION = 44;
+    /// Exceed the maximum number of partitions.
+    static constexpr int PARTITION_MAX_NUM_EXCEPTION = 45;
+    /// Authentication failed.
+    static constexpr int AUTHENTICATE_EXCEPTION = 46;
+    /// Security is disabled.
+    static constexpr int SECURITY_DISABLED_EXCEPTION = 47;
+    /// Authorization failed.
+    static constexpr int AUTHORIZATION_EXCEPTION = 48;
+    /// Exceed the maximum number of buckets.
+    static constexpr int BUCKET_MAX_NUM_EXCEPTION = 49;
+    /// The tiering epoch is invalid.
+    static constexpr int FENCED_TIERING_EPOCH_EXCEPTION = 50;
+    /// Authentication failed with retriable exception.
+    static constexpr int RETRIABLE_AUTHENTICATE_EXCEPTION = 51;
+    /// The server rack info is invalid.
+    static constexpr int INVALID_SERVER_RACK_INFO_EXCEPTION = 52;
+    /// The lake snapshot does not exist.
+    static constexpr int LAKE_SNAPSHOT_NOT_EXIST = 53;
+    /// The lake table already exists.
+    static constexpr int LAKE_TABLE_ALREADY_EXIST = 54;
+    /// The new ISR contains at least one ineligible replica.
+    static constexpr int INELIGIBLE_REPLICA_EXCEPTION = 55;
+    /// The alter table is invalid.
+    static constexpr int INVALID_ALTER_TABLE_EXCEPTION = 56;
+    /// Deletion operations are disabled on this table.
+    static constexpr int DELETION_DISABLED_EXCEPTION = 57;
+
+    /// Returns true if retrying the request may succeed. Mirrors Java's RetriableException hierarchy.
+    static constexpr bool IsRetriable(int32_t code) {
+        return code == NETWORK_EXCEPTION || code == CORRUPT_MESSAGE ||
+               code == SCHEMA_NOT_EXIST || code == LOG_STORAGE_EXCEPTION ||
+               code == KV_STORAGE_EXCEPTION || code == NOT_LEADER_OR_FOLLOWER ||
+               code == CORRUPT_RECORD_EXCEPTION ||
+               code == UNKNOWN_TABLE_OR_BUCKET_EXCEPTION || code == REQUEST_TIME_OUT ||
+               code == STORAGE_EXCEPTION ||
+               code == NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION ||
+               code == NOT_ENOUGH_REPLICAS_EXCEPTION || code == LEADER_NOT_AVAILABLE_EXCEPTION;
+    }
+};
+
+struct Date {
+    int32_t days_since_epoch{0};
+
+    static Date FromDays(int32_t days) { return {days}; }
+    static Date FromYMD(int year, int month, int day);
+
+    int Year() const;
+    int Month() const;
+    int Day() const;
+};
+
+struct Time {
+    static constexpr int32_t kMillisPerSecond = 1000;
+    static constexpr int32_t kMillisPerMinute = 60 * kMillisPerSecond;
+    static constexpr int32_t kMillisPerHour = 60 * kMillisPerMinute;
+
+    int32_t millis_since_midnight{0};
+
+    static Time FromMillis(int32_t ms) { return {ms}; }
+    static Time FromHMS(int hour, int minute, int second, int millis = 0) {
+        return {hour * kMillisPerHour + minute * kMillisPerMinute + second * kMillisPerSecond +
+                millis};
+    }
+
+    int Hour() const { return millis_since_midnight / kMillisPerHour; }
+    int Minute() const { return (millis_since_midnight % kMillisPerHour) / kMillisPerMinute; }
+    int Second() const { return (millis_since_midnight % kMillisPerMinute) / kMillisPerSecond; }
+    int Millis() const { return millis_since_midnight % kMillisPerSecond; }
+};
+
+struct Timestamp {
+    static constexpr int32_t kMaxNanoOfMillisecond = 999999;
+    static constexpr int64_t kNanosPerMilli = 1000000;
+
+    int64_t epoch_millis{0};
+    int32_t nano_of_millisecond{0};
+
+    static Timestamp FromMillis(int64_t ms) { return {ms, 0}; }
+    static Timestamp FromMillisNanos(int64_t ms, int32_t nanos) {
+        if (nanos < 0) nanos = 0;
+        if (nanos > kMaxNanoOfMillisecond) nanos = kMaxNanoOfMillisecond;
+        return {ms, nanos};
+    }
+    static Timestamp FromTimePoint(std::chrono::system_clock::time_point tp) {
+        auto duration = tp.time_since_epoch();
+        auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
+        auto ms = ns / kNanosPerMilli;
+        auto nano_of_ms = static_cast<int32_t>(ns % kNanosPerMilli);
+        if (nano_of_ms < 0) {
+            nano_of_ms += kNanosPerMilli;
+            ms -= 1;
+        }
+        return {ms, nano_of_ms};
+    }
+};
+
+enum class ChangeType {
+    AppendOnly = 0,
+    Insert = 1,
+    UpdateBefore = 2,
+    UpdateAfter = 3,
+    Delete = 4,
+};
+
+enum class TypeId {
+    Unknown = 0,
+    Boolean = 1,
+    TinyInt = 2,
+    SmallInt = 3,
+    Int = 4,
+    BigInt = 5,
+    Float = 6,
+    Double = 7,
+    String = 8,
+    Bytes = 9,
+    Date = 10,
+    Time = 11,
+    Timestamp = 12,
+    TimestampLtz = 13,
+    Decimal = 14,
+    Char = 15,
+    Binary = 16,
+    Array = 17,
+};
+
+class DataType {
+   public:
+    explicit DataType(TypeId id, int32_t p = 0, int32_t s = 0, bool nullable = true)
+        : id_(id), precision_(p), scale_(s), nullable_(nullable) {}
+
+    static DataType Boolean() { return DataType(TypeId::Boolean); }
+    static DataType TinyInt() { return DataType(TypeId::TinyInt); }
+    static DataType SmallInt() { return DataType(TypeId::SmallInt); }
+    static DataType Int() { return DataType(TypeId::Int); }
+    static DataType BigInt() { return DataType(TypeId::BigInt); }
+    static DataType Float() { return DataType(TypeId::Float); }
+    static DataType Double() { return DataType(TypeId::Double); }
+    static DataType String() { return DataType(TypeId::String); }
+    static DataType Bytes() { return DataType(TypeId::Bytes); }
+    static DataType Date() { return DataType(TypeId::Date); }
+    static DataType Time() { return DataType(TypeId::Time); }
+    static DataType Timestamp(int32_t precision = 6) {
+        return DataType(TypeId::Timestamp, precision, 0);
+    }
+    static DataType TimestampLtz(int32_t precision = 6) {
+        return DataType(TypeId::TimestampLtz, precision, 0);
+    }
+    static DataType Decimal(int32_t precision, int32_t scale) {
+        return DataType(TypeId::Decimal, precision, scale);
+    }
+    static DataType Char(int32_t length) { return DataType(TypeId::Char, length, 0); }
+    static DataType Binary(int32_t length) { return DataType(TypeId::Binary, length, 0); }
+    /// Constructs an `ARRAY<element>` type. The element DataType (possibly
+    /// itself an array) is deep-copied into a shared owning handle so that
+    /// copies of the outer DataType remain cheap while the element lives
+    /// as long as any reference exists.
+    static DataType Array(DataType element) {
+        DataType dt(TypeId::Array, 0, 0);
+        dt.element_type_ = std::make_shared<DataType>(std::move(element));
+        return dt;
+    }
+
+    TypeId id() const { return id_; }
+    int32_t precision() const { return precision_; }
+    int32_t scale() const { return scale_; }
+    bool nullable() const { return nullable_; }
+    /// Returns the element type of an ARRAY. Returns `nullptr` for non-array
+    /// types. The returned pointer is valid as long as this DataType (or a
+    /// copy holding the same shared element) is alive.
+    const DataType* element_type() const { return element_type_.get(); }
+
+    /// Returns a copy of this DataType with nullable set to false.
+    DataType NotNull() const {
+        DataType dt(id_, precision_, scale_, false);
+        dt.element_type_ = element_type_;
+        return dt;
+    }
+
+   private:
+    TypeId id_;
+    int32_t precision_{0};
+    int32_t scale_{0};
+    bool nullable_{true};
+    std::shared_ptr<DataType> element_type_;
+};
+
+constexpr int64_t EARLIEST_OFFSET = -2;
+
+enum class OffsetType {
+    Earliest = 0,
+    Latest = 1,
+    Timestamp = 2,
+};
+
+struct OffsetSpec {
+    OffsetType type;
+    int64_t timestamp{0};
+
+    static OffsetSpec Earliest() { return {OffsetType::Earliest, 0}; }
+    static OffsetSpec Latest() { return {OffsetType::Latest, 0}; }
+    static OffsetSpec Timestamp(int64_t ts) { return {OffsetType::Timestamp, ts}; }
+};
+
+struct Result {
+    int32_t error_code{0};
+    std::string error_message;
+
+    bool Ok() const { return error_code == 0; }
+
+    /// Returns true if retrying the request may succeed. Client-side errors always return false.
+    bool IsRetriable() const { return ErrorCode::IsRetriable(error_code); }
+};
+
+struct TablePath {
+    std::string database_name;
+    std::string table_name;
+
+    TablePath() = default;
+    TablePath(std::string db, std::string tbl)
+        : database_name(std::move(db)), table_name(std::move(tbl)) {}
+
+    std::string ToString() const { return database_name + "." + table_name; }
+};
+
+struct Column {
+    std::string name;
+    DataType data_type;
+    std::string comment;
+};
+
+struct Schema {
+    std::vector<Column> columns;
+    std::vector<std::string> primary_keys;
+
+    class Builder {
+       public:
+        Builder& AddColumn(std::string name, DataType type, std::string comment = "") {
+            columns_.push_back({std::move(name), std::move(type), std::move(comment)});
+            return *this;
+        }
+
+        Builder& SetPrimaryKeys(std::vector<std::string> keys) {
+            primary_keys_ = std::move(keys);
+            return *this;
+        }
+
+        Schema Build() { return Schema{std::move(columns_), std::move(primary_keys_)}; }
+
+       private:
+        std::vector<Column> columns_;
+        std::vector<std::string> primary_keys_;
+    };
+
+    static Builder NewBuilder() { return Builder(); }
+};
+
+struct TableDescriptor {
+    Schema schema;
+    std::vector<std::string> partition_keys;
+    int32_t bucket_count{0};
+    std::vector<std::string> bucket_keys;
+    std::unordered_map<std::string, std::string> properties;
+    std::unordered_map<std::string, std::string> custom_properties;
+    std::string comment;
+
+    class Builder {
+       public:
+        Builder& SetSchema(Schema s) {
+            schema_ = std::move(s);
+            return *this;
+        }
+
+        Builder& SetPartitionKeys(std::vector<std::string> keys) {
+            partition_keys_ = std::move(keys);
+            return *this;
+        }
+
+        Builder& SetBucketCount(int32_t count) {
+            bucket_count_ = count;
+            return *this;
+        }
+
+        Builder& SetBucketKeys(std::vector<std::string> keys) {
+            bucket_keys_ = std::move(keys);
+            return *this;
+        }
+
+        Builder& SetProperty(std::string key, std::string value) {
+            properties_[std::move(key)] = std::move(value);
+            return *this;
+        }
+
+        Builder& SetCustomProperty(std::string key, std::string value) {
+            custom_properties_[std::move(key)] = std::move(value);
+            return *this;
+        }
+
+        Builder& SetLogFormat(std::string format) {
+            return SetProperty("table.log.format", std::move(format));
+        }
+
+        Builder& SetKvFormat(std::string format) {
+            return SetProperty("table.kv.format", std::move(format));
+        }
+
+        Builder& SetComment(std::string comment) {
+            comment_ = std::move(comment);
+            return *this;
+        }
+
+        TableDescriptor Build() {
+            return TableDescriptor{std::move(schema_),     std::move(partition_keys_),
+                                   bucket_count_,          std::move(bucket_keys_),
+                                   std::move(properties_), std::move(custom_properties_),
+                                   std::move(comment_)};
+        }
+
+       private:
+        Schema schema_;
+        std::vector<std::string> partition_keys_;
+        int32_t bucket_count_{0};
+        std::vector<std::string> bucket_keys_;
+        std::unordered_map<std::string, std::string> properties_;
+        std::unordered_map<std::string, std::string> custom_properties_;
+        std::string comment_;
+    };
+
+    static Builder NewBuilder() { return Builder(); }
+};
+
+struct TableInfo {
+    int64_t table_id;
+    int32_t schema_id;
+    TablePath table_path;
+    int64_t created_time;
+    int64_t modified_time;
+    std::vector<std::string> primary_keys;
+    std::vector<std::string> bucket_keys;
+    std::vector<std::string> partition_keys;
+    int32_t num_buckets;
+    bool has_primary_key;
+    bool is_partitioned;
+    std::unordered_map<std::string, std::string> properties;
+    std::unordered_map<std::string, std::string> custom_properties;
+    std::string comment;
+    Schema schema;
+};
+
+namespace detail {
+struct ColumnInfo {
+    size_t index;
+    TypeId type_id;
+};
+using ColumnMap = std::unordered_map<std::string, ColumnInfo>;
+
+inline size_t ResolveColumn(const ColumnMap& map, const std::string& name) {
+    auto it = map.find(name);
+    if (it == map.end()) {
+        throw std::runtime_error("Unknown column '" + name + "'");
+    }
+    return it->second.index;
+}
+
+// Forward declaration so NamedGetters can declare GetArrayView(...) even
+// though the concrete class is defined further down.
+}  // namespace detail
+class ArrayView;
+namespace detail {
+
+/// CRTP mixin that adds name-based getters to any class with index-based getters.
+/// Derived must provide: `size_t Resolve(const std::string&) const`
+/// and all the index-based getters (IsNull(idx), GetBool(idx), etc.).
+template <typename Derived>
+struct NamedGetters {
+    bool IsNull(const std::string& n) const { return Self().IsNull(Self().Resolve(n)); }
+    bool GetBool(const std::string& n) const { return Self().GetBool(Self().Resolve(n)); }
+    int32_t GetInt32(const std::string& n) const { return Self().GetInt32(Self().Resolve(n)); }
+    int64_t GetInt64(const std::string& n) const { return Self().GetInt64(Self().Resolve(n)); }
+    float GetFloat32(const std::string& n) const { return Self().GetFloat32(Self().Resolve(n)); }
+    double GetFloat64(const std::string& n) const { return Self().GetFloat64(Self().Resolve(n)); }
+    std::string_view GetString(const std::string& n) const {
+        return Self().GetString(Self().Resolve(n));
+    }
+    std::pair<const uint8_t*, size_t> GetBytes(const std::string& n) const {
+        return Self().GetBytes(Self().Resolve(n));
+    }
+    fluss::Date GetDate(const std::string& n) const { return Self().GetDate(Self().Resolve(n)); }
+    fluss::Time GetTime(const std::string& n) const { return Self().GetTime(Self().Resolve(n)); }
+    fluss::Timestamp GetTimestamp(const std::string& n) const {
+        return Self().GetTimestamp(Self().Resolve(n));
+    }
+    std::string GetDecimalString(const std::string& n) const {
+        return Self().GetDecimalString(Self().Resolve(n));
+    }
+    size_t GetArraySize(const std::string& n) const {
+        return Self().GetArraySize(Self().Resolve(n));
+    }
+    TypeId GetArrayElementType(const std::string& n) const {
+        return Self().GetArrayElementType(Self().Resolve(n));
+    }
+    bool IsArrayElementNull(const std::string& n, size_t element) const {
+        return Self().IsArrayElementNull(Self().Resolve(n), element);
+    }
+    bool GetArrayBool(const std::string& n, size_t element) const {
+        return Self().GetArrayBool(Self().Resolve(n), element);
+    }
+    int32_t GetArrayInt32(const std::string& n, size_t element) const {
+        return Self().GetArrayInt32(Self().Resolve(n), element);
+    }
+    int64_t GetArrayInt64(const std::string& n, size_t element) const {
+        return Self().GetArrayInt64(Self().Resolve(n), element);
+    }
+    float GetArrayFloat32(const std::string& n, size_t element) const {
+        return Self().GetArrayFloat32(Self().Resolve(n), element);
+    }
+    double GetArrayFloat64(const std::string& n, size_t element) const {
+        return Self().GetArrayFloat64(Self().Resolve(n), element);
+    }
+    std::string GetArrayString(const std::string& n, size_t element) const {
+        return Self().GetArrayString(Self().Resolve(n), element);
+    }
+    std::vector<uint8_t> GetArrayBytes(const std::string& n, size_t element) const {
+        return Self().GetArrayBytes(Self().Resolve(n), element);
+    }
+    fluss::Date GetArrayDate(const std::string& n, size_t element) const {
+        return Self().GetArrayDate(Self().Resolve(n), element);
+    }
+    fluss::Time GetArrayTime(const std::string& n, size_t element) const {
+        return Self().GetArrayTime(Self().Resolve(n), element);
+    }
+    fluss::Timestamp GetArrayTimestamp(const std::string& n, size_t element) const {
+        return Self().GetArrayTimestamp(Self().Resolve(n), element);
+    }
+    std::string GetArrayDecimalString(const std::string& n, size_t element) const {
+        return Self().GetArrayDecimalString(Self().Resolve(n), element);
+    }
+    // Definition appears below the ArrayView class; return-by-value requires
+    // the complete type so we cannot inline the body here.
+    ArrayView GetArrayView(const std::string& n) const;
+
+   private:
+    const Derived& Self() const { return static_cast<const Derived&>(*this); }
+};
+
+struct ScanData {
+    ffi::ScanResultInner* raw;
+    ColumnMap columns;
+
+    ScanData(ffi::ScanResultInner* r, ColumnMap cols) : raw(r), columns(std::move(cols)) {}
+    ~ScanData();
+
+    ScanData(const ScanData&) = delete;
+    ScanData& operator=(const ScanData&) = delete;
+};
+}  // namespace detail
+
+/**
+ * @brief Read-only view over a FlussArray column value.
+ *
+ * Obtained from RowView::GetArrayView() / LookupResult::GetArrayView(), and
+ * recursively from ArrayView::GetArray() for nested `ARRAY<ARRAY<...>>`
+ * columns. Owns an opaque Rust handle (FlussArray + element DataType) that
+ * is released on destruction. Move-only.
+ */
+class ArrayView {
+   public:
+    ~ArrayView() noexcept;
+
+    ArrayView(const ArrayView&) = delete;
+    ArrayView& operator=(const ArrayView&) = delete;
+    ArrayView(ArrayView&& other) noexcept;
+    ArrayView& operator=(ArrayView&& other) noexcept;
+
+    size_t Size() const noexcept;
+    TypeId ElementType() const noexcept;
+    bool IsNull(size_t element) const;
+
+    bool GetBool(size_t element) const;
+    int32_t GetInt32(size_t element) const;
+    int64_t GetInt64(size_t element) const;
+    float GetFloat32(size_t element) const;
+    double GetFloat64(size_t element) const;
+    std::string GetString(size_t element) const;
+    std::vector<uint8_t> GetBytes(size_t element) const;
+    fluss::Date GetDate(size_t element) const;
+    fluss::Time GetTime(size_t element) const;
+    fluss::Timestamp GetTimestampNtz(size_t element) const;
+    fluss::Timestamp GetTimestampLtz(size_t element) const;
+    std::string GetDecimalString(size_t element) const;
+    ArrayView GetArray(size_t element) const;
+
+   private:
+    friend class RowView;
+    friend class LookupResult;
+    explicit ArrayView(ffi::ArrayViewInner* inner) : inner_(inner) {}
+    void Destroy() noexcept;
+    ffi::ArrayViewInner* inner_{nullptr};
+};
+
+namespace detail {
+template <typename Derived>
+inline ArrayView NamedGetters<Derived>::GetArrayView(const std::string& n) const {
+    return Self().GetArrayView(Self().Resolve(n));
+}
+}  // namespace detail
+
+class ArrayWriter {
+   public:
+    ArrayWriter(size_t size, DataType element_type);
+    ~ArrayWriter() noexcept;
+
+    ArrayWriter(const ArrayWriter&) = delete;
+    ArrayWriter& operator=(const ArrayWriter&) = delete;
+    ArrayWriter(ArrayWriter&& other) noexcept;
+    ArrayWriter& operator=(ArrayWriter&& other) noexcept;
+
+    bool Available() const;
+    size_t Size() const noexcept;
+
+    void SetNull(size_t idx);
+    void SetBool(size_t idx, bool v);
+    void SetInt32(size_t idx, int32_t v);
+    void SetInt64(size_t idx, int64_t v);
+    void SetFloat32(size_t idx, float v);
+    void SetFloat64(size_t idx, double v);
+    void SetString(size_t idx, const std::string& v);
+    void SetBytes(size_t idx, const std::vector<uint8_t>& v);
+    void SetDate(size_t idx, fluss::Date d);
+    void SetTime(size_t idx, fluss::Time t);
+    void SetTimestampNtz(size_t idx, fluss::Timestamp ts);
+    void SetTimestampLtz(size_t idx, fluss::Timestamp ts);
+    void SetDecimal(size_t idx, const std::string& value);
+    void SetArray(size_t idx, ArrayWriter&& nested);
+
+   private:
+    friend class GenericRow;
+    void Destroy() noexcept;
+    ffi::ArrayWriterInner* inner_{nullptr};
+    DataType element_type_;
+};
+
+class GenericRow {
+   public:
+    GenericRow();
+    explicit GenericRow(size_t field_count);
+    ~GenericRow() noexcept;
+
+    GenericRow(const GenericRow&) = delete;
+    GenericRow& operator=(const GenericRow&) = delete;
+    GenericRow(GenericRow&& other) noexcept;
+    GenericRow& operator=(GenericRow&& other) noexcept;
+
+    bool Available() const;
+    void Reset();
+
+    // ── Index-based setters ──────────────────────────────────────────
+    void SetNull(size_t idx);
+    void SetBool(size_t idx, bool v);
+    void SetInt32(size_t idx, int32_t v);
+    void SetInt64(size_t idx, int64_t v);
+    void SetFloat32(size_t idx, float v);
+    void SetFloat64(size_t idx, double v);
+    void SetString(size_t idx, std::string v);
+    void SetBytes(size_t idx, std::vector<uint8_t> v);
+    void SetDate(size_t idx, fluss::Date d);
+    void SetTime(size_t idx, fluss::Time t);
+    void SetTimestampNtz(size_t idx, fluss::Timestamp ts);
+    void SetTimestampLtz(size_t idx, fluss::Timestamp ts);
+    void SetDecimal(size_t idx, const std::string& value);
+    void SetArray(size_t idx, ArrayWriter&& writer);
+
+    // ── Name-based setters (require schema — see Table::NewRow()) ───
+    void Set(const std::string& name, std::nullptr_t) { SetNull(Resolve(name)); }
+    void Set(const std::string& name, bool v) { SetBool(Resolve(name), v); }
+    void Set(const std::string& name, int32_t v) { SetInt32(Resolve(name), v); }
+    void Set(const std::string& name, int64_t v) { SetInt64(Resolve(name), v); }
+    void Set(const std::string& name, float v) { SetFloat32(Resolve(name), v); }
+    void Set(const std::string& name, double v) { SetFloat64(Resolve(name), v); }
+    // const char* overload to prevent "string literal" -> bool conversion
+    void Set(const std::string& name, const char* v) {
+        auto [idx, type] = ResolveColumn(name);
+        if (type == TypeId::Decimal) {
+            SetDecimal(idx, v);
+        } else if (type == TypeId::String) {
+            SetString(idx, v);
+        } else {
+            throw std::runtime_error("GenericRow::Set: column '" + name +
+                                     "' is not a string or decimal column");
+        }
+    }
+    void Set(const std::string& name, std::string v) {
+        auto [idx, type] = ResolveColumn(name);
+        if (type == TypeId::Decimal) {
+            SetDecimal(idx, v);
+        } else if (type == TypeId::String) {
+            SetString(idx, std::move(v));
+        } else {
+            throw std::runtime_error("GenericRow::Set: column '" + name +
+                                     "' is not a string or decimal column");
+        }
+    }
+    void Set(const std::string& name, std::vector<uint8_t> v) {
+        SetBytes(Resolve(name), std::move(v));
+    }
+    void Set(const std::string& name, fluss::Date d) { SetDate(Resolve(name), d); }
+    void Set(const std::string& name, fluss::Time t) { SetTime(Resolve(name), t); }
+    void Set(const std::string& name, fluss::Timestamp ts) {
+        auto [idx, type] = ResolveColumn(name);
+        if (type == TypeId::TimestampLtz) {
+            SetTimestampLtz(idx, ts);
+        } else if (type == TypeId::Timestamp) {
+            SetTimestampNtz(idx, ts);
+        } else {
+            throw std::runtime_error("GenericRow::Set: column '" + name +
+                                     "' is not a timestamp column");
+        }
+    }
+    void Set(const std::string& name, ArrayWriter&& writer) { SetArray(Resolve(name), std::move(writer)); }
+
+   private:
+    friend class Table;
+    friend class AppendWriter;
+    friend class UpsertWriter;
+    friend class Lookuper;
+
+    using ColumnInfo = detail::ColumnInfo;
+    using ColumnMap = detail::ColumnMap;
+
+    size_t Resolve(const std::string& name) const { return ResolveColumn(name).index; }
+
+    const ColumnInfo& ResolveColumn(const std::string& name) const {
+        if (!column_map_) {
+            throw std::runtime_error(
+                "GenericRow: name-based Set() requires a schema. "
+                "Use Table::NewRow() to create a schema-aware row.");
+        }
+        auto it = column_map_->find(name);
+        if (it == column_map_->end()) {
+            throw std::runtime_error("GenericRow: unknown column '" + name + "'");
+        }
+        return it->second;
+    }
+
+    void Destroy() noexcept;
+
+    ffi::GenericRowInner* inner_{nullptr};
+    std::shared_ptr<ColumnMap> column_map_;
+};
+
+/// Read-only row view for scan results. Zero-copy access to string and bytes data.
+///
+/// RowView shares ownership of the underlying scan data via reference counting,
+/// so it can safely outlive the ScanRecords that produced it.
+class RowView : public detail::NamedGetters<RowView> {
+    friend struct detail::NamedGetters<RowView>;
+
+   public:
+    RowView(std::shared_ptr<const detail::ScanData> data, size_t bucket_idx, size_t rec_idx)
+        : data_(std::move(data)), bucket_idx_(bucket_idx), rec_idx_(rec_idx) {}
+
+    // ── Index-based getters ──────────────────────────────────────────
+    size_t FieldCount() const;
+    TypeId GetType(size_t idx) const;
+    bool IsNull(size_t idx) const;
+    bool GetBool(size_t idx) const;
+    int32_t GetInt32(size_t idx) const;
+    int64_t GetInt64(size_t idx) const;
+    float GetFloat32(size_t idx) const;
+    double GetFloat64(size_t idx) const;
+    std::string_view GetString(size_t idx) const;
+    std::pair<const uint8_t*, size_t> GetBytes(size_t idx) const;
+    fluss::Date GetDate(size_t idx) const;
+    fluss::Time GetTime(size_t idx) const;
+    fluss::Timestamp GetTimestamp(size_t idx) const;
+    bool IsDecimal(size_t idx) const;
+    std::string GetDecimalString(size_t idx) const;
+
+    // ── Array getters ────────────────────────────────────────────────
+    size_t GetArraySize(size_t idx) const;
+    TypeId GetArrayElementType(size_t idx) const;
+    bool IsArrayElementNull(size_t idx, size_t element) const;
+    bool GetArrayBool(size_t idx, size_t element) const;
+    int32_t GetArrayInt32(size_t idx, size_t element) const;
+    int64_t GetArrayInt64(size_t idx, size_t element) const;
+    float GetArrayFloat32(size_t idx, size_t element) const;
+    double GetArrayFloat64(size_t idx, size_t element) const;
+    std::string GetArrayString(size_t idx, size_t element) const;
+    std::vector<uint8_t> GetArrayBytes(size_t idx, size_t element) const;
+    fluss::Date GetArrayDate(size_t idx, size_t element) const;
+    fluss::Time GetArrayTime(size_t idx, size_t element) const;
+    fluss::Timestamp GetArrayTimestamp(size_t idx, size_t element) const;
+    std::string GetArrayDecimalString(size_t idx, size_t element) const;
+    /// Returns an owning ArrayView over the array column at `idx`. ArrayView
+    /// supports nested arrays via ArrayView::GetArray(). Parity with Python's
+    /// recursive list return from `row.get_array(i)`.
+    ArrayView GetArrayView(size_t idx) const;
+
+    // Name-based getters inherited from detail::NamedGetters<RowView>
+    using detail::NamedGetters<RowView>::IsNull;
+    using detail::NamedGetters<RowView>::GetBool;
+    using detail::NamedGetters<RowView>::GetInt32;
+    using detail::NamedGetters<RowView>::GetInt64;
+    using detail::NamedGetters<RowView>::GetFloat32;
+    using detail::NamedGetters<RowView>::GetFloat64;
+    using detail::NamedGetters<RowView>::GetString;
+    using detail::NamedGetters<RowView>::GetBytes;
+    using detail::NamedGetters<RowView>::GetDate;
+    using detail::NamedGetters<RowView>::GetTime;
+    using detail::NamedGetters<RowView>::GetTimestamp;
+    using detail::NamedGetters<RowView>::GetDecimalString;
+    using detail::NamedGetters<RowView>::GetArraySize;
+    using detail::NamedGetters<RowView>::GetArrayElementType;
+    using detail::NamedGetters<RowView>::IsArrayElementNull;
+    using detail::NamedGetters<RowView>::GetArrayBool;
+    using detail::NamedGetters<RowView>::GetArrayInt32;
+    using detail::NamedGetters<RowView>::GetArrayInt64;
+    using detail::NamedGetters<RowView>::GetArrayFloat32;
+    using detail::NamedGetters<RowView>::GetArrayFloat64;
+    using detail::NamedGetters<RowView>::GetArrayString;
+    using detail::NamedGetters<RowView>::GetArrayBytes;
+    using detail::NamedGetters<RowView>::GetArrayDate;
+    using detail::NamedGetters<RowView>::GetArrayTime;
+    using detail::NamedGetters<RowView>::GetArrayTimestamp;
+    using detail::NamedGetters<RowView>::GetArrayDecimalString;
+    using detail::NamedGetters<RowView>::GetArrayView;
+
+   private:
+    size_t Resolve(const std::string& name) const {
+        if (!data_) {
+            throw std::runtime_error("RowView: name-based access not available");
+        }
+        return detail::ResolveColumn(data_->columns, name);
+    }
+    std::shared_ptr<const detail::ScanData> data_;
+    size_t bucket_idx_;
+    size_t rec_idx_;
+};
+
+/// Identifies a specific bucket, optionally within a partition.
+struct TableBucket {
+    int64_t table_id;
+    int32_t bucket_id;
+    std::optional<int64_t> partition_id;
+
+    bool operator==(const TableBucket& other) const {
+        return table_id == other.table_id && bucket_id == other.bucket_id &&
+               partition_id == other.partition_id;
+    }
+
+    bool operator!=(const TableBucket& other) const { return !(*this == other); }
+};
+
+/// A single scan record. Contains metadata and a RowView for field access.
+///
+/// ScanRecord is a value type that can be freely copied, stored, and
+/// accumulated across multiple Poll() calls.
+struct ScanRecord {
+    int64_t offset;
+    int64_t timestamp;
+    ChangeType change_type;
+    RowView row;
+};
+
+/// A bundle of scan records belonging to a single bucket.
+///
+/// BucketRecords is a value type — it shares ownership of the underlying scan data
+/// via reference counting, so it can safely outlive the ScanRecords that produced it.
+class BucketRecords {
+   public:
+    BucketRecords(std::shared_ptr<const detail::ScanData> data, TableBucket bucket,
+                  size_t bucket_idx, size_t count)
+        : data_(std::move(data)),
+          bucket_(std::move(bucket)),
+          bucket_idx_(bucket_idx),
+          count_(count) {}
+
+    /// The bucket these records belong to.
+    const TableBucket& Bucket() const { return bucket_; }
+
+    /// Number of records in this bucket.
+    size_t Size() const { return count_; }
+    bool Empty() const { return count_ == 0; }
+
+    /// Access a record by its position within this bucket (0-based).
+    ScanRecord operator[](size_t idx) const;
+
+    class Iterator {
+       public:
+        ScanRecord operator*() const;
+        Iterator& operator++() {
+            ++idx_;
+            return *this;
+        }
+        bool operator!=(const Iterator& other) const { return idx_ != other.idx_; }
+
+       private:
+        friend class BucketRecords;
+        Iterator(const BucketRecords* owner, size_t idx) : owner_(owner), idx_(idx) {}
+        const BucketRecords* owner_;
+        size_t idx_;
+    };
+
+    Iterator begin() const { return Iterator(this, 0); }
+    Iterator end() const { return Iterator(this, count_); }
+
+   private:
+    std::shared_ptr<const detail::ScanData> data_;
+    TableBucket bucket_;
+    size_t bucket_idx_;
+    size_t count_;
+};
+
+class ScanRecords {
+   public:
+    ScanRecords() noexcept = default;
+    ~ScanRecords() noexcept = default;
+
+    ScanRecords(const ScanRecords&) = delete;
+    ScanRecords& operator=(const ScanRecords&) = delete;
+    ScanRecords(ScanRecords&&) noexcept = default;
+    ScanRecords& operator=(ScanRecords&&) noexcept = default;
+
+    /// Total number of records across all buckets.
+    size_t Count() const;
+    bool IsEmpty() const;
+
+    /// Number of distinct buckets with records.
+    size_t BucketCount() const;
+
+    /// List of distinct buckets that have records.
+    std::vector<TableBucket> Buckets() const;
+
+    /// Get records for a specific bucket.
+    ///
+    /// Returns an empty BucketRecords if the bucket is not present (matches Rust/Java).
+    /// Note: O(B) linear scan. For iteration over all buckets, prefer BucketAt(idx).
+    BucketRecords Records(const TableBucket& bucket) const;
+
+    /// Get records by bucket index (0-based). O(1).
+    ///
+    /// Throws std::out_of_range if idx >= BucketCount().
+    BucketRecords BucketAt(size_t idx) const;
+
+    /// Flat iterator over all records across all buckets (matches Java Iterable<ScanRecord>).
+    class Iterator {
+       public:
+        ScanRecord operator*() const;
+        Iterator& operator++();
+        bool operator!=(const Iterator& other) const {
+            return bucket_idx_ != other.bucket_idx_ || rec_idx_ != other.rec_idx_;
+        }
+
+       private:
+        friend class ScanRecords;
+        Iterator(const ScanRecords* owner, size_t bucket_idx, size_t rec_idx)
+            : owner_(owner), bucket_idx_(bucket_idx), rec_idx_(rec_idx) {}
+        const ScanRecords* owner_;
+        size_t bucket_idx_;
+        size_t rec_idx_;
+    };
+
+    Iterator begin() const;
+    Iterator end() const { return Iterator(this, BucketCount(), 0); }
+
+   private:
+    friend class LogScanner;
+    ScanRecord RecordAt(size_t bucket, size_t rec_idx) const;
+    std::shared_ptr<const detail::ScanData> data_;
+};
+
+class ArrowRecordBatch {
+   public:
+    std::shared_ptr<arrow::RecordBatch> GetArrowRecordBatch() const { return batch_; }
+
+    bool Available() const;
+
+    // Get number of rows in the batch
+    int64_t NumRows() const;
+
+    // Get ScanBatch metadata
+    int64_t GetTableId() const;
+    int64_t GetPartitionId() const;
+    int32_t GetBucketId() const;
+    int64_t GetBaseOffset() const;
+    int64_t GetLastOffset() const;
+
+   private:
+    friend class LogScanner;
+    explicit ArrowRecordBatch(std::shared_ptr<arrow::RecordBatch> batch, int64_t table_id,
+                              int64_t partition_id, int32_t bucket_id,
+                              int64_t base_offset) noexcept;
+
+    std::shared_ptr<arrow::RecordBatch> batch_{nullptr};
+
+    int64_t table_id_;
+    int64_t partition_id_;
+    int32_t bucket_id_;
+    int64_t base_offset_;
+};
+
+struct ArrowRecordBatches {
+    std::vector<std::unique_ptr<ArrowRecordBatch>> batches;
+
+    size_t Size() const { return batches.size(); }
+    bool Empty() const { return batches.empty(); }
+    const std::unique_ptr<ArrowRecordBatch>& operator[](size_t idx) const { return batches[idx]; }
+
+    auto begin() const { return batches.begin(); }
+    auto end() const { return batches.end(); }
+};
+
+struct BucketOffset {
+    int64_t table_id;
+    int64_t partition_id;
+    int32_t bucket_id;
+    int64_t offset;
+};
+
+struct BucketSubscription {
+    int32_t bucket_id;
+    int64_t offset;
+};
+
+struct PartitionBucketSubscription {
+    int64_t partition_id;
+    int32_t bucket_id;
+    int64_t offset;
+};
+
+struct LakeSnapshot {
+    int64_t snapshot_id;
+    std::vector<BucketOffset> bucket_offsets;
+};
+
+struct PartitionInfo {
+    int64_t partition_id;
+    std::string partition_name;
+};
+
+struct ServerNode {
+    int32_t id;
+    std::string host;
+    uint32_t port;
+    std::string server_type;
+    std::string uid;
+};
+
+/// Descriptor for create_database (optional). Leave comment and properties empty for default.
+struct DatabaseDescriptor {
+    std::string comment;
+    std::unordered_map<std::string, std::string> properties;
+};
+
+/// Metadata returned by GetDatabaseInfo.
+struct DatabaseInfo {
+    std::string database_name;
+    std::string comment;
+    std::unordered_map<std::string, std::string> properties;
+    int64_t created_time{0};
+    int64_t modified_time{0};
+};
+
+/// Read-only result for lookup operations.
+class LookupResult : public detail::NamedGetters<LookupResult> {
+    friend struct detail::NamedGetters<LookupResult>;
+
+   public:
+    LookupResult() noexcept;
+    ~LookupResult() noexcept;
+
+    LookupResult(const LookupResult&) = delete;
+    LookupResult& operator=(const LookupResult&) = delete;
+    LookupResult(LookupResult&& other) noexcept;
+    LookupResult& operator=(LookupResult&& other) noexcept;
+
+    bool Found() const;
+    size_t FieldCount() const;
+
+    // ── Index-based getters ──────────────────────────────────────────
+    TypeId GetType(size_t idx) const;
+    bool IsNull(size_t idx) const;
+    bool GetBool(size_t idx) const;
+    int32_t GetInt32(size_t idx) const;
+    int64_t GetInt64(size_t idx) const;
+    float GetFloat32(size_t idx) const;
+    double GetFloat64(size_t idx) const;
+    std::string_view GetString(size_t idx) const;
+    std::pair<const uint8_t*, size_t> GetBytes(size_t idx) const;
+    fluss::Date GetDate(size_t idx) const;
+    fluss::Time GetTime(size_t idx) const;
+    fluss::Timestamp GetTimestamp(size_t idx) const;
+    bool IsDecimal(size_t idx) const;
+    std::string GetDecimalString(size_t idx) const;
+
+    // ── Array getters ────────────────────────────────────────────────
+    size_t GetArraySize(size_t idx) const;
+    TypeId GetArrayElementType(size_t idx) const;
+    bool IsArrayElementNull(size_t idx, size_t element) const;
+    bool GetArrayBool(size_t idx, size_t element) const;
+    int32_t GetArrayInt32(size_t idx, size_t element) const;
+    int64_t GetArrayInt64(size_t idx, size_t element) const;
+    float GetArrayFloat32(size_t idx, size_t element) const;
+    double GetArrayFloat64(size_t idx, size_t element) const;
+    std::string GetArrayString(size_t idx, size_t element) const;
+    std::vector<uint8_t> GetArrayBytes(size_t idx, size_t element) const;
+    fluss::Date GetArrayDate(size_t idx, size_t element) const;
+    fluss::Time GetArrayTime(size_t idx, size_t element) const;
+    fluss::Timestamp GetArrayTimestamp(size_t idx, size_t element) const;
+    std::string GetArrayDecimalString(size_t idx, size_t element) const;
+    /// See RowView::GetArrayView for semantics. Supports nested arrays.
+    ArrayView GetArrayView(size_t idx) const;
+
+    // Name-based getters inherited from detail::NamedGetters<LookupResult>
+    using detail::NamedGetters<LookupResult>::IsNull;
+    using detail::NamedGetters<LookupResult>::GetBool;
+    using detail::NamedGetters<LookupResult>::GetInt32;
+    using detail::NamedGetters<LookupResult>::GetInt64;
+    using detail::NamedGetters<LookupResult>::GetFloat32;
+    using detail::NamedGetters<LookupResult>::GetFloat64;
+    using detail::NamedGetters<LookupResult>::GetString;
+    using detail::NamedGetters<LookupResult>::GetBytes;
+    using detail::NamedGetters<LookupResult>::GetDate;
+    using detail::NamedGetters<LookupResult>::GetTime;
+    using detail::NamedGetters<LookupResult>::GetTimestamp;
+    using detail::NamedGetters<LookupResult>::GetDecimalString;
+    using detail::NamedGetters<LookupResult>::GetArraySize;
+    using detail::NamedGetters<LookupResult>::GetArrayElementType;
+    using detail::NamedGetters<LookupResult>::IsArrayElementNull;
+    using detail::NamedGetters<LookupResult>::GetArrayBool;
+    using detail::NamedGetters<LookupResult>::GetArrayInt32;
+    using detail::NamedGetters<LookupResult>::GetArrayInt64;
+    using detail::NamedGetters<LookupResult>::GetArrayFloat32;
+    using detail::NamedGetters<LookupResult>::GetArrayFloat64;
+    using detail::NamedGetters<LookupResult>::GetArrayString;
+    using detail::NamedGetters<LookupResult>::GetArrayBytes;
+    using detail::NamedGetters<LookupResult>::GetArrayDate;
+    using detail::NamedGetters<LookupResult>::GetArrayTime;
+    using detail::NamedGetters<LookupResult>::GetArrayTimestamp;
+    using detail::NamedGetters<LookupResult>::GetArrayDecimalString;
+    using detail::NamedGetters<LookupResult>::GetArrayView;
+
+   private:
+    friend class Lookuper;
+    size_t Resolve(const std::string& name) const {
+        if (!column_map_) {
+            BuildColumnMap();
+        }
+        return detail::ResolveColumn(*column_map_, name);
+    }
+    void Destroy() noexcept;
+    void BuildColumnMap() const;
+    ffi::LookupResultInner* inner_{nullptr};
+    mutable std::shared_ptr<detail::ColumnMap> column_map_;
+};
+
+class AppendWriter;
+class UpsertWriter;
+class Lookuper;
+class WriteResult;
+class LogScanner;
+class Admin;
+class Table;
+class TableAppend;
+class TableUpsert;
+class TableLookup;
+class TableScan;
+
+struct Configuration {
+    // Coordinator server address
+    std::string bootstrap_servers{"127.0.0.1:9123"};
+    // Max request size in bytes (10 MB)
+    int32_t writer_request_max_size{10 * 1024 * 1024};
+    // Writer acknowledgment mode: "all", "0", "1", or "-1"
+    std::string writer_acks{"all"};
+    // Max number of writer retries
+    int32_t writer_retries{std::numeric_limits<int32_t>::max()};
+    // Writer batch size in bytes (2 MB), also the upper bound when dynamic sizing is on
+    int32_t writer_batch_size{2 * 1024 * 1024};
+    // Tune the per-table writer batch size from observed fill ratios
+    bool writer_dynamic_batch_size_enabled{true};
+    // Lower bound (256 KB) for the dynamic batch size estimator
+    int32_t writer_dynamic_batch_size_min{256 * 1024};
+    // Bucket assigner for tables without bucket keys: "sticky" or "round_robin"
+    std::string writer_bucket_no_key_assigner{"sticky"};
+    // Number of remote log batches to prefetch during scanning
+    size_t scanner_remote_log_prefetch_num{4};
+    // Number of threads for downloading remote log data
+    size_t remote_file_download_thread_num{3};
+    // Remote log read concurrency within one file (streaming read path)
+    size_t scanner_remote_log_read_concurrency{4};
+    // Maximum number of records returned in a single call to Poll() for LogScanner
+    size_t scanner_log_max_poll_records{500};
+    // Maximum bytes per fetch response for LogScanner (16 MB)
+    int32_t scanner_log_fetch_max_bytes{16 * 1024 * 1024};
+    // Minimum bytes to accumulate before server returns a fetch response
+    int32_t scanner_log_fetch_min_bytes{1};
+    // Maximum time (ms) the server may wait to satisfy min bytes
+    int32_t scanner_log_fetch_wait_max_time_ms{500};
+    // Maximum bytes per fetch response per bucket for LogScanner (1 MB)
+    int32_t scanner_log_fetch_max_bytes_for_bucket{1024 * 1024};
+    int64_t writer_batch_timeout_ms{100};
+    // Whether to enable idempotent writes
+    bool writer_enable_idempotence{true};
+    // Maximum number of in-flight requests per bucket for idempotent writes
+    size_t writer_max_inflight_requests_per_bucket{5};
+    // Total memory available for buffering write batches (default 64MB)
+    size_t writer_buffer_memory_size{64 * 1024 * 1024};
+    // Maximum time in milliseconds to block waiting for buffer memory
+    uint64_t writer_buffer_wait_timeout_ms{std::numeric_limits<uint64_t>::max()};
+    // Connect timeout in milliseconds for TCP transport connect
+    uint64_t connect_timeout_ms{120000};
+    // Security protocol: "PLAINTEXT" (default, no auth) or "sasl" (SASL auth)
+    std::string security_protocol{"PLAINTEXT"};
+    // SASL mechanism (only "PLAIN" is supported)
+    std::string security_sasl_mechanism{"PLAIN"};
+    // SASL username (required when security_protocol is "sasl")
+    std::string security_sasl_username;
+    // SASL password (required when security_protocol is "sasl")
+    std::string security_sasl_password;
+    // Maximum number of pending lookup operations
+    size_t lookup_queue_size{25600};
+    // Maximum batch size of merging lookup operations to one lookup request
+    size_t lookup_max_batch_size{128};
+    // Maximum time to wait for the lookup batch to fill (in milliseconds)
+    uint64_t lookup_batch_timeout_ms{100};
+    // Maximum number of unacknowledged lookup requests
+    size_t lookup_max_inflight_requests{128};
+    // Maximum number of lookup retries
+    int32_t lookup_max_retries{std::numeric_limits<int32_t>::max()};
+};
+
+class Connection {
+   public:
+    Connection() noexcept;
+    ~Connection() noexcept;
+
+    Connection(const Connection&) = delete;
+    Connection& operator=(const Connection&) = delete;
+    Connection(Connection&& other) noexcept;
+    Connection& operator=(Connection&& other) noexcept;
+
+    static Result Create(const Configuration& config, Connection& out);
+
+    bool Available() const;
+
+    Result GetAdmin(Admin& out);
+    Result GetTable(const TablePath& table_path, Table& out);
+
+   private:
+    void Destroy() noexcept;
+    ffi::Connection* conn_{nullptr};
+};
+
+class Admin {
+   public:
+    Admin() noexcept;
+    ~Admin() noexcept;
+
+    Admin(const Admin&) = delete;
+    Admin& operator=(const Admin&) = delete;
+    Admin(Admin&& other) noexcept;
+    Admin& operator=(Admin&& other) noexcept;
+
+    bool Available() const;
+
+    Result CreateTable(const TablePath& table_path, const TableDescriptor& descriptor,
+                       bool ignore_if_exists = false);
+
+    Result DropTable(const TablePath& table_path, bool ignore_if_not_exists = false);
+
+    Result GetTableInfo(const TablePath& table_path, TableInfo& out);
+
+    Result GetLatestLakeSnapshot(const TablePath& table_path, LakeSnapshot& out);
+
+    Result ListOffsets(const TablePath& table_path, const std::vector<int32_t>& bucket_ids,
+                       const OffsetSpec& offset_spec, std::unordered_map<int32_t, int64_t>& out);
+
+    Result ListPartitionOffsets(const TablePath& table_path, const std::string& partition_name,
+                                const std::vector<int32_t>& bucket_ids,
+                                const OffsetSpec& offset_spec,
+                                std::unordered_map<int32_t, int64_t>& out);
+
+    Result ListPartitionInfos(const TablePath& table_path, std::vector<PartitionInfo>& out);
+
+    Result ListPartitionInfos(const TablePath& table_path,
+                              const std::unordered_map<std::string, std::string>& partition_spec,
+                              std::vector<PartitionInfo>& out);
+
+    Result CreatePartition(const TablePath& table_path,
+                           const std::unordered_map<std::string, std::string>& partition_spec,
+                           bool ignore_if_exists = false);
+
+    Result DropPartition(const TablePath& table_path,
+                         const std::unordered_map<std::string, std::string>& partition_spec,
+                         bool ignore_if_not_exists = false);
+
+    Result CreateDatabase(const std::string& database_name, const DatabaseDescriptor& descriptor,
+                          bool ignore_if_exists = false);
+
+    Result DropDatabase(const std::string& database_name, bool ignore_if_not_exists = false,
+                        bool cascade = true);
+
+    Result ListDatabases(std::vector<std::string>& out);
+
+    Result DatabaseExists(const std::string& database_name, bool& out);
+
+    Result GetDatabaseInfo(const std::string& database_name, DatabaseInfo& out);
+
+    Result ListTables(const std::string& database_name, std::vector<std::string>& out);
+
+    Result TableExists(const TablePath& table_path, bool& out);
+
+    Result GetServerNodes(std::vector<ServerNode>& out);
+
+   private:
+    Result DoListOffsets(const TablePath& table_path, const std::vector<int32_t>& bucket_ids,
+                         const OffsetSpec& offset_spec, std::unordered_map<int32_t, int64_t>& out,
+                         const std::string* partition_name = nullptr);
+
+    friend class Connection;
+    Admin(ffi::Admin* admin) noexcept;
+
+    void Destroy() noexcept;
+    ffi::Admin* admin_{nullptr};
+};
+
+class Table {
+   public:
+    Table() noexcept;
+    ~Table() noexcept;
+
+    Table(const Table&) = delete;
+    Table& operator=(const Table&) = delete;
+    Table(Table&& other) noexcept;
+    Table& operator=(Table&& other) noexcept;
+
+    bool Available() const;
+
+    GenericRow NewRow() const;
+
+    TableAppend NewAppend();
+    TableUpsert NewUpsert();
+    TableLookup NewLookup();
+    TableScan NewScan();
+
+    TableInfo GetTableInfo() const;
+    TablePath GetTablePath() const;
+    bool HasPrimaryKey() const;
+
+   private:
+    friend class Connection;
+    friend class TableAppend;
+    friend class TableUpsert;
+    friend class TableLookup;
+    friend class TableScan;
+    Table(ffi::Table* table) noexcept;
+
+    void Destroy() noexcept;
+    const std::shared_ptr<GenericRow::ColumnMap>& GetColumnMap() const;
+
+    ffi::Table* table_{nullptr};
+    mutable std::shared_ptr<GenericRow::ColumnMap> column_map_;
+};
+
+class TableAppend {
+   public:
+    TableAppend(const TableAppend&) = delete;
+    TableAppend& operator=(const TableAppend&) = delete;
+    TableAppend(TableAppend&&) noexcept = default;
+    TableAppend& operator=(TableAppend&&) noexcept = default;
+
+    Result CreateWriter(AppendWriter& out);
+
+   private:
+    friend class Table;
+    explicit TableAppend(ffi::Table* table) noexcept;
+
+    ffi::Table* table_{nullptr};
+};
+
+class TableUpsert {
+   public:
+    TableUpsert(const TableUpsert&) = delete;
+    TableUpsert& operator=(const TableUpsert&) = delete;
+    TableUpsert(TableUpsert&&) noexcept = default;
+    TableUpsert& operator=(TableUpsert&&) noexcept = default;
+
+    TableUpsert& PartialUpdateByIndex(std::vector<size_t> column_indices);
+    TableUpsert& PartialUpdateByName(std::vector<std::string> column_names);
+
+    Result CreateWriter(UpsertWriter& out);
+
+   private:
+    friend class Table;
+    explicit TableUpsert(ffi::Table* table) noexcept;
+
+    std::vector<size_t> ResolveNameProjection() const;
+
+    ffi::Table* table_{nullptr};
+    std::vector<size_t> column_indices_;
+    std::vector<std::string> column_names_;
+};
+
+class TableLookup {
+   public:
+    TableLookup(const TableLookup&) = delete;
+    TableLookup& operator=(const TableLookup&) = delete;
+    TableLookup(TableLookup&&) noexcept = default;
+    TableLookup& operator=(TableLookup&&) noexcept = default;
+
+    Result CreateLookuper(Lookuper& out);
+
+   private:
+    friend class Table;
+    explicit TableLookup(ffi::Table* table) noexcept;
+
+    ffi::Table* table_{nullptr};
+};
+
+class TableScan {
+   public:
+    TableScan(const TableScan&) = delete;
+    TableScan& operator=(const TableScan&) = delete;
+    TableScan(TableScan&&) noexcept = default;
+    TableScan& operator=(TableScan&&) noexcept = default;
+
+    TableScan& ProjectByIndex(std::vector<size_t> column_indices);
+    TableScan& ProjectByName(std::vector<std::string> column_names);
+
+    Result CreateLogScanner(LogScanner& out);
+    Result CreateRecordBatchLogScanner(LogScanner& out);
+
+   private:
+    friend class Table;
+    explicit TableScan(ffi::Table* table) noexcept;
+
+    std::vector<size_t> ResolveNameProjection() const;
+    Result DoCreateScanner(LogScanner& out, bool is_record_batch);
+
+    ffi::Table* table_{nullptr};
+    std::vector<size_t> projection_;
+    std::vector<std::string> name_projection_;
+};
+
+class WriteResult {
+   public:
+    WriteResult() noexcept;
+    ~WriteResult() noexcept;
+
+    WriteResult(const WriteResult&) = delete;
+    WriteResult& operator=(const WriteResult&) = delete;
+    WriteResult(WriteResult&& other) noexcept;
+    WriteResult& operator=(WriteResult&& other) noexcept;
+
+    bool Available() const;
+
+    /// Wait for server acknowledgment of the write.
+    /// For fire-and-forget, simply let the WriteResult go out of scope.
+    Result Wait();
+
+   private:
+    friend class AppendWriter;
+    friend class UpsertWriter;
+    WriteResult(ffi::WriteResult* inner) noexcept;
+
+    void Destroy() noexcept;
+    ffi::WriteResult* inner_{nullptr};
+};
+
+class AppendWriter {
+   public:
+    AppendWriter() noexcept;
+    ~AppendWriter() noexcept;
+
+    AppendWriter(const AppendWriter&) = delete;
+    AppendWriter& operator=(const AppendWriter&) = delete;
+    AppendWriter(AppendWriter&& other) noexcept;
+    AppendWriter& operator=(AppendWriter&& other) noexcept;
+
+    bool Available() const;
+
+    Result Append(const GenericRow& row);
+    Result Append(const GenericRow& row, WriteResult& out);
+    Result AppendArrowBatch(const std::shared_ptr<arrow::RecordBatch>& batch);
+    Result AppendArrowBatch(const std::shared_ptr<arrow::RecordBatch>& batch, WriteResult& out);
+    Result Flush();
+
+   private:
+    friend class Table;
+    friend class TableAppend;
+    AppendWriter(ffi::AppendWriter* writer) noexcept;
+
+    void Destroy() noexcept;
+    ffi::AppendWriter* writer_{nullptr};
+};
+
+class UpsertWriter {
+   public:
+    UpsertWriter() noexcept;
+    ~UpsertWriter() noexcept;
+
+    UpsertWriter(const UpsertWriter&) = delete;
+    UpsertWriter& operator=(const UpsertWriter&) = delete;
+    UpsertWriter(UpsertWriter&& other) noexcept;
+    UpsertWriter& operator=(UpsertWriter&& other) noexcept;
+
+    bool Available() const;
+
+    Result Upsert(const GenericRow& row);
+    Result Upsert(const GenericRow& row, WriteResult& out);
+    Result Delete(const GenericRow& row);
+    Result Delete(const GenericRow& row, WriteResult& out);
+    Result Flush();
+
+   private:
+    friend class Table;
+    friend class TableUpsert;
+    UpsertWriter(ffi::UpsertWriter* writer) noexcept;
+    void Destroy() noexcept;
+    ffi::UpsertWriter* writer_{nullptr};
+};
+
+class Lookuper {
+   public:
+    Lookuper() noexcept;
+    ~Lookuper() noexcept;
+
+    Lookuper(const Lookuper&) = delete;
+    Lookuper& operator=(const Lookuper&) = delete;
+    Lookuper(Lookuper&& other) noexcept;
+    Lookuper& operator=(Lookuper&& other) noexcept;
+
+    bool Available() const;
+
+    Result Lookup(const GenericRow& pk_row, LookupResult& out);
+
+   private:
+    friend class Table;
+    friend class TableLookup;
+    Lookuper(ffi::Lookuper* lookuper) noexcept;
+    void Destroy() noexcept;
+    ffi::Lookuper* lookuper_{nullptr};
+};
+
+class LogScanner {
+   public:
+    LogScanner() noexcept;
+    ~LogScanner() noexcept;
+
+    LogScanner(const LogScanner&) = delete;
+    LogScanner& operator=(const LogScanner&) = delete;
+    LogScanner(LogScanner&& other) noexcept;
+    LogScanner& operator=(LogScanner&& other) noexcept;
+
+    bool Available() const;
+
+    Result Subscribe(int32_t bucket_id, int64_t start_offset);
+    Result Subscribe(const std::vector<BucketSubscription>& bucket_offsets);
+    Result SubscribePartitionBuckets(int64_t partition_id, int32_t bucket_id, int64_t start_offset);
+    Result SubscribePartitionBuckets(const std::vector<PartitionBucketSubscription>& subscriptions);
+    Result Unsubscribe(int32_t bucket_id);
+    Result UnsubscribePartition(int64_t partition_id, int32_t bucket_id);
+    Result Poll(int64_t timeout_ms, ScanRecords& out);
+    Result PollRecordBatch(int64_t timeout_ms, ArrowRecordBatches& out);
+
+   private:
+    friend class Table;
+    friend class TableScan;
+    LogScanner(ffi::LogScanner* scanner) noexcept;
+
+    void Destroy() noexcept;
+    ffi::LogScanner* scanner_{nullptr};
+};
+
+}  // namespace fluss
diff --git a/fluss-rust/bindings/cpp/scripts/ensure_protoc.sh b/fluss-rust/bindings/cpp/scripts/ensure_protoc.sh
new file mode 100755
index 0000000000..3210bcc7a5
--- /dev/null
+++ b/fluss-rust/bindings/cpp/scripts/ensure_protoc.sh
@@ -0,0 +1,277 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+PROTOBUF_BASELINE_VERSION="${PROTOBUF_BASELINE_VERSION:-3.25.5}"
+if [[ -n "${XDG_CACHE_HOME:-}" ]]; then
+  _PROTOC_DEFAULT_CACHE_BASE="${XDG_CACHE_HOME}"
+elif [[ -n "${HOME:-}" ]]; then
+  _PROTOC_DEFAULT_CACHE_BASE="${HOME}/.cache"
+else
+  _PROTOC_DEFAULT_CACHE_BASE="/tmp"
+fi
+
+_PROTOC_UNAME_S="$(uname -s | tr '[:upper:]' '[:lower:]')"
+case "${_PROTOC_UNAME_S}" in
+  linux*)
+    _PROTOC_DEFAULT_OS="linux"
+    ;;
+  darwin*)
+    _PROTOC_DEFAULT_OS="osx"
+    ;;
+  *)
+    echo "ERROR: unsupported host OS '${_PROTOC_UNAME_S}'. Please set PROTOC_OS explicitly." >&2
+    exit 1
+    ;;
+esac
+
+_PROTOC_UNAME_M="$(uname -m)"
+case "${_PROTOC_UNAME_M}" in
+  x86_64|amd64)
+    _PROTOC_DEFAULT_ARCH="x86_64"
+    ;;
+  aarch64|arm64)
+    _PROTOC_DEFAULT_ARCH="aarch_64"
+    ;;
+  *)
+    echo "ERROR: unsupported host arch '${_PROTOC_UNAME_M}'. Please set PROTOC_ARCH explicitly." >&2
+    exit 1
+    ;;
+esac
+
+PROTOC_INSTALL_ROOT="${PROTOC_INSTALL_ROOT:-${_PROTOC_DEFAULT_CACHE_BASE}/fluss-cpp-tools}"
+PROTOC_OS="${PROTOC_OS:-${_PROTOC_DEFAULT_OS}}"
+PROTOC_ARCH="${PROTOC_ARCH:-${_PROTOC_DEFAULT_ARCH}}"
+PROTOC_FORCE_INSTALL="${PROTOC_FORCE_INSTALL:-0}"
+PROTOC_PRINT_PATH_ONLY="${PROTOC_PRINT_PATH_ONLY:-0}"
+PROTOC_ALLOW_INSECURE_DOWNLOAD="${PROTOC_ALLOW_INSECURE_DOWNLOAD:-0}"
+PROTOC_SKIP_CHECKSUM_VERIFY="${PROTOC_SKIP_CHECKSUM_VERIFY:-0}"
+
+usage() {
+  cat <<'EOF'
+Usage: bindings/cpp/scripts/ensure_protoc.sh [--print-path]
+
+Ensures a protoc binary matching the configured protobuf baseline is available.
+Installs into a local cache directory (default: \$XDG_CACHE_HOME/fluss-cpp-tools or
+\$HOME/.cache/fluss-cpp-tools) and prints
+the protoc path on stdout.
+
+Env vars:
+  PROTOBUF_BASELINE_VERSION  Baseline protobuf version (default: 3.25.5)
+  PROTOC_INSTALL_ROOT        Local cache root (default: XDG/HOME cache dir)
+  PROTOC_OS                 protoc package OS (default: auto-detect host: linux/osx)
+  PROTOC_ARCH               protoc package arch (default: auto-detect host: x86_64/aarch_64)
+  PROTOC_FORCE_INSTALL      1 to force re-download
+  PROTOC_ALLOW_INSECURE_DOWNLOAD
+                            1 to disable TLS verification (not recommended)
+  PROTOC_SKIP_CHECKSUM_VERIFY
+                            1 to skip pinned archive checksum verification
+  BAZEL_PROXY_URL           Optional proxy (sets curl/wget proxy envs if present)
+EOF
+}
+
+for arg in "$@"; do
+  case "$arg" in
+    --print-path)
+      PROTOC_PRINT_PATH_ONLY=1
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $arg" >&2
+      usage >&2
+      exit 1
+      ;;
+  esac
+done
+
+setup_proxy_env() {
+  if [[ -n "${BAZEL_PROXY_URL:-}" ]]; then
+    export http_proxy="${http_proxy:-$BAZEL_PROXY_URL}"
+    export https_proxy="${https_proxy:-$BAZEL_PROXY_URL}"
+    export HTTP_PROXY="${HTTP_PROXY:-$http_proxy}"
+    export HTTPS_PROXY="${HTTPS_PROXY:-$https_proxy}"
+  fi
+}
+
+normalize_version_for_protoc_release() {
+  local v="$1"
+  # Protobuf release packaging switched from v3.x.y to vX.Y for newer versions.
+  # For our current agreed baseline (3.25.5), the protoc archive/tag is 25.5.
+  if [[ "$v" =~ ^3\.([0-9]+\.[0-9]+)$ ]]; then
+    local stripped="${BASH_REMATCH[1]}"
+    local major="${stripped%%.*}"
+    if [[ "$major" -ge 21 ]]; then
+      echo "$stripped"
+      return 0
+    fi
+  fi
+  echo "$v"
+}
+
+version_matches_baseline() {
+  local actual="$1"
+  local baseline="$2"
+  local actual_norm baseline_norm
+  actual_norm="$(normalize_version_for_protoc_release "$actual")"
+  baseline_norm="$(normalize_version_for_protoc_release "$baseline")"
+  [[ "$actual" == "$baseline" || "$actual_norm" == "$baseline_norm" ]]
+}
+
+lookup_protoc_archive_sha256() {
+  local release_version="$1"
+  local os="$2"
+  local arch="$3"
+  case "${release_version}:${os}:${arch}" in
+    25.5:linux:aarch_64)
+      echo "dc715bb5aab2ebf9653d7d3efbe55e01a035e45c26f391ff6d9b7923e22914b7"
+      ;;
+    25.5:linux:x86_64)
+      echo "e1ed237a17b2e851cf9662cb5ad02b46e70ff8e060e05984725bc4b4228c6b28"
+      ;;
+    25.5:osx:aarch_64)
+      echo "781a6fc4c265034872cadc65e63dd3c0fc49245b70917821b60e2d457a6876ab"
+      ;;
+    25.5:osx:x86_64)
+      echo "c5447e4f0d5caffb18d9ff21eae7bc7faf2bb2000083d6f49e5b6000b30fceae"
+      ;;
+    *)
+      return 1
+      ;;
+  esac
+}
+
+verify_download_sha256() {
+  local file="$1"
+  local expected="$2"
+  local actual=""
+  if command -v sha256sum >/dev/null 2>&1; then
+    actual="$(sha256sum "$file" | awk '{print $1}')"
+  elif command -v shasum >/dev/null 2>&1; then
+    actual="$(shasum -a 256 "$file" | awk '{print $1}')"
+  else
+    echo "ERROR: neither sha256sum nor shasum is available for checksum verification." >&2
+    return 1
+  fi
+  if [[ "$actual" != "$expected" ]]; then
+    echo "ERROR: protoc archive checksum mismatch." >&2
+    echo "  expected: $expected" >&2
+    echo "  actual:   $actual" >&2
+    return 1
+  fi
+}
+
+download_file() {
+  local url="$1"
+  local out="$2"
+
+  if command -v curl >/dev/null 2>&1; then
+    local curl_args=(-fL)
+    if [[ "${PROTOC_ALLOW_INSECURE_DOWNLOAD}" == "1" ]]; then
+      curl_args+=(-k)
+    fi
+    curl "${curl_args[@]}" "$url" -o "$out"
+    return 0
+  fi
+
+  if command -v wget >/dev/null 2>&1; then
+    local wget_args=()
+    if [[ -n "${https_proxy:-}" || -n "${http_proxy:-}" ]]; then
+      wget_args+=(-e use_proxy=yes)
+      if [[ -n "${https_proxy:-}" ]]; then
+        wget_args+=(-e "https_proxy=${https_proxy}")
+      fi
+      if [[ -n "${http_proxy:-}" ]]; then
+        wget_args+=(-e "http_proxy=${http_proxy}")
+      fi
+    fi
+    if [[ "${PROTOC_ALLOW_INSECURE_DOWNLOAD}" == "1" ]]; then
+      wget_args+=(--no-check-certificate)
+    fi
+    wget "${wget_args[@]}" -O "$out" "$url"
+    return 0
+  fi
+
+  echo "ERROR: neither curl nor wget is available for downloading protoc." >&2
+  return 1
+}
+
+ensure_zip_tools() {
+  command -v unzip >/dev/null 2>&1 || {
+    echo "ERROR: unzip not found." >&2
+    exit 1
+  }
+}
+
+setup_proxy_env
+ensure_zip_tools
+
+if command -v protoc >/dev/null 2>&1; then
+  existing_out="$(protoc --version 2>/dev/null || true)"
+  if [[ "$existing_out" =~ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
+    existing_ver="${BASH_REMATCH[1]}"
+    if version_matches_baseline "$existing_ver" "$PROTOBUF_BASELINE_VERSION"; then
+      command -v protoc
+      exit 0
+    fi
+  fi
+fi
+
+PROTOC_RELEASE_VERSION="$(normalize_version_for_protoc_release "$PROTOBUF_BASELINE_VERSION")"
+PROTOC_ARCHIVE="protoc-${PROTOC_RELEASE_VERSION}-${PROTOC_OS}-${PROTOC_ARCH}.zip"
+PROTOC_URL="https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_RELEASE_VERSION}/${PROTOC_ARCHIVE}"
+PROTOC_PREFIX="${PROTOC_INSTALL_ROOT}/protoc-${PROTOC_RELEASE_VERSION}-${PROTOC_OS}-${PROTOC_ARCH}"
+PROTOC_BIN="${PROTOC_PREFIX}/bin/protoc"
+
+if [[ "${PROTOC_FORCE_INSTALL}" != "1" && -x "${PROTOC_BIN}" ]]; then
+  if [[ "${PROTOC_PRINT_PATH_ONLY}" == "1" ]]; then
+    echo "${PROTOC_BIN}"
+  else
+    echo "${PROTOC_BIN}"
+  fi
+  exit 0
+fi
+
+mkdir -p "${PROTOC_INSTALL_ROOT}"
+tmpdir="$(mktemp -d "${PROTOC_INSTALL_ROOT}/.protoc-download.XXXXXX")"
+trap 'rm -rf "${tmpdir}"' EXIT
+
+archive_path="${tmpdir}/${PROTOC_ARCHIVE}"
+download_file "${PROTOC_URL}" "${archive_path}"
+if [[ "${PROTOC_SKIP_CHECKSUM_VERIFY}" != "1" ]]; then
+  if expected_sha256="$(lookup_protoc_archive_sha256 "${PROTOC_RELEASE_VERSION}" "${PROTOC_OS}" "${PROTOC_ARCH}")"; then
+    verify_download_sha256 "${archive_path}" "${expected_sha256}"
+  else
+    echo "ERROR: no pinned checksum for protoc archive ${PROTOC_ARCHIVE}. Set PROTOC_SKIP_CHECKSUM_VERIFY=1 to bypass." >&2
+    exit 1
+  fi
+fi
+
+extract_dir="${tmpdir}/extract"
+mkdir -p "${extract_dir}"
+unzip -q "${archive_path}" -d "${extract_dir}"
+
+rm -rf "${PROTOC_PREFIX}"
+mkdir -p "${PROTOC_PREFIX}"
+cp -a "${extract_dir}/." "${PROTOC_PREFIX}/"
+chmod +x "${PROTOC_BIN}"
+
+echo "${PROTOC_BIN}"
diff --git a/fluss-rust/bindings/cpp/src/admin.cpp b/fluss-rust/bindings/cpp/src/admin.cpp
new file mode 100644
index 0000000000..a689c6143a
--- /dev/null
+++ b/fluss-rust/bindings/cpp/src/admin.cpp
@@ -0,0 +1,372 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "ffi_converter.hpp"
+#include "fluss.hpp"
+#include "lib.rs.h"
+#include "rust/cxx.h"
+#include <exception>
+
+namespace fluss {
+
+Admin::Admin() noexcept = default;
+
+Admin::Admin(ffi::Admin* admin) noexcept : admin_(admin) {}
+
+Admin::~Admin() noexcept { Destroy(); }
+
+void Admin::Destroy() noexcept {
+    if (admin_) {
+        ffi::delete_admin(admin_);
+        admin_ = nullptr;
+    }
+}
+
+Admin::Admin(Admin&& other) noexcept : admin_(other.admin_) { other.admin_ = nullptr; }
+
+Admin& Admin::operator=(Admin&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        admin_ = other.admin_;
+        other.admin_ = nullptr;
+    }
+    return *this;
+}
+
+bool Admin::Available() const { return admin_ != nullptr; }
+
+Result Admin::CreateTable(const TablePath& table_path, const TableDescriptor& descriptor,
+                          bool ignore_if_exists) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+    auto ffi_desc = utils::to_ffi_table_descriptor(descriptor);
+
+    auto ffi_result = admin_->create_table(ffi_path, ffi_desc, ignore_if_exists);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result Admin::DropTable(const TablePath& table_path, bool ignore_if_not_exists) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+    auto ffi_result = admin_->drop_table(ffi_path, ignore_if_not_exists);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result Admin::GetTableInfo(const TablePath& table_path, TableInfo& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+    auto ffi_result = admin_->get_table_info(ffi_path);
+
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        try {
+            out = utils::from_ffi_table_info(ffi_result.table_info);
+        } catch (const std::exception& e) {
+            return utils::make_client_error(std::string("Failed to parse table metadata: ") + e.what());
+        }
+    }
+
+    return result;
+}
+
+Result Admin::GetLatestLakeSnapshot(const TablePath& table_path, LakeSnapshot& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+    auto ffi_result = admin_->get_latest_lake_snapshot(ffi_path);
+
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out = utils::from_ffi_lake_snapshot(ffi_result.lake_snapshot);
+    }
+
+    return result;
+}
+
+// function for common list offsets functionality
+Result Admin::DoListOffsets(const TablePath& table_path, const std::vector<int32_t>& bucket_ids,
+                            const OffsetSpec& offset_spec,
+                            std::unordered_map<int32_t, int64_t>& out,
+                            const std::string* partition_name) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+
+    rust::Vec<int32_t> rust_bucket_ids;
+    for (int32_t id : bucket_ids) {
+        rust_bucket_ids.push_back(id);
+    }
+
+    ffi::FfiOffsetQuery ffi_query;
+    ffi_query.offset_type = static_cast<int32_t>(offset_spec.type);
+    ffi_query.timestamp = offset_spec.timestamp;
+
+    ffi::FfiListOffsetsResult ffi_result;
+    if (partition_name != nullptr) {
+        ffi_result = admin_->list_partition_offsets(ffi_path, rust::String(*partition_name),
+                                                    std::move(rust_bucket_ids), ffi_query);
+    } else {
+        ffi_result = admin_->list_offsets(ffi_path, std::move(rust_bucket_ids), ffi_query);
+    }
+
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.clear();
+        for (const auto& pair : ffi_result.bucket_offsets) {
+            out[pair.bucket_id] = pair.offset;
+        }
+    }
+
+    return result;
+}
+
+Result Admin::ListOffsets(const TablePath& table_path, const std::vector<int32_t>& bucket_ids,
+                          const OffsetSpec& offset_spec,
+                          std::unordered_map<int32_t, int64_t>& out) {
+    return DoListOffsets(table_path, bucket_ids, offset_spec, out);
+}
+
+Result Admin::ListPartitionOffsets(const TablePath& table_path, const std::string& partition_name,
+                                   const std::vector<int32_t>& bucket_ids,
+                                   const OffsetSpec& offset_spec,
+                                   std::unordered_map<int32_t, int64_t>& out) {
+    return DoListOffsets(table_path, bucket_ids, offset_spec, out, &partition_name);
+}
+
+Result Admin::ListPartitionInfos(const TablePath& table_path, std::vector<PartitionInfo>& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+    auto ffi_result = admin_->list_partition_infos(ffi_path);
+
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.clear();
+        out.reserve(ffi_result.partition_infos.size());
+        for (const auto& pi : ffi_result.partition_infos) {
+            out.push_back({pi.partition_id, std::string(pi.partition_name)});
+        }
+    }
+
+    return result;
+}
+
+Result Admin::ListPartitionInfos(const TablePath& table_path,
+                                 const std::unordered_map<std::string, std::string>& partition_spec,
+                                 std::vector<PartitionInfo>& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+
+    rust::Vec<ffi::FfiPartitionKeyValue> rust_spec;
+    for (const auto& [key, value] : partition_spec) {
+        ffi::FfiPartitionKeyValue kv;
+        kv.key = rust::String(key);
+        kv.value = rust::String(value);
+        rust_spec.push_back(std::move(kv));
+    }
+
+    auto ffi_result = admin_->list_partition_infos_with_spec(ffi_path, std::move(rust_spec));
+
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.clear();
+        out.reserve(ffi_result.partition_infos.size());
+        for (const auto& pi : ffi_result.partition_infos) {
+            out.push_back({pi.partition_id, std::string(pi.partition_name)});
+        }
+    }
+
+    return result;
+}
+
+Result Admin::CreatePartition(const TablePath& table_path,
+                              const std::unordered_map<std::string, std::string>& partition_spec,
+                              bool ignore_if_exists) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+
+    rust::Vec<ffi::FfiPartitionKeyValue> rust_spec;
+    for (const auto& [key, value] : partition_spec) {
+        ffi::FfiPartitionKeyValue kv;
+        kv.key = rust::String(key);
+        kv.value = rust::String(value);
+        rust_spec.push_back(std::move(kv));
+    }
+
+    auto ffi_result = admin_->create_partition(ffi_path, std::move(rust_spec), ignore_if_exists);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result Admin::DropPartition(const TablePath& table_path,
+                            const std::unordered_map<std::string, std::string>& partition_spec,
+                            bool ignore_if_not_exists) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+
+    rust::Vec<ffi::FfiPartitionKeyValue> rust_spec;
+    for (const auto& [key, value] : partition_spec) {
+        ffi::FfiPartitionKeyValue kv;
+        kv.key = rust::String(key);
+        kv.value = rust::String(value);
+        rust_spec.push_back(std::move(kv));
+    }
+
+    auto ffi_result = admin_->drop_partition(ffi_path, std::move(rust_spec), ignore_if_not_exists);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result Admin::CreateDatabase(const std::string& database_name, const DatabaseDescriptor& descriptor,
+                             bool ignore_if_exists) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_desc = utils::to_ffi_database_descriptor(descriptor);
+    auto ffi_result = admin_->create_database(rust::Str(database_name), ffi_desc, ignore_if_exists);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result Admin::DropDatabase(const std::string& database_name, bool ignore_if_not_exists,
+                           bool cascade) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_result =
+        admin_->drop_database(rust::Str(database_name), ignore_if_not_exists, cascade);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result Admin::ListDatabases(std::vector<std::string>& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_result = admin_->list_databases();
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.clear();
+        out.reserve(ffi_result.database_names.size());
+        for (const auto& name : ffi_result.database_names) {
+            out.push_back(std::string(name));
+        }
+    }
+    return result;
+}
+
+Result Admin::DatabaseExists(const std::string& database_name, bool& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_result = admin_->database_exists(rust::Str(database_name));
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out = ffi_result.value;
+    }
+    return result;
+}
+
+Result Admin::GetDatabaseInfo(const std::string& database_name, DatabaseInfo& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_result = admin_->get_database_info(rust::Str(database_name));
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out = utils::from_ffi_database_info(ffi_result.database_info);
+    }
+    return result;
+}
+
+Result Admin::ListTables(const std::string& database_name, std::vector<std::string>& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_result = admin_->list_tables(rust::Str(database_name));
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.clear();
+        out.reserve(ffi_result.table_names.size());
+        for (const auto& name : ffi_result.table_names) {
+            out.push_back(std::string(name));
+        }
+    }
+    return result;
+}
+
+Result Admin::TableExists(const TablePath& table_path, bool& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+    auto ffi_result = admin_->table_exists(ffi_path);
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out = ffi_result.value;
+    }
+    return result;
+}
+
+Result Admin::GetServerNodes(std::vector<ServerNode>& out) {
+    if (!Available()) {
+        return utils::make_client_error("Admin not available");
+    }
+
+    auto ffi_result = admin_->get_server_nodes();
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.clear();
+        out.reserve(ffi_result.server_nodes.size());
+        for (const auto& node : ffi_result.server_nodes) {
+            out.push_back({node.node_id, std::string(node.host), node.port,
+                           std::string(node.server_type), std::string(node.uid)});
+        }
+    }
+    return result;
+}
+
+}  // namespace fluss
diff --git a/fluss-rust/bindings/cpp/src/connection.cpp b/fluss-rust/bindings/cpp/src/connection.cpp
new file mode 100644
index 0000000000..6cd73017f5
--- /dev/null
+++ b/fluss-rust/bindings/cpp/src/connection.cpp
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "ffi_converter.hpp"
+#include "fluss.hpp"
+#include "lib.rs.h"
+#include "rust/cxx.h"
+
+namespace fluss {
+
+Connection::Connection() noexcept = default;
+
+Connection::~Connection() noexcept { Destroy(); }
+
+void Connection::Destroy() noexcept {
+    if (conn_) {
+        ffi::delete_connection(conn_);
+        conn_ = nullptr;
+    }
+}
+
+Connection::Connection(Connection&& other) noexcept : conn_(other.conn_) { other.conn_ = nullptr; }
+
+Connection& Connection::operator=(Connection&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        conn_ = other.conn_;
+        other.conn_ = nullptr;
+    }
+    return *this;
+}
+
+Result Connection::Create(const Configuration& config, Connection& out) {
+    auto ffi_config = utils::to_ffi_config(config);
+    auto ffi_result = ffi::new_connection(ffi_config);
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.conn_ = utils::ptr_from_ffi<ffi::Connection>(ffi_result);
+    }
+    return result;
+}
+
+bool Connection::Available() const { return conn_ != nullptr; }
+
+Result Connection::GetAdmin(Admin& out) {
+    if (!Available()) {
+        return utils::make_client_error("Connection not available");
+    }
+
+    auto ffi_result = conn_->get_admin();
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.admin_ = utils::ptr_from_ffi<ffi::Admin>(ffi_result);
+    }
+    return result;
+}
+
+Result Connection::GetTable(const TablePath& table_path, Table& out) {
+    if (!Available()) {
+        return utils::make_client_error("Connection not available");
+    }
+
+    auto ffi_path = utils::to_ffi_table_path(table_path);
+    auto ffi_result = conn_->get_table(ffi_path);
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.table_ = utils::ptr_from_ffi<ffi::Table>(ffi_result);
+    }
+    return result;
+}
+
+}  // namespace fluss
diff --git a/fluss-rust/bindings/cpp/src/ffi_converter.hpp b/fluss-rust/bindings/cpp/src/ffi_converter.hpp
new file mode 100644
index 0000000000..47453d998a
--- /dev/null
+++ b/fluss-rust/bindings/cpp/src/ffi_converter.hpp
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <stdexcept>
+
+#include "fluss.hpp"
+#include "lib.rs.h"
+
+namespace fluss {
+namespace utils {
+
+/// Compact FFI representation of a (possibly nested) array type.
+///
+/// `nesting` counts the number of ARRAY wrappers stripped to reach the leaf
+/// element type. `leaf_type`/`leaf_precision`/`leaf_scale` describe that leaf
+/// scalar. A non-array input produces a zero-initialised value (nesting == 0).
+/// `array_nullability` has `nesting + 1` entries: one per ARRAY wrapper
+/// (outermost first) plus a trailing entry for the leaf scalar's nullability.
+///
+/// Using a flat representation — rather than serialising a recursive
+/// `DataType` — keeps the cxx bridge contract small while preserving schema
+/// fidelity across the FFI boundary when paired with rebuild_array_type().
+struct FlattenedArrayType {
+    int32_t nesting{0};
+    int32_t leaf_type{0};
+    int32_t leaf_precision{0};
+    int32_t leaf_scale{0};
+    std::vector<uint8_t> array_nullability;
+};
+
+/// Flattens an `ARRAY<ARRAY<...<leaf>>>` DataType into a FlattenedArrayType.
+///
+/// Contract:
+///   - If `data_type` is not an ARRAY, returns a zero-valued FlattenedArrayType
+///     and callers must use the column's own `id/precision/scale` instead.
+///   - If `data_type` is an ARRAY but has a null element_type() chain (which
+///     should only happen on malformed input), returns a zero-valued result to
+///     signal the caller to reject the schema.
+///   - Otherwise, `nesting >= 1`, array_nullability has `nesting + 1` entries
+///     (last = leaf scalar nullability), and leaf_* describe the innermost scalar.
+inline FlattenedArrayType flatten_array_type(const DataType& data_type) {
+    FlattenedArrayType out;
+    if (data_type.id() != TypeId::Array) {
+        return out;
+    }
+
+    const DataType* current = &data_type;
+    while (current && current->id() == TypeId::Array) {
+        out.nesting += 1;
+        out.array_nullability.push_back(current->nullable() ? 1 : 0);
+        current = current->element_type();
+    }
+    if (!current) {
+        return FlattenedArrayType{};
+    }
+
+    out.leaf_type = static_cast<int32_t>(current->id());
+    out.leaf_precision = current->precision();
+    out.leaf_scale = current->scale();
+    out.array_nullability.push_back(current->nullable() ? 1 : 0);
+    return out;
+}
+
+/// Inverse of flatten_array_type: rebuilds an `ARRAY<ARRAY<...<leaf>>>` type
+/// from the compact flat form. Requires `flat.nesting >= 1`; callers handle
+/// the `nesting == 0` case by using a plain scalar DataType directly.
+/// `array_nullability` must have `nesting + 1` entries (last = leaf).
+inline DataType rebuild_array_type(const FlattenedArrayType& flat) {
+    bool leaf_nullable = (static_cast<size_t>(flat.nesting) < flat.array_nullability.size())
+                             ? (flat.array_nullability[static_cast<size_t>(flat.nesting)] != 0)
+                             : true;
+    DataType dt(static_cast<TypeId>(flat.leaf_type), flat.leaf_precision, flat.leaf_scale,
+                leaf_nullable);
+    for (int32_t i = flat.nesting - 1; i >= 0; --i) {
+        bool nullable = (static_cast<size_t>(i) < flat.array_nullability.size())
+                            ? (flat.array_nullability[static_cast<size_t>(i)] != 0)
+                            : true;
+        auto arr = DataType::Array(std::move(dt));
+        if (!nullable) {
+            arr = arr.NotNull();
+        }
+        dt = std::move(arr);
+    }
+    return dt;
+}
+
+inline Result make_error(int32_t code, std::string msg) { return Result{code, std::move(msg)}; }
+
+inline Result make_client_error(std::string msg) {
+    return Result{ErrorCode::CLIENT_ERROR, std::move(msg)};
+}
+
+inline Result make_ok() { return Result{0, {}}; }
+
+inline Result from_ffi_result(const ffi::FfiResult& ffi_result) {
+    return Result{ffi_result.error_code, std::string(ffi_result.error_message)};
+}
+
+template <typename T>
+inline T* ptr_from_ffi(const ffi::FfiPtrResult& r) {
+    assert(r.ptr != 0 && "ptr_from_ffi: null pointer in FfiPtrResult");
+    return reinterpret_cast<T*>(r.ptr);
+}
+
+inline ffi::FfiTablePath to_ffi_table_path(const TablePath& path) {
+    ffi::FfiTablePath ffi_path;
+    ffi_path.database_name = rust::String(path.database_name);
+    ffi_path.table_name = rust::String(path.table_name);
+    return ffi_path;
+}
+
+inline ffi::FfiConfig to_ffi_config(const Configuration& config) {
+    ffi::FfiConfig ffi_config;
+    ffi_config.bootstrap_servers = rust::String(config.bootstrap_servers);
+    ffi_config.writer_request_max_size = config.writer_request_max_size;
+    ffi_config.writer_acks = rust::String(config.writer_acks);
+    ffi_config.writer_retries = config.writer_retries;
+    ffi_config.writer_batch_size = config.writer_batch_size;
+    ffi_config.writer_dynamic_batch_size_enabled = config.writer_dynamic_batch_size_enabled;
+    ffi_config.writer_dynamic_batch_size_min = config.writer_dynamic_batch_size_min;
+    ffi_config.writer_bucket_no_key_assigner = rust::String(config.writer_bucket_no_key_assigner);
+    ffi_config.scanner_remote_log_prefetch_num = config.scanner_remote_log_prefetch_num;
+    ffi_config.remote_file_download_thread_num = config.remote_file_download_thread_num;
+    ffi_config.scanner_remote_log_read_concurrency = config.scanner_remote_log_read_concurrency;
+    ffi_config.scanner_log_max_poll_records = config.scanner_log_max_poll_records;
+    ffi_config.scanner_log_fetch_max_bytes = config.scanner_log_fetch_max_bytes;
+    ffi_config.scanner_log_fetch_min_bytes = config.scanner_log_fetch_min_bytes;
+    ffi_config.scanner_log_fetch_wait_max_time_ms = config.scanner_log_fetch_wait_max_time_ms;
+    ffi_config.scanner_log_fetch_max_bytes_for_bucket = config.scanner_log_fetch_max_bytes_for_bucket;
+    ffi_config.writer_batch_timeout_ms = config.writer_batch_timeout_ms;
+    ffi_config.writer_enable_idempotence = config.writer_enable_idempotence;
+    ffi_config.writer_max_inflight_requests_per_bucket =
+        config.writer_max_inflight_requests_per_bucket;
+    ffi_config.writer_buffer_memory_size = config.writer_buffer_memory_size;
+    ffi_config.writer_buffer_wait_timeout_ms = config.writer_buffer_wait_timeout_ms;
+    ffi_config.connect_timeout_ms = config.connect_timeout_ms;
+    ffi_config.security_protocol = rust::String(config.security_protocol);
+    ffi_config.security_sasl_mechanism = rust::String(config.security_sasl_mechanism);
+    ffi_config.security_sasl_username = rust::String(config.security_sasl_username);
+    ffi_config.security_sasl_password = rust::String(config.security_sasl_password);
+    ffi_config.lookup_queue_size = config.lookup_queue_size;
+    ffi_config.lookup_max_batch_size = config.lookup_max_batch_size;
+    ffi_config.lookup_batch_timeout_ms = config.lookup_batch_timeout_ms;
+    ffi_config.lookup_max_inflight_requests = config.lookup_max_inflight_requests;
+    ffi_config.lookup_max_retries = config.lookup_max_retries;
+    return ffi_config;
+}
+
+inline ffi::FfiColumn to_ffi_column(const Column& col) {
+    ffi::FfiColumn ffi_col;
+    ffi_col.name = rust::String(col.name);
+    ffi_col.data_type = static_cast<int32_t>(col.data_type.id());
+    ffi_col.nullable = col.data_type.nullable();
+    ffi_col.comment = rust::String(col.comment);
+    ffi_col.precision = col.data_type.precision();
+    ffi_col.scale = col.data_type.scale();
+    auto flat = flatten_array_type(col.data_type);
+    ffi_col.array_nesting = flat.nesting;
+    for (auto nullable : flat.array_nullability) {
+        ffi_col.array_nullability.push_back(nullable);
+    }
+    if (flat.nesting > 0 && flat.leaf_type != 0) {
+        ffi_col.element_data_type = flat.leaf_type;
+        ffi_col.element_precision = flat.leaf_precision;
+        ffi_col.element_scale = flat.leaf_scale;
+    } else {
+        ffi_col.element_data_type = 0;
+        ffi_col.element_precision = 0;
+        ffi_col.element_scale = 0;
+    }
+    return ffi_col;
+}
+
+inline ffi::FfiSchema to_ffi_schema(const Schema& schema) {
+    ffi::FfiSchema ffi_schema;
+
+    rust::Vec<ffi::FfiColumn> cols;
+    for (const auto& col : schema.columns) {
+        cols.push_back(to_ffi_column(col));
+    }
+    ffi_schema.columns = std::move(cols);
+
+    rust::Vec<rust::String> pks;
+    for (const auto& pk : schema.primary_keys) {
+        pks.push_back(rust::String(pk));
+    }
+    ffi_schema.primary_keys = std::move(pks);
+
+    return ffi_schema;
+}
+
+inline ffi::FfiTableDescriptor to_ffi_table_descriptor(const TableDescriptor& desc) {
+    ffi::FfiTableDescriptor ffi_desc;
+
+    ffi_desc.schema = to_ffi_schema(desc.schema);
+
+    rust::Vec<rust::String> partition_keys;
+    for (const auto& pk : desc.partition_keys) {
+        partition_keys.push_back(rust::String(pk));
+    }
+    ffi_desc.partition_keys = std::move(partition_keys);
+
+    ffi_desc.bucket_count = desc.bucket_count;
+
+    rust::Vec<rust::String> bucket_keys;
+    for (const auto& bk : desc.bucket_keys) {
+        bucket_keys.push_back(rust::String(bk));
+    }
+    ffi_desc.bucket_keys = std::move(bucket_keys);
+
+    rust::Vec<ffi::HashMapValue> props;
+    for (const auto& [k, v] : desc.properties) {
+        ffi::HashMapValue prop;
+        prop.key = rust::String(k);
+        prop.value = rust::String(v);
+        props.push_back(prop);
+    }
+    ffi_desc.properties = std::move(props);
+
+    rust::Vec<ffi::HashMapValue> custom_props;
+    for (const auto& [k, v] : desc.custom_properties) {
+        ffi::HashMapValue prop;
+        prop.key = rust::String(k);
+        prop.value = rust::String(v);
+        custom_props.push_back(prop);
+    }
+    ffi_desc.custom_properties = std::move(custom_props);
+
+    ffi_desc.comment = rust::String(desc.comment);
+
+    return ffi_desc;
+}
+
+inline Column from_ffi_column(const ffi::FfiColumn& ffi_col) {
+    auto type_id = static_cast<TypeId>(ffi_col.data_type);
+    if (type_id == TypeId::Array) {
+        if (ffi_col.element_data_type == 0) {
+            throw std::runtime_error("Malformed ARRAY column '" + std::string(ffi_col.name) +
+                                     "': missing element_data_type");
+        }
+        if (ffi_col.array_nesting < 0) {
+            throw std::runtime_error("Malformed ARRAY column '" + std::string(ffi_col.name) +
+                                     "': array_nesting must be non-negative");
+        }
+        if (ffi_col.element_data_type == static_cast<int32_t>(TypeId::Array)) {
+            throw std::runtime_error("Malformed ARRAY column '" + std::string(ffi_col.name) +
+                                     "': leaf element_data_type cannot be ARRAY");
+        }
+        auto is_supported_leaf_type = [](int32_t leaf_type) {
+            switch (static_cast<TypeId>(leaf_type)) {
+                case TypeId::Boolean:
+                case TypeId::TinyInt:
+                case TypeId::SmallInt:
+                case TypeId::Int:
+                case TypeId::BigInt:
+                case TypeId::Float:
+                case TypeId::Double:
+                case TypeId::String:
+                case TypeId::Bytes:
+                case TypeId::Date:
+                case TypeId::Time:
+                case TypeId::Timestamp:
+                case TypeId::TimestampLtz:
+                case TypeId::Decimal:
+                case TypeId::Char:
+                case TypeId::Binary:
+                    return true;
+                default:
+                    return false;
+            }
+        };
+        if (!is_supported_leaf_type(ffi_col.element_data_type)) {
+            throw std::runtime_error("Malformed ARRAY column '" + std::string(ffi_col.name) +
+                                     "': unsupported leaf element_data_type " +
+                                     std::to_string(ffi_col.element_data_type));
+        }
+
+        int32_t nesting = ffi_col.array_nesting > 0 ? ffi_col.array_nesting : 1;
+        std::vector<uint8_t> array_nullability;
+        for (auto nullable : ffi_col.array_nullability) {
+            array_nullability.push_back(nullable);
+        }
+        auto dt = rebuild_array_type(
+            FlattenedArrayType{
+                nesting,
+                ffi_col.element_data_type,
+                ffi_col.element_precision,
+                ffi_col.element_scale,
+                std::move(array_nullability),
+            });
+        return Column{std::string(ffi_col.name), std::move(dt), std::string(ffi_col.comment)};
+    }
+    DataType dt(type_id, ffi_col.precision, ffi_col.scale, ffi_col.nullable);
+    return Column{std::string(ffi_col.name), std::move(dt), std::string(ffi_col.comment)};
+}
+
+inline Schema from_ffi_schema(const ffi::FfiSchema& ffi_schema) {
+    Schema schema;
+
+    for (const auto& col : ffi_schema.columns) {
+        schema.columns.push_back(from_ffi_column(col));
+    }
+
+    for (const auto& pk : ffi_schema.primary_keys) {
+        schema.primary_keys.push_back(std::string(pk));
+    }
+
+    return schema;
+}
+
+inline TableInfo from_ffi_table_info(const ffi::FfiTableInfo& ffi_info) {
+    TableInfo info;
+
+    info.table_id = ffi_info.table_id;
+    info.schema_id = ffi_info.schema_id;
+    info.table_path = TablePath{std::string(ffi_info.table_path.database_name),
+                                std::string(ffi_info.table_path.table_name)};
+    info.created_time = ffi_info.created_time;
+    info.modified_time = ffi_info.modified_time;
+
+    for (const auto& pk : ffi_info.primary_keys) {
+        info.primary_keys.push_back(std::string(pk));
+    }
+
+    for (const auto& bk : ffi_info.bucket_keys) {
+        info.bucket_keys.push_back(std::string(bk));
+    }
+
+    for (const auto& pk : ffi_info.partition_keys) {
+        info.partition_keys.push_back(std::string(pk));
+    }
+
+    info.num_buckets = ffi_info.num_buckets;
+    info.has_primary_key = ffi_info.has_primary_key;
+    info.is_partitioned = ffi_info.is_partitioned;
+
+    for (const auto& prop : ffi_info.properties) {
+        info.properties[std::string(prop.key)] = std::string(prop.value);
+    }
+
+    for (const auto& prop : ffi_info.custom_properties) {
+        info.custom_properties[std::string(prop.key)] = std::string(prop.value);
+    }
+
+    info.comment = std::string(ffi_info.comment);
+    info.schema = from_ffi_schema(ffi_info.schema);
+
+    return info;
+}
+
+inline LakeSnapshot from_ffi_lake_snapshot(const ffi::FfiLakeSnapshot& ffi_snapshot) {
+    LakeSnapshot snapshot;
+    snapshot.snapshot_id = ffi_snapshot.snapshot_id;
+
+    for (const auto& offset : ffi_snapshot.bucket_offsets) {
+        snapshot.bucket_offsets.push_back(
+            BucketOffset{offset.table_id, offset.partition_id, offset.bucket_id, offset.offset});
+    }
+
+    return snapshot;
+}
+
+inline ffi::FfiDatabaseDescriptor to_ffi_database_descriptor(const DatabaseDescriptor& desc) {
+    ffi::FfiDatabaseDescriptor ffi_desc;
+    ffi_desc.comment = rust::String(desc.comment);
+    for (const auto& [k, v] : desc.properties) {
+        ffi::HashMapValue kv;
+        kv.key = rust::String(k);
+        kv.value = rust::String(v);
+        ffi_desc.properties.push_back(std::move(kv));
+    }
+    return ffi_desc;
+}
+
+inline DatabaseInfo from_ffi_database_info(const ffi::FfiDatabaseInfo& ffi_info) {
+    DatabaseInfo info;
+    info.database_name = std::string(ffi_info.database_name);
+    info.comment = std::string(ffi_info.comment);
+    info.created_time = ffi_info.created_time;
+    info.modified_time = ffi_info.modified_time;
+    for (const auto& prop : ffi_info.properties) {
+        info.properties[std::string(prop.key)] = std::string(prop.value);
+    }
+    return info;
+}
+
+}  // namespace utils
+}  // namespace fluss
diff --git a/fluss-rust/bindings/cpp/src/lib.rs b/fluss-rust/bindings/cpp/src/lib.rs
new file mode 100644
index 0000000000..ed575244f6
--- /dev/null
+++ b/fluss-rust/bindings/cpp/src/lib.rs
@@ -0,0 +1,3650 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod types;
+
+use std::str::FromStr;
+use std::sync::{Arc, LazyLock};
+use std::time::Duration;
+
+use fluss as fcore;
+use fluss::PartitionId;
+use fluss::error::Error;
+use fluss::rpc::FlussError as CoreFlussError;
+
+static RUNTIME: LazyLock<tokio::runtime::Runtime> = LazyLock::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap()
+});
+
+#[cxx::bridge(namespace = "fluss::ffi")]
+mod ffi {
+    struct HashMapValue {
+        key: String,
+        value: String,
+    }
+
+    struct FfiConfig {
+        bootstrap_servers: String,
+        writer_request_max_size: i32,
+        writer_acks: String,
+        writer_retries: i32,
+        writer_batch_size: i32,
+        writer_dynamic_batch_size_enabled: bool,
+        writer_dynamic_batch_size_min: i32,
+        writer_bucket_no_key_assigner: String,
+        scanner_remote_log_prefetch_num: usize,
+        remote_file_download_thread_num: usize,
+        scanner_remote_log_read_concurrency: usize,
+        scanner_log_max_poll_records: usize,
+        scanner_log_fetch_max_bytes: i32,
+        scanner_log_fetch_min_bytes: i32,
+        scanner_log_fetch_wait_max_time_ms: i32,
+        scanner_log_fetch_max_bytes_for_bucket: i32,
+        writer_batch_timeout_ms: i64,
+        writer_enable_idempotence: bool,
+        writer_max_inflight_requests_per_bucket: usize,
+        writer_buffer_memory_size: usize,
+        writer_buffer_wait_timeout_ms: u64,
+        connect_timeout_ms: u64,
+        security_protocol: String,
+        security_sasl_mechanism: String,
+        security_sasl_username: String,
+        security_sasl_password: String,
+        lookup_queue_size: usize,
+        lookup_max_batch_size: usize,
+        lookup_batch_timeout_ms: u64,
+        lookup_max_inflight_requests: usize,
+        lookup_max_retries: i32,
+    }
+
+    struct FfiResult {
+        error_code: i32,
+        error_message: String,
+    }
+
+    struct FfiTablePath {
+        database_name: String,
+        table_name: String,
+    }
+
+    struct FfiColumn {
+        name: String,
+        data_type: i32,
+        nullable: bool,
+        comment: String,
+        precision: i32,
+        scale: i32,
+        array_nesting: i32,
+        array_nullability: Vec<u8>,
+        element_data_type: i32,
+        element_precision: i32,
+        element_scale: i32,
+    }
+
+    struct FfiSchema {
+        columns: Vec<FfiColumn>,
+        primary_keys: Vec<String>,
+    }
+
+    struct FfiTableDescriptor {
+        schema: FfiSchema,
+        partition_keys: Vec<String>,
+        bucket_count: i32,
+        bucket_keys: Vec<String>,
+        properties: Vec<HashMapValue>,
+        custom_properties: Vec<HashMapValue>,
+        comment: String,
+    }
+
+    struct FfiTableInfo {
+        table_id: i64,
+        schema_id: i32,
+        table_path: FfiTablePath,
+        created_time: i64,
+        modified_time: i64,
+        primary_keys: Vec<String>,
+        bucket_keys: Vec<String>,
+        partition_keys: Vec<String>,
+        num_buckets: i32,
+        has_primary_key: bool,
+        is_partitioned: bool,
+        properties: Vec<HashMapValue>,
+        custom_properties: Vec<HashMapValue>,
+        comment: String,
+        schema: FfiSchema,
+    }
+
+    struct FfiTableInfoResult {
+        result: FfiResult,
+        table_info: FfiTableInfo,
+    }
+
+    // NOTE: FfiDatum, FfiGenericRow, FfiScanRecord, FfiScanRecords, FfiScanRecordsResult
+    // have been replaced by opaque types below (ScanResultInner, GenericRowInner, LookupResultInner).
+
+    struct FfiArrowRecordBatch {
+        array_ptr: usize,
+        schema_ptr: usize,
+        table_id: i64,
+        partition_id: i64,
+        bucket_id: i32,
+        base_offset: i64,
+    }
+
+    struct FfiArrowRecordBatches {
+        batches: Vec<FfiArrowRecordBatch>,
+    }
+
+    struct FfiArrowRecordBatchesResult {
+        result: FfiResult,
+        arrow_batches: FfiArrowRecordBatches,
+    }
+
+    struct FfiLakeSnapshot {
+        snapshot_id: i64,
+        bucket_offsets: Vec<FfiBucketOffset>,
+    }
+
+    struct FfiBucketOffset {
+        table_id: i64,
+        partition_id: i64,
+        bucket_id: i32,
+        offset: i64,
+    }
+
+    struct FfiOffsetQuery {
+        offset_type: i32,
+        timestamp: i64,
+    }
+
+    struct FfiBucketInfo {
+        table_id: i64,
+        bucket_id: i32,
+        has_partition_id: bool,
+        partition_id: i64,
+        record_count: usize,
+    }
+
+    struct FfiBucketSubscription {
+        bucket_id: i32,
+        offset: i64,
+    }
+
+    struct FfiPartitionBucketSubscription {
+        partition_id: i64,
+        bucket_id: i32,
+        offset: i64,
+    }
+
+    struct FfiBucketOffsetPair {
+        bucket_id: i32,
+        offset: i64,
+    }
+
+    struct FfiListOffsetsResult {
+        result: FfiResult,
+        bucket_offsets: Vec<FfiBucketOffsetPair>,
+    }
+
+    // NOTE: FfiLookupResult replaced by opaque LookupResultInner below.
+
+    struct FfiLakeSnapshotResult {
+        result: FfiResult,
+        lake_snapshot: FfiLakeSnapshot,
+    }
+
+    struct FfiPartitionKeyValue {
+        key: String,
+        value: String,
+    }
+
+    struct FfiPartitionInfo {
+        partition_id: i64,
+        partition_name: String,
+    }
+
+    struct FfiListPartitionInfosResult {
+        result: FfiResult,
+        partition_infos: Vec<FfiPartitionInfo>,
+    }
+
+    struct FfiDatabaseDescriptor {
+        comment: String,
+        properties: Vec<HashMapValue>,
+    }
+
+    struct FfiDatabaseInfo {
+        database_name: String,
+        comment: String,
+        properties: Vec<HashMapValue>,
+        created_time: i64,
+        modified_time: i64,
+    }
+
+    struct FfiDatabaseInfoResult {
+        result: FfiResult,
+        database_info: FfiDatabaseInfo,
+    }
+
+    struct FfiListDatabasesResult {
+        result: FfiResult,
+        database_names: Vec<String>,
+    }
+
+    struct FfiListTablesResult {
+        result: FfiResult,
+        table_names: Vec<String>,
+    }
+
+    struct FfiBoolResult {
+        result: FfiResult,
+        value: bool,
+    }
+
+    struct FfiServerNode {
+        node_id: i32,
+        host: String,
+        port: u32,
+        server_type: String,
+        uid: String,
+    }
+
+    struct FfiServerNodesResult {
+        result: FfiResult,
+        server_nodes: Vec<FfiServerNode>,
+    }
+
+    struct FfiPtrResult {
+        result: FfiResult,
+        ptr: usize,
+    }
+
+    extern "Rust" {
+        type Connection;
+        type Admin;
+        type Table;
+        type AppendWriter;
+        type WriteResult;
+        type LogScanner;
+        type UpsertWriter;
+        type Lookuper;
+
+        // Opaque types for optimized FFI
+        type ScanResultInner;
+        type GenericRowInner;
+        type LookupResultInner;
+        type ArrayWriterInner;
+        type ArrayViewInner;
+
+        // Connection
+        fn new_connection(config: &FfiConfig) -> FfiPtrResult;
+        unsafe fn delete_connection(conn: *mut Connection);
+        fn get_admin(self: &Connection) -> FfiPtrResult;
+        fn get_table(self: &Connection, table_path: &FfiTablePath) -> FfiPtrResult;
+
+        // Admin
+        unsafe fn delete_admin(admin: *mut Admin);
+        fn create_table(
+            self: &Admin,
+            table_path: &FfiTablePath,
+            descriptor: &FfiTableDescriptor,
+            ignore_if_exists: bool,
+        ) -> FfiResult;
+        fn drop_table(
+            self: &Admin,
+            table_path: &FfiTablePath,
+            ignore_if_not_exists: bool,
+        ) -> FfiResult;
+        fn get_table_info(self: &Admin, table_path: &FfiTablePath) -> FfiTableInfoResult;
+        fn get_latest_lake_snapshot(
+            self: &Admin,
+            table_path: &FfiTablePath,
+        ) -> FfiLakeSnapshotResult;
+        fn list_offsets(
+            self: &Admin,
+            table_path: &FfiTablePath,
+            bucket_ids: Vec<i32>,
+            offset_query: &FfiOffsetQuery,
+        ) -> FfiListOffsetsResult;
+        fn list_partition_offsets(
+            self: &Admin,
+            table_path: &FfiTablePath,
+            partition_name: String,
+            bucket_ids: Vec<i32>,
+            offset_query: &FfiOffsetQuery,
+        ) -> FfiListOffsetsResult;
+        fn list_partition_infos(
+            self: &Admin,
+            table_path: &FfiTablePath,
+        ) -> FfiListPartitionInfosResult;
+        fn list_partition_infos_with_spec(
+            self: &Admin,
+            table_path: &FfiTablePath,
+            partition_spec: Vec<FfiPartitionKeyValue>,
+        ) -> FfiListPartitionInfosResult;
+        fn create_partition(
+            self: &Admin,
+            table_path: &FfiTablePath,
+            partition_spec: Vec<FfiPartitionKeyValue>,
+            ignore_if_exists: bool,
+        ) -> FfiResult;
+        fn drop_partition(
+            self: &Admin,
+            table_path: &FfiTablePath,
+            partition_spec: Vec<FfiPartitionKeyValue>,
+            ignore_if_not_exists: bool,
+        ) -> FfiResult;
+        fn create_database(
+            self: &Admin,
+            database_name: &str,
+            descriptor: &FfiDatabaseDescriptor,
+            ignore_if_exists: bool,
+        ) -> FfiResult;
+        fn drop_database(
+            self: &Admin,
+            database_name: &str,
+            ignore_if_not_exists: bool,
+            cascade: bool,
+        ) -> FfiResult;
+        fn list_databases(self: &Admin) -> FfiListDatabasesResult;
+        fn database_exists(self: &Admin, database_name: &str) -> FfiBoolResult;
+        fn get_database_info(self: &Admin, database_name: &str) -> FfiDatabaseInfoResult;
+        fn list_tables(self: &Admin, database_name: &str) -> FfiListTablesResult;
+        fn table_exists(self: &Admin, table_path: &FfiTablePath) -> FfiBoolResult;
+        fn get_server_nodes(self: &Admin) -> FfiServerNodesResult;
+
+        // Table
+        unsafe fn delete_table(table: *mut Table);
+        fn new_append_writer(self: &Table) -> FfiPtrResult;
+        fn create_scanner(self: &Table, column_indices: Vec<usize>, batch: bool) -> FfiPtrResult;
+        fn get_table_info_from_table(self: &Table) -> FfiTableInfo;
+        fn get_table_path(self: &Table) -> FfiTablePath;
+        fn has_primary_key(self: &Table) -> bool;
+        fn create_upsert_writer(self: &Table, column_indices: Vec<usize>) -> FfiPtrResult;
+        fn new_lookuper(self: &Table) -> FfiPtrResult;
+
+        // GenericRowInner — opaque row for writes
+        fn new_generic_row(field_count: usize) -> Box<GenericRowInner>;
+        fn gr_reset(self: &mut GenericRowInner);
+        fn gr_set_null(self: &mut GenericRowInner, idx: usize);
+        fn gr_set_bool(self: &mut GenericRowInner, idx: usize, val: bool);
+        fn gr_set_i32(self: &mut GenericRowInner, idx: usize, val: i32);
+        fn gr_set_i64(self: &mut GenericRowInner, idx: usize, val: i64);
+        fn gr_set_f32(self: &mut GenericRowInner, idx: usize, val: f32);
+        fn gr_set_f64(self: &mut GenericRowInner, idx: usize, val: f64);
+        fn gr_set_str(self: &mut GenericRowInner, idx: usize, val: &str);
+        fn gr_set_bytes(self: &mut GenericRowInner, idx: usize, val: &[u8]);
+        fn gr_set_date(self: &mut GenericRowInner, idx: usize, days: i32);
+        fn gr_set_time(self: &mut GenericRowInner, idx: usize, millis: i32);
+        fn gr_set_ts_ntz(self: &mut GenericRowInner, idx: usize, millis: i64, nanos: i32);
+        fn gr_set_ts_ltz(self: &mut GenericRowInner, idx: usize, millis: i64, nanos: i32);
+        fn gr_set_decimal_str(self: &mut GenericRowInner, idx: usize, val: &str);
+        fn gr_set_array(
+            self: &mut GenericRowInner,
+            idx: usize,
+            writer: &mut ArrayWriterInner,
+        ) -> Result<()>;
+
+        // ArrayWriterInner — opaque array builder for writes
+        fn new_array_writer(
+            size: usize,
+            element_leaf_type_id: i32,
+            precision: u32,
+            scale: u32,
+            array_nesting: u32,
+        ) -> Result<Box<ArrayWriterInner>>;
+        fn aw_size(self: &ArrayWriterInner) -> usize;
+        fn aw_set_null(self: &mut ArrayWriterInner, idx: usize) -> Result<()>;
+        fn aw_set_bool(self: &mut ArrayWriterInner, idx: usize, val: bool) -> Result<()>;
+        fn aw_set_i32(self: &mut ArrayWriterInner, idx: usize, val: i32) -> Result<()>;
+        fn aw_set_i64(self: &mut ArrayWriterInner, idx: usize, val: i64) -> Result<()>;
+        fn aw_set_f32(self: &mut ArrayWriterInner, idx: usize, val: f32) -> Result<()>;
+        fn aw_set_f64(self: &mut ArrayWriterInner, idx: usize, val: f64) -> Result<()>;
+        fn aw_set_str(self: &mut ArrayWriterInner, idx: usize, val: &str) -> Result<()>;
+        fn aw_set_bytes(self: &mut ArrayWriterInner, idx: usize, val: &[u8]) -> Result<()>;
+        fn aw_set_date(self: &mut ArrayWriterInner, idx: usize, days: i32) -> Result<()>;
+        fn aw_set_time(self: &mut ArrayWriterInner, idx: usize, millis: i32) -> Result<()>;
+        fn aw_set_ts_ntz(
+            self: &mut ArrayWriterInner,
+            idx: usize,
+            millis: i64,
+            nanos: i32,
+        ) -> Result<()>;
+        fn aw_set_ts_ltz(
+            self: &mut ArrayWriterInner,
+            idx: usize,
+            millis: i64,
+            nanos: i32,
+        ) -> Result<()>;
+        fn aw_set_decimal_str(self: &mut ArrayWriterInner, idx: usize, val: &str) -> Result<()>;
+        fn aw_set_array(
+            self: &mut ArrayWriterInner,
+            idx: usize,
+            nested: &mut ArrayWriterInner,
+        ) -> Result<()>;
+
+        // AppendWriter
+        unsafe fn delete_append_writer(writer: *mut AppendWriter);
+        fn append(self: &mut AppendWriter, row: &GenericRowInner) -> FfiPtrResult;
+        fn append_arrow_batch(
+            self: &mut AppendWriter,
+            array_ptr: usize,
+            schema_ptr: usize,
+        ) -> FfiPtrResult;
+        fn flush(self: &mut AppendWriter) -> FfiResult;
+
+        // WriteResult
+        unsafe fn delete_write_result(wr: *mut WriteResult);
+        fn wait(self: &mut WriteResult) -> FfiResult;
+
+        // UpsertWriter
+        unsafe fn delete_upsert_writer(writer: *mut UpsertWriter);
+        fn upsert(self: &mut UpsertWriter, row: &GenericRowInner) -> FfiPtrResult;
+        fn delete_row(self: &mut UpsertWriter, row: &GenericRowInner) -> FfiPtrResult;
+        fn upsert_flush(self: &mut UpsertWriter) -> FfiResult;
+
+        // Lookuper
+        unsafe fn delete_lookuper(lookuper: *mut Lookuper);
+        fn lookup(self: &mut Lookuper, pk_row: &GenericRowInner) -> Box<LookupResultInner>;
+
+        // LookupResultInner accessors
+        fn lv_has_error(self: &LookupResultInner) -> bool;
+        fn lv_error_code(self: &LookupResultInner) -> i32;
+        fn lv_error_message(self: &LookupResultInner) -> &str;
+        fn lv_found(self: &LookupResultInner) -> bool;
+        fn lv_field_count(self: &LookupResultInner) -> usize;
+        fn lv_column_name(self: &LookupResultInner, field: usize) -> Result<&str>;
+        fn lv_column_type(self: &LookupResultInner, field: usize) -> Result<i32>;
+        fn lv_is_null(self: &LookupResultInner, field: usize) -> Result<bool>;
+        fn lv_get_bool(self: &LookupResultInner, field: usize) -> Result<bool>;
+        fn lv_get_i32(self: &LookupResultInner, field: usize) -> Result<i32>;
+        fn lv_get_i64(self: &LookupResultInner, field: usize) -> Result<i64>;
+        fn lv_get_f32(self: &LookupResultInner, field: usize) -> Result<f32>;
+        fn lv_get_f64(self: &LookupResultInner, field: usize) -> Result<f64>;
+        fn lv_get_str(self: &LookupResultInner, field: usize) -> Result<&str>;
+        fn lv_get_bytes(self: &LookupResultInner, field: usize) -> Result<&[u8]>;
+        fn lv_get_date_days(self: &LookupResultInner, field: usize) -> Result<i32>;
+        fn lv_get_time_millis(self: &LookupResultInner, field: usize) -> Result<i32>;
+        fn lv_get_ts_millis(self: &LookupResultInner, field: usize) -> Result<i64>;
+        fn lv_get_ts_nanos(self: &LookupResultInner, field: usize) -> Result<i32>;
+        fn lv_is_ts_ltz(self: &LookupResultInner, field: usize) -> Result<bool>;
+        fn lv_get_decimal_str(self: &LookupResultInner, field: usize) -> Result<String>;
+
+        fn lv_get_array_size(self: &LookupResultInner, field: usize) -> Result<usize>;
+        fn lv_get_array_is_null(
+            self: &LookupResultInner,
+            field: usize,
+            element: usize,
+        ) -> Result<bool>;
+        fn lv_get_array_bool(
+            self: &LookupResultInner,
+            field: usize,
+            element: usize,
+        ) -> Result<bool>;
+        fn lv_get_array_i32(self: &LookupResultInner, field: usize, element: usize) -> Result<i32>;
+        fn lv_get_array_i64(self: &LookupResultInner, field: usize, element: usize) -> Result<i64>;
+        fn lv_get_array_f32(self: &LookupResultInner, field: usize, element: usize) -> Result<f32>;
+        fn lv_get_array_f64(self: &LookupResultInner, field: usize, element: usize) -> Result<f64>;
+        fn lv_get_array_str(
+            self: &LookupResultInner,
+            field: usize,
+            element: usize,
+        ) -> Result<String>;
+        fn lv_get_array_bytes(
+            self: &LookupResultInner,
+            field: usize,
+            element: usize,
+        ) -> Result<Vec<u8>>;
+        fn lv_get_array_date_days(
+            self: &LookupResultInner,
+            field: usize,
+            element: usize,
+        ) -> Result<i32>;
+        fn lv_get_array_time_millis(
+            self: &LookupResultInner,
+            field: usize,
+            element: usize,
+        ) -> Result<i32>;
+        fn lv_get_array_ts_millis(
+            self: &LookupResultInner,
+            field: usize,
+            element: usize,
+        ) -> Result<i64>;
+        fn lv_get_array_ts_nanos(
+            self: &LookupResultInner,
+            field: usize,
+            element: usize,
+        ) -> Result<i32>;
+        fn lv_get_array_decimal_str(
+            self: &LookupResultInner,
+            field: usize,
+            element: usize,
+        ) -> Result<String>;
+        fn lv_get_array_element_type(self: &LookupResultInner, field: usize) -> Result<i32>;
+        fn lv_get_array_view(self: &LookupResultInner, field: usize)
+        -> Result<Box<ArrayViewInner>>;
+
+        // ArrayViewInner — opaque recursive array reader for C++ bindings
+        fn av_size(self: &ArrayViewInner) -> usize;
+        fn av_element_type_id(self: &ArrayViewInner) -> i32;
+        fn av_is_null(self: &ArrayViewInner, element: usize) -> Result<bool>;
+        fn av_get_bool(self: &ArrayViewInner, element: usize) -> Result<bool>;
+        fn av_get_i32(self: &ArrayViewInner, element: usize) -> Result<i32>;
+        fn av_get_i64(self: &ArrayViewInner, element: usize) -> Result<i64>;
+        fn av_get_f32(self: &ArrayViewInner, element: usize) -> Result<f32>;
+        fn av_get_f64(self: &ArrayViewInner, element: usize) -> Result<f64>;
+        fn av_get_str(self: &ArrayViewInner, element: usize) -> Result<String>;
+        fn av_get_bytes(self: &ArrayViewInner, element: usize) -> Result<Vec<u8>>;
+        fn av_get_date_days(self: &ArrayViewInner, element: usize) -> Result<i32>;
+        fn av_get_time_millis(self: &ArrayViewInner, element: usize) -> Result<i32>;
+        fn av_get_ts_millis(self: &ArrayViewInner, element: usize) -> Result<i64>;
+        fn av_get_ts_nanos(self: &ArrayViewInner, element: usize) -> Result<i32>;
+        fn av_get_decimal_str(self: &ArrayViewInner, element: usize) -> Result<String>;
+        fn av_get_nested(self: &ArrayViewInner, element: usize) -> Result<Box<ArrayViewInner>>;
+
+        // LogScanner
+        unsafe fn delete_log_scanner(scanner: *mut LogScanner);
+        fn subscribe(self: &LogScanner, bucket_id: i32, start_offset: i64) -> FfiResult;
+        fn subscribe_buckets(
+            self: &LogScanner,
+            subscriptions: Vec<FfiBucketSubscription>,
+        ) -> FfiResult;
+        fn subscribe_partition(
+            self: &LogScanner,
+            partition_id: i64,
+            bucket_id: i32,
+            start_offset: i64,
+        ) -> FfiResult;
+        fn subscribe_partition_buckets(
+            self: &LogScanner,
+            subscriptions: Vec<FfiPartitionBucketSubscription>,
+        ) -> FfiResult;
+        fn unsubscribe(self: &LogScanner, bucket_id: i32) -> FfiResult;
+        fn unsubscribe_partition(self: &LogScanner, partition_id: i64, bucket_id: i32)
+        -> FfiResult;
+        fn poll(self: &LogScanner, timeout_ms: i64) -> Box<ScanResultInner>;
+        fn poll_record_batch(self: &LogScanner, timeout_ms: i64) -> FfiArrowRecordBatchesResult;
+        fn free_arrow_ffi_structures(array_ptr: usize, schema_ptr: usize);
+
+        // ScanResultInner accessors
+        fn sv_has_error(self: &ScanResultInner) -> bool;
+        fn sv_error_code(self: &ScanResultInner) -> i32;
+        fn sv_error_message(self: &ScanResultInner) -> &str;
+        fn sv_record_count(self: &ScanResultInner) -> usize;
+        fn sv_column_count(self: &ScanResultInner) -> usize;
+        fn sv_column_name(self: &ScanResultInner, field: usize) -> Result<&str>;
+        fn sv_column_type(self: &ScanResultInner, field: usize) -> Result<i32>;
+        fn sv_offset(self: &ScanResultInner, bucket: usize, rec: usize) -> i64;
+        fn sv_timestamp(self: &ScanResultInner, bucket: usize, rec: usize) -> i64;
+        fn sv_change_type(self: &ScanResultInner, bucket: usize, rec: usize) -> i32;
+        fn sv_field_count(self: &ScanResultInner) -> usize;
+        fn sv_is_null(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<bool>;
+        fn sv_get_bool(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<bool>;
+        fn sv_get_i32(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<i32>;
+        fn sv_get_i64(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<i64>;
+        fn sv_get_f32(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<f32>;
+        fn sv_get_f64(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<f64>;
+        fn sv_get_str(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<&str>;
+        fn sv_get_bytes(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<&[u8]>;
+        fn sv_get_date_days(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<i32>;
+        fn sv_get_time_millis(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<i32>;
+        fn sv_get_ts_millis(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<i64>;
+        fn sv_get_ts_nanos(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<i32>;
+        fn sv_is_ts_ltz(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<bool>;
+        fn sv_get_decimal_str(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<String>;
+
+        fn sv_get_array_size(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<usize>;
+        fn sv_get_array_is_null(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<bool>;
+        fn sv_get_array_bool(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<bool>;
+        fn sv_get_array_i32(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<i32>;
+        fn sv_get_array_i64(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<i64>;
+        fn sv_get_array_f32(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<f32>;
+        fn sv_get_array_f64(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<f64>;
+        fn sv_get_array_str(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<String>;
+        fn sv_get_array_bytes(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<Vec<u8>>;
+        fn sv_get_array_date_days(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<i32>;
+        fn sv_get_array_time_millis(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<i32>;
+        fn sv_get_array_ts_millis(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<i64>;
+        fn sv_get_array_ts_nanos(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<i32>;
+        fn sv_get_array_decimal_str(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+            element: usize,
+        ) -> Result<String>;
+        fn sv_get_array_element_type(self: &ScanResultInner, field: usize) -> Result<i32>;
+        fn sv_get_array_view(
+            self: &ScanResultInner,
+            bucket: usize,
+            rec: usize,
+            field: usize,
+        ) -> Result<Box<ArrayViewInner>>;
+
+        fn sv_bucket_infos(self: &ScanResultInner) -> &Vec<FfiBucketInfo>;
+    }
+}
+
+pub struct Connection {
+    inner: Arc<fcore::client::FlussConnection>,
+}
+
+pub struct Admin {
+    inner: Arc<fcore::client::FlussAdmin>,
+}
+
+pub struct Table {
+    connection: Arc<fcore::client::FlussConnection>,
+    metadata: Arc<fcore::client::Metadata>,
+    table_info: fcore::metadata::TableInfo,
+    table_path: fcore::metadata::TablePath,
+    has_pk: bool,
+}
+
+pub struct AppendWriter {
+    inner: fcore::client::AppendWriter,
+    table_info: fcore::metadata::TableInfo,
+}
+
+pub struct WriteResult {
+    inner: Option<fcore::client::WriteResultFuture>,
+}
+
+enum ScannerKind {
+    Record(fcore::client::LogScanner),
+    Batch(fcore::client::RecordBatchLogScanner),
+}
+
+pub struct LogScanner {
+    scanner: ScannerKind,
+    /// Fluss columns matching the projected Arrow fields (1:1 by index).
+    /// For non-projected scanners this is the full table schema columns.
+    projected_columns: Vec<fcore::metadata::Column>,
+}
+
+pub struct UpsertWriter {
+    inner: fcore::client::UpsertWriter,
+    table_info: fcore::metadata::TableInfo,
+}
+
+pub struct Lookuper {
+    inner: fcore::client::Lookuper,
+    table_info: fcore::metadata::TableInfo,
+}
+
+/// Error code for client-side errors that did not originate from the server API protocol.
+/// Must be non-zero so that CPP `Result::Ok()` (which checks `error_code == 0`) correctly
+/// detects client-side errors as failures. The value -2 is outside the server API error
+/// code range (-1 .. 57+), so it will never collide with current or future API codes.
+const CLIENT_ERROR_CODE: i32 = -2;
+
+fn ok_result() -> ffi::FfiResult {
+    ffi::FfiResult {
+        error_code: 0,
+        error_message: String::new(),
+    }
+}
+
+fn err_result(code: i32, msg: String) -> ffi::FfiResult {
+    ffi::FfiResult {
+        error_code: code,
+        error_message: msg,
+    }
+}
+
+/// Create a client-side error result (not from server API).
+fn client_err(msg: String) -> ffi::FfiResult {
+    err_result(CLIENT_ERROR_CODE, msg)
+}
+
+fn err_from_core_error(e: &Error) -> ffi::FfiResult {
+    // Transport failures map to `NetworkException` (Java parity,
+    // retriable).
+    match e {
+        Error::FlussAPIError { api_error } => err_result(api_error.code, api_error.message.clone()),
+        Error::RpcError { .. } => {
+            err_result(CoreFlussError::NetworkException.code(), e.to_string())
+        }
+        _ => client_err(e.to_string()),
+    }
+}
+
+fn ok_ptr(ptr: usize) -> ffi::FfiPtrResult {
+    ffi::FfiPtrResult {
+        result: ok_result(),
+        ptr,
+    }
+}
+
+fn client_err_ptr(msg: String) -> ffi::FfiPtrResult {
+    ffi::FfiPtrResult {
+        result: client_err(msg),
+        ptr: 0usize,
+    }
+}
+
+fn err_ptr_from_core(e: &fcore::error::Error) -> ffi::FfiPtrResult {
+    ffi::FfiPtrResult {
+        result: err_from_core_error(e),
+        ptr: 0usize,
+    }
+}
+
+// Connection implementation
+fn new_connection(config: &ffi::FfiConfig) -> ffi::FfiPtrResult {
+    let assigner_type = match config
+        .writer_bucket_no_key_assigner
+        .parse::<fluss::config::NoKeyAssigner>()
+    {
+        Ok(v) => v,
+        Err(e) => return client_err_ptr(format!("Invalid bucket assigner type: {e}")),
+    };
+    let config_core = fluss::config::Config {
+        bootstrap_servers: config.bootstrap_servers.to_string(),
+        writer_request_max_size: config.writer_request_max_size,
+        writer_acks: config.writer_acks.to_string(),
+        writer_retries: config.writer_retries,
+        writer_batch_size: config.writer_batch_size,
+        writer_dynamic_batch_size_enabled: config.writer_dynamic_batch_size_enabled,
+        writer_dynamic_batch_size_min: config.writer_dynamic_batch_size_min,
+        writer_batch_timeout_ms: config.writer_batch_timeout_ms,
+        writer_bucket_no_key_assigner: assigner_type,
+        scanner_remote_log_prefetch_num: config.scanner_remote_log_prefetch_num,
+        remote_file_download_thread_num: config.remote_file_download_thread_num,
+        scanner_remote_log_read_concurrency: config.scanner_remote_log_read_concurrency,
+        scanner_log_max_poll_records: config.scanner_log_max_poll_records,
+        scanner_log_fetch_max_bytes: config.scanner_log_fetch_max_bytes,
+        scanner_log_fetch_min_bytes: config.scanner_log_fetch_min_bytes,
+        scanner_log_fetch_wait_max_time_ms: config.scanner_log_fetch_wait_max_time_ms,
+        scanner_log_fetch_max_bytes_for_bucket: config.scanner_log_fetch_max_bytes_for_bucket,
+        writer_enable_idempotence: config.writer_enable_idempotence,
+        writer_max_inflight_requests_per_bucket: config.writer_max_inflight_requests_per_bucket,
+        writer_buffer_memory_size: config.writer_buffer_memory_size,
+        writer_buffer_wait_timeout_ms: config.writer_buffer_wait_timeout_ms,
+        connect_timeout_ms: config.connect_timeout_ms,
+        security_protocol: config.security_protocol.to_string(),
+        security_sasl_mechanism: config.security_sasl_mechanism.to_string(),
+        security_sasl_username: config.security_sasl_username.to_string(),
+        security_sasl_password: config.security_sasl_password.to_string(),
+        lookup_queue_size: config.lookup_queue_size,
+        lookup_max_batch_size: config.lookup_max_batch_size,
+        lookup_batch_timeout_ms: config.lookup_batch_timeout_ms,
+        lookup_max_inflight_requests: config.lookup_max_inflight_requests,
+        lookup_max_retries: config.lookup_max_retries,
+    };
+
+    let conn = RUNTIME.block_on(async { fcore::client::FlussConnection::new(config_core).await });
+
+    match conn {
+        Ok(c) => {
+            let ptr = Box::into_raw(Box::new(Connection { inner: Arc::new(c) }));
+            ok_ptr(ptr as usize)
+        }
+        Err(e) => err_ptr_from_core(&e),
+    }
+}
+
+unsafe fn delete_connection(conn: *mut Connection) {
+    if !conn.is_null() {
+        unsafe {
+            drop(Box::from_raw(conn));
+        }
+    }
+}
+
+impl Connection {
+    fn get_admin(&self) -> ffi::FfiPtrResult {
+        let admin_result = self.inner.get_admin();
+
+        match admin_result {
+            Ok(admin) => {
+                let ptr = Box::into_raw(Box::new(Admin { inner: admin }));
+                ok_ptr(ptr as usize)
+            }
+            Err(e) => err_ptr_from_core(&e),
+        }
+    }
+
+    fn get_table(&self, table_path: &ffi::FfiTablePath) -> ffi::FfiPtrResult {
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+
+        let table_result = RUNTIME.block_on(async { self.inner.get_table(&path).await });
+
+        match table_result {
+            Ok(t) => {
+                let ptr = Box::into_raw(Box::new(Table {
+                    connection: self.inner.clone(),
+                    metadata: t.metadata().clone(),
+                    table_info: t.get_table_info().clone(),
+                    table_path: t.table_path().clone(),
+                    has_pk: t.has_primary_key(),
+                }));
+                ok_ptr(ptr as usize)
+            }
+            Err(e) => err_ptr_from_core(&e),
+        }
+    }
+}
+
+// Admin implementation
+unsafe fn delete_admin(admin: *mut Admin) {
+    if !admin.is_null() {
+        unsafe {
+            drop(Box::from_raw(admin));
+        }
+    }
+}
+
+impl Admin {
+    fn create_table(
+        &self,
+        table_path: &ffi::FfiTablePath,
+        descriptor: &ffi::FfiTableDescriptor,
+        ignore_if_exists: bool,
+    ) -> ffi::FfiResult {
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+
+        let core_descriptor = match types::ffi_descriptor_to_core(descriptor) {
+            Ok(d) => d,
+            Err(e) => return client_err(e.to_string()),
+        };
+
+        let result = RUNTIME.block_on(async {
+            self.inner
+                .create_table(&path, &core_descriptor, ignore_if_exists)
+                .await
+        });
+
+        match result {
+            Ok(_) => ok_result(),
+            Err(e) => err_from_core_error(&e),
+        }
+    }
+
+    fn drop_table(
+        &self,
+        table_path: &ffi::FfiTablePath,
+        ignore_if_not_exists: bool,
+    ) -> ffi::FfiResult {
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+
+        let result =
+            RUNTIME.block_on(async { self.inner.drop_table(&path, ignore_if_not_exists).await });
+
+        match result {
+            Ok(_) => ok_result(),
+            Err(e) => err_from_core_error(&e),
+        }
+    }
+
+    fn get_table_info(&self, table_path: &ffi::FfiTablePath) -> ffi::FfiTableInfoResult {
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+
+        let result = RUNTIME.block_on(async { self.inner.get_table_info(&path).await });
+
+        match result {
+            Ok(info) => ffi::FfiTableInfoResult {
+                result: ok_result(),
+                table_info: types::core_table_info_to_ffi(&info),
+            },
+            Err(e) => ffi::FfiTableInfoResult {
+                result: err_from_core_error(&e),
+                table_info: types::empty_table_info(),
+            },
+        }
+    }
+
+    fn get_latest_lake_snapshot(
+        &self,
+        table_path: &ffi::FfiTablePath,
+    ) -> ffi::FfiLakeSnapshotResult {
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+
+        let result = RUNTIME.block_on(async { self.inner.get_latest_lake_snapshot(&path).await });
+
+        match result {
+            Ok(snapshot) => ffi::FfiLakeSnapshotResult {
+                result: ok_result(),
+                lake_snapshot: types::core_lake_snapshot_to_ffi(&snapshot),
+            },
+            Err(e) => ffi::FfiLakeSnapshotResult {
+                result: err_from_core_error(&e),
+                lake_snapshot: ffi::FfiLakeSnapshot {
+                    snapshot_id: -1,
+                    bucket_offsets: vec![],
+                },
+            },
+        }
+    }
+
+    // Helper function for common list offsets functionality
+    fn do_list_offsets(
+        &self,
+        table_path: &ffi::FfiTablePath,
+        partition_name: Option<&str>,
+        bucket_ids: Vec<i32>,
+        offset_query: &ffi::FfiOffsetQuery,
+    ) -> ffi::FfiListOffsetsResult {
+        use fcore::rpc::message::OffsetSpec;
+
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+
+        let offset_spec = match offset_query.offset_type {
+            0 => OffsetSpec::Earliest,
+            1 => OffsetSpec::Latest,
+            2 => OffsetSpec::Timestamp(offset_query.timestamp),
+            _ => {
+                return ffi::FfiListOffsetsResult {
+                    result: client_err(format!(
+                        "Invalid offset_type: {}",
+                        offset_query.offset_type
+                    )),
+                    bucket_offsets: vec![],
+                };
+            }
+        };
+
+        let result = RUNTIME.block_on(async {
+            if let Some(part_name) = partition_name {
+                self.inner
+                    .list_partition_offsets(&path, part_name, &bucket_ids, offset_spec)
+                    .await
+            } else {
+                self.inner
+                    .list_offsets(&path, &bucket_ids, offset_spec)
+                    .await
+            }
+        });
+
+        match result {
+            Ok(offsets) => {
+                let bucket_offsets: Vec<ffi::FfiBucketOffsetPair> = offsets
+                    .into_iter()
+                    .map(|(bucket_id, offset)| ffi::FfiBucketOffsetPair { bucket_id, offset })
+                    .collect();
+                ffi::FfiListOffsetsResult {
+                    result: ok_result(),
+                    bucket_offsets,
+                }
+            }
+            Err(e) => ffi::FfiListOffsetsResult {
+                result: err_from_core_error(&e),
+                bucket_offsets: vec![],
+            },
+        }
+    }
+
+    fn list_offsets(
+        &self,
+        table_path: &ffi::FfiTablePath,
+        bucket_ids: Vec<i32>,
+        offset_query: &ffi::FfiOffsetQuery,
+    ) -> ffi::FfiListOffsetsResult {
+        self.do_list_offsets(table_path, None, bucket_ids, offset_query)
+    }
+
+    fn list_partition_offsets(
+        &self,
+        table_path: &ffi::FfiTablePath,
+        partition_name: String,
+        bucket_ids: Vec<i32>,
+        offset_query: &ffi::FfiOffsetQuery,
+    ) -> ffi::FfiListOffsetsResult {
+        self.do_list_offsets(table_path, Some(&partition_name), bucket_ids, offset_query)
+    }
+
+    fn list_partition_infos(
+        &self,
+        table_path: &ffi::FfiTablePath,
+    ) -> ffi::FfiListPartitionInfosResult {
+        self.do_list_partition_infos(table_path, None)
+    }
+
+    fn list_partition_infos_with_spec(
+        &self,
+        table_path: &ffi::FfiTablePath,
+        partition_spec: Vec<ffi::FfiPartitionKeyValue>,
+    ) -> ffi::FfiListPartitionInfosResult {
+        let spec_map: std::collections::HashMap<String, String> = partition_spec
+            .into_iter()
+            .map(|kv| (kv.key, kv.value))
+            .collect();
+        let spec = fcore::metadata::PartitionSpec::new(spec_map);
+        self.do_list_partition_infos(table_path, Some(&spec))
+    }
+    fn create_partition(
+        &self,
+        table_path: &ffi::FfiTablePath,
+        partition_spec: Vec<ffi::FfiPartitionKeyValue>,
+        ignore_if_exists: bool,
+    ) -> ffi::FfiResult {
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+        let spec_map: std::collections::HashMap<String, String> = partition_spec
+            .into_iter()
+            .map(|kv| (kv.key, kv.value))
+            .collect();
+        let partition_spec = fcore::metadata::PartitionSpec::new(spec_map);
+
+        let result = RUNTIME.block_on(async {
+            self.inner
+                .create_partition(&path, &partition_spec, ignore_if_exists)
+                .await
+        });
+
+        match result {
+            Ok(_) => ok_result(),
+            Err(e) => err_from_core_error(&e),
+        }
+    }
+
+    fn drop_partition(
+        &self,
+        table_path: &ffi::FfiTablePath,
+        partition_spec: Vec<ffi::FfiPartitionKeyValue>,
+        ignore_if_not_exists: bool,
+    ) -> ffi::FfiResult {
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+        let spec_map: std::collections::HashMap<String, String> = partition_spec
+            .into_iter()
+            .map(|kv| (kv.key, kv.value))
+            .collect();
+        let partition_spec = fcore::metadata::PartitionSpec::new(spec_map);
+
+        let result = RUNTIME.block_on(async {
+            self.inner
+                .drop_partition(&path, &partition_spec, ignore_if_not_exists)
+                .await
+        });
+
+        match result {
+            Ok(_) => ok_result(),
+            Err(e) => err_from_core_error(&e),
+        }
+    }
+
+    fn create_database(
+        &self,
+        database_name: &str,
+        descriptor: &ffi::FfiDatabaseDescriptor,
+        ignore_if_exists: bool,
+    ) -> ffi::FfiResult {
+        let descriptor_opt = types::ffi_database_descriptor_to_core(descriptor);
+
+        let result = RUNTIME.block_on(async {
+            self.inner
+                .create_database(database_name, descriptor_opt.as_ref(), ignore_if_exists)
+                .await
+        });
+
+        match result {
+            Ok(_) => ok_result(),
+            Err(e) => err_from_core_error(&e),
+        }
+    }
+
+    fn drop_database(
+        &self,
+        database_name: &str,
+        ignore_if_not_exists: bool,
+        cascade: bool,
+    ) -> ffi::FfiResult {
+        let result = RUNTIME.block_on(async {
+            self.inner
+                .drop_database(database_name, ignore_if_not_exists, cascade)
+                .await
+        });
+
+        match result {
+            Ok(_) => ok_result(),
+            Err(e) => err_from_core_error(&e),
+        }
+    }
+
+    fn list_databases(&self) -> ffi::FfiListDatabasesResult {
+        let result = RUNTIME.block_on(async { self.inner.list_databases().await });
+
+        match result {
+            Ok(names) => ffi::FfiListDatabasesResult {
+                result: ok_result(),
+                database_names: names,
+            },
+            Err(e) => ffi::FfiListDatabasesResult {
+                result: err_from_core_error(&e),
+                database_names: vec![],
+            },
+        }
+    }
+
+    fn database_exists(&self, database_name: &str) -> ffi::FfiBoolResult {
+        let result = RUNTIME.block_on(async { self.inner.database_exists(database_name).await });
+
+        match result {
+            Ok(exists) => ffi::FfiBoolResult {
+                result: ok_result(),
+                value: exists,
+            },
+            Err(e) => ffi::FfiBoolResult {
+                result: err_from_core_error(&e),
+                value: false,
+            },
+        }
+    }
+
+    fn get_database_info(&self, database_name: &str) -> ffi::FfiDatabaseInfoResult {
+        let result = RUNTIME.block_on(async { self.inner.get_database_info(database_name).await });
+
+        match result {
+            Ok(info) => ffi::FfiDatabaseInfoResult {
+                result: ok_result(),
+                database_info: types::core_database_info_to_ffi(&info),
+            },
+            Err(e) => ffi::FfiDatabaseInfoResult {
+                result: err_from_core_error(&e),
+                database_info: ffi::FfiDatabaseInfo {
+                    database_name: String::new(),
+                    comment: String::new(),
+                    properties: vec![],
+                    created_time: 0,
+                    modified_time: 0,
+                },
+            },
+        }
+    }
+
+    fn list_tables(&self, database_name: &str) -> ffi::FfiListTablesResult {
+        let result = RUNTIME.block_on(async { self.inner.list_tables(database_name).await });
+
+        match result {
+            Ok(names) => ffi::FfiListTablesResult {
+                result: ok_result(),
+                table_names: names,
+            },
+            Err(e) => ffi::FfiListTablesResult {
+                result: err_from_core_error(&e),
+                table_names: vec![],
+            },
+        }
+    }
+
+    fn table_exists(&self, table_path: &ffi::FfiTablePath) -> ffi::FfiBoolResult {
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+
+        let result = RUNTIME.block_on(async { self.inner.table_exists(&path).await });
+
+        match result {
+            Ok(exists) => ffi::FfiBoolResult {
+                result: ok_result(),
+                value: exists,
+            },
+            Err(e) => ffi::FfiBoolResult {
+                result: err_from_core_error(&e),
+                value: false,
+            },
+        }
+    }
+
+    fn do_list_partition_infos(
+        &self,
+        table_path: &ffi::FfiTablePath,
+        partial_partition_spec: Option<&fcore::metadata::PartitionSpec>,
+    ) -> ffi::FfiListPartitionInfosResult {
+        let path = fcore::metadata::TablePath::new(
+            table_path.database_name.clone(),
+            table_path.table_name.clone(),
+        );
+        let result = RUNTIME.block_on(async {
+            self.inner
+                .list_partition_infos_with_spec(&path, partial_partition_spec)
+                .await
+        });
+        match result {
+            Ok(infos) => {
+                let partition_infos: Vec<ffi::FfiPartitionInfo> = infos
+                    .into_iter()
+                    .map(|info| ffi::FfiPartitionInfo {
+                        partition_id: info.get_partition_id(),
+                        partition_name: info.get_partition_name(),
+                    })
+                    .collect();
+                ffi::FfiListPartitionInfosResult {
+                    result: ok_result(),
+                    partition_infos,
+                }
+            }
+            Err(e) => ffi::FfiListPartitionInfosResult {
+                result: err_from_core_error(&e),
+                partition_infos: vec![],
+            },
+        }
+    }
+
+    fn get_server_nodes(&self) -> ffi::FfiServerNodesResult {
+        let result = RUNTIME.block_on(async { self.inner.get_server_nodes().await });
+
+        match result {
+            Ok(nodes) => {
+                let server_nodes: Vec<ffi::FfiServerNode> = nodes
+                    .into_iter()
+                    .map(|node| ffi::FfiServerNode {
+                        node_id: node.id(),
+                        host: node.host().to_string(),
+                        port: node.port(),
+                        server_type: node.server_type().to_string(),
+                        uid: node.uid().to_string(),
+                    })
+                    .collect();
+                ffi::FfiServerNodesResult {
+                    result: ok_result(),
+                    server_nodes,
+                }
+            }
+            Err(e) => ffi::FfiServerNodesResult {
+                result: err_from_core_error(&e),
+                server_nodes: vec![],
+            },
+        }
+    }
+}
+
+// Table implementation
+unsafe fn delete_table(table: *mut Table) {
+    if !table.is_null() {
+        unsafe {
+            drop(Box::from_raw(table));
+        }
+    }
+}
+
+impl Table {
+    fn fluss_table(&self) -> fcore::client::FlussTable<'_> {
+        fcore::client::FlussTable::new(
+            &self.connection,
+            self.metadata.clone(),
+            self.table_info.clone(),
+        )
+    }
+
+    fn resolve_projected_columns(
+        &self,
+        indices: &[usize],
+    ) -> Result<Vec<fcore::metadata::Column>, String> {
+        let all_columns = self.table_info.get_schema().columns();
+        indices
+            .iter()
+            .map(|&i| {
+                all_columns.get(i).cloned().ok_or_else(|| {
+                    format!(
+                        "Invalid column index {i}: schema has {} columns",
+                        all_columns.len()
+                    )
+                })
+            })
+            .collect()
+    }
+
+    fn new_append_writer(&self) -> ffi::FfiPtrResult {
+        let _enter = RUNTIME.enter();
+
+        let table_append = match self.fluss_table().new_append() {
+            Ok(a) => a,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let writer = match table_append.create_writer() {
+            Ok(w) => w,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let ptr = Box::into_raw(Box::new(AppendWriter {
+            inner: writer,
+            table_info: self.table_info.clone(),
+        }));
+        ok_ptr(ptr as usize)
+    }
+
+    fn create_scanner(&self, column_indices: Vec<usize>, batch: bool) -> ffi::FfiPtrResult {
+        RUNTIME.block_on(async {
+            let fluss_table = self.fluss_table();
+            let scan = fluss_table.new_scan();
+
+            let (projected_columns, scan) = if column_indices.is_empty() {
+                (self.table_info.get_schema().columns().to_vec(), scan)
+            } else {
+                let cols = match self.resolve_projected_columns(&column_indices) {
+                    Ok(c) => c,
+                    Err(e) => return client_err_ptr(e),
+                };
+                let scan = match scan.project(&column_indices) {
+                    Ok(s) => s,
+                    Err(e) => return err_ptr_from_core(&e),
+                };
+                (cols, scan)
+            };
+
+            let scanner = if batch {
+                match scan.create_record_batch_log_scanner() {
+                    Ok(s) => ScannerKind::Batch(s),
+                    Err(e) => return err_ptr_from_core(&e),
+                }
+            } else {
+                match scan.create_log_scanner() {
+                    Ok(s) => ScannerKind::Record(s),
+                    Err(e) => return err_ptr_from_core(&e),
+                }
+            };
+
+            let ptr = Box::into_raw(Box::new(LogScanner {
+                scanner,
+                projected_columns,
+            }));
+            ok_ptr(ptr as usize)
+        })
+    }
+
+    fn get_table_info_from_table(&self) -> ffi::FfiTableInfo {
+        types::core_table_info_to_ffi(&self.table_info)
+    }
+
+    fn get_table_path(&self) -> ffi::FfiTablePath {
+        ffi::FfiTablePath {
+            database_name: self.table_path.database().to_string(),
+            table_name: self.table_path.table().to_string(),
+        }
+    }
+
+    fn has_primary_key(&self) -> bool {
+        self.has_pk
+    }
+
+    fn create_upsert_writer(&self, column_indices: Vec<usize>) -> ffi::FfiPtrResult {
+        let _enter = RUNTIME.enter();
+
+        let table_upsert = match self.fluss_table().new_upsert() {
+            Ok(u) => u,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let table_upsert = if column_indices.is_empty() {
+            table_upsert
+        } else {
+            match table_upsert.partial_update(Some(column_indices)) {
+                Ok(u) => u,
+                Err(e) => return err_ptr_from_core(&e),
+            }
+        };
+
+        let writer = match table_upsert.create_writer() {
+            Ok(w) => w,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let ptr = Box::into_raw(Box::new(UpsertWriter {
+            inner: writer,
+            table_info: self.table_info.clone(),
+        }));
+        ok_ptr(ptr as usize)
+    }
+
+    fn new_lookuper(&self) -> ffi::FfiPtrResult {
+        let _enter = RUNTIME.enter();
+
+        let table_lookup = match self.fluss_table().new_lookup() {
+            Ok(l) => l,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let lookuper = match table_lookup.create_lookuper() {
+            Ok(l) => l,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let ptr = Box::into_raw(Box::new(Lookuper {
+            inner: lookuper,
+            table_info: self.table_info.clone(),
+        }));
+        ok_ptr(ptr as usize)
+    }
+}
+
+// AppendWriter implementation
+unsafe fn delete_append_writer(writer: *mut AppendWriter) {
+    if !writer.is_null() {
+        unsafe {
+            drop(Box::from_raw(writer));
+        }
+    }
+}
+
+impl AppendWriter {
+    fn append(&mut self, row: &GenericRowInner) -> ffi::FfiPtrResult {
+        let schema = self.table_info.get_schema();
+        let generic_row = match types::resolve_row_types(&row.row, Some(schema)) {
+            Ok(r) => r,
+            Err(e) => return client_err_ptr(e.to_string()),
+        };
+
+        let result_future = match self.inner.append(&generic_row) {
+            Ok(f) => f,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let ptr = Box::into_raw(Box::new(WriteResult {
+            inner: Some(result_future),
+        }));
+        ok_ptr(ptr as usize)
+    }
+
+    fn append_arrow_batch(&mut self, array_ptr: usize, schema_ptr: usize) -> ffi::FfiPtrResult {
+        use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
+
+        // Safety: C++ allocates these via `new ArrowArray/ArrowSchema` after a
+        // successful `ExportRecordBatch`, so both pointers are valid heap
+        // allocations that we take ownership of here.
+        let ffi_array = unsafe { *Box::from_raw(array_ptr as *mut FFI_ArrowArray) };
+        let ffi_schema = unsafe { Box::from_raw(schema_ptr as *mut FFI_ArrowSchema) };
+
+        // Safety: `from_ffi` requires that the array and schema conform to the
+        // Arrow C Data Interface, which is guaranteed by C++'s ExportRecordBatch.
+        let array_data = match unsafe { arrow::ffi::from_ffi(ffi_array, &ffi_schema) } {
+            Ok(d) => d,
+            Err(e) => return client_err_ptr(format!("Failed to import Arrow batch: {e}")),
+        };
+        // ffi_array is consumed by from_ffi; ffi_schema is dropped here (Box goes out of scope)
+
+        // Reconstruct RecordBatch from the imported StructArray data
+        let struct_array = arrow::array::StructArray::from(array_data);
+        let batch = arrow::record_batch::RecordBatch::from(struct_array);
+
+        let result_future = match self.inner.append_arrow_batch(batch) {
+            Ok(f) => f,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let ptr = Box::into_raw(Box::new(WriteResult {
+            inner: Some(result_future),
+        }));
+        ok_ptr(ptr as usize)
+    }
+
+    fn flush(&mut self) -> ffi::FfiResult {
+        let result = RUNTIME.block_on(async { self.inner.flush().await });
+
+        match result {
+            Ok(_) => ok_result(),
+            Err(e) => err_from_core_error(&e),
+        }
+    }
+}
+
+unsafe fn delete_write_result(wr: *mut WriteResult) {
+    if !wr.is_null() {
+        unsafe {
+            drop(Box::from_raw(wr));
+        }
+    }
+}
+
+impl WriteResult {
+    fn wait(&mut self) -> ffi::FfiResult {
+        if let Some(future) = self.inner.take() {
+            let result = RUNTIME.block_on(future);
+            match result {
+                Ok(_) => ok_result(),
+                Err(e) => err_from_core_error(&e),
+            }
+        } else {
+            client_err("WriteResult already consumed".to_string())
+        }
+    }
+}
+
+// UpsertWriter implementation
+unsafe fn delete_upsert_writer(writer: *mut UpsertWriter) {
+    if !writer.is_null() {
+        unsafe {
+            drop(Box::from_raw(writer));
+        }
+    }
+}
+
+impl UpsertWriter {
+    /// Pad row with Null to full schema width.
+    /// This allows callers to only set the fields they care about.
+    fn pad_row<'a>(&self, mut row: fcore::row::GenericRow<'a>) -> fcore::row::GenericRow<'a> {
+        let num_columns = self.table_info.get_schema().columns().len();
+        if row.values.len() < num_columns {
+            row.values.resize(num_columns, fcore::row::Datum::Null);
+        }
+        row
+    }
+
+    fn upsert(&mut self, row: &GenericRowInner) -> ffi::FfiPtrResult {
+        let schema = self.table_info.get_schema();
+        let generic_row = match types::resolve_row_types(&row.row, Some(schema)) {
+            Ok(r) => r,
+            Err(e) => return client_err_ptr(e.to_string()),
+        };
+        let generic_row = self.pad_row(generic_row);
+
+        let result_future = match self.inner.upsert(&generic_row) {
+            Ok(f) => f,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let ptr = Box::into_raw(Box::new(WriteResult {
+            inner: Some(result_future),
+        }));
+        ok_ptr(ptr as usize)
+    }
+
+    fn delete_row(&mut self, row: &GenericRowInner) -> ffi::FfiPtrResult {
+        let schema = self.table_info.get_schema();
+        let generic_row = match types::resolve_row_types(&row.row, Some(schema)) {
+            Ok(r) => r,
+            Err(e) => return client_err_ptr(e.to_string()),
+        };
+        let generic_row = self.pad_row(generic_row);
+
+        let result_future = match self.inner.delete(&generic_row) {
+            Ok(f) => f,
+            Err(e) => return err_ptr_from_core(&e),
+        };
+
+        let ptr = Box::into_raw(Box::new(WriteResult {
+            inner: Some(result_future),
+        }));
+        ok_ptr(ptr as usize)
+    }
+
+    fn upsert_flush(&mut self) -> ffi::FfiResult {
+        let result = RUNTIME.block_on(async { self.inner.flush().await });
+
+        match result {
+            Ok(_) => ok_result(),
+            Err(e) => err_from_core_error(&e),
+        }
+    }
+}
+
+// Lookuper implementation
+unsafe fn delete_lookuper(lookuper: *mut Lookuper) {
+    if !lookuper.is_null() {
+        unsafe {
+            drop(Box::from_raw(lookuper));
+        }
+    }
+}
+
+impl Lookuper {
+    /// Build a dense PK-only row from a (possibly sparse) input row.
+    /// The user may set PK values at their full schema positions (e.g. [0, 2])
+    /// via name-based Set(). We compact them into [0, 1, …] to match
+    /// the lookup_row_type the core KeyEncoder expects.
+    fn dense_pk_row<'a>(&self, mut row: fcore::row::GenericRow<'a>) -> fcore::row::GenericRow<'a> {
+        let pk_indices = self.table_info.get_schema().primary_key_indexes();
+        let mut dense = fcore::row::GenericRow::new(pk_indices.len());
+        for (dense_idx, &schema_idx) in pk_indices.iter().enumerate() {
+            if schema_idx < row.values.len() {
+                dense.values[dense_idx] =
+                    std::mem::replace(&mut row.values[schema_idx], fcore::row::Datum::Null);
+            }
+        }
+        dense
+    }
+
+    fn lookup(&mut self, pk_row: &GenericRowInner) -> Box<LookupResultInner> {
+        let schema = self.table_info.get_schema();
+        let generic_row = match types::resolve_row_types(&pk_row.row, Some(schema)) {
+            Ok(r) => self.dense_pk_row(r),
+            Err(e) => {
+                return Box::new(LookupResultInner::from_error(
+                    CLIENT_ERROR_CODE,
+                    e.to_string(),
+                ));
+            }
+        };
+
+        let lookup_result = match RUNTIME.block_on(self.inner.lookup(&generic_row)) {
+            Ok(r) => r,
+            Err(e) => {
+                let ffi_err = err_from_core_error(&e);
+                return Box::new(LookupResultInner::from_error(
+                    ffi_err.error_code,
+                    ffi_err.error_message,
+                ));
+            }
+        };
+
+        let columns = self.table_info.get_schema().columns().to_vec();
+        match lookup_result.get_single_row() {
+            Ok(Some(row)) => match types::compacted_row_to_owned(&row, &self.table_info) {
+                Ok(owned_row) => Box::new(LookupResultInner {
+                    error: None,
+                    found: true,
+                    row: Some(owned_row),
+                    columns,
+                }),
+                Err(e) => Box::new(LookupResultInner::from_error(
+                    CLIENT_ERROR_CODE,
+                    e.to_string(),
+                )),
+            },
+            Ok(None) => Box::new(LookupResultInner {
+                error: None,
+                found: false,
+                row: None,
+                columns,
+            }),
+            Err(e) => {
+                let ffi_err = err_from_core_error(&e);
+                Box::new(LookupResultInner::from_error(
+                    ffi_err.error_code,
+                    ffi_err.error_message,
+                ))
+            }
+        }
+    }
+}
+
+// LogScanner implementation
+unsafe fn delete_log_scanner(scanner: *mut LogScanner) {
+    if !scanner.is_null() {
+        unsafe {
+            drop(Box::from_raw(scanner));
+        }
+    }
+}
+
+// Helper function to free the Arrow FFI structures separately (for use after ImportRecordBatch)
+pub extern "C" fn free_arrow_ffi_structures(array_ptr: usize, schema_ptr: usize) {
+    use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
+    if array_ptr != 0 {
+        let _array = unsafe { Box::from_raw(array_ptr as *mut FFI_ArrowArray) };
+    }
+    if schema_ptr != 0 {
+        let _schema = unsafe { Box::from_raw(schema_ptr as *mut FFI_ArrowSchema) };
+    }
+}
+
+/// Dispatch a method call to whichever scanner variant is active.
+/// Both LogScanner and RecordBatchLogScanner share the same subscribe/unsubscribe interface.
+macro_rules! dispatch_scanner {
+    ($self:expr, $method:ident($($arg:expr),*)) => {
+        match RUNTIME.block_on(async {
+            match &$self.scanner {
+                ScannerKind::Record(s) => s.$method($($arg),*).await,
+                ScannerKind::Batch(s) => s.$method($($arg),*).await,
+            }
+        }) {
+            Ok(_) => ok_result(),
+            Err(e) => err_from_core_error(&e),
+        }
+    };
+}
+
+impl LogScanner {
+    fn subscribe(&self, bucket_id: i32, start_offset: i64) -> ffi::FfiResult {
+        dispatch_scanner!(self, subscribe(bucket_id, start_offset))
+    }
+
+    fn subscribe_buckets(&self, subscriptions: Vec<ffi::FfiBucketSubscription>) -> ffi::FfiResult {
+        use std::collections::HashMap;
+        let bucket_offsets: HashMap<i32, i64> = subscriptions
+            .into_iter()
+            .map(|s| (s.bucket_id, s.offset))
+            .collect();
+        dispatch_scanner!(self, subscribe_buckets(&bucket_offsets))
+    }
+
+    fn subscribe_partition(
+        &self,
+        partition_id: PartitionId,
+        bucket_id: i32,
+        start_offset: i64,
+    ) -> ffi::FfiResult {
+        dispatch_scanner!(
+            self,
+            subscribe_partition(partition_id, bucket_id, start_offset)
+        )
+    }
+
+    fn subscribe_partition_buckets(
+        &self,
+        subscriptions: Vec<ffi::FfiPartitionBucketSubscription>,
+    ) -> ffi::FfiResult {
+        use std::collections::HashMap;
+        let offsets: HashMap<(PartitionId, i32), i64> = subscriptions
+            .into_iter()
+            .map(|s| ((s.partition_id, s.bucket_id), s.offset))
+            .collect();
+        dispatch_scanner!(self, subscribe_partition_buckets(&offsets))
+    }
+
+    fn unsubscribe(&self, bucket_id: i32) -> ffi::FfiResult {
+        dispatch_scanner!(self, unsubscribe(bucket_id))
+    }
+
+    fn unsubscribe_partition(&self, partition_id: PartitionId, bucket_id: i32) -> ffi::FfiResult {
+        dispatch_scanner!(self, unsubscribe_partition(partition_id, bucket_id))
+    }
+
+    fn poll(&self, timeout_ms: i64) -> Box<ScanResultInner> {
+        let ScannerKind::Record(ref inner) = self.scanner else {
+            return Box::new(ScanResultInner::from_error(
+                CLIENT_ERROR_CODE,
+                "Record-based scanner not available".to_string(),
+            ));
+        };
+
+        let timeout = Duration::from_millis(timeout_ms.max(0) as u64);
+        let result = RUNTIME.block_on(async { inner.poll(timeout).await });
+
+        match result {
+            Ok(records) => {
+                let columns = self.projected_columns.clone();
+                let mut total_count = 0usize;
+                let mut buckets = Vec::new();
+                let mut bucket_infos = Vec::new();
+                for (table_bucket, bucket_records) in records.into_records_by_buckets() {
+                    let count = bucket_records.len();
+                    total_count += count;
+                    bucket_infos.push(ffi::FfiBucketInfo {
+                        table_id: table_bucket.table_id(),
+                        bucket_id: table_bucket.bucket_id(),
+                        has_partition_id: table_bucket.partition_id().is_some(),
+                        partition_id: table_bucket.partition_id().unwrap_or(0),
+                        record_count: count,
+                    });
+                    buckets.push((table_bucket, bucket_records));
+                }
+                Box::new(ScanResultInner {
+                    error: None,
+                    buckets,
+                    columns,
+                    bucket_infos,
+                    total_count,
+                })
+            }
+            Err(e) => {
+                let ffi_err = err_from_core_error(&e);
+                Box::new(ScanResultInner::from_error(
+                    ffi_err.error_code,
+                    ffi_err.error_message,
+                ))
+            }
+        }
+    }
+
+    fn poll_record_batch(&self, timeout_ms: i64) -> ffi::FfiArrowRecordBatchesResult {
+        let ScannerKind::Batch(ref inner_batch) = self.scanner else {
+            return ffi::FfiArrowRecordBatchesResult {
+                result: client_err("Batch-based scanner not available".to_string()),
+                arrow_batches: ffi::FfiArrowRecordBatches { batches: vec![] },
+            };
+        };
+
+        let timeout = Duration::from_millis(timeout_ms.max(0) as u64);
+        let result = RUNTIME.block_on(async { inner_batch.poll(timeout).await });
+
+        match result {
+            Ok(batches) => match types::core_scan_batches_to_ffi(&batches) {
+                Ok(arrow_batches) => ffi::FfiArrowRecordBatchesResult {
+                    result: ok_result(),
+                    arrow_batches,
+                },
+                Err(e) => ffi::FfiArrowRecordBatchesResult {
+                    result: client_err(e),
+                    arrow_batches: ffi::FfiArrowRecordBatches { batches: vec![] },
+                },
+            },
+            Err(e) => ffi::FfiArrowRecordBatchesResult {
+                result: err_from_core_error(&e),
+                arrow_batches: ffi::FfiArrowRecordBatches { batches: vec![] },
+            },
+        }
+    }
+}
+
+// ============================================================================
+// Opaque types: GenericRowInner (write path)
+// ============================================================================
+
+pub struct GenericRowInner {
+    row: fcore::row::GenericRow<'static>,
+}
+
+fn new_generic_row(field_count: usize) -> Box<GenericRowInner> {
+    Box::new(GenericRowInner {
+        row: fcore::row::GenericRow::new(field_count),
+    })
+}
+
+impl GenericRowInner {
+    fn gr_reset(&mut self) {
+        let len = self.row.values.len();
+        self.row = fcore::row::GenericRow::new(len);
+    }
+
+    fn gr_set_null(&mut self, idx: usize) {
+        self.ensure_size(idx);
+        self.row.set_field(idx, fcore::row::Datum::Null);
+    }
+
+    fn gr_set_bool(&mut self, idx: usize, val: bool) {
+        self.ensure_size(idx);
+        self.row.set_field(idx, fcore::row::Datum::Bool(val));
+    }
+
+    fn gr_set_i32(&mut self, idx: usize, val: i32) {
+        self.ensure_size(idx);
+        self.row.set_field(idx, fcore::row::Datum::Int32(val));
+    }
+
+    fn gr_set_i64(&mut self, idx: usize, val: i64) {
+        self.ensure_size(idx);
+        self.row.set_field(idx, fcore::row::Datum::Int64(val));
+    }
+
+    fn gr_set_f32(&mut self, idx: usize, val: f32) {
+        self.ensure_size(idx);
+        self.row
+            .set_field(idx, fcore::row::Datum::Float32(val.into()));
+    }
+
+    fn gr_set_f64(&mut self, idx: usize, val: f64) {
+        self.ensure_size(idx);
+        self.row
+            .set_field(idx, fcore::row::Datum::Float64(val.into()));
+    }
+
+    fn gr_set_str(&mut self, idx: usize, val: &str) {
+        self.ensure_size(idx);
+        self.row.set_field(
+            idx,
+            fcore::row::Datum::String(std::borrow::Cow::Owned(val.to_string())),
+        );
+    }
+
+    fn gr_set_bytes(&mut self, idx: usize, val: &[u8]) {
+        self.ensure_size(idx);
+        self.row.set_field(
+            idx,
+            fcore::row::Datum::Blob(std::borrow::Cow::Owned(val.to_vec())),
+        );
+    }
+
+    fn gr_set_date(&mut self, idx: usize, days: i32) {
+        self.ensure_size(idx);
+        self.row
+            .set_field(idx, fcore::row::Datum::Date(fcore::row::Date::new(days)));
+    }
+
+    fn gr_set_time(&mut self, idx: usize, millis: i32) {
+        self.ensure_size(idx);
+        self.row
+            .set_field(idx, fcore::row::Datum::Time(fcore::row::Time::new(millis)));
+    }
+
+    fn gr_set_ts_ntz(&mut self, idx: usize, millis: i64, nanos: i32) {
+        self.ensure_size(idx);
+        // Use from_millis_nanos, falling back to millis-only on error
+        let ts = fcore::row::TimestampNtz::from_millis_nanos(millis, nanos)
+            .unwrap_or_else(|_| fcore::row::TimestampNtz::new(millis));
+        self.row.set_field(idx, fcore::row::Datum::TimestampNtz(ts));
+    }
+
+    fn gr_set_ts_ltz(&mut self, idx: usize, millis: i64, nanos: i32) {
+        self.ensure_size(idx);
+        let ts = fcore::row::TimestampLtz::from_millis_nanos(millis, nanos)
+            .unwrap_or_else(|_| fcore::row::TimestampLtz::new(millis));
+        self.row.set_field(idx, fcore::row::Datum::TimestampLtz(ts));
+    }
+
+    fn gr_set_decimal_str(&mut self, idx: usize, val: &str) {
+        self.ensure_size(idx);
+        // Store as string; resolve_row_types() will parse and validate against schema
+        self.row.set_field(
+            idx,
+            fcore::row::Datum::String(std::borrow::Cow::Owned(val.to_string())),
+        );
+    }
+
+    fn gr_set_array(&mut self, idx: usize, writer: &mut ArrayWriterInner) -> Result<(), String> {
+        self.ensure_size(idx);
+        writer.complete_if_needed()?;
+        let arr = writer.completed.take().ok_or_else(|| {
+            "ArrayWriter invariant violation: completed array missing after finalize".to_string()
+        })?;
+        self.row.set_field(idx, fcore::row::Datum::Array(arr));
+        Ok(())
+    }
+
+    fn ensure_size(&mut self, idx: usize) {
+        if self.row.values.len() <= idx {
+            self.row.values.resize(idx + 1, fcore::row::Datum::Null);
+        }
+    }
+}
+
+// ============================================================================
+// Shared row-reading helpers (used by both ScanResultInner and LookupResultInner)
+// ============================================================================
+
+mod row_reader {
+    use super::array_reader;
+    use fcore::row::InternalRow;
+    use fluss as fcore;
+
+    use crate::types;
+
+    /// Get column at `field`, or error if out of bounds.
+    fn get_column(
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<&fcore::metadata::Column, String> {
+        columns.get(field).ok_or_else(|| {
+            format!(
+                "field index {field} out of range ({} columns)",
+                columns.len()
+            )
+        })
+    }
+
+    /// Validate bounds, null, and type compatibility in a single pass.
+    /// Returns the data type on success for callers that need to dispatch on it.
+    fn validate<'a>(
+        row: &dyn InternalRow,
+        columns: &'a [fcore::metadata::Column],
+        field: usize,
+        getter: &str,
+        allowed: impl FnOnce(&fcore::metadata::DataType) -> bool,
+    ) -> Result<&'a fcore::metadata::DataType, String> {
+        let col = get_column(columns, field)?;
+        if row.is_null_at(field).map_err(|e| e.to_string())? {
+            return Err(format!("field {field} is null"));
+        }
+        let dt = col.data_type();
+        if !allowed(dt) {
+            return Err(format!(
+                "{getter}: column {field} has incompatible type {dt}"
+            ));
+        }
+        Ok(dt)
+    }
+
+    pub fn column_type(columns: &[fcore::metadata::Column], field: usize) -> Result<i32, String> {
+        Ok(types::core_data_type_to_ffi(
+            get_column(columns, field)?.data_type(),
+        ))
+    }
+
+    pub fn column_name(columns: &[fcore::metadata::Column], field: usize) -> Result<&str, String> {
+        Ok(get_column(columns, field)?.name())
+    }
+
+    pub fn is_null(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<bool, String> {
+        get_column(columns, field)?;
+        row.is_null_at(field).map_err(|e| e.to_string())
+    }
+
+    pub fn get_bool(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<bool, String> {
+        validate(row, columns, field, "get_bool", |dt| {
+            matches!(dt, fcore::metadata::DataType::Boolean(_))
+        })?;
+        row.get_boolean(field).map_err(|e| e.to_string())
+    }
+
+    pub fn get_i32(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<i32, String> {
+        let dt = validate(row, columns, field, "get_i32", |dt| {
+            matches!(
+                dt,
+                fcore::metadata::DataType::TinyInt(_)
+                    | fcore::metadata::DataType::SmallInt(_)
+                    | fcore::metadata::DataType::Int(_)
+            )
+        })?;
+        match dt {
+            fcore::metadata::DataType::TinyInt(_) => row
+                .get_byte(field)
+                .map(|v| v as i32)
+                .map_err(|e| e.to_string()),
+            fcore::metadata::DataType::SmallInt(_) => row
+                .get_short(field)
+                .map(|v| v as i32)
+                .map_err(|e| e.to_string()),
+            _ => row.get_int(field).map_err(|e| e.to_string()),
+        }
+    }
+
+    pub fn get_i64(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<i64, String> {
+        validate(row, columns, field, "get_i64", |dt| {
+            matches!(dt, fcore::metadata::DataType::BigInt(_))
+        })?;
+        row.get_long(field).map_err(|e| e.to_string())
+    }
+
+    pub fn get_f32(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<f32, String> {
+        validate(row, columns, field, "get_f32", |dt| {
+            matches!(dt, fcore::metadata::DataType::Float(_))
+        })?;
+        row.get_float(field).map_err(|e| e.to_string())
+    }
+
+    pub fn get_f64(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<f64, String> {
+        validate(row, columns, field, "get_f64", |dt| {
+            matches!(dt, fcore::metadata::DataType::Double(_))
+        })?;
+        row.get_double(field).map_err(|e| e.to_string())
+    }
+
+    pub fn get_str<'a>(
+        row: &'a dyn InternalRow,
+        columns: &'a [fcore::metadata::Column],
+        field: usize,
+    ) -> Result<&'a str, String> {
+        let dt = validate(row, columns, field, "get_str", |dt| {
+            matches!(
+                dt,
+                fcore::metadata::DataType::Char(_) | fcore::metadata::DataType::String(_)
+            )
+        })?;
+        match dt {
+            fcore::metadata::DataType::Char(ct) => row
+                .get_char(field, ct.length() as usize)
+                .map_err(|e| e.to_string()),
+            _ => row.get_string(field).map_err(|e| e.to_string()),
+        }
+    }
+
+    pub fn get_bytes<'a>(
+        row: &'a dyn InternalRow,
+        columns: &'a [fcore::metadata::Column],
+        field: usize,
+    ) -> Result<&'a [u8], String> {
+        let dt = validate(row, columns, field, "get_bytes", |dt| {
+            matches!(
+                dt,
+                fcore::metadata::DataType::Binary(_) | fcore::metadata::DataType::Bytes(_)
+            )
+        })?;
+        match dt {
+            fcore::metadata::DataType::Binary(bt) => row
+                .get_binary(field, bt.length())
+                .map_err(|e| e.to_string()),
+            _ => row.get_bytes(field).map_err(|e| e.to_string()),
+        }
+    }
+
+    pub fn get_date_days(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<i32, String> {
+        validate(row, columns, field, "get_date_days", |dt| {
+            matches!(dt, fcore::metadata::DataType::Date(_))
+        })?;
+        row.get_date(field)
+            .map(|d| d.get_inner())
+            .map_err(|e| e.to_string())
+    }
+
+    pub fn get_time_millis(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<i32, String> {
+        validate(row, columns, field, "get_time_millis", |dt| {
+            matches!(dt, fcore::metadata::DataType::Time(_))
+        })?;
+        row.get_time(field)
+            .map(|t| t.get_inner())
+            .map_err(|e| e.to_string())
+    }
+
+    pub fn get_ts_millis(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<i64, String> {
+        let dt = validate(row, columns, field, "get_ts_millis", |dt| {
+            matches!(
+                dt,
+                fcore::metadata::DataType::Timestamp(_)
+                    | fcore::metadata::DataType::TimestampLTz(_)
+            )
+        })?;
+        match dt {
+            fcore::metadata::DataType::TimestampLTz(ts) => row
+                .get_timestamp_ltz(field, ts.precision())
+                .map(|v| v.get_epoch_millisecond())
+                .map_err(|e| e.to_string()),
+            fcore::metadata::DataType::Timestamp(ts) => row
+                .get_timestamp_ntz(field, ts.precision())
+                .map(|v| v.get_millisecond())
+                .map_err(|e| e.to_string()),
+            dt => Err(format!("get_ts_millis: unexpected type {dt}")),
+        }
+    }
+
+    pub fn get_ts_nanos(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<i32, String> {
+        let dt = validate(row, columns, field, "get_ts_nanos", |dt| {
+            matches!(
+                dt,
+                fcore::metadata::DataType::Timestamp(_)
+                    | fcore::metadata::DataType::TimestampLTz(_)
+            )
+        })?;
+        match dt {
+            fcore::metadata::DataType::TimestampLTz(ts) => row
+                .get_timestamp_ltz(field, ts.precision())
+                .map(|v| v.get_nano_of_millisecond())
+                .map_err(|e| e.to_string()),
+            fcore::metadata::DataType::Timestamp(ts) => row
+                .get_timestamp_ntz(field, ts.precision())
+                .map(|v| v.get_nano_of_millisecond())
+                .map_err(|e| e.to_string()),
+            dt => Err(format!("get_ts_nanos: unexpected type {dt}")),
+        }
+    }
+
+    pub fn is_ts_ltz(columns: &[fcore::metadata::Column], field: usize) -> Result<bool, String> {
+        Ok(matches!(
+            get_column(columns, field)?.data_type(),
+            fcore::metadata::DataType::TimestampLTz(_)
+        ))
+    }
+
+    pub fn get_decimal_str(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<String, String> {
+        let dt = validate(row, columns, field, "get_decimal_str", |dt| {
+            matches!(dt, fcore::metadata::DataType::Decimal(_))
+        })?;
+        match dt {
+            fcore::metadata::DataType::Decimal(dd) => {
+                let decimal = row
+                    .get_decimal(field, dd.precision() as usize, dd.scale() as usize)
+                    .map_err(|e| e.to_string())?;
+                Ok(decimal.to_big_decimal().to_string())
+            }
+            dt => Err(format!("get_decimal_str: unexpected type {dt}")),
+        }
+    }
+
+    fn get_fluss_array(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<fcore::row::binary_array::FlussArray, String> {
+        validate(row, columns, field, "get_array", |dt| {
+            matches!(dt, fcore::metadata::DataType::Array(_))
+        })?;
+        row.get_array(field).map_err(|e| e.to_string())
+    }
+
+    pub fn get_array_element_type(
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<&fcore::metadata::DataType, String> {
+        let col = get_column(columns, field)?;
+        match col.data_type() {
+            fcore::metadata::DataType::Array(at) => Ok(at.get_element_type()),
+            dt => Err(format!("get_array: column {field} is not Array, got {dt}")),
+        }
+    }
+
+    pub fn get_array_size(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<usize, String> {
+        let arr = get_fluss_array(row, columns, field)?;
+        Ok(arr.size())
+    }
+
+    pub fn get_array_and_elem_type<'a>(
+        row: &dyn InternalRow,
+        columns: &'a [fcore::metadata::Column],
+        field: usize,
+    ) -> Result<
+        (
+            fcore::row::binary_array::FlussArray,
+            &'a fcore::metadata::DataType,
+        ),
+        String,
+    > {
+        let arr = get_fluss_array(row, columns, field)?;
+        let elem = get_array_element_type(columns, field)?;
+        Ok((arr, elem))
+    }
+
+    pub fn get_array_is_null(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<bool, String> {
+        let arr = get_fluss_array(row, columns, field)?;
+        array_reader::is_null(&arr, element)
+    }
+
+    pub fn get_array_bool(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<bool, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_bool(&arr, elem, element)
+    }
+
+    pub fn get_array_i32(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<i32, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_i32(&arr, elem, element)
+    }
+
+    pub fn get_array_i64(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<i64, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_i64(&arr, elem, element)
+    }
+
+    pub fn get_array_f32(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<f32, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_f32(&arr, elem, element)
+    }
+
+    pub fn get_array_f64(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<f64, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_f64(&arr, elem, element)
+    }
+
+    pub fn get_array_str(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<String, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_str(&arr, elem, element)
+    }
+
+    pub fn get_array_bytes(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<Vec<u8>, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_bytes(&arr, elem, element)
+    }
+
+    pub fn get_array_date_days(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<i32, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_date_days(&arr, elem, element)
+    }
+
+    pub fn get_array_time_millis(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<i32, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_time_millis(&arr, elem, element)
+    }
+
+    pub fn get_array_ts_millis(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<i64, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_ts_millis(&arr, elem, element)
+    }
+
+    pub fn get_array_ts_nanos(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<i32, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_ts_nanos(&arr, elem, element)
+    }
+
+    pub fn get_array_decimal_str(
+        row: &dyn InternalRow,
+        columns: &[fcore::metadata::Column],
+        field: usize,
+        element: usize,
+    ) -> Result<String, String> {
+        let (arr, elem) = get_array_and_elem_type(row, columns, field)?;
+        array_reader::get_decimal_str(&arr, elem, element)
+    }
+
+    pub fn get_array_element_type_id(
+        columns: &[fcore::metadata::Column],
+        field: usize,
+    ) -> Result<i32, String> {
+        let elem_type = get_array_element_type(columns, field)?;
+        Ok(crate::types::core_data_type_to_ffi(elem_type))
+    }
+}
+
+// ============================================================================
+// array_reader — low-level accessors over an already-resolved FlussArray
+//
+// Shared by the top-level `row_reader::get_array_*` wrappers and by
+// `ArrayViewInner` (which exposes recursive/nested access to C++). Keeping
+// one implementation here guarantees identical bounds-checking, null
+// validation, type checking, and type dispatch across flat and nested reads.
+// ============================================================================
+
+mod array_reader {
+    use super::fcore;
+
+    fn validate_index(
+        arr: &fcore::row::binary_array::FlussArray,
+        element: usize,
+        op: &str,
+    ) -> Result<(), String> {
+        if element < arr.size() {
+            Ok(())
+        } else {
+            Err(format!(
+                "{op}: element index out of bounds: element={element}, size={}",
+                arr.size()
+            ))
+        }
+    }
+
+    fn ensure_non_null(
+        arr: &fcore::row::binary_array::FlussArray,
+        element: usize,
+        op: &str,
+    ) -> Result<(), String> {
+        if arr.is_null_at(element) {
+            Err(format!(
+                "{op}: element at index {element} is null; call array_is_null first"
+            ))
+        } else {
+            Ok(())
+        }
+    }
+
+    fn ensure_type(
+        elem_type: &fcore::metadata::DataType,
+        op: &str,
+        expected: &str,
+        allowed: impl FnOnce(&fcore::metadata::DataType) -> bool,
+    ) -> Result<(), String> {
+        if allowed(elem_type) {
+            Ok(())
+        } else {
+            Err(format!(
+                "{op}: element type is {elem_type}, expected {expected}"
+            ))
+        }
+    }
+
+    fn ensure_readable(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+        op: &str,
+        expected: &str,
+        allowed: impl FnOnce(&fcore::metadata::DataType) -> bool,
+    ) -> Result<(), String> {
+        validate_index(arr, element, op)?;
+        ensure_type(elem_type, op, expected, allowed)?;
+        ensure_non_null(arr, element, op)
+    }
+
+    pub fn is_null(
+        arr: &fcore::row::binary_array::FlussArray,
+        element: usize,
+    ) -> Result<bool, String> {
+        validate_index(arr, element, "array_is_null")?;
+        Ok(arr.is_null_at(element))
+    }
+
+    pub fn get_bool(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<bool, String> {
+        ensure_readable(arr, elem_type, element, "array_bool", "BOOLEAN", |dt| {
+            matches!(dt, fcore::metadata::DataType::Boolean(_))
+        })?;
+        arr.get_boolean(element).map_err(|e| e.to_string())
+    }
+
+    pub fn get_i32(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<i32, String> {
+        ensure_readable(
+            arr,
+            elem_type,
+            element,
+            "array_i32",
+            "TINYINT/SMALLINT/INT",
+            |dt| {
+                matches!(
+                    dt,
+                    fcore::metadata::DataType::TinyInt(_)
+                        | fcore::metadata::DataType::SmallInt(_)
+                        | fcore::metadata::DataType::Int(_)
+                )
+            },
+        )?;
+        match elem_type {
+            fcore::metadata::DataType::TinyInt(_) => arr
+                .get_byte(element)
+                .map(|v| v as i32)
+                .map_err(|e| e.to_string()),
+            fcore::metadata::DataType::SmallInt(_) => arr
+                .get_short(element)
+                .map(|v| v as i32)
+                .map_err(|e| e.to_string()),
+            fcore::metadata::DataType::Int(_) => arr.get_int(element).map_err(|e| e.to_string()),
+            _ => unreachable!("type validated by ensure_readable"),
+        }
+    }
+
+    pub fn get_i64(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<i64, String> {
+        ensure_readable(arr, elem_type, element, "array_i64", "BIGINT", |dt| {
+            matches!(dt, fcore::metadata::DataType::BigInt(_))
+        })?;
+        arr.get_long(element).map_err(|e| e.to_string())
+    }
+
+    pub fn get_f32(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<f32, String> {
+        ensure_readable(arr, elem_type, element, "array_f32", "FLOAT", |dt| {
+            matches!(dt, fcore::metadata::DataType::Float(_))
+        })?;
+        arr.get_float(element).map_err(|e| e.to_string())
+    }
+
+    pub fn get_f64(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<f64, String> {
+        ensure_readable(arr, elem_type, element, "array_f64", "DOUBLE", |dt| {
+            matches!(dt, fcore::metadata::DataType::Double(_))
+        })?;
+        arr.get_double(element).map_err(|e| e.to_string())
+    }
+
+    pub fn get_str(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<String, String> {
+        ensure_readable(arr, elem_type, element, "array_str", "STRING/CHAR", |dt| {
+            matches!(
+                dt,
+                fcore::metadata::DataType::String(_) | fcore::metadata::DataType::Char(_)
+            )
+        })?;
+        arr.get_string(element)
+            .map(|s| s.to_string())
+            .map_err(|e| e.to_string())
+    }
+
+    pub fn get_bytes(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<Vec<u8>, String> {
+        ensure_readable(
+            arr,
+            elem_type,
+            element,
+            "array_bytes",
+            "BYTES/BINARY",
+            |dt| {
+                matches!(
+                    dt,
+                    fcore::metadata::DataType::Bytes(_) | fcore::metadata::DataType::Binary(_)
+                )
+            },
+        )?;
+        arr.get_binary(element)
+            .map(|b| b.to_vec())
+            .map_err(|e| e.to_string())
+    }
+
+    pub fn get_date_days(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<i32, String> {
+        ensure_readable(arr, elem_type, element, "array_date", "DATE", |dt| {
+            matches!(dt, fcore::metadata::DataType::Date(_))
+        })?;
+        arr.get_date(element)
+            .map(|d| d.get_inner())
+            .map_err(|e| e.to_string())
+    }
+
+    pub fn get_time_millis(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<i32, String> {
+        ensure_readable(arr, elem_type, element, "array_time", "TIME", |dt| {
+            matches!(dt, fcore::metadata::DataType::Time(_))
+        })?;
+        arr.get_time(element)
+            .map(|t| t.get_inner())
+            .map_err(|e| e.to_string())
+    }
+
+    pub fn get_ts_millis(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<i64, String> {
+        ensure_readable(
+            arr,
+            elem_type,
+            element,
+            "array_ts_millis",
+            "TIMESTAMP/TIMESTAMP_LTZ",
+            |dt| {
+                matches!(
+                    dt,
+                    fcore::metadata::DataType::Timestamp(_)
+                        | fcore::metadata::DataType::TimestampLTz(_)
+                )
+            },
+        )?;
+        match elem_type {
+            fcore::metadata::DataType::TimestampLTz(ts) => arr
+                .get_timestamp_ltz(element, ts.precision())
+                .map(|v| v.get_epoch_millisecond())
+                .map_err(|e| e.to_string()),
+            fcore::metadata::DataType::Timestamp(ts) => arr
+                .get_timestamp_ntz(element, ts.precision())
+                .map(|v| v.get_millisecond())
+                .map_err(|e| e.to_string()),
+            _ => unreachable!("type validated by ensure_readable"),
+        }
+    }
+
+    pub fn get_ts_nanos(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<i32, String> {
+        ensure_readable(
+            arr,
+            elem_type,
+            element,
+            "array_ts_nanos",
+            "TIMESTAMP/TIMESTAMP_LTZ",
+            |dt| {
+                matches!(
+                    dt,
+                    fcore::metadata::DataType::Timestamp(_)
+                        | fcore::metadata::DataType::TimestampLTz(_)
+                )
+            },
+        )?;
+        match elem_type {
+            fcore::metadata::DataType::TimestampLTz(ts) => arr
+                .get_timestamp_ltz(element, ts.precision())
+                .map(|v| v.get_nano_of_millisecond())
+                .map_err(|e| e.to_string()),
+            fcore::metadata::DataType::Timestamp(ts) => arr
+                .get_timestamp_ntz(element, ts.precision())
+                .map(|v| v.get_nano_of_millisecond())
+                .map_err(|e| e.to_string()),
+            _ => unreachable!("type validated by ensure_readable"),
+        }
+    }
+
+    pub fn get_decimal_str(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<String, String> {
+        ensure_readable(arr, elem_type, element, "array_decimal", "DECIMAL", |dt| {
+            matches!(dt, fcore::metadata::DataType::Decimal(_))
+        })?;
+        match elem_type {
+            fcore::metadata::DataType::Decimal(dd) => {
+                let decimal = arr
+                    .get_decimal(element, dd.precision(), dd.scale())
+                    .map_err(|e| e.to_string())?;
+                Ok(decimal.to_big_decimal().to_string())
+            }
+            _ => unreachable!("type validated by ensure_readable"),
+        }
+    }
+
+    pub fn get_nested_array(
+        arr: &fcore::row::binary_array::FlussArray,
+        elem_type: &fcore::metadata::DataType,
+        element: usize,
+    ) -> Result<
+        (
+            fcore::row::binary_array::FlussArray,
+            fcore::metadata::DataType,
+        ),
+        String,
+    > {
+        ensure_readable(arr, elem_type, element, "array_nested", "ARRAY", |dt| {
+            matches!(dt, fcore::metadata::DataType::Array(_))
+        })?;
+        match elem_type {
+            fcore::metadata::DataType::Array(at) => {
+                let nested = arr.get_array(element).map_err(|e| e.to_string())?;
+                Ok((nested, at.get_element_type().clone()))
+            }
+            _ => unreachable!("type validated by ensure_readable"),
+        }
+    }
+}
+
+// ============================================================================
+// Macros that generate uniform sv_/lv_ array element getters (thin wrappers
+// that only forward to `row_reader::get_array_*`).
+// ============================================================================
+
+macro_rules! sv_array_element_getters {
+    ($( $method:ident, $reader_fn:ident, $ret:ty; )+) => {
+        $(
+            fn $method(
+                &self,
+                bucket: usize,
+                rec: usize,
+                field: usize,
+                element: usize,
+            ) -> Result<$ret, String> {
+                row_reader::$reader_fn(
+                    self.resolve(bucket, rec).row(),
+                    &self.columns,
+                    field,
+                    element,
+                )
+            }
+        )+
+    };
+}
+
+macro_rules! lv_array_element_getters {
+    ($( $method:ident, $reader_fn:ident, $ret:ty; )+) => {
+        $(
+            fn $method(&self, field: usize, element: usize) -> Result<$ret, String> {
+                let r = self.lv_row()?;
+                row_reader::$reader_fn(r, &self.columns, field, element)
+            }
+        )+
+    };
+}
+
+// ============================================================================
+// Opaque types: ScanResultInner (scan read path)
+// ============================================================================
+
+pub struct ScanResultInner {
+    error: Option<(i32, String)>,
+    buckets: Vec<(fcore::metadata::TableBucket, Vec<fcore::record::ScanRecord>)>,
+    columns: Vec<fcore::metadata::Column>,
+    bucket_infos: Vec<ffi::FfiBucketInfo>,
+    total_count: usize,
+}
+
+impl ScanResultInner {
+    fn from_error(code: i32, msg: String) -> Self {
+        Self {
+            error: Some((code, msg)),
+            buckets: Vec::new(),
+            columns: Vec::new(),
+            bucket_infos: Vec::new(),
+            total_count: 0,
+        }
+    }
+
+    fn resolve(&self, bucket: usize, rec: usize) -> &fcore::record::ScanRecord {
+        &self.buckets[bucket].1[rec]
+    }
+
+    fn sv_has_error(&self) -> bool {
+        self.error.is_some()
+    }
+
+    fn sv_error_code(&self) -> i32 {
+        self.error.as_ref().map_or(0, |e| e.0)
+    }
+
+    fn sv_error_message(&self) -> &str {
+        self.error.as_ref().map_or("", |e| e.1.as_str())
+    }
+
+    fn sv_record_count(&self) -> usize {
+        self.total_count
+    }
+
+    fn sv_column_count(&self) -> usize {
+        self.columns.len()
+    }
+    fn sv_column_name(&self, field: usize) -> Result<&str, String> {
+        row_reader::column_name(&self.columns, field)
+    }
+    fn sv_column_type(&self, field: usize) -> Result<i32, String> {
+        row_reader::column_type(&self.columns, field)
+    }
+
+    fn sv_offset(&self, bucket: usize, rec: usize) -> i64 {
+        self.resolve(bucket, rec).offset()
+    }
+    fn sv_timestamp(&self, bucket: usize, rec: usize) -> i64 {
+        self.resolve(bucket, rec).timestamp()
+    }
+    fn sv_change_type(&self, bucket: usize, rec: usize) -> i32 {
+        self.resolve(bucket, rec).change_type().to_byte_value() as i32
+    }
+    fn sv_field_count(&self) -> usize {
+        self.columns.len()
+    }
+
+    // Field accessors — C++ validates bounds in BucketRecords/RecordAt, validate() checks field.
+    fn sv_is_null(&self, bucket: usize, rec: usize, field: usize) -> Result<bool, String> {
+        row_reader::is_null(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_bool(&self, bucket: usize, rec: usize, field: usize) -> Result<bool, String> {
+        row_reader::get_bool(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_i32(&self, bucket: usize, rec: usize, field: usize) -> Result<i32, String> {
+        row_reader::get_i32(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_i64(&self, bucket: usize, rec: usize, field: usize) -> Result<i64, String> {
+        row_reader::get_i64(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_f32(&self, bucket: usize, rec: usize, field: usize) -> Result<f32, String> {
+        row_reader::get_f32(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_f64(&self, bucket: usize, rec: usize, field: usize) -> Result<f64, String> {
+        row_reader::get_f64(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_str(&self, bucket: usize, rec: usize, field: usize) -> Result<&str, String> {
+        row_reader::get_str(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_bytes(&self, bucket: usize, rec: usize, field: usize) -> Result<&[u8], String> {
+        row_reader::get_bytes(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_date_days(&self, bucket: usize, rec: usize, field: usize) -> Result<i32, String> {
+        row_reader::get_date_days(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_time_millis(&self, bucket: usize, rec: usize, field: usize) -> Result<i32, String> {
+        row_reader::get_time_millis(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_ts_millis(&self, bucket: usize, rec: usize, field: usize) -> Result<i64, String> {
+        row_reader::get_ts_millis(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_get_ts_nanos(&self, bucket: usize, rec: usize, field: usize) -> Result<i32, String> {
+        row_reader::get_ts_nanos(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    fn sv_is_ts_ltz(&self, _bucket: usize, _rec: usize, field: usize) -> Result<bool, String> {
+        row_reader::is_ts_ltz(&self.columns, field)
+    }
+    fn sv_get_decimal_str(
+        &self,
+        bucket: usize,
+        rec: usize,
+        field: usize,
+    ) -> Result<String, String> {
+        row_reader::get_decimal_str(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+
+    fn sv_get_array_size(&self, bucket: usize, rec: usize, field: usize) -> Result<usize, String> {
+        row_reader::get_array_size(self.resolve(bucket, rec).row(), &self.columns, field)
+    }
+    sv_array_element_getters! {
+        sv_get_array_is_null, get_array_is_null, bool;
+        sv_get_array_bool,    get_array_bool,    bool;
+        sv_get_array_i32,     get_array_i32,     i32;
+        sv_get_array_i64,     get_array_i64,     i64;
+        sv_get_array_f32,     get_array_f32,     f32;
+        sv_get_array_f64,     get_array_f64,     f64;
+        sv_get_array_str,     get_array_str,     String;
+        sv_get_array_bytes,   get_array_bytes,   Vec<u8>;
+        sv_get_array_date_days,   get_array_date_days,   i32;
+        sv_get_array_time_millis, get_array_time_millis, i32;
+        sv_get_array_ts_millis,   get_array_ts_millis,   i64;
+        sv_get_array_ts_nanos,    get_array_ts_nanos,    i32;
+        sv_get_array_decimal_str, get_array_decimal_str, String;
+    }
+    fn sv_get_array_element_type(&self, field: usize) -> Result<i32, String> {
+        row_reader::get_array_element_type_id(&self.columns, field)
+    }
+    fn sv_get_array_view(
+        &self,
+        bucket: usize,
+        rec: usize,
+        field: usize,
+    ) -> Result<Box<ArrayViewInner>, String> {
+        let (arr, elem) = row_reader::get_array_and_elem_type(
+            self.resolve(bucket, rec).row(),
+            &self.columns,
+            field,
+        )?;
+        Ok(Box::new(ArrayViewInner {
+            array: arr,
+            element_type: elem.clone(),
+        }))
+    }
+
+    fn sv_bucket_infos(&self) -> &Vec<ffi::FfiBucketInfo> {
+        &self.bucket_infos
+    }
+}
+
+// ============================================================================
+// Opaque types: LookupResultInner (lookup read path)
+// ============================================================================
+
+pub struct LookupResultInner {
+    error: Option<(i32, String)>,
+    found: bool,
+    row: Option<fcore::row::GenericRow<'static>>,
+    columns: Vec<fcore::metadata::Column>,
+}
+
+impl LookupResultInner {
+    fn from_error(code: i32, msg: String) -> Self {
+        Self {
+            error: Some((code, msg)),
+            found: false,
+            row: None,
+            columns: Vec::new(),
+        }
+    }
+
+    fn lv_has_error(&self) -> bool {
+        self.error.is_some()
+    }
+
+    fn lv_error_code(&self) -> i32 {
+        self.error.as_ref().map_or(0, |e| e.0)
+    }
+
+    fn lv_error_message(&self) -> &str {
+        self.error.as_ref().map_or("", |e| e.1.as_str())
+    }
+
+    fn lv_found(&self) -> bool {
+        self.found
+    }
+
+    fn lv_field_count(&self) -> usize {
+        self.columns.len()
+    }
+
+    fn lv_column_type(&self, field: usize) -> Result<i32, String> {
+        row_reader::column_type(&self.columns, field)
+    }
+
+    fn lv_column_name(&self, field: usize) -> Result<&str, String> {
+        row_reader::column_name(&self.columns, field)
+    }
+
+    fn lv_row(&self) -> Result<&fcore::row::GenericRow<'static>, String> {
+        self.row
+            .as_ref()
+            .ok_or_else(|| "no row available (not found or error)".to_string())
+    }
+
+    // Field accessors — delegate to shared row_reader helpers.
+    fn lv_is_null(&self, field: usize) -> Result<bool, String> {
+        let r = self.lv_row()?;
+        row_reader::is_null(r, &self.columns, field)
+    }
+    fn lv_get_bool(&self, field: usize) -> Result<bool, String> {
+        let r = self.lv_row()?;
+        row_reader::get_bool(r, &self.columns, field)
+    }
+    fn lv_get_i32(&self, field: usize) -> Result<i32, String> {
+        let r = self.lv_row()?;
+        row_reader::get_i32(r, &self.columns, field)
+    }
+    fn lv_get_i64(&self, field: usize) -> Result<i64, String> {
+        let r = self.lv_row()?;
+        row_reader::get_i64(r, &self.columns, field)
+    }
+    fn lv_get_f32(&self, field: usize) -> Result<f32, String> {
+        let r = self.lv_row()?;
+        row_reader::get_f32(r, &self.columns, field)
+    }
+    fn lv_get_f64(&self, field: usize) -> Result<f64, String> {
+        let r = self.lv_row()?;
+        row_reader::get_f64(r, &self.columns, field)
+    }
+    fn lv_get_str(&self, field: usize) -> Result<&str, String> {
+        let r = self.lv_row()?;
+        row_reader::get_str(r, &self.columns, field)
+    }
+    fn lv_get_bytes(&self, field: usize) -> Result<&[u8], String> {
+        let r = self.lv_row()?;
+        row_reader::get_bytes(r, &self.columns, field)
+    }
+    fn lv_get_date_days(&self, field: usize) -> Result<i32, String> {
+        let r = self.lv_row()?;
+        row_reader::get_date_days(r, &self.columns, field)
+    }
+    fn lv_get_time_millis(&self, field: usize) -> Result<i32, String> {
+        let r = self.lv_row()?;
+        row_reader::get_time_millis(r, &self.columns, field)
+    }
+    fn lv_get_ts_millis(&self, field: usize) -> Result<i64, String> {
+        let r = self.lv_row()?;
+        row_reader::get_ts_millis(r, &self.columns, field)
+    }
+    fn lv_get_ts_nanos(&self, field: usize) -> Result<i32, String> {
+        let r = self.lv_row()?;
+        row_reader::get_ts_nanos(r, &self.columns, field)
+    }
+    fn lv_is_ts_ltz(&self, field: usize) -> Result<bool, String> {
+        row_reader::is_ts_ltz(&self.columns, field)
+    }
+    fn lv_get_decimal_str(&self, field: usize) -> Result<String, String> {
+        let r = self.lv_row()?;
+        row_reader::get_decimal_str(r, &self.columns, field)
+    }
+    fn lv_get_array_size(&self, field: usize) -> Result<usize, String> {
+        let r = self.lv_row()?;
+        row_reader::get_array_size(r, &self.columns, field)
+    }
+    lv_array_element_getters! {
+        lv_get_array_is_null, get_array_is_null, bool;
+        lv_get_array_bool,    get_array_bool,    bool;
+        lv_get_array_i32,     get_array_i32,     i32;
+        lv_get_array_i64,     get_array_i64,     i64;
+        lv_get_array_f32,     get_array_f32,     f32;
+        lv_get_array_f64,     get_array_f64,     f64;
+        lv_get_array_str,     get_array_str,     String;
+        lv_get_array_bytes,   get_array_bytes,   Vec<u8>;
+        lv_get_array_date_days,   get_array_date_days,   i32;
+        lv_get_array_time_millis, get_array_time_millis, i32;
+        lv_get_array_ts_millis,   get_array_ts_millis,   i64;
+        lv_get_array_ts_nanos,    get_array_ts_nanos,    i32;
+        lv_get_array_decimal_str, get_array_decimal_str, String;
+    }
+    fn lv_get_array_element_type(&self, field: usize) -> Result<i32, String> {
+        row_reader::get_array_element_type_id(&self.columns, field)
+    }
+    fn lv_get_array_view(&self, field: usize) -> Result<Box<ArrayViewInner>, String> {
+        let r = self.lv_row()?;
+        let (arr, elem) = row_reader::get_array_and_elem_type(r, &self.columns, field)?;
+        Ok(Box::new(ArrayViewInner {
+            array: arr,
+            element_type: elem.clone(),
+        }))
+    }
+}
+
+// ============================================================================
+// Opaque types: ArrayViewInner (recursive array reader)
+//
+// Wraps an owned `FlussArray` plus its element `DataType` and exposes the
+// same accessors as `row_reader::get_array_*`, delegating to the shared
+// `array_reader` primitives. Enables C++ bindings to recurse into nested
+// arrays without per-level FFI scaffolding.
+// ============================================================================
+
+pub struct ArrayViewInner {
+    array: fcore::row::binary_array::FlussArray,
+    element_type: fcore::metadata::DataType,
+}
+
+impl ArrayViewInner {
+    fn av_size(&self) -> usize {
+        self.array.size()
+    }
+
+    fn av_element_type_id(&self) -> i32 {
+        crate::types::core_data_type_to_ffi(&self.element_type)
+    }
+
+    fn av_is_null(&self, element: usize) -> Result<bool, String> {
+        array_reader::is_null(&self.array, element)
+    }
+
+    fn av_get_bool(&self, element: usize) -> Result<bool, String> {
+        array_reader::get_bool(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_i32(&self, element: usize) -> Result<i32, String> {
+        array_reader::get_i32(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_i64(&self, element: usize) -> Result<i64, String> {
+        array_reader::get_i64(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_f32(&self, element: usize) -> Result<f32, String> {
+        array_reader::get_f32(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_f64(&self, element: usize) -> Result<f64, String> {
+        array_reader::get_f64(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_str(&self, element: usize) -> Result<String, String> {
+        array_reader::get_str(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_bytes(&self, element: usize) -> Result<Vec<u8>, String> {
+        array_reader::get_bytes(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_date_days(&self, element: usize) -> Result<i32, String> {
+        array_reader::get_date_days(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_time_millis(&self, element: usize) -> Result<i32, String> {
+        array_reader::get_time_millis(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_ts_millis(&self, element: usize) -> Result<i64, String> {
+        array_reader::get_ts_millis(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_ts_nanos(&self, element: usize) -> Result<i32, String> {
+        array_reader::get_ts_nanos(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_decimal_str(&self, element: usize) -> Result<String, String> {
+        array_reader::get_decimal_str(&self.array, &self.element_type, element)
+    }
+
+    fn av_get_nested(&self, element: usize) -> Result<Box<ArrayViewInner>, String> {
+        let (arr, elem) = array_reader::get_nested_array(&self.array, &self.element_type, element)?;
+        Ok(Box::new(ArrayViewInner {
+            array: arr,
+            element_type: elem,
+        }))
+    }
+}
+
+// ============================================================================
+// Opaque types: ArrayWriterInner (array builder for writes)
+// ============================================================================
+
+pub struct ArrayWriterInner {
+    writer: Option<fcore::row::binary_array::FlussArrayWriter>,
+    completed: Option<fcore::row::binary_array::FlussArray>,
+    element_type: fcore::metadata::DataType,
+    num_elements: usize,
+}
+
+fn new_array_writer(
+    size: usize,
+    element_leaf_type_id: i32,
+    precision: u32,
+    scale: u32,
+    array_nesting: u32,
+) -> Result<Box<ArrayWriterInner>, String> {
+    let element_type =
+        types::element_type_from_ffi(element_leaf_type_id, precision, scale, array_nesting)
+            .map_err(|e| e.to_string())?;
+    let writer = fcore::row::binary_array::FlussArrayWriter::new(size, &element_type);
+    Ok(Box::new(ArrayWriterInner {
+        writer: Some(writer),
+        completed: None,
+        element_type,
+        num_elements: size,
+    }))
+}
+
+impl ArrayWriterInner {
+    fn writer_mut(&mut self) -> Result<&mut fcore::row::binary_array::FlussArrayWriter, String> {
+        self.writer
+            .as_mut()
+            .ok_or_else(|| "ArrayWriter is already finalized".to_string())
+    }
+
+    fn validate_index(&self, idx: usize) -> Result<(), String> {
+        if idx < self.num_elements {
+            Ok(())
+        } else {
+            Err(format!(
+                "ArrayWriter index out of bounds: idx={idx}, size={}",
+                self.num_elements
+            ))
+        }
+    }
+
+    fn complete_if_needed(&mut self) -> Result<(), String> {
+        if self.completed.is_none() {
+            let w = self
+                .writer
+                .take()
+                .ok_or_else(|| "ArrayWriter has already been finalized".to_string())?;
+            self.completed = Some(w.complete().map_err(|e| e.to_string())?);
+        }
+        Ok(())
+    }
+
+    /// Checks writer liveness first, then the element index. Returning the
+    /// clearest finalization error before a bounds error keeps diagnostics
+    /// aligned with the caller's intent when a writer is misused after
+    /// completion.
+    fn ensure_writable(&self, idx: usize) -> Result<(), String> {
+        if self.writer.is_none() {
+            return Err("ArrayWriter is already finalized".to_string());
+        }
+        self.validate_index(idx)
+    }
+
+    fn aw_size(&self) -> usize {
+        self.num_elements
+    }
+
+    fn aw_set_null(&mut self, idx: usize) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        self.writer_mut()?.set_null_at(idx);
+        Ok(())
+    }
+
+    fn aw_set_bool(&mut self, idx: usize, val: bool) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        if !matches!(self.element_type, fcore::metadata::DataType::Boolean(_)) {
+            return Err(format!(
+                "ArrayWriter type mismatch: expected BOOLEAN element, got {}",
+                self.element_type
+            ));
+        }
+        self.writer_mut()?.write_boolean(idx, val);
+        Ok(())
+    }
+
+    fn aw_set_i32(&mut self, idx: usize, val: i32) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        match &self.element_type {
+            fcore::metadata::DataType::TinyInt(_) => {
+                let v = i8::try_from(val)
+                    .map_err(|_| format!("Value {val} does not fit TINYINT element"))?;
+                self.writer_mut()?.write_byte(idx, v);
+            }
+            fcore::metadata::DataType::SmallInt(_) => {
+                let v = i16::try_from(val)
+                    .map_err(|_| format!("Value {val} does not fit SMALLINT element"))?;
+                self.writer_mut()?.write_short(idx, v);
+            }
+            fcore::metadata::DataType::Int(_) => {
+                self.writer_mut()?.write_int(idx, val);
+            }
+            _ => {
+                return Err(format!(
+                    "ArrayWriter type mismatch: expected TINYINT/SMALLINT/INT element, got {}",
+                    self.element_type
+                ));
+            }
+        }
+        Ok(())
+    }
+
+    fn aw_set_i64(&mut self, idx: usize, val: i64) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        if !matches!(self.element_type, fcore::metadata::DataType::BigInt(_)) {
+            return Err(format!(
+                "ArrayWriter type mismatch: expected BIGINT element, got {}",
+                self.element_type
+            ));
+        }
+        self.writer_mut()?.write_long(idx, val);
+        Ok(())
+    }
+
+    fn aw_set_f32(&mut self, idx: usize, val: f32) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        if !matches!(self.element_type, fcore::metadata::DataType::Float(_)) {
+            return Err(format!(
+                "ArrayWriter type mismatch: expected FLOAT element, got {}",
+                self.element_type
+            ));
+        }
+        self.writer_mut()?.write_float(idx, val);
+        Ok(())
+    }
+
+    fn aw_set_f64(&mut self, idx: usize, val: f64) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        if !matches!(self.element_type, fcore::metadata::DataType::Double(_)) {
+            return Err(format!(
+                "ArrayWriter type mismatch: expected DOUBLE element, got {}",
+                self.element_type
+            ));
+        }
+        self.writer_mut()?.write_double(idx, val);
+        Ok(())
+    }
+
+    fn aw_set_str(&mut self, idx: usize, val: &str) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        if !matches!(
+            self.element_type,
+            fcore::metadata::DataType::String(_) | fcore::metadata::DataType::Char(_)
+        ) {
+            return Err(format!(
+                "ArrayWriter type mismatch: expected STRING/CHAR element, got {}",
+                self.element_type
+            ));
+        }
+        self.writer_mut()?.write_string(idx, val);
+        Ok(())
+    }
+
+    fn aw_set_bytes(&mut self, idx: usize, val: &[u8]) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        if !matches!(
+            self.element_type,
+            fcore::metadata::DataType::Bytes(_) | fcore::metadata::DataType::Binary(_)
+        ) {
+            return Err(format!(
+                "ArrayWriter type mismatch: expected BYTES/BINARY element, got {}",
+                self.element_type
+            ));
+        }
+        self.writer_mut()?.write_binary_bytes(idx, val);
+        Ok(())
+    }
+
+    fn aw_set_date(&mut self, idx: usize, days: i32) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        if !matches!(self.element_type, fcore::metadata::DataType::Date(_)) {
+            return Err(format!(
+                "ArrayWriter type mismatch: expected DATE element, got {}",
+                self.element_type
+            ));
+        }
+        self.writer_mut()?
+            .write_date(idx, fcore::row::Date::new(days));
+        Ok(())
+    }
+
+    fn aw_set_time(&mut self, idx: usize, millis: i32) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        if !matches!(self.element_type, fcore::metadata::DataType::Time(_)) {
+            return Err(format!(
+                "ArrayWriter type mismatch: expected TIME element, got {}",
+                self.element_type
+            ));
+        }
+        self.writer_mut()?
+            .write_time(idx, fcore::row::Time::new(millis));
+        Ok(())
+    }
+
+    fn aw_set_ts_ntz(&mut self, idx: usize, millis: i64, nanos: i32) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        let precision = match &self.element_type {
+            fcore::metadata::DataType::Timestamp(ts) => ts.precision(),
+            _ => {
+                return Err(format!(
+                    "ArrayWriter type mismatch: expected TIMESTAMP element, got {}",
+                    self.element_type
+                ));
+            }
+        };
+        let ts = fcore::row::TimestampNtz::from_millis_nanos(millis, nanos)
+            .map_err(|e| e.to_string())?;
+        self.writer_mut()?.write_timestamp_ntz(idx, &ts, precision);
+        Ok(())
+    }
+
+    fn aw_set_ts_ltz(&mut self, idx: usize, millis: i64, nanos: i32) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        let precision = match &self.element_type {
+            fcore::metadata::DataType::TimestampLTz(ts) => ts.precision(),
+            _ => {
+                return Err(format!(
+                    "ArrayWriter type mismatch: expected TIMESTAMP_LTZ element, got {}",
+                    self.element_type
+                ));
+            }
+        };
+        let ts = fcore::row::TimestampLtz::from_millis_nanos(millis, nanos)
+            .map_err(|e| e.to_string())?;
+        self.writer_mut()?.write_timestamp_ltz(idx, &ts, precision);
+        Ok(())
+    }
+
+    fn aw_set_decimal_str(&mut self, idx: usize, val: &str) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        let (precision, scale) = match &self.element_type {
+            fcore::metadata::DataType::Decimal(d) => (d.precision(), d.scale()),
+            _ => {
+                return Err(format!(
+                    "ArrayWriter type mismatch: expected DECIMAL element, got {}",
+                    self.element_type
+                ));
+            }
+        };
+        let bd = bigdecimal::BigDecimal::from_str(val).map_err(|e| e.to_string())?;
+        let decimal = fcore::row::Decimal::from_big_decimal(bd, precision, scale)
+            .map_err(|e| e.to_string())?;
+        self.writer_mut()?.write_decimal(idx, &decimal, precision);
+        Ok(())
+    }
+
+    fn aw_set_array(&mut self, idx: usize, nested: &mut ArrayWriterInner) -> Result<(), String> {
+        self.ensure_writable(idx)?;
+        let expected_inner = match &self.element_type {
+            fcore::metadata::DataType::Array(at) => at.get_element_type(),
+            _ => {
+                return Err(format!(
+                    "ArrayWriter type mismatch: expected ARRAY element, got {}",
+                    self.element_type
+                ));
+            }
+        };
+        if !structurally_compatible(expected_inner, &nested.element_type) {
+            return Err(format!(
+                "Nested ArrayWriter type mismatch: expected nested element type {}, got {}",
+                expected_inner, nested.element_type
+            ));
+        }
+        nested.complete_if_needed()?;
+        let arr = nested.completed.as_ref().ok_or_else(|| {
+            "ArrayWriter invariant violation: nested completed array missing after finalize"
+                .to_string()
+        })?;
+        self.writer_mut()?.write_array(idx, arr);
+        Ok(())
+    }
+}
+
+/// Structural type equivalence that ignores nullability flags but preserves
+/// variant and precision/scale semantics. Used to compare ArrayWriter element
+/// types on the binding boundary. Nullability is ignored in structural comparison
+/// because the Rust-side element type is always reconstructed as nullable
+/// (encoding doesn't depend on it).
+fn structurally_compatible(a: &fcore::metadata::DataType, b: &fcore::metadata::DataType) -> bool {
+    use fcore::metadata::DataType;
+    match (a, b) {
+        (DataType::Boolean(_), DataType::Boolean(_))
+        | (DataType::TinyInt(_), DataType::TinyInt(_))
+        | (DataType::SmallInt(_), DataType::SmallInt(_))
+        | (DataType::Int(_), DataType::Int(_))
+        | (DataType::BigInt(_), DataType::BigInt(_))
+        | (DataType::Float(_), DataType::Float(_))
+        | (DataType::Double(_), DataType::Double(_))
+        | (DataType::String(_), DataType::String(_))
+        | (DataType::Bytes(_), DataType::Bytes(_))
+        | (DataType::Date(_), DataType::Date(_))
+        | (DataType::Time(_), DataType::Time(_)) => true,
+        (DataType::Timestamp(x), DataType::Timestamp(y)) => x.precision() == y.precision(),
+        (DataType::TimestampLTz(x), DataType::TimestampLTz(y)) => x.precision() == y.precision(),
+        (DataType::Char(x), DataType::Char(y)) => x.length() == y.length(),
+        (DataType::Binary(x), DataType::Binary(y)) => x.length() == y.length(),
+        (DataType::Decimal(x), DataType::Decimal(y)) => {
+            x.precision() == y.precision() && x.scale() == y.scale()
+        }
+        (DataType::Array(x), DataType::Array(y)) => {
+            structurally_compatible(x.get_element_type(), y.get_element_type())
+        }
+        _ => false,
+    }
+}
diff --git a/fluss-rust/bindings/cpp/src/table.cpp b/fluss-rust/bindings/cpp/src/table.cpp
new file mode 100644
index 0000000000..f389f7ac90
--- /dev/null
+++ b/fluss-rust/bindings/cpp/src/table.cpp
@@ -0,0 +1,1619 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow/c/bridge.h>
+
+#include <cassert>
+#include <ctime>
+
+#include "ffi_converter.hpp"
+#include "fluss.hpp"
+#include "lib.rs.h"
+#include "rust/cxx.h"
+// todo:  bindings/cpp/BUILD.bazel still doesn't declare Arrow include/link dependencies.
+// In environments where Bazel does not already have Arrow available, this will fail at compile/link
+// time.
+#include <arrow/record_batch.h>
+
+namespace fluss {
+
+static constexpr int kSecondsPerDay = 24 * 60 * 60;
+
+static std::time_t timegm_utc(std::tm* tm) {
+#if defined(_WIN32)
+    return _mkgmtime(tm);
+#else
+    return ::timegm(tm);
+#endif
+}
+
+static std::tm gmtime_utc(std::time_t epoch_seconds) {
+    std::tm tm{};
+#if defined(_WIN32)
+    gmtime_s(&tm, &epoch_seconds);
+#else
+    ::gmtime_r(&epoch_seconds, &tm);
+#endif
+    return tm;
+}
+
+Date Date::FromYMD(int year, int month, int day) {
+    std::tm tm{};
+    tm.tm_year = year - 1900;
+    tm.tm_mon = month - 1;
+    tm.tm_mday = day;
+    std::time_t epoch_seconds = timegm_utc(&tm);
+    return {static_cast<int32_t>(epoch_seconds / kSecondsPerDay)};
+}
+
+int Date::Year() const {
+    std::time_t epoch_seconds = static_cast<std::time_t>(days_since_epoch) * kSecondsPerDay;
+    std::tm tm = gmtime_utc(epoch_seconds);
+    return tm.tm_year + 1900;
+}
+
+int Date::Month() const {
+    std::time_t epoch_seconds = static_cast<std::time_t>(days_since_epoch) * kSecondsPerDay;
+    std::tm tm = gmtime_utc(epoch_seconds);
+    return tm.tm_mon + 1;
+}
+
+int Date::Day() const {
+    std::time_t epoch_seconds = static_cast<std::time_t>(days_since_epoch) * kSecondsPerDay;
+    std::tm tm = gmtime_utc(epoch_seconds);
+    return tm.tm_mday;
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
+#define CHECK_INNER(name)                                                                 \
+    do {                                                                                  \
+        if (!inner_) throw std::logic_error(name ": not available (moved-from or null)"); \
+    } while (0)
+
+// ============================================================================
+// ArrayWriter — builder for array values backed by Rust ArrayWriterInner
+// ============================================================================
+
+ArrayWriter::ArrayWriter(size_t size, DataType element_type) : element_type_(std::move(element_type)) {
+    auto flat = utils::flatten_array_type(element_type_);
+    int32_t leaf_type_id = flat.nesting > 0 ? flat.leaf_type : static_cast<int32_t>(element_type_.id());
+    uint32_t leaf_precision = static_cast<uint32_t>(flat.nesting > 0 ? flat.leaf_precision
+                                                                      : element_type_.precision());
+    uint32_t leaf_scale = static_cast<uint32_t>(flat.nesting > 0 ? flat.leaf_scale : element_type_.scale());
+    uint32_t array_nesting = static_cast<uint32_t>(flat.nesting);
+
+    auto box = ffi::new_array_writer(size, leaf_type_id, leaf_precision, leaf_scale, array_nesting);
+    inner_ = box.into_raw();
+}
+
+ArrayWriter::~ArrayWriter() noexcept { Destroy(); }
+
+void ArrayWriter::Destroy() noexcept {
+    if (inner_) {
+        rust::Box<ffi::ArrayWriterInner>::from_raw(inner_);
+        inner_ = nullptr;
+    }
+}
+
+ArrayWriter::ArrayWriter(ArrayWriter&& other) noexcept
+    : inner_(other.inner_), element_type_(std::move(other.element_type_)) {
+    other.inner_ = nullptr;
+}
+
+ArrayWriter& ArrayWriter::operator=(ArrayWriter&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        inner_ = other.inner_;
+        element_type_ = std::move(other.element_type_);
+        other.inner_ = nullptr;
+    }
+    return *this;
+}
+
+bool ArrayWriter::Available() const { return inner_ != nullptr; }
+
+size_t ArrayWriter::Size() const noexcept {
+    assert(inner_ && "ArrayWriter::Size called on moved-from instance");
+    return inner_->aw_size();
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
+#define CHECK_AW(name)                                                                    \
+    do {                                                                                  \
+        if (!inner_) throw std::logic_error(name ": not available (moved-from or null)"); \
+    } while (0)
+
+void ArrayWriter::SetNull(size_t idx) { CHECK_AW("ArrayWriter"); inner_->aw_set_null(idx); }
+void ArrayWriter::SetBool(size_t idx, bool v) { CHECK_AW("ArrayWriter"); inner_->aw_set_bool(idx, v); }
+void ArrayWriter::SetInt32(size_t idx, int32_t v) { CHECK_AW("ArrayWriter"); inner_->aw_set_i32(idx, v); }
+void ArrayWriter::SetInt64(size_t idx, int64_t v) { CHECK_AW("ArrayWriter"); inner_->aw_set_i64(idx, v); }
+void ArrayWriter::SetFloat32(size_t idx, float v) { CHECK_AW("ArrayWriter"); inner_->aw_set_f32(idx, v); }
+void ArrayWriter::SetFloat64(size_t idx, double v) { CHECK_AW("ArrayWriter"); inner_->aw_set_f64(idx, v); }
+
+void ArrayWriter::SetString(size_t idx, const std::string& v) {
+    CHECK_AW("ArrayWriter");
+    inner_->aw_set_str(idx, v);
+}
+
+void ArrayWriter::SetBytes(size_t idx, const std::vector<uint8_t>& v) {
+    CHECK_AW("ArrayWriter");
+    inner_->aw_set_bytes(idx, rust::Slice<const uint8_t>(v.data(), v.size()));
+}
+
+void ArrayWriter::SetDate(size_t idx, fluss::Date d) {
+    CHECK_AW("ArrayWriter");
+    inner_->aw_set_date(idx, d.days_since_epoch);
+}
+
+void ArrayWriter::SetTime(size_t idx, fluss::Time t) {
+    CHECK_AW("ArrayWriter");
+    inner_->aw_set_time(idx, t.millis_since_midnight);
+}
+
+void ArrayWriter::SetTimestampNtz(size_t idx, fluss::Timestamp ts) {
+    CHECK_AW("ArrayWriter");
+    inner_->aw_set_ts_ntz(idx, ts.epoch_millis, ts.nano_of_millisecond);
+}
+
+void ArrayWriter::SetTimestampLtz(size_t idx, fluss::Timestamp ts) {
+    CHECK_AW("ArrayWriter");
+    inner_->aw_set_ts_ltz(idx, ts.epoch_millis, ts.nano_of_millisecond);
+}
+
+void ArrayWriter::SetDecimal(size_t idx, const std::string& value) {
+    CHECK_AW("ArrayWriter");
+    inner_->aw_set_decimal_str(idx, value);
+}
+
+void ArrayWriter::SetArray(size_t idx, ArrayWriter&& nested) {
+    CHECK_AW("ArrayWriter");
+    if (!nested.inner_) {
+        throw std::logic_error("ArrayWriter::SetArray: nested writer not available");
+    }
+    inner_->aw_set_array(idx, *nested.inner_);
+    nested.Destroy();
+}
+
+// ============================================================================
+// ArrayView — read-only recursive view into an array column value
+// ============================================================================
+
+ArrayView::~ArrayView() noexcept { Destroy(); }
+
+void ArrayView::Destroy() noexcept {
+    if (inner_) {
+        rust::Box<ffi::ArrayViewInner>::from_raw(inner_);
+        inner_ = nullptr;
+    }
+}
+
+ArrayView::ArrayView(ArrayView&& other) noexcept : inner_(other.inner_) { other.inner_ = nullptr; }
+
+ArrayView& ArrayView::operator=(ArrayView&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        inner_ = other.inner_;
+        other.inner_ = nullptr;
+    }
+    return *this;
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
+#define CHECK_AV()                                                                      \
+    do {                                                                                \
+        if (!inner_) throw std::logic_error("ArrayView: not available (moved-from)");  \
+    } while (0)
+
+size_t ArrayView::Size() const noexcept {
+    assert(inner_ && "ArrayView::Size called on moved-from instance");
+    return inner_->av_size();
+}
+
+TypeId ArrayView::ElementType() const noexcept {
+    assert(inner_ && "ArrayView::ElementType called on moved-from instance");
+    return static_cast<TypeId>(inner_->av_element_type_id());
+}
+
+bool ArrayView::IsNull(size_t element) const {
+    CHECK_AV();
+    return inner_->av_is_null(element);
+}
+
+bool ArrayView::GetBool(size_t element) const {
+    CHECK_AV();
+    return inner_->av_get_bool(element);
+}
+
+int32_t ArrayView::GetInt32(size_t element) const {
+    CHECK_AV();
+    return inner_->av_get_i32(element);
+}
+
+int64_t ArrayView::GetInt64(size_t element) const {
+    CHECK_AV();
+    return inner_->av_get_i64(element);
+}
+
+float ArrayView::GetFloat32(size_t element) const {
+    CHECK_AV();
+    return inner_->av_get_f32(element);
+}
+
+double ArrayView::GetFloat64(size_t element) const {
+    CHECK_AV();
+    return inner_->av_get_f64(element);
+}
+
+std::string ArrayView::GetString(size_t element) const {
+    CHECK_AV();
+    return std::string(inner_->av_get_str(element));
+}
+
+std::vector<uint8_t> ArrayView::GetBytes(size_t element) const {
+    CHECK_AV();
+    auto rv = inner_->av_get_bytes(element);
+    return {rv.data(), rv.data() + rv.size()};
+}
+
+fluss::Date ArrayView::GetDate(size_t element) const {
+    CHECK_AV();
+    return fluss::Date{inner_->av_get_date_days(element)};
+}
+
+fluss::Time ArrayView::GetTime(size_t element) const {
+    CHECK_AV();
+    return fluss::Time{inner_->av_get_time_millis(element)};
+}
+
+fluss::Timestamp ArrayView::GetTimestampNtz(size_t element) const {
+    CHECK_AV();
+    return fluss::Timestamp{inner_->av_get_ts_millis(element),
+                            inner_->av_get_ts_nanos(element)};
+}
+
+fluss::Timestamp ArrayView::GetTimestampLtz(size_t element) const {
+    CHECK_AV();
+    return fluss::Timestamp{inner_->av_get_ts_millis(element),
+                            inner_->av_get_ts_nanos(element)};
+}
+
+std::string ArrayView::GetDecimalString(size_t element) const {
+    CHECK_AV();
+    return std::string(inner_->av_get_decimal_str(element));
+}
+
+ArrayView ArrayView::GetArray(size_t element) const {
+    CHECK_AV();
+    auto box = inner_->av_get_nested(element);
+    return ArrayView(box.into_raw());
+}
+
+#undef CHECK_AV
+
+// ============================================================================
+// GenericRow — write-only row backed by opaque Rust GenericRowInner
+// ============================================================================
+
+GenericRow::GenericRow() {
+    auto box = ffi::new_generic_row(0);
+    inner_ = box.into_raw();
+}
+
+GenericRow::GenericRow(size_t field_count) {
+    auto box = ffi::new_generic_row(field_count);
+    inner_ = box.into_raw();
+}
+
+GenericRow::~GenericRow() noexcept { Destroy(); }
+
+void GenericRow::Destroy() noexcept {
+    if (inner_) {
+        rust::Box<ffi::GenericRowInner>::from_raw(inner_);
+        inner_ = nullptr;
+    }
+    column_map_.reset();
+}
+
+GenericRow::GenericRow(GenericRow&& other) noexcept
+    : inner_(other.inner_), column_map_(std::move(other.column_map_)) {
+    other.inner_ = nullptr;
+}
+
+GenericRow& GenericRow::operator=(GenericRow&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        inner_ = other.inner_;
+        column_map_ = std::move(other.column_map_);
+        other.inner_ = nullptr;
+    }
+    return *this;
+}
+
+bool GenericRow::Available() const { return inner_ != nullptr; }
+
+void GenericRow::Reset() {
+    CHECK_INNER("GenericRow");
+    inner_->gr_reset();
+}
+
+void GenericRow::SetNull(size_t idx) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_null(idx);
+}
+void GenericRow::SetBool(size_t idx, bool v) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_bool(idx, v);
+}
+void GenericRow::SetInt32(size_t idx, int32_t v) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_i32(idx, v);
+}
+void GenericRow::SetInt64(size_t idx, int64_t v) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_i64(idx, v);
+}
+void GenericRow::SetFloat32(size_t idx, float v) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_f32(idx, v);
+}
+void GenericRow::SetFloat64(size_t idx, double v) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_f64(idx, v);
+}
+
+void GenericRow::SetString(size_t idx, std::string v) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_str(idx, v);
+}
+
+void GenericRow::SetBytes(size_t idx, std::vector<uint8_t> v) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_bytes(idx, rust::Slice<const uint8_t>(v.data(), v.size()));
+}
+
+void GenericRow::SetDate(size_t idx, fluss::Date d) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_date(idx, d.days_since_epoch);
+}
+
+void GenericRow::SetTime(size_t idx, fluss::Time t) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_time(idx, t.millis_since_midnight);
+}
+
+void GenericRow::SetTimestampNtz(size_t idx, fluss::Timestamp ts) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_ts_ntz(idx, ts.epoch_millis, ts.nano_of_millisecond);
+}
+
+void GenericRow::SetTimestampLtz(size_t idx, fluss::Timestamp ts) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_ts_ltz(idx, ts.epoch_millis, ts.nano_of_millisecond);
+}
+
+void GenericRow::SetDecimal(size_t idx, const std::string& value) {
+    CHECK_INNER("GenericRow");
+    inner_->gr_set_decimal_str(idx, value);
+}
+
+void GenericRow::SetArray(size_t idx, ArrayWriter&& writer) {
+    CHECK_INNER("GenericRow");
+    if (!writer.inner_) {
+        throw std::logic_error("GenericRow::SetArray: ArrayWriter not available");
+    }
+    inner_->gr_set_array(idx, *writer.inner_);
+    writer.Destroy();
+}
+
+// ============================================================================
+// ScanData — destructor must live in .cpp where rust::Box is visible
+// ============================================================================
+
+detail::ScanData::~ScanData() {
+    if (raw) {
+        rust::Box<ffi::ScanResultInner>::from_raw(raw);
+    }
+}
+
+// ============================================================================
+// RowView — zero-copy read-only row view for scan results
+// ============================================================================
+
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
+#define CHECK_DATA(name)                                                                 \
+    do {                                                                                 \
+        if (!data_) throw std::logic_error(name ": not available (moved-from or null)"); \
+    } while (0)
+
+size_t RowView::FieldCount() const { return data_ ? data_->raw->sv_field_count() : 0; }
+
+TypeId RowView::GetType(size_t idx) const {
+    CHECK_DATA("RowView");
+    return static_cast<TypeId>(data_->raw->sv_column_type(idx));
+}
+
+bool RowView::IsNull(size_t idx) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_is_null(bucket_idx_, rec_idx_, idx);
+}
+bool RowView::GetBool(size_t idx) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_bool(bucket_idx_, rec_idx_, idx);
+}
+int32_t RowView::GetInt32(size_t idx) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_i32(bucket_idx_, rec_idx_, idx);
+}
+int64_t RowView::GetInt64(size_t idx) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_i64(bucket_idx_, rec_idx_, idx);
+}
+float RowView::GetFloat32(size_t idx) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_f32(bucket_idx_, rec_idx_, idx);
+}
+double RowView::GetFloat64(size_t idx) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_f64(bucket_idx_, rec_idx_, idx);
+}
+
+std::string_view RowView::GetString(size_t idx) const {
+    CHECK_DATA("RowView");
+    auto s = data_->raw->sv_get_str(bucket_idx_, rec_idx_, idx);
+    return std::string_view(s.data(), s.size());
+}
+
+std::pair<const uint8_t*, size_t> RowView::GetBytes(size_t idx) const {
+    CHECK_DATA("RowView");
+    auto bytes = data_->raw->sv_get_bytes(bucket_idx_, rec_idx_, idx);
+    return {bytes.data(), bytes.size()};
+}
+
+Date RowView::GetDate(size_t idx) const {
+    CHECK_DATA("RowView");
+    return Date{data_->raw->sv_get_date_days(bucket_idx_, rec_idx_, idx)};
+}
+
+Time RowView::GetTime(size_t idx) const {
+    CHECK_DATA("RowView");
+    return Time{data_->raw->sv_get_time_millis(bucket_idx_, rec_idx_, idx)};
+}
+
+Timestamp RowView::GetTimestamp(size_t idx) const {
+    CHECK_DATA("RowView");
+    return Timestamp{data_->raw->sv_get_ts_millis(bucket_idx_, rec_idx_, idx),
+                     data_->raw->sv_get_ts_nanos(bucket_idx_, rec_idx_, idx)};
+}
+
+bool RowView::IsDecimal(size_t idx) const { return GetType(idx) == TypeId::Decimal; }
+
+std::string RowView::GetDecimalString(size_t idx) const {
+    CHECK_DATA("RowView");
+    return std::string(data_->raw->sv_get_decimal_str(bucket_idx_, rec_idx_, idx));
+}
+
+size_t RowView::GetArraySize(size_t idx) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_array_size(bucket_idx_, rec_idx_, idx);
+}
+
+TypeId RowView::GetArrayElementType(size_t idx) const {
+    CHECK_DATA("RowView");
+    return static_cast<TypeId>(data_->raw->sv_get_array_element_type(idx));
+}
+
+bool RowView::IsArrayElementNull(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_array_is_null(bucket_idx_, rec_idx_, idx, element);
+}
+
+bool RowView::GetArrayBool(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_array_bool(bucket_idx_, rec_idx_, idx, element);
+}
+
+int32_t RowView::GetArrayInt32(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_array_i32(bucket_idx_, rec_idx_, idx, element);
+}
+
+int64_t RowView::GetArrayInt64(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_array_i64(bucket_idx_, rec_idx_, idx, element);
+}
+
+float RowView::GetArrayFloat32(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_array_f32(bucket_idx_, rec_idx_, idx, element);
+}
+
+double RowView::GetArrayFloat64(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return data_->raw->sv_get_array_f64(bucket_idx_, rec_idx_, idx, element);
+}
+
+std::string RowView::GetArrayString(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return std::string(data_->raw->sv_get_array_str(bucket_idx_, rec_idx_, idx, element));
+}
+
+std::vector<uint8_t> RowView::GetArrayBytes(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    auto rv = data_->raw->sv_get_array_bytes(bucket_idx_, rec_idx_, idx, element);
+    return {rv.data(), rv.data() + rv.size()};
+}
+
+fluss::Date RowView::GetArrayDate(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return fluss::Date{data_->raw->sv_get_array_date_days(bucket_idx_, rec_idx_, idx, element)};
+}
+
+fluss::Time RowView::GetArrayTime(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return fluss::Time{data_->raw->sv_get_array_time_millis(bucket_idx_, rec_idx_, idx, element)};
+}
+
+fluss::Timestamp RowView::GetArrayTimestamp(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    auto millis = data_->raw->sv_get_array_ts_millis(bucket_idx_, rec_idx_, idx, element);
+    auto nanos = data_->raw->sv_get_array_ts_nanos(bucket_idx_, rec_idx_, idx, element);
+    return fluss::Timestamp{millis, nanos};
+}
+
+std::string RowView::GetArrayDecimalString(size_t idx, size_t element) const {
+    CHECK_DATA("RowView");
+    return std::string(data_->raw->sv_get_array_decimal_str(bucket_idx_, rec_idx_, idx, element));
+}
+
+ArrayView RowView::GetArrayView(size_t idx) const {
+    CHECK_DATA("RowView");
+    auto box = data_->raw->sv_get_array_view(bucket_idx_, rec_idx_, idx);
+    return ArrayView(box.into_raw());
+}
+
+// ============================================================================
+// ScanRecords — backed by opaque Rust ScanResultInner
+// ============================================================================
+
+// ScanRecords constructor, destructor, move operations are all defaulted in the header.
+
+size_t ScanRecords::Count() const { return data_ ? data_->raw->sv_record_count() : 0; }
+
+bool ScanRecords::IsEmpty() const { return Count() == 0; }
+
+ScanRecord ScanRecords::RecordAt(size_t bucket, size_t rec_idx) const {
+    if (!data_) {
+        throw std::logic_error("ScanRecords: not available (moved-from or null)");
+    }
+    return ScanRecord{data_->raw->sv_offset(bucket, rec_idx),
+                      data_->raw->sv_timestamp(bucket, rec_idx),
+                      static_cast<ChangeType>(data_->raw->sv_change_type(bucket, rec_idx)),
+                      RowView(data_, bucket, rec_idx)};
+}
+
+static TableBucket to_table_bucket(const ffi::FfiBucketInfo& g) {
+    return TableBucket{g.table_id, g.bucket_id,
+                       g.has_partition_id ? std::optional<int64_t>(g.partition_id) : std::nullopt};
+}
+
+size_t ScanRecords::BucketCount() const { return data_ ? data_->raw->sv_bucket_infos().size() : 0; }
+
+ScanRecord ScanRecords::Iterator::operator*() const {
+    return owner_->RecordAt(bucket_idx_, rec_idx_);
+}
+
+ScanRecords::Iterator ScanRecords::begin() const { return Iterator(this, 0, 0); }
+
+ScanRecords::Iterator& ScanRecords::Iterator::operator++() {
+    ++rec_idx_;
+    if (owner_->data_) {
+        const auto& infos = owner_->data_->raw->sv_bucket_infos();
+        while (bucket_idx_ < infos.size() && rec_idx_ >= infos[bucket_idx_].record_count) {
+            rec_idx_ = 0;
+            ++bucket_idx_;
+        }
+    }
+    return *this;
+}
+
+std::vector<TableBucket> ScanRecords::Buckets() const {
+    std::vector<TableBucket> result;
+    if (!data_) return result;
+    const auto& infos = data_->raw->sv_bucket_infos();
+    result.reserve(infos.size());
+    for (const auto& g : infos) {
+        result.push_back(to_table_bucket(g));
+    }
+    return result;
+}
+
+BucketRecords ScanRecords::Records(const TableBucket& bucket) const {
+    if (!data_) {
+        return BucketRecords({}, bucket, 0, 0);
+    }
+    const auto& infos = data_->raw->sv_bucket_infos();
+    for (size_t i = 0; i < infos.size(); ++i) {
+        TableBucket tb = to_table_bucket(infos[i]);
+        if (tb == bucket) {
+            return BucketRecords(data_, std::move(tb), i, infos[i].record_count);
+        }
+    }
+    return BucketRecords({}, bucket, 0, 0);
+}
+
+BucketRecords ScanRecords::BucketAt(size_t idx) const {
+    if (!data_) {
+        throw std::logic_error("ScanRecords: not available (moved-from or null)");
+    }
+    const auto& infos = data_->raw->sv_bucket_infos();
+    if (idx >= infos.size()) {
+        throw std::out_of_range("ScanRecords::BucketAt: index " + std::to_string(idx) +
+                                " out of range (" + std::to_string(infos.size()) + " buckets)");
+    }
+    return BucketRecords(data_, to_table_bucket(infos[idx]), idx, infos[idx].record_count);
+}
+
+ScanRecord BucketRecords::operator[](size_t idx) const {
+    if (idx >= count_) {
+        throw std::out_of_range("BucketRecords: index " + std::to_string(idx) + " out of range (" +
+                                std::to_string(count_) + " records)");
+    }
+    return ScanRecord{data_->raw->sv_offset(bucket_idx_, idx),
+                      data_->raw->sv_timestamp(bucket_idx_, idx),
+                      static_cast<ChangeType>(data_->raw->sv_change_type(bucket_idx_, idx)),
+                      RowView(data_, bucket_idx_, idx)};
+}
+
+ScanRecord BucketRecords::Iterator::operator*() const { return owner_->operator[](idx_); }
+
+// ============================================================================
+// LookupResult — backed by opaque Rust LookupResultInner
+// ============================================================================
+
+LookupResult::LookupResult() noexcept = default;
+
+LookupResult::~LookupResult() noexcept { Destroy(); }
+
+void LookupResult::Destroy() noexcept {
+    if (inner_) {
+        rust::Box<ffi::LookupResultInner>::from_raw(inner_);
+        inner_ = nullptr;
+        column_map_.reset();
+    }
+}
+
+LookupResult::LookupResult(LookupResult&& other) noexcept
+    : inner_(other.inner_), column_map_(std::move(other.column_map_)) {
+    other.inner_ = nullptr;
+}
+
+LookupResult& LookupResult::operator=(LookupResult&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        inner_ = other.inner_;
+        column_map_ = std::move(other.column_map_);
+        other.inner_ = nullptr;
+    }
+    return *this;
+}
+
+void LookupResult::BuildColumnMap() const {
+    if (!inner_) return;
+    auto map = std::make_shared<detail::ColumnMap>();
+    auto count = inner_->lv_field_count();
+    for (size_t i = 0; i < count; ++i) {
+        auto name = inner_->lv_column_name(i);
+        (*map)[std::string(name.data(), name.size())] = {
+            i, static_cast<TypeId>(inner_->lv_column_type(i))};
+    }
+    column_map_ = std::move(map);
+}
+
+bool LookupResult::Found() const { return inner_ && inner_->lv_found(); }
+
+size_t LookupResult::FieldCount() const { return inner_ ? inner_->lv_field_count() : 0; }
+
+TypeId LookupResult::GetType(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return static_cast<TypeId>(inner_->lv_column_type(idx));
+}
+
+bool LookupResult::IsNull(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_is_null(idx);
+}
+bool LookupResult::GetBool(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_bool(idx);
+}
+int32_t LookupResult::GetInt32(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_i32(idx);
+}
+int64_t LookupResult::GetInt64(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_i64(idx);
+}
+float LookupResult::GetFloat32(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_f32(idx);
+}
+double LookupResult::GetFloat64(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_f64(idx);
+}
+
+std::string_view LookupResult::GetString(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    auto s = inner_->lv_get_str(idx);
+    return std::string_view(s.data(), s.size());
+}
+
+std::pair<const uint8_t*, size_t> LookupResult::GetBytes(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    auto bytes = inner_->lv_get_bytes(idx);
+    return {bytes.data(), bytes.size()};
+}
+
+Date LookupResult::GetDate(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return Date{inner_->lv_get_date_days(idx)};
+}
+
+Time LookupResult::GetTime(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return Time{inner_->lv_get_time_millis(idx)};
+}
+
+Timestamp LookupResult::GetTimestamp(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return Timestamp{inner_->lv_get_ts_millis(idx), inner_->lv_get_ts_nanos(idx)};
+}
+
+bool LookupResult::IsDecimal(size_t idx) const { return GetType(idx) == TypeId::Decimal; }
+
+std::string LookupResult::GetDecimalString(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return std::string(inner_->lv_get_decimal_str(idx));
+}
+
+size_t LookupResult::GetArraySize(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_array_size(idx);
+}
+
+TypeId LookupResult::GetArrayElementType(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    return static_cast<TypeId>(inner_->lv_get_array_element_type(idx));
+}
+
+bool LookupResult::IsArrayElementNull(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_array_is_null(idx, element);
+}
+
+bool LookupResult::GetArrayBool(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_array_bool(idx, element);
+}
+
+int32_t LookupResult::GetArrayInt32(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_array_i32(idx, element);
+}
+
+int64_t LookupResult::GetArrayInt64(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_array_i64(idx, element);
+}
+
+float LookupResult::GetArrayFloat32(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_array_f32(idx, element);
+}
+
+double LookupResult::GetArrayFloat64(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return inner_->lv_get_array_f64(idx, element);
+}
+
+std::string LookupResult::GetArrayString(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return std::string(inner_->lv_get_array_str(idx, element));
+}
+
+std::vector<uint8_t> LookupResult::GetArrayBytes(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    auto rv = inner_->lv_get_array_bytes(idx, element);
+    return {rv.data(), rv.data() + rv.size()};
+}
+
+fluss::Date LookupResult::GetArrayDate(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return fluss::Date{inner_->lv_get_array_date_days(idx, element)};
+}
+
+fluss::Time LookupResult::GetArrayTime(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return fluss::Time{inner_->lv_get_array_time_millis(idx, element)};
+}
+
+fluss::Timestamp LookupResult::GetArrayTimestamp(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    auto millis = inner_->lv_get_array_ts_millis(idx, element);
+    auto nanos = inner_->lv_get_array_ts_nanos(idx, element);
+    return fluss::Timestamp{millis, nanos};
+}
+
+std::string LookupResult::GetArrayDecimalString(size_t idx, size_t element) const {
+    CHECK_INNER("LookupResult");
+    return std::string(inner_->lv_get_array_decimal_str(idx, element));
+}
+
+ArrayView LookupResult::GetArrayView(size_t idx) const {
+    CHECK_INNER("LookupResult");
+    auto box = inner_->lv_get_array_view(idx);
+    return ArrayView(box.into_raw());
+}
+
+// ============================================================================
+// Table
+// ============================================================================
+
+Table::Table() noexcept = default;
+
+Table::Table(ffi::Table* table) noexcept : table_(table) {}
+
+Table::~Table() noexcept { Destroy(); }
+
+void Table::Destroy() noexcept {
+    if (table_) {
+        ffi::delete_table(table_);
+        table_ = nullptr;
+    }
+}
+
+Table::Table(Table&& other) noexcept
+    : table_(other.table_), column_map_(std::move(other.column_map_)) {
+    other.table_ = nullptr;
+}
+
+Table& Table::operator=(Table&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        table_ = other.table_;
+        column_map_ = std::move(other.column_map_);
+        other.table_ = nullptr;
+    }
+    return *this;
+}
+
+bool Table::Available() const { return table_ != nullptr; }
+
+TableAppend Table::NewAppend() { return TableAppend(table_); }
+
+TableUpsert Table::NewUpsert() { return TableUpsert(table_); }
+
+TableLookup Table::NewLookup() { return TableLookup(table_); }
+
+TableScan Table::NewScan() { return TableScan(table_); }
+
+const std::shared_ptr<GenericRow::ColumnMap>& Table::GetColumnMap() const {
+    if (!column_map_ && Available()) {
+        auto info = GetTableInfo();
+        column_map_ = std::make_shared<GenericRow::ColumnMap>();
+        for (size_t i = 0; i < info.schema.columns.size(); ++i) {
+            (*column_map_)[info.schema.columns[i].name] = {i,
+                                                           info.schema.columns[i].data_type.id()};
+        }
+    }
+    return column_map_;
+}
+
+GenericRow Table::NewRow() const {
+    GenericRow row;
+    row.column_map_ = GetColumnMap();
+    return row;
+}
+
+TableInfo Table::GetTableInfo() const {
+    if (!Available()) {
+        return TableInfo{};
+    }
+    auto ffi_info = table_->get_table_info_from_table();
+    return utils::from_ffi_table_info(ffi_info);
+}
+
+TablePath Table::GetTablePath() const {
+    if (!Available()) {
+        return TablePath{};
+    }
+    auto ffi_path = table_->get_table_path();
+    return TablePath{std::string(ffi_path.database_name), std::string(ffi_path.table_name)};
+}
+
+bool Table::HasPrimaryKey() const {
+    if (!Available()) {
+        return false;
+    }
+    return table_->has_primary_key();
+}
+
+// ============================================================================
+// TableAppend
+// ============================================================================
+
+TableAppend::TableAppend(ffi::Table* table) noexcept : table_(table) {}
+
+Result TableAppend::CreateWriter(AppendWriter& out) {
+    if (table_ == nullptr) {
+        return utils::make_client_error("Table not available");
+    }
+
+    auto ffi_result = table_->new_append_writer();
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out = AppendWriter(utils::ptr_from_ffi<ffi::AppendWriter>(ffi_result));
+    }
+    return result;
+}
+
+// ============================================================================
+// TableUpsert
+// ============================================================================
+
+TableUpsert::TableUpsert(ffi::Table* table) noexcept : table_(table) {}
+
+TableUpsert& TableUpsert::PartialUpdateByIndex(std::vector<size_t> column_indices) {
+    if (column_indices.empty()) {
+        throw std::invalid_argument("PartialUpdateByIndex requires at least one column");
+    }
+    column_indices_ = std::move(column_indices);
+    column_names_.clear();
+    return *this;
+}
+
+TableUpsert& TableUpsert::PartialUpdateByName(std::vector<std::string> column_names) {
+    if (column_names.empty()) {
+        throw std::invalid_argument("PartialUpdateByName requires at least one column");
+    }
+    column_names_ = std::move(column_names);
+    column_indices_.clear();
+    return *this;
+}
+
+std::vector<size_t> TableUpsert::ResolveNameProjection() const {
+    auto ffi_info = table_->get_table_info_from_table();
+    const auto& columns = ffi_info.schema.columns;
+
+    std::vector<size_t> indices;
+    for (const auto& name : column_names_) {
+        bool found = false;
+        for (size_t i = 0; i < columns.size(); ++i) {
+            if (std::string(columns[i].name) == name) {
+                indices.push_back(i);
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            throw std::runtime_error("Column '" + name + "' not found");
+        }
+    }
+    return indices;
+}
+
+Result TableUpsert::CreateWriter(UpsertWriter& out) {
+    if (table_ == nullptr) {
+        return utils::make_client_error("Table not available");
+    }
+
+    try {
+        auto resolved_indices = !column_names_.empty() ? ResolveNameProjection() : column_indices_;
+
+        rust::Vec<size_t> rust_indices;
+        for (size_t idx : resolved_indices) {
+            rust_indices.push_back(idx);
+        }
+        auto ffi_result = table_->create_upsert_writer(std::move(rust_indices));
+        auto result = utils::from_ffi_result(ffi_result.result);
+        if (result.Ok()) {
+            out = UpsertWriter(utils::ptr_from_ffi<ffi::UpsertWriter>(ffi_result));
+        }
+        return result;
+    } catch (const std::exception& e) {
+        // ResolveNameProjection() may throw
+        return utils::make_client_error(e.what());
+    }
+}
+
+// ============================================================================
+// TableLookup
+// ============================================================================
+
+TableLookup::TableLookup(ffi::Table* table) noexcept : table_(table) {}
+
+Result TableLookup::CreateLookuper(Lookuper& out) {
+    if (table_ == nullptr) {
+        return utils::make_client_error("Table not available");
+    }
+
+    auto ffi_result = table_->new_lookuper();
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out = Lookuper(utils::ptr_from_ffi<ffi::Lookuper>(ffi_result));
+    }
+    return result;
+}
+
+// ============================================================================
+// TableScan
+// ============================================================================
+
+TableScan::TableScan(ffi::Table* table) noexcept : table_(table) {}
+
+TableScan& TableScan::ProjectByIndex(std::vector<size_t> column_indices) {
+    projection_ = std::move(column_indices);
+    name_projection_.clear();
+    return *this;
+}
+
+TableScan& TableScan::ProjectByName(std::vector<std::string> column_names) {
+    name_projection_ = std::move(column_names);
+    projection_.clear();
+    return *this;
+}
+
+std::vector<size_t> TableScan::ResolveNameProjection() const {
+    auto ffi_info = table_->get_table_info_from_table();
+    const auto& columns = ffi_info.schema.columns;
+
+    std::vector<size_t> indices;
+    for (const auto& name : name_projection_) {
+        bool found = false;
+        for (size_t i = 0; i < columns.size(); ++i) {
+            if (std::string(columns[i].name) == name) {
+                indices.push_back(i);
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            throw std::runtime_error("Column '" + name + "' not found");
+        }
+    }
+    return indices;
+}
+
+Result TableScan::CreateLogScanner(LogScanner& out) { return DoCreateScanner(out, false); }
+
+Result TableScan::CreateRecordBatchLogScanner(LogScanner& out) {
+    return DoCreateScanner(out, true);
+}
+
+Result TableScan::DoCreateScanner(LogScanner& out, bool is_record_batch) {
+    if (table_ == nullptr) {
+        return utils::make_client_error("Table not available");
+    }
+
+    try {
+        auto resolved_indices = !name_projection_.empty() ? ResolveNameProjection() : projection_;
+        rust::Vec<size_t> rust_indices;
+        for (size_t idx : resolved_indices) {
+            rust_indices.push_back(idx);
+        }
+        auto ffi_result = table_->create_scanner(std::move(rust_indices), is_record_batch);
+        auto result = utils::from_ffi_result(ffi_result.result);
+        if (result.Ok()) {
+            out.scanner_ = utils::ptr_from_ffi<ffi::LogScanner>(ffi_result);
+        }
+        return result;
+    } catch (const std::exception& e) {
+        // ResolveNameProjection() may throw
+        return utils::make_client_error(e.what());
+    }
+}
+
+// ============================================================================
+// WriteResult
+// ============================================================================
+
+WriteResult::WriteResult() noexcept = default;
+
+WriteResult::WriteResult(ffi::WriteResult* inner) noexcept : inner_(inner) {}
+
+WriteResult::~WriteResult() noexcept { Destroy(); }
+
+void WriteResult::Destroy() noexcept {
+    if (inner_) {
+        ffi::delete_write_result(inner_);
+        inner_ = nullptr;
+    }
+}
+
+WriteResult::WriteResult(WriteResult&& other) noexcept : inner_(other.inner_) {
+    other.inner_ = nullptr;
+}
+
+WriteResult& WriteResult::operator=(WriteResult&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        inner_ = other.inner_;
+        other.inner_ = nullptr;
+    }
+    return *this;
+}
+
+bool WriteResult::Available() const { return inner_ != nullptr; }
+
+Result WriteResult::Wait() {
+    if (!Available()) {
+        return utils::make_ok();
+    }
+
+    auto ffi_result = inner_->wait();
+    return utils::from_ffi_result(ffi_result);
+}
+
+// ============================================================================
+// AppendWriter
+// ============================================================================
+
+AppendWriter::AppendWriter() noexcept = default;
+
+AppendWriter::AppendWriter(ffi::AppendWriter* writer) noexcept : writer_(writer) {}
+
+AppendWriter::~AppendWriter() noexcept { Destroy(); }
+
+void AppendWriter::Destroy() noexcept {
+    if (writer_) {
+        ffi::delete_append_writer(writer_);
+        writer_ = nullptr;
+    }
+}
+
+AppendWriter::AppendWriter(AppendWriter&& other) noexcept : writer_(other.writer_) {
+    other.writer_ = nullptr;
+}
+
+AppendWriter& AppendWriter::operator=(AppendWriter&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        writer_ = other.writer_;
+        other.writer_ = nullptr;
+    }
+    return *this;
+}
+
+bool AppendWriter::Available() const { return writer_ != nullptr; }
+
+Result AppendWriter::Append(const GenericRow& row) {
+    WriteResult wr;
+    return Append(row, wr);
+}
+
+Result AppendWriter::Append(const GenericRow& row, WriteResult& out) {
+    if (!Available()) {
+        return utils::make_client_error("AppendWriter not available");
+    }
+    if (!row.Available()) {
+        return utils::make_client_error("GenericRow not available");
+    }
+
+    auto ffi_result = writer_->append(*row.inner_);
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out = WriteResult(utils::ptr_from_ffi<ffi::WriteResult>(ffi_result));
+    }
+    return result;
+}
+
+Result AppendWriter::AppendArrowBatch(const std::shared_ptr<arrow::RecordBatch>& batch) {
+    WriteResult wr;
+    return AppendArrowBatch(batch, wr);
+}
+
+Result AppendWriter::AppendArrowBatch(const std::shared_ptr<arrow::RecordBatch>& batch,
+                                      WriteResult& out) {
+    if (!Available()) {
+        return utils::make_client_error("AppendWriter not available");
+    }
+    if (!batch) {
+        return utils::make_client_error("Arrow RecordBatch is null");
+    }
+
+    // Export via Arrow C Data Interface
+    struct ArrowArray c_array;
+    struct ArrowSchema c_schema;
+    auto status = arrow::ExportRecordBatch(*batch, &c_array, &c_schema);
+    if (!status.ok()) {
+        return utils::make_client_error("Failed to export Arrow batch: " + status.ToString());
+    }
+
+    // Heap-allocate for Rust ownership transfer
+    auto* array_heap = new ArrowArray(std::move(c_array));
+    auto* schema_heap = new ArrowSchema(std::move(c_schema));
+
+    // Rust takes ownership of both pointers immediately via Box::from_raw(),
+    // so after this call C++ must NOT free them.
+    auto ffi_result = writer_->append_arrow_batch(reinterpret_cast<size_t>(array_heap),
+                                                  reinterpret_cast<size_t>(schema_heap));
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out.Destroy();
+        out.inner_ = utils::ptr_from_ffi<ffi::WriteResult>(ffi_result);
+    }
+    return result;
+}
+
+Result AppendWriter::Flush() {
+    if (!Available()) {
+        return utils::make_client_error("AppendWriter not available");
+    }
+
+    auto ffi_result = writer_->flush();
+    return utils::from_ffi_result(ffi_result);
+}
+
+// ============================================================================
+// UpsertWriter
+// ============================================================================
+
+UpsertWriter::UpsertWriter() noexcept = default;
+
+UpsertWriter::UpsertWriter(ffi::UpsertWriter* writer) noexcept : writer_(writer) {}
+
+UpsertWriter::~UpsertWriter() noexcept { Destroy(); }
+
+void UpsertWriter::Destroy() noexcept {
+    if (writer_) {
+        ffi::delete_upsert_writer(writer_);
+        writer_ = nullptr;
+    }
+}
+
+UpsertWriter::UpsertWriter(UpsertWriter&& other) noexcept : writer_(other.writer_) {
+    other.writer_ = nullptr;
+}
+
+UpsertWriter& UpsertWriter::operator=(UpsertWriter&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        writer_ = other.writer_;
+        other.writer_ = nullptr;
+    }
+    return *this;
+}
+
+bool UpsertWriter::Available() const { return writer_ != nullptr; }
+
+Result UpsertWriter::Upsert(const GenericRow& row) {
+    WriteResult wr;
+    return Upsert(row, wr);
+}
+
+Result UpsertWriter::Upsert(const GenericRow& row, WriteResult& out) {
+    if (!Available()) {
+        return utils::make_client_error("UpsertWriter not available");
+    }
+    if (!row.Available()) {
+        return utils::make_client_error("GenericRow not available");
+    }
+
+    auto ffi_result = writer_->upsert(*row.inner_);
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out = WriteResult(utils::ptr_from_ffi<ffi::WriteResult>(ffi_result));
+    }
+    return result;
+}
+
+Result UpsertWriter::Delete(const GenericRow& row) {
+    WriteResult wr;
+    return Delete(row, wr);
+}
+
+Result UpsertWriter::Delete(const GenericRow& row, WriteResult& out) {
+    if (!Available()) {
+        return utils::make_client_error("UpsertWriter not available");
+    }
+    if (!row.Available()) {
+        return utils::make_client_error("GenericRow not available");
+    }
+
+    auto ffi_result = writer_->delete_row(*row.inner_);
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (result.Ok()) {
+        out = WriteResult(utils::ptr_from_ffi<ffi::WriteResult>(ffi_result));
+    }
+    return result;
+}
+
+Result UpsertWriter::Flush() {
+    if (!Available()) {
+        return utils::make_client_error("UpsertWriter not available");
+    }
+
+    auto ffi_result = writer_->upsert_flush();
+    return utils::from_ffi_result(ffi_result);
+}
+
+// ============================================================================
+// Lookuper
+// ============================================================================
+
+Lookuper::Lookuper() noexcept = default;
+
+Lookuper::Lookuper(ffi::Lookuper* lookuper) noexcept : lookuper_(lookuper) {}
+
+Lookuper::~Lookuper() noexcept { Destroy(); }
+
+void Lookuper::Destroy() noexcept {
+    if (lookuper_) {
+        ffi::delete_lookuper(lookuper_);
+        lookuper_ = nullptr;
+    }
+}
+
+Lookuper::Lookuper(Lookuper&& other) noexcept : lookuper_(other.lookuper_) {
+    other.lookuper_ = nullptr;
+}
+
+Lookuper& Lookuper::operator=(Lookuper&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        lookuper_ = other.lookuper_;
+        other.lookuper_ = nullptr;
+    }
+    return *this;
+}
+
+bool Lookuper::Available() const { return lookuper_ != nullptr; }
+
+Result Lookuper::Lookup(const GenericRow& pk_row, LookupResult& out) {
+    if (!Available()) {
+        return utils::make_client_error("Lookuper not available");
+    }
+    if (!pk_row.Available()) {
+        return utils::make_client_error("GenericRow not available");
+    }
+
+    auto result_box = lookuper_->lookup(*pk_row.inner_);
+    if (result_box->lv_has_error()) {
+        return utils::make_error(result_box->lv_error_code(),
+                                 std::string(result_box->lv_error_message()));
+    }
+
+    out.Destroy();
+    out.inner_ = result_box.into_raw();
+    return utils::make_ok();
+}
+
+// ============================================================================
+// LogScanner
+// ============================================================================
+
+LogScanner::LogScanner() noexcept = default;
+
+LogScanner::LogScanner(ffi::LogScanner* scanner) noexcept : scanner_(scanner) {}
+
+LogScanner::~LogScanner() noexcept { Destroy(); }
+
+void LogScanner::Destroy() noexcept {
+    if (scanner_) {
+        ffi::delete_log_scanner(scanner_);
+        scanner_ = nullptr;
+    }
+}
+
+LogScanner::LogScanner(LogScanner&& other) noexcept : scanner_(other.scanner_) {
+    other.scanner_ = nullptr;
+}
+
+LogScanner& LogScanner::operator=(LogScanner&& other) noexcept {
+    if (this != &other) {
+        Destroy();
+        scanner_ = other.scanner_;
+        other.scanner_ = nullptr;
+    }
+    return *this;
+}
+
+bool LogScanner::Available() const { return scanner_ != nullptr; }
+
+Result LogScanner::Subscribe(int32_t bucket_id, int64_t start_offset) {
+    if (!Available()) {
+        return utils::make_client_error("LogScanner not available");
+    }
+
+    auto ffi_result = scanner_->subscribe(bucket_id, start_offset);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result LogScanner::Subscribe(const std::vector<BucketSubscription>& bucket_offsets) {
+    if (!Available()) {
+        return utils::make_client_error("LogScanner not available");
+    }
+
+    rust::Vec<ffi::FfiBucketSubscription> rust_subs;
+    for (const auto& sub : bucket_offsets) {
+        ffi::FfiBucketSubscription ffi_sub;
+        ffi_sub.bucket_id = sub.bucket_id;
+        ffi_sub.offset = sub.offset;
+        rust_subs.push_back(ffi_sub);
+    }
+
+    auto ffi_result = scanner_->subscribe_buckets(std::move(rust_subs));
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result LogScanner::SubscribePartitionBuckets(int64_t partition_id, int32_t bucket_id,
+                                             int64_t start_offset) {
+    if (!Available()) {
+        return utils::make_client_error("LogScanner not available");
+    }
+
+    auto ffi_result = scanner_->subscribe_partition(partition_id, bucket_id, start_offset);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result LogScanner::SubscribePartitionBuckets(
+    const std::vector<PartitionBucketSubscription>& subscriptions) {
+    if (!Available()) {
+        return utils::make_client_error("LogScanner not available");
+    }
+
+    rust::Vec<ffi::FfiPartitionBucketSubscription> rust_subs;
+    for (const auto& sub : subscriptions) {
+        ffi::FfiPartitionBucketSubscription ffi_sub;
+        ffi_sub.partition_id = sub.partition_id;
+        ffi_sub.bucket_id = sub.bucket_id;
+        ffi_sub.offset = sub.offset;
+        rust_subs.push_back(ffi_sub);
+    }
+
+    auto ffi_result = scanner_->subscribe_partition_buckets(std::move(rust_subs));
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result LogScanner::Unsubscribe(int32_t bucket_id) {
+    if (!Available()) {
+        return utils::make_client_error("LogScanner not available");
+    }
+
+    auto ffi_result = scanner_->unsubscribe(bucket_id);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result LogScanner::UnsubscribePartition(int64_t partition_id, int32_t bucket_id) {
+    if (!Available()) {
+        return utils::make_client_error("LogScanner not available");
+    }
+
+    auto ffi_result = scanner_->unsubscribe_partition(partition_id, bucket_id);
+    return utils::from_ffi_result(ffi_result);
+}
+
+Result LogScanner::Poll(int64_t timeout_ms, ScanRecords& out) {
+    if (!Available()) {
+        return utils::make_client_error("LogScanner not available");
+    }
+
+    auto result_box = scanner_->poll(timeout_ms);
+    if (result_box->sv_has_error()) {
+        return utils::make_error(result_box->sv_error_code(),
+                                 std::string(result_box->sv_error_message()));
+    }
+
+    // Wrap raw pointer in ScanData immediately so it's never leaked on exception.
+    auto data = std::make_shared<detail::ScanData>(result_box.into_raw(), detail::ColumnMap{});
+    // Build column map eagerly — shared by all RowViews/BucketRecords.
+    auto col_count = data->raw->sv_column_count();
+    for (size_t i = 0; i < col_count; ++i) {
+        auto name = data->raw->sv_column_name(i);
+        data->columns[std::string(name.data(), name.size())] = {
+            i, static_cast<TypeId>(data->raw->sv_column_type(i))};
+    }
+    out.data_ = std::move(data);
+    return utils::make_ok();
+}
+
+ArrowRecordBatch::ArrowRecordBatch(std::shared_ptr<arrow::RecordBatch> batch, int64_t table_id,
+                                   int64_t partition_id, int32_t bucket_id,
+                                   int64_t base_offset) noexcept
+    : batch_(std::move(batch)),
+      table_id_(table_id),
+      partition_id_(partition_id),
+      bucket_id_(bucket_id),
+      base_offset_(base_offset) {}
+
+bool ArrowRecordBatch::Available() const { return batch_ != nullptr; }
+
+int64_t ArrowRecordBatch::NumRows() const {
+    if (!Available()) return 0;
+    return batch_->num_rows();
+}
+
+int64_t ArrowRecordBatch::GetTableId() const {
+    if (!Available()) return 0;
+    return this->table_id_;
+}
+
+int64_t ArrowRecordBatch::GetPartitionId() const {
+    if (!Available()) return -1;
+    return this->partition_id_;
+}
+
+int32_t ArrowRecordBatch::GetBucketId() const {
+    if (!Available()) return -1;
+    return this->bucket_id_;
+}
+
+int64_t ArrowRecordBatch::GetBaseOffset() const {
+    if (!Available()) return -1;
+    return this->base_offset_;
+}
+
+int64_t ArrowRecordBatch::GetLastOffset() const {
+    if (!Available()) return -1;
+    return this->base_offset_ + this->NumRows() - 1;
+}
+
+Result LogScanner::PollRecordBatch(int64_t timeout_ms, ArrowRecordBatches& out) {
+    if (!Available()) {
+        return utils::make_client_error("LogScanner not available");
+    }
+
+    auto ffi_result = scanner_->poll_record_batch(timeout_ms);
+    auto result = utils::from_ffi_result(ffi_result.result);
+    if (!result.Ok()) {
+        return result;
+    }
+
+    // Convert the FFI Arrow record batches to C++ ArrowRecordBatch objects
+    out.batches.clear();
+    for (const auto& ffi_batch : ffi_result.arrow_batches.batches) {
+        auto* c_array = reinterpret_cast<struct ArrowArray*>(ffi_batch.array_ptr);
+        auto* c_schema = reinterpret_cast<struct ArrowSchema*>(ffi_batch.schema_ptr);
+
+        auto import_result = arrow::ImportRecordBatch(c_array, c_schema);
+        if (import_result.ok()) {
+            auto batch_ptr = import_result.ValueOrDie();
+            auto batch_wrapper = std::unique_ptr<ArrowRecordBatch>(new ArrowRecordBatch(
+                std::move(batch_ptr), ffi_batch.table_id, ffi_batch.partition_id,
+                ffi_batch.bucket_id, ffi_batch.base_offset));
+            out.batches.push_back(std::move(batch_wrapper));
+
+            // Free the container structures that were allocated in Rust after successful import
+            ffi::free_arrow_ffi_structures(ffi_batch.array_ptr, ffi_batch.schema_ptr);
+        } else {
+            // Import failed, free the container structures to avoid leaks and return error
+            ffi::free_arrow_ffi_structures(ffi_batch.array_ptr, ffi_batch.schema_ptr);
+
+            // Return an error indicating that the import failed
+            std::string error_msg =
+                "Failed to import Arrow record batch: " + import_result.status().ToString();
+            return utils::make_client_error(error_msg);
+        }
+    }
+
+    return utils::make_ok();
+}
+
+}  // namespace fluss
diff --git a/fluss-rust/bindings/cpp/src/types.rs b/fluss-rust/bindings/cpp/src/types.rs
new file mode 100644
index 0000000000..23ac636d4c
--- /dev/null
+++ b/fluss-rust/bindings/cpp/src/types.rs
@@ -0,0 +1,646 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::ffi;
+use anyhow::{Result, anyhow};
+use arrow::array::Array;
+use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema};
+use fluss as fcore;
+use std::borrow::Cow;
+use std::str::FromStr;
+
+pub const DATA_TYPE_BOOLEAN: i32 = 1;
+pub const DATA_TYPE_TINYINT: i32 = 2;
+pub const DATA_TYPE_SMALLINT: i32 = 3;
+pub const DATA_TYPE_INT: i32 = 4;
+pub const DATA_TYPE_BIGINT: i32 = 5;
+pub const DATA_TYPE_FLOAT: i32 = 6;
+pub const DATA_TYPE_DOUBLE: i32 = 7;
+pub const DATA_TYPE_STRING: i32 = 8;
+pub const DATA_TYPE_BYTES: i32 = 9;
+pub const DATA_TYPE_DATE: i32 = 10;
+pub const DATA_TYPE_TIME: i32 = 11;
+pub const DATA_TYPE_TIMESTAMP: i32 = 12;
+pub const DATA_TYPE_TIMESTAMP_LTZ: i32 = 13;
+pub const DATA_TYPE_DECIMAL: i32 = 14;
+pub const DATA_TYPE_CHAR: i32 = 15;
+pub const DATA_TYPE_BINARY: i32 = 16;
+pub const DATA_TYPE_ARRAY: i32 = 17;
+
+/// Separates scalar and array type specs so each variant only carries
+/// the fields it actually needs — no zeroed-out placeholders.
+enum FfiDataTypeSpec {
+    Scalar {
+        data_type: i32,
+        precision: u32,
+        scale: u32,
+        nullable: bool,
+    },
+    Array {
+        element_data_type: i32,
+        element_precision: u32,
+        element_scale: u32,
+        array_nesting: u32,
+        /// `nesting` entries for each ARRAY wrapper (outermost first) plus
+        /// one trailing entry for the leaf scalar. Length = `nesting + 1`.
+        array_nullability: Vec<u8>,
+    },
+}
+
+fn ffi_column_to_core_data_type(col: &ffi::FfiColumn) -> Result<fcore::metadata::DataType> {
+    if col.data_type == DATA_TYPE_ARRAY {
+        ffi_data_type_to_core(FfiDataTypeSpec::Array {
+            element_data_type: col.element_data_type,
+            element_precision: col.element_precision as u32,
+            element_scale: col.element_scale as u32,
+            array_nesting: col.array_nesting.max(0) as u32,
+            array_nullability: col.array_nullability.clone(),
+        })
+    } else {
+        ffi_data_type_to_core(FfiDataTypeSpec::Scalar {
+            data_type: col.data_type,
+            precision: col.precision as u32,
+            scale: col.scale as u32,
+            nullable: col.nullable,
+        })
+    }
+}
+
+fn type_precision_scale(dt: &fcore::metadata::DataType) -> (i32, i32) {
+    match dt {
+        fcore::metadata::DataType::Decimal(d) => (d.precision() as i32, d.scale() as i32),
+        fcore::metadata::DataType::Timestamp(ts) => (ts.precision() as i32, 0),
+        fcore::metadata::DataType::TimestampLTz(ts) => (ts.precision() as i32, 0),
+        fcore::metadata::DataType::Char(ch) => (ch.length() as i32, 0),
+        fcore::metadata::DataType::Binary(bin) => (bin.length() as i32, 0),
+        _ => (0, 0),
+    }
+}
+
+struct FlattenedLeafType {
+    nesting: i32,
+    leaf_type: i32,
+    leaf_precision: i32,
+    leaf_scale: i32,
+    /// `nesting` entries for ARRAY wrappers (outermost first) plus one
+    /// trailing entry for the leaf scalar. Length = `nesting + 1`.
+    array_nullability: Vec<u8>,
+}
+
+fn flatten_array_leaf_type(dt: &fcore::metadata::DataType) -> Result<FlattenedLeafType> {
+    let mut nesting = 0_i32;
+    let mut leaf = dt;
+    let mut array_nullability = Vec::new();
+    while let fcore::metadata::DataType::Array(at) = leaf {
+        nesting += 1;
+        array_nullability.push(u8::from(leaf.is_nullable()));
+        leaf = at.get_element_type();
+    }
+    if nesting == 0 {
+        return Err(anyhow!("Expected ARRAY data type, got {dt}"));
+    }
+    let leaf_type = core_data_type_to_ffi(leaf);
+    if leaf_type == 0 {
+        return Err(anyhow!(
+            "Unsupported ARRAY leaf type for C++ bindings: {leaf}"
+        ));
+    }
+    array_nullability.push(u8::from(leaf.is_nullable()));
+    let (leaf_precision, leaf_scale) = type_precision_scale(leaf);
+    Ok(FlattenedLeafType {
+        nesting,
+        leaf_type,
+        leaf_precision,
+        leaf_scale,
+        array_nullability,
+    })
+}
+
+fn build_array_type_from_leaf(
+    element_data_type: i32,
+    element_precision: u32,
+    element_scale: u32,
+    array_nesting: u32,
+    array_nullability: &[u8],
+) -> Result<fcore::metadata::DataType> {
+    if array_nesting == 0 {
+        return Err(anyhow!("ARRAY nesting must be >= 1"));
+    }
+    let leaf_nullable = array_nullability
+        .get(array_nesting as usize)
+        .map(|v| *v != 0)
+        .unwrap_or(true);
+    let mut dt = ffi_data_type_to_core(FfiDataTypeSpec::Scalar {
+        data_type: element_data_type,
+        precision: element_precision,
+        scale: element_scale,
+        nullable: leaf_nullable,
+    })?;
+    for i in (0..array_nesting).rev() {
+        let nullable = array_nullability
+            .get(i as usize)
+            .map(|v| *v != 0)
+            .unwrap_or(true);
+        dt = fcore::metadata::DataType::Array(fcore::metadata::ArrayType::with_nullable(
+            nullable, dt,
+        ));
+    }
+    Ok(dt)
+}
+
+fn ffi_data_type_to_core(spec: FfiDataTypeSpec) -> Result<fcore::metadata::DataType> {
+    match spec {
+        FfiDataTypeSpec::Scalar {
+            data_type,
+            precision,
+            scale,
+            nullable,
+        } => {
+            let dt = match data_type {
+                DATA_TYPE_BOOLEAN => fcore::metadata::DataTypes::boolean(),
+                DATA_TYPE_TINYINT => fcore::metadata::DataTypes::tinyint(),
+                DATA_TYPE_SMALLINT => fcore::metadata::DataTypes::smallint(),
+                DATA_TYPE_INT => fcore::metadata::DataTypes::int(),
+                DATA_TYPE_BIGINT => fcore::metadata::DataTypes::bigint(),
+                DATA_TYPE_FLOAT => fcore::metadata::DataTypes::float(),
+                DATA_TYPE_DOUBLE => fcore::metadata::DataTypes::double(),
+                DATA_TYPE_STRING => fcore::metadata::DataTypes::string(),
+                DATA_TYPE_BYTES => fcore::metadata::DataTypes::bytes(),
+                DATA_TYPE_DATE => fcore::metadata::DataTypes::date(),
+                DATA_TYPE_TIME => fcore::metadata::DataTypes::time(),
+                DATA_TYPE_TIMESTAMP => {
+                    fcore::metadata::DataTypes::timestamp_with_precision(precision)
+                }
+                DATA_TYPE_TIMESTAMP_LTZ => {
+                    fcore::metadata::DataTypes::timestamp_ltz_with_precision(precision)
+                }
+                DATA_TYPE_DECIMAL => {
+                    let dt = fcore::metadata::DecimalType::new(precision, scale)?;
+                    fcore::metadata::DataType::Decimal(dt)
+                }
+                DATA_TYPE_CHAR => fcore::metadata::DataTypes::char(precision),
+                DATA_TYPE_BINARY => fcore::metadata::DataTypes::binary(precision as usize),
+                _ => return Err(anyhow!("Unknown data type: {}", data_type)),
+            };
+            if nullable {
+                Ok(dt)
+            } else {
+                Ok(dt.as_non_nullable())
+            }
+        }
+        FfiDataTypeSpec::Array {
+            element_data_type,
+            element_precision,
+            element_scale,
+            array_nesting,
+            ref array_nullability,
+        } => build_array_type_from_leaf(
+            element_data_type,
+            element_precision,
+            element_scale,
+            array_nesting,
+            array_nullability,
+        ),
+    }
+}
+
+pub fn core_data_type_to_ffi(dt: &fcore::metadata::DataType) -> i32 {
+    match dt {
+        fcore::metadata::DataType::Boolean(_) => DATA_TYPE_BOOLEAN,
+        fcore::metadata::DataType::TinyInt(_) => DATA_TYPE_TINYINT,
+        fcore::metadata::DataType::SmallInt(_) => DATA_TYPE_SMALLINT,
+        fcore::metadata::DataType::Int(_) => DATA_TYPE_INT,
+        fcore::metadata::DataType::BigInt(_) => DATA_TYPE_BIGINT,
+        fcore::metadata::DataType::Float(_) => DATA_TYPE_FLOAT,
+        fcore::metadata::DataType::Double(_) => DATA_TYPE_DOUBLE,
+        fcore::metadata::DataType::String(_) => DATA_TYPE_STRING,
+        fcore::metadata::DataType::Bytes(_) => DATA_TYPE_BYTES,
+        fcore::metadata::DataType::Date(_) => DATA_TYPE_DATE,
+        fcore::metadata::DataType::Time(_) => DATA_TYPE_TIME,
+        fcore::metadata::DataType::Timestamp(_) => DATA_TYPE_TIMESTAMP,
+        fcore::metadata::DataType::TimestampLTz(_) => DATA_TYPE_TIMESTAMP_LTZ,
+        fcore::metadata::DataType::Decimal(_) => DATA_TYPE_DECIMAL,
+        fcore::metadata::DataType::Char(_) => DATA_TYPE_CHAR,
+        fcore::metadata::DataType::Binary(_) => DATA_TYPE_BINARY,
+        fcore::metadata::DataType::Array(_) => DATA_TYPE_ARRAY,
+        _ => 0,
+    }
+}
+
+fn core_column_to_ffi(col: &fcore::metadata::Column) -> ffi::FfiColumn {
+    let (precision, scale) = type_precision_scale(col.data_type());
+
+    let flat = match col.data_type() {
+        fcore::metadata::DataType::Array(_) => flatten_array_leaf_type(col.data_type()).ok(),
+        _ => None,
+    };
+
+    ffi::FfiColumn {
+        name: col.name().to_string(),
+        data_type: core_data_type_to_ffi(col.data_type()),
+        nullable: col.data_type().is_nullable(),
+        comment: col.comment().unwrap_or("").to_string(),
+        precision,
+        scale,
+        array_nesting: flat.as_ref().map_or(0, |f| f.nesting),
+        array_nullability: flat
+            .as_ref()
+            .map_or_else(Vec::new, |f| f.array_nullability.clone()),
+        element_data_type: flat.as_ref().map_or(0, |f| f.leaf_type),
+        element_precision: flat.as_ref().map_or(0, |f| f.leaf_precision),
+        element_scale: flat.as_ref().map_or(0, |f| f.leaf_scale),
+    }
+}
+
+pub fn ffi_descriptor_to_core(
+    descriptor: &ffi::FfiTableDescriptor,
+) -> Result<fcore::metadata::TableDescriptor> {
+    let mut schema_builder = fcore::metadata::Schema::builder();
+
+    for col in &descriptor.schema.columns {
+        if col.precision < 0 || col.scale < 0 || col.array_nesting < 0 {
+            return Err(anyhow!(
+                "Column '{}': precision, scale, and array_nesting must be non-negative",
+                col.name
+            ));
+        }
+        let dt = ffi_column_to_core_data_type(col)?;
+        schema_builder = schema_builder.column(&col.name, dt);
+        if !col.comment.is_empty() {
+            schema_builder = schema_builder.with_comment(&col.comment);
+        }
+    }
+
+    if !descriptor.schema.primary_keys.is_empty() {
+        schema_builder = schema_builder.primary_key(descriptor.schema.primary_keys.clone());
+    }
+
+    let schema = schema_builder.build()?;
+
+    let mut builder = fcore::metadata::TableDescriptor::builder()
+        .schema(schema)
+        .partitioned_by(descriptor.partition_keys.clone());
+
+    if descriptor.bucket_count > 0 {
+        builder = builder.distributed_by(
+            Some(descriptor.bucket_count),
+            descriptor.bucket_keys.clone(),
+        );
+    } else {
+        builder = builder.distributed_by(None, descriptor.bucket_keys.clone());
+    }
+
+    for prop in &descriptor.properties {
+        builder = builder.property(&prop.key, &prop.value);
+    }
+
+    if !descriptor.custom_properties.is_empty() {
+        let custom: std::collections::HashMap<String, String> = descriptor
+            .custom_properties
+            .iter()
+            .map(|kv| (kv.key.clone(), kv.value.clone()))
+            .collect();
+        builder = builder.custom_properties(custom);
+    }
+
+    if !descriptor.comment.is_empty() {
+        builder = builder.comment(&descriptor.comment);
+    }
+
+    Ok(builder.build()?)
+}
+
+pub fn core_table_info_to_ffi(info: &fcore::metadata::TableInfo) -> ffi::FfiTableInfo {
+    let schema = info.get_schema();
+    let columns: Vec<ffi::FfiColumn> = schema.columns().iter().map(core_column_to_ffi).collect();
+
+    let primary_keys: Vec<String> = schema
+        .primary_key()
+        .map(|pk| pk.column_names().to_vec())
+        .unwrap_or_default();
+
+    let properties: Vec<ffi::HashMapValue> = info
+        .get_properties()
+        .iter()
+        .map(|(k, v)| ffi::HashMapValue {
+            key: k.clone(),
+            value: v.clone(),
+        })
+        .collect();
+
+    let custom_properties: Vec<ffi::HashMapValue> = info
+        .get_custom_properties()
+        .iter()
+        .map(|(k, v)| ffi::HashMapValue {
+            key: k.clone(),
+            value: v.clone(),
+        })
+        .collect();
+
+    ffi::FfiTableInfo {
+        table_id: info.get_table_id(),
+        schema_id: info.get_schema_id(),
+        table_path: ffi::FfiTablePath {
+            database_name: info.get_table_path().database().to_string(),
+            table_name: info.get_table_path().table().to_string(),
+        },
+        created_time: info.get_created_time(),
+        modified_time: info.get_modified_time(),
+        primary_keys: info.get_primary_keys().clone(),
+        bucket_keys: info.get_bucket_keys().to_vec(),
+        partition_keys: info.get_partition_keys().to_vec(),
+        num_buckets: info.get_num_buckets(),
+        has_primary_key: info.has_primary_key(),
+        is_partitioned: info.is_partitioned(),
+        properties,
+        custom_properties,
+        comment: info.get_comment().unwrap_or("").to_string(),
+        schema: ffi::FfiSchema {
+            columns,
+            primary_keys,
+        },
+    }
+}
+
+pub fn empty_table_info() -> ffi::FfiTableInfo {
+    ffi::FfiTableInfo {
+        table_id: 0,
+        schema_id: 0,
+        table_path: ffi::FfiTablePath {
+            database_name: String::new(),
+            table_name: String::new(),
+        },
+        created_time: 0,
+        modified_time: 0,
+        primary_keys: vec![],
+        bucket_keys: vec![],
+        partition_keys: vec![],
+        num_buckets: 0,
+        has_primary_key: false,
+        is_partitioned: false,
+        properties: vec![],
+        custom_properties: vec![],
+        comment: String::new(),
+        schema: ffi::FfiSchema {
+            columns: vec![],
+            primary_keys: vec![],
+        },
+    }
+}
+
+/// Convert element type tag + precision/scale to core DataType.
+/// Used by ArrayWriterInner construction from C++.
+///
+/// Nullability is hardcoded to `true` (the default) because `ArrayWriter`
+/// only needs the type for encoding — the binary array format does not
+/// vary based on nullability. Nullability is a schema-level constraint
+/// enforced elsewhere (column definition, primary key normalization).
+pub fn element_type_from_ffi(
+    leaf_dt: i32,
+    precision: u32,
+    scale: u32,
+    array_nesting: u32,
+) -> Result<fcore::metadata::DataType> {
+    if array_nesting == 0 {
+        ffi_data_type_to_core(FfiDataTypeSpec::Scalar {
+            data_type: leaf_dt,
+            precision,
+            scale,
+            nullable: true,
+        })
+    } else {
+        let array_nullability = vec![1u8; (array_nesting + 1) as usize];
+        build_array_type_from_leaf(leaf_dt, precision, scale, array_nesting, &array_nullability)
+    }
+}
+
+/// Convert FFI database descriptor to core. Returns None if descriptor is effectively empty
+/// (no comment and no properties), so create_database can pass Option::None to core.
+pub fn ffi_database_descriptor_to_core(
+    d: &ffi::FfiDatabaseDescriptor,
+) -> Option<fcore::metadata::DatabaseDescriptor> {
+    if d.comment.is_empty() && d.properties.is_empty() {
+        return None;
+    }
+    let mut builder = fcore::metadata::DatabaseDescriptor::builder();
+    if !d.comment.is_empty() {
+        builder = builder.comment(&d.comment);
+    }
+    if !d.properties.is_empty() {
+        let props: std::collections::HashMap<String, String> = d
+            .properties
+            .iter()
+            .map(|kv| (kv.key.clone(), kv.value.clone()))
+            .collect();
+        builder = builder.custom_properties(props);
+    }
+    Some(builder.build())
+}
+
+/// Convert core DatabaseInfo to FFI.
+pub fn core_database_info_to_ffi(info: &fcore::metadata::DatabaseInfo) -> ffi::FfiDatabaseInfo {
+    let desc = info.database_descriptor();
+    let properties: Vec<ffi::HashMapValue> = desc
+        .custom_properties()
+        .iter()
+        .map(|(k, v)| ffi::HashMapValue {
+            key: k.clone(),
+            value: v.clone(),
+        })
+        .collect();
+    ffi::FfiDatabaseInfo {
+        database_name: info.database_name().to_string(),
+        comment: desc.comment().unwrap_or("").to_string(),
+        properties,
+        created_time: info.created_time(),
+        modified_time: info.modified_time(),
+    }
+}
+
+/// Resolve types in a GenericRow using schema metadata.
+/// Narrows Int32 → Int8/Int16, parses decimal strings, etc.
+/// Used by both AppendWriter and UpsertWriter.
+pub fn resolve_row_types(
+    row: &fcore::row::GenericRow<'_>,
+    schema: Option<&fcore::metadata::Schema>,
+) -> Result<fcore::row::GenericRow<'static>> {
+    use fcore::row::Datum;
+
+    let mut out = fcore::row::GenericRow::new(row.values.len());
+
+    for (idx, datum) in row.values.iter().enumerate() {
+        let resolved = match datum {
+            Datum::Null => Datum::Null,
+            Datum::Bool(v) => Datum::Bool(*v),
+            Datum::Int32(v) => match schema
+                .and_then(|s| s.columns().get(idx))
+                .map(|c| c.data_type())
+            {
+                Some(fcore::metadata::DataType::TinyInt(_)) => Datum::Int8(
+                    i8::try_from(*v).map_err(|_| anyhow!("Column {idx}: {v} overflows TinyInt"))?,
+                ),
+                Some(fcore::metadata::DataType::SmallInt(_)) => Datum::Int16(
+                    i16::try_from(*v)
+                        .map_err(|_| anyhow!("Column {idx}: {v} overflows SmallInt"))?,
+                ),
+                _ => Datum::Int32(*v),
+            },
+            Datum::Int64(v) => Datum::Int64(*v),
+            Datum::Float32(v) => Datum::Float32(*v),
+            Datum::Float64(v) => Datum::Float64(*v),
+            Datum::Int8(v) => Datum::Int8(*v),
+            Datum::Int16(v) => Datum::Int16(*v),
+            Datum::String(cow) => {
+                // Check if the schema column is Decimal — if so, parse the string as decimal
+                match schema
+                    .and_then(|s| s.columns().get(idx))
+                    .map(|c| c.data_type())
+                {
+                    Some(fcore::metadata::DataType::Decimal(dt)) => {
+                        let (precision, scale) = (dt.precision(), dt.scale());
+                        let bd = bigdecimal::BigDecimal::from_str(cow.as_ref()).map_err(|e| {
+                            anyhow!("Column {idx}: invalid decimal string '{cow}': {e}")
+                        })?;
+                        let decimal = fcore::row::Decimal::from_big_decimal(bd, precision, scale)
+                            .map_err(|e| anyhow!("Column {idx}: {e}"))?;
+                        Datum::Decimal(decimal)
+                    }
+                    _ => Datum::String(Cow::Owned(cow.to_string())),
+                }
+            }
+            Datum::Blob(cow) => Datum::Blob(Cow::Owned(cow.to_vec())),
+            Datum::Decimal(d) => Datum::Decimal(d.clone()),
+            Datum::Date(d) => Datum::Date(*d),
+            Datum::Time(t) => Datum::Time(*t),
+            Datum::TimestampNtz(ts) => Datum::TimestampNtz(*ts),
+            Datum::TimestampLtz(ts) => Datum::TimestampLtz(*ts),
+            Datum::Array(a) => Datum::Array(a.clone()),
+            Datum::Map(m) => Datum::Map(m.clone()),
+            Datum::Row(_) => return Err(anyhow!("Row datum is not yet supported in C++ bindings")),
+        };
+        out.set_field(idx, resolved);
+    }
+
+    Ok(out)
+}
+
+/// Convert a CompactedRow (lookup result) to an owned GenericRow<'static>.
+/// One copy for strings/bytes (Cow::Owned), but no second copy into FfiDatum.
+pub fn compacted_row_to_owned(
+    row: &dyn fcore::row::InternalRow,
+    table_info: &fcore::metadata::TableInfo,
+) -> Result<fcore::row::GenericRow<'static>> {
+    use fcore::row::Datum;
+
+    let schema = table_info.get_schema();
+    let columns = schema.columns();
+    let mut out = fcore::row::GenericRow::new(columns.len());
+
+    for (i, col) in columns.iter().enumerate() {
+        if row.is_null_at(i)? {
+            out.set_field(i, Datum::Null);
+            continue;
+        }
+
+        let datum = match col.data_type() {
+            fcore::metadata::DataType::Boolean(_) => Datum::Bool(row.get_boolean(i)?),
+            fcore::metadata::DataType::TinyInt(_) => Datum::Int8(row.get_byte(i)?),
+            fcore::metadata::DataType::SmallInt(_) => Datum::Int16(row.get_short(i)?),
+            fcore::metadata::DataType::Int(_) => Datum::Int32(row.get_int(i)?),
+            fcore::metadata::DataType::BigInt(_) => Datum::Int64(row.get_long(i)?),
+            fcore::metadata::DataType::Float(_) => Datum::Float32(row.get_float(i)?.into()),
+            fcore::metadata::DataType::Double(_) => Datum::Float64(row.get_double(i)?.into()),
+            fcore::metadata::DataType::String(_) => {
+                Datum::String(Cow::Owned(row.get_string(i)?.to_string()))
+            }
+            fcore::metadata::DataType::Bytes(_) => {
+                Datum::Blob(Cow::Owned(row.get_bytes(i)?.to_vec()))
+            }
+            fcore::metadata::DataType::Date(_) => Datum::Date(row.get_date(i)?),
+            fcore::metadata::DataType::Time(_) => Datum::Time(row.get_time(i)?),
+            fcore::metadata::DataType::Timestamp(dt) => {
+                Datum::TimestampNtz(row.get_timestamp_ntz(i, dt.precision())?)
+            }
+            fcore::metadata::DataType::TimestampLTz(dt) => {
+                Datum::TimestampLtz(row.get_timestamp_ltz(i, dt.precision())?)
+            }
+            fcore::metadata::DataType::Decimal(dt) => {
+                let decimal = row.get_decimal(i, dt.precision() as usize, dt.scale() as usize)?;
+                Datum::Decimal(decimal)
+            }
+            fcore::metadata::DataType::Char(dt) => Datum::String(Cow::Owned(
+                row.get_char(i, dt.length() as usize)?.to_string(),
+            )),
+            fcore::metadata::DataType::Binary(dt) => {
+                Datum::Blob(Cow::Owned(row.get_binary(i, dt.length())?.to_vec()))
+            }
+            fcore::metadata::DataType::Array(_) => Datum::Array(row.get_array(i)?),
+            fcore::metadata::DataType::Map(_) => Datum::Map(row.get_map(i)?),
+            other => return Err(anyhow!("Unsupported data type for column {i}: {other:?}")),
+        };
+
+        out.set_field(i, datum);
+    }
+
+    Ok(out)
+}
+
+pub fn core_lake_snapshot_to_ffi(snapshot: &fcore::metadata::LakeSnapshot) -> ffi::FfiLakeSnapshot {
+    let bucket_offsets: Vec<ffi::FfiBucketOffset> = snapshot
+        .table_buckets_offset
+        .iter()
+        .map(|(bucket, offset)| ffi::FfiBucketOffset {
+            table_id: bucket.table_id(),
+            partition_id: bucket.partition_id().unwrap_or(-1),
+            bucket_id: bucket.bucket_id(),
+            offset: *offset,
+        })
+        .collect();
+
+    ffi::FfiLakeSnapshot {
+        snapshot_id: snapshot.snapshot_id,
+        bucket_offsets,
+    }
+}
+
+pub fn core_scan_batches_to_ffi(
+    batches: &[fcore::record::ScanBatch],
+) -> Result<ffi::FfiArrowRecordBatches, String> {
+    let mut ffi_batches = Vec::new();
+    for batch in batches {
+        let record_batch = batch.batch();
+        // Convert RecordBatch to StructArray first, then get the data
+        let struct_array = arrow::array::StructArray::from(record_batch.clone());
+        let ffi_array = Box::new(FFI_ArrowArray::new(&struct_array.into_data()));
+        let ffi_schema = Box::new(
+            FFI_ArrowSchema::try_from(record_batch.schema().as_ref()).map_err(|e| e.to_string())?,
+        );
+        // Export as raw pointers
+        ffi_batches.push(ffi::FfiArrowRecordBatch {
+            array_ptr: Box::into_raw(ffi_array) as usize,
+            schema_ptr: Box::into_raw(ffi_schema) as usize,
+            table_id: batch.bucket().table_id(),
+            partition_id: batch.bucket().partition_id().unwrap_or(-1),
+            bucket_id: batch.bucket().bucket_id(),
+            base_offset: batch.base_offset(),
+        });
+    }
+
+    Ok(ffi::FfiArrowRecordBatches {
+        batches: ffi_batches,
+    })
+}
diff --git a/fluss-rust/bindings/cpp/test/test_admin.cpp b/fluss-rust/bindings/cpp/test/test_admin.cpp
new file mode 100644
index 0000000000..99f93fcf1e
--- /dev/null
+++ b/fluss-rust/bindings/cpp/test/test_admin.cpp
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "test_utils.h"
+
+class AdminTest : public ::testing::Test {
+   protected:
+    fluss::Admin& admin() { return fluss_test::FlussTestEnvironment::Instance()->GetAdmin(); }
+};
+
+TEST_F(AdminTest, CreateDatabase) {
+    auto& adm = admin();
+
+    std::string db_name = "test_create_database_cpp";
+
+    // Database should not exist initially
+    bool exists = true;
+    ASSERT_OK(adm.DatabaseExists(db_name, exists));
+    ASSERT_FALSE(exists);
+
+    // Create database with descriptor
+    fluss::DatabaseDescriptor descriptor;
+    descriptor.comment = "test_db";
+    descriptor.properties = {{"k1", "v1"}, {"k2", "v2"}};
+    ASSERT_OK(adm.CreateDatabase(db_name, descriptor, false));
+
+    // Database should exist now
+    ASSERT_OK(adm.DatabaseExists(db_name, exists));
+    ASSERT_TRUE(exists);
+
+    // Get database info
+    fluss::DatabaseInfo db_info;
+    ASSERT_OK(adm.GetDatabaseInfo(db_name, db_info));
+    EXPECT_EQ(db_info.database_name, db_name);
+    EXPECT_EQ(db_info.comment, "test_db");
+    EXPECT_EQ(db_info.properties.at("k1"), "v1");
+    EXPECT_EQ(db_info.properties.at("k2"), "v2");
+
+    // Drop database
+    ASSERT_OK(adm.DropDatabase(db_name, false, true));
+
+    // Database should not exist now
+    ASSERT_OK(adm.DatabaseExists(db_name, exists));
+    ASSERT_FALSE(exists);
+}
+
+TEST_F(AdminTest, CreateTable) {
+    auto& adm = admin();
+
+    std::string db_name = "test_create_table_cpp_db";
+    fluss::DatabaseDescriptor db_desc;
+    db_desc.comment = "Database for test_create_table";
+
+    bool exists = false;
+    ASSERT_OK(adm.DatabaseExists(db_name, exists));
+    ASSERT_FALSE(exists);
+
+    ASSERT_OK(adm.CreateDatabase(db_name, db_desc, false));
+
+    std::string table_name = "test_user_table";
+    fluss::TablePath table_path(db_name, table_name);
+
+    // Build schema
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .AddColumn("age", fluss::DataType::Int(), "User's age (optional)")
+                      .AddColumn("email", fluss::DataType::String())
+                      .SetPrimaryKeys({"id"})
+                      .Build();
+
+    // Build table descriptor
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetComment("Test table for user data (id, name, age, email)")
+                                .SetBucketCount(3)
+                                .SetBucketKeys({"id"})
+                                .SetProperty("table.replication.factor", "1")
+                                .SetLogFormat("arrow")
+                                .SetKvFormat("indexed")
+                                .Build();
+
+    // Create table
+    ASSERT_OK(adm.CreateTable(table_path, table_descriptor, false));
+
+    // Table should exist
+    ASSERT_OK(adm.TableExists(table_path, exists));
+    ASSERT_TRUE(exists);
+
+    // List tables
+    std::vector<std::string> tables;
+    ASSERT_OK(adm.ListTables(db_name, tables));
+    ASSERT_EQ(tables.size(), 1u);
+    EXPECT_TRUE(std::find(tables.begin(), tables.end(), table_name) != tables.end());
+
+    // Get table info
+    fluss::TableInfo table_info;
+    ASSERT_OK(adm.GetTableInfo(table_path, table_info));
+
+    EXPECT_EQ(table_info.comment, "Test table for user data (id, name, age, email)");
+    EXPECT_EQ(table_info.primary_keys, std::vector<std::string>{"id"});
+    EXPECT_EQ(table_info.num_buckets, 3);
+    EXPECT_EQ(table_info.bucket_keys, std::vector<std::string>{"id"});
+
+    // Drop table
+    ASSERT_OK(adm.DropTable(table_path, false));
+    ASSERT_OK(adm.TableExists(table_path, exists));
+    ASSERT_FALSE(exists);
+
+    // Drop database
+    ASSERT_OK(adm.DropDatabase(db_name, false, true));
+    ASSERT_OK(adm.DatabaseExists(db_name, exists));
+    ASSERT_FALSE(exists);
+}
+
+TEST_F(AdminTest, PartitionApis) {
+    auto& adm = admin();
+
+    std::string db_name = "test_partition_apis_cpp_db";
+    fluss::DatabaseDescriptor db_desc;
+    db_desc.comment = "Database for test_partition_apis";
+    ASSERT_OK(adm.CreateDatabase(db_name, db_desc, true));
+
+    fluss::TablePath table_path(db_name, "partitioned_table");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .AddColumn("dt", fluss::DataType::String())
+                      .AddColumn("region", fluss::DataType::String())
+                      .SetPrimaryKeys({"id", "dt", "region"})
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetBucketCount(3)
+                                .SetBucketKeys({"id"})
+                                .SetPartitionKeys({"dt", "region"})
+                                .SetProperty("table.replication.factor", "1")
+                                .SetLogFormat("arrow")
+                                .SetKvFormat("compacted")
+                                .Build();
+
+    ASSERT_OK(adm.CreateTable(table_path, table_descriptor, true));
+
+    // No partitions initially
+    std::vector<fluss::PartitionInfo> partitions;
+    ASSERT_OK(adm.ListPartitionInfos(table_path, partitions));
+    ASSERT_TRUE(partitions.empty());
+
+    // Create a partition
+    std::unordered_map<std::string, std::string> partition_spec = {
+        {"dt", "2024-01-15"}, {"region", "EMEA"}};
+    ASSERT_OK(adm.CreatePartition(table_path, partition_spec, false));
+
+    // Should have one partition
+    ASSERT_OK(adm.ListPartitionInfos(table_path, partitions));
+    ASSERT_EQ(partitions.size(), 1u);
+    EXPECT_EQ(partitions[0].partition_name, "2024-01-15$EMEA");
+
+    // List with partial spec filter - should find the partition
+    std::unordered_map<std::string, std::string> partial_spec = {{"dt", "2024-01-15"}};
+    std::vector<fluss::PartitionInfo> partitions_with_spec;
+    ASSERT_OK(adm.ListPartitionInfos(table_path, partial_spec, partitions_with_spec));
+    ASSERT_EQ(partitions_with_spec.size(), 1u);
+    EXPECT_EQ(partitions_with_spec[0].partition_name, "2024-01-15$EMEA");
+
+    // List with non-matching spec - should find no partitions
+    std::unordered_map<std::string, std::string> non_matching_spec = {{"dt", "2024-01-16"}};
+    std::vector<fluss::PartitionInfo> empty_partitions;
+    ASSERT_OK(adm.ListPartitionInfos(table_path, non_matching_spec, empty_partitions));
+    ASSERT_TRUE(empty_partitions.empty());
+
+    // Drop partition
+    ASSERT_OK(adm.DropPartition(table_path, partition_spec, false));
+
+    ASSERT_OK(adm.ListPartitionInfos(table_path, partitions));
+    ASSERT_TRUE(partitions.empty());
+
+    // Cleanup
+    ASSERT_OK(adm.DropTable(table_path, true));
+    ASSERT_OK(adm.DropDatabase(db_name, true, true));
+}
+
+TEST_F(AdminTest, FlussErrorResponse) {
+    auto& adm = admin();
+
+    fluss::TablePath table_path("fluss", "not_exist_cpp");
+
+    fluss::TableInfo info;
+    auto result = adm.GetTableInfo(table_path, info);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::TABLE_NOT_EXIST);
+}
+
+TEST_F(AdminTest, ErrorDatabaseNotExist) {
+    auto& adm = admin();
+
+    // get_database_info for non-existent database
+    fluss::DatabaseInfo info;
+    auto result = adm.GetDatabaseInfo("no_such_db_cpp", info);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::DATABASE_NOT_EXIST);
+
+    // drop_database without ignore flag
+    result = adm.DropDatabase("no_such_db_cpp", false, false);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::DATABASE_NOT_EXIST);
+
+    // list_tables for non-existent database
+    std::vector<std::string> tables;
+    result = adm.ListTables("no_such_db_cpp", tables);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::DATABASE_NOT_EXIST);
+}
+
+TEST_F(AdminTest, ErrorDatabaseAlreadyExist) {
+    auto& adm = admin();
+
+    std::string db_name = "test_error_db_already_exist_cpp";
+    fluss::DatabaseDescriptor descriptor;
+
+    ASSERT_OK(adm.CreateDatabase(db_name, descriptor, false));
+
+    // Create same database again without ignore flag
+    auto result = adm.CreateDatabase(db_name, descriptor, false);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::DATABASE_ALREADY_EXIST);
+
+    // With ignore flag should succeed
+    ASSERT_OK(adm.CreateDatabase(db_name, descriptor, true));
+
+    // Cleanup
+    ASSERT_OK(adm.DropDatabase(db_name, true, true));
+}
+
+TEST_F(AdminTest, ErrorTableAlreadyExist) {
+    auto& adm = admin();
+
+    std::string db_name = "test_error_tbl_already_exist_cpp_db";
+    fluss::DatabaseDescriptor db_desc;
+    ASSERT_OK(adm.CreateDatabase(db_name, db_desc, true));
+
+    fluss::TablePath table_path(db_name, "my_table");
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .Build();
+    auto table_desc = fluss::TableDescriptor::NewBuilder()
+                          .SetSchema(schema)
+                          .SetBucketCount(1)
+                          .SetProperty("table.replication.factor", "1")
+                          .Build();
+
+    ASSERT_OK(adm.CreateTable(table_path, table_desc, false));
+
+    // Create same table again without ignore flag
+    auto result = adm.CreateTable(table_path, table_desc, false);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::TABLE_ALREADY_EXIST);
+
+    // With ignore flag should succeed
+    ASSERT_OK(adm.CreateTable(table_path, table_desc, true));
+
+    // Cleanup
+    ASSERT_OK(adm.DropTable(table_path, true));
+    ASSERT_OK(adm.DropDatabase(db_name, true, true));
+}
+
+TEST_F(AdminTest, GetServerNodes) {
+    auto& adm = admin();
+
+    std::vector<fluss::ServerNode> nodes;
+    ASSERT_OK(adm.GetServerNodes(nodes));
+
+    ASSERT_GT(nodes.size(), 0u) << "Expected at least one server node";
+
+    bool has_coordinator = false;
+    bool has_tablet = false;
+    for (const auto& node : nodes) {
+        EXPECT_FALSE(node.host.empty()) << "Server node host should not be empty";
+        EXPECT_GT(node.port, 0u) << "Server node port should be > 0";
+        EXPECT_FALSE(node.uid.empty()) << "Server node uid should not be empty";
+
+        if (node.server_type == "CoordinatorServer") {
+            has_coordinator = true;
+        } else if (node.server_type == "TabletServer") {
+            has_tablet = true;
+        }
+    }
+    EXPECT_TRUE(has_coordinator) << "Expected a coordinator server node";
+    EXPECT_TRUE(has_tablet) << "Expected at least one tablet server node";
+}
+
+TEST_F(AdminTest, ErrorTableNotExist) {
+    auto& adm = admin();
+
+    fluss::TablePath table_path("fluss", "no_such_table_cpp");
+
+    // Drop without ignore flag
+    auto result = adm.DropTable(table_path, false);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::TABLE_NOT_EXIST);
+
+    // Drop with ignore flag should succeed
+    ASSERT_OK(adm.DropTable(table_path, true));
+}
+
+TEST_F(AdminTest, ErrorTableNotPartitioned) {
+    auto& adm = admin();
+
+    std::string db_name = "test_error_not_partitioned_cpp_db";
+    fluss::DatabaseDescriptor db_desc;
+    ASSERT_OK(adm.CreateDatabase(db_name, db_desc, true));
+
+    fluss::TablePath table_path(db_name, "non_partitioned_table");
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .Build();
+    auto table_desc = fluss::TableDescriptor::NewBuilder()
+                          .SetSchema(schema)
+                          .SetBucketCount(1)
+                          .SetProperty("table.replication.factor", "1")
+                          .Build();
+
+    ASSERT_OK(adm.CreateTable(table_path, table_desc, false));
+
+    // list_partition_infos on non-partitioned table
+    std::vector<fluss::PartitionInfo> partitions;
+    auto result = adm.ListPartitionInfos(table_path, partitions);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::TABLE_NOT_PARTITIONED_EXCEPTION);
+
+    // Cleanup
+    ASSERT_OK(adm.DropTable(table_path, true));
+    ASSERT_OK(adm.DropDatabase(db_name, true, true));
+}
diff --git a/fluss-rust/bindings/cpp/test/test_ffi_converter.cpp b/fluss-rust/bindings/cpp/test/test_ffi_converter.cpp
new file mode 100644
index 0000000000..2078bdabb4
--- /dev/null
+++ b/fluss-rust/bindings/cpp/test/test_ffi_converter.cpp
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <stdexcept>
+
+#include "ffi_converter.hpp"
+
+namespace {
+
+fluss::ffi::FfiColumn MakeArrayColumn(int32_t nesting, int32_t element_type,
+                                      bool nullable = true, bool leaf_nullable = true,
+                                      std::vector<uint8_t> per_level_nullability = {}) {
+    fluss::ffi::FfiColumn col;
+    col.name = rust::String("bad_array");
+    col.data_type = static_cast<int32_t>(fluss::TypeId::Array);
+    col.nullable = nullable;
+    col.comment = rust::String("");
+    col.precision = 0;
+    col.scale = 0;
+    col.array_nesting = nesting;
+    if (!per_level_nullability.empty()) {
+        for (auto v : per_level_nullability) {
+            col.array_nullability.push_back(v);
+        }
+    } else {
+        for (int32_t i = 0; i < nesting; ++i) {
+            col.array_nullability.push_back((i == 0 ? nullable : true) ? 1 : 0);
+        }
+        col.array_nullability.push_back(leaf_nullable ? 1 : 0);
+    }
+    col.element_data_type = element_type;
+    col.element_precision = 0;
+    col.element_scale = 0;
+    return col;
+}
+
+fluss::ffi::FfiColumn MakeScalarColumn(const char* name, fluss::TypeId type_id,
+                                       bool nullable = true, int32_t precision = 0,
+                                       int32_t scale = 0) {
+    fluss::ffi::FfiColumn col;
+    col.name = rust::String(name);
+    col.data_type = static_cast<int32_t>(type_id);
+    col.nullable = nullable;
+    col.comment = rust::String("");
+    col.precision = precision;
+    col.scale = scale;
+    col.array_nesting = 0;
+    col.element_data_type = 0;
+    col.element_precision = 0;
+    col.element_scale = 0;
+    return col;
+}
+
+}  // namespace
+
+TEST(FfiConverterTest, RejectsArrayWithoutElementType) {
+    auto col = MakeArrayColumn(1, 0);
+    EXPECT_THROW((void)fluss::utils::from_ffi_column(col), std::runtime_error);
+}
+
+TEST(FfiConverterTest, RejectsArrayWithArrayLeafType) {
+    auto col = MakeArrayColumn(2, static_cast<int32_t>(fluss::TypeId::Array));
+    EXPECT_THROW((void)fluss::utils::from_ffi_column(col), std::runtime_error);
+}
+
+TEST(FfiConverterTest, RejectsArrayWithUnknownLeafType) {
+    auto col = MakeArrayColumn(1, 999);
+    EXPECT_THROW((void)fluss::utils::from_ffi_column(col), std::runtime_error);
+}
+
+TEST(FfiConverterTest, SupportsLegacyOneLevelArrayMetadata) {
+    auto col = MakeArrayColumn(0, static_cast<int32_t>(fluss::TypeId::Int));
+    auto converted = fluss::utils::from_ffi_column(col);
+    EXPECT_EQ(converted.data_type.id(), fluss::TypeId::Array);
+    ASSERT_NE(converted.data_type.element_type(), nullptr);
+    EXPECT_EQ(converted.data_type.element_type()->id(), fluss::TypeId::Int);
+}
+
+// --- Nullability tests ---
+
+TEST(DataTypeTest, DefaultNullable) {
+    auto dt = fluss::DataType::Int();
+    EXPECT_TRUE(dt.nullable());
+}
+
+TEST(DataTypeTest, NotNullMethod) {
+    auto dt = fluss::DataType::Int().NotNull();
+    EXPECT_FALSE(dt.nullable());
+    EXPECT_EQ(dt.id(), fluss::TypeId::Int);
+}
+
+TEST(DataTypeTest, NotNullPreservesPrecisionScale) {
+    auto dt = fluss::DataType::Decimal(10, 2).NotNull();
+    EXPECT_FALSE(dt.nullable());
+    EXPECT_EQ(dt.precision(), 10);
+    EXPECT_EQ(dt.scale(), 2);
+}
+
+TEST(DataTypeTest, ArrayElementNullability) {
+    auto dt = fluss::DataType::Array(fluss::DataType::Int().NotNull());
+    EXPECT_TRUE(dt.nullable());
+    ASSERT_NE(dt.element_type(), nullptr);
+    EXPECT_FALSE(dt.element_type()->nullable());
+}
+
+TEST(DataTypeTest, NotNullArrayNullableElement) {
+    auto dt = fluss::DataType::Array(fluss::DataType::Int()).NotNull();
+    EXPECT_FALSE(dt.nullable());
+    ASSERT_NE(dt.element_type(), nullptr);
+    EXPECT_TRUE(dt.element_type()->nullable());
+}
+
+TEST(DataTypeTest, NotNullArrayNotNullElement) {
+    auto dt = fluss::DataType::Array(fluss::DataType::Int().NotNull()).NotNull();
+    EXPECT_FALSE(dt.nullable());
+    ASSERT_NE(dt.element_type(), nullptr);
+    EXPECT_FALSE(dt.element_type()->nullable());
+}
+
+TEST(FfiConverterTest, ScalarNullableRoundTrip) {
+    fluss::Column col{"id", fluss::DataType::Int(), ""};
+    auto ffi_col = fluss::utils::to_ffi_column(col);
+    EXPECT_TRUE(ffi_col.nullable);
+    auto back = fluss::utils::from_ffi_column(ffi_col);
+    EXPECT_TRUE(back.data_type.nullable());
+}
+
+TEST(FfiConverterTest, ScalarNotNullRoundTrip) {
+    fluss::Column col{"id", fluss::DataType::Int().NotNull(), ""};
+    auto ffi_col = fluss::utils::to_ffi_column(col);
+    EXPECT_FALSE(ffi_col.nullable);
+    auto back = fluss::utils::from_ffi_column(ffi_col);
+    EXPECT_FALSE(back.data_type.nullable());
+}
+
+TEST(FfiConverterTest, ArrayNotNullElementRoundTrip) {
+    fluss::Column col{"tags", fluss::DataType::Array(fluss::DataType::String().NotNull()), ""};
+    auto ffi_col = fluss::utils::to_ffi_column(col);
+    EXPECT_TRUE(ffi_col.nullable);
+    ASSERT_EQ(ffi_col.array_nullability.size(), 2u);
+    EXPECT_EQ(ffi_col.array_nullability[1], 0);
+    auto back = fluss::utils::from_ffi_column(ffi_col);
+    EXPECT_TRUE(back.data_type.nullable());
+    ASSERT_NE(back.data_type.element_type(), nullptr);
+    EXPECT_FALSE(back.data_type.element_type()->nullable());
+}
+
+TEST(FfiConverterTest, NotNullArrayNullableElementRoundTrip) {
+    fluss::Column col{"ids", fluss::DataType::Array(fluss::DataType::Int()).NotNull(), ""};
+    auto ffi_col = fluss::utils::to_ffi_column(col);
+    EXPECT_FALSE(ffi_col.nullable);
+    ASSERT_EQ(ffi_col.array_nullability.size(), 2u);
+    EXPECT_EQ(ffi_col.array_nullability[1], 1);
+    auto back = fluss::utils::from_ffi_column(ffi_col);
+    EXPECT_FALSE(back.data_type.nullable());
+    ASSERT_NE(back.data_type.element_type(), nullptr);
+    EXPECT_TRUE(back.data_type.element_type()->nullable());
+}
+
+TEST(FfiConverterTest, NotNullArrayNotNullElementRoundTrip) {
+    fluss::Column col{
+        "strict_ids",
+        fluss::DataType::Array(fluss::DataType::Int().NotNull()).NotNull(),
+        "",
+    };
+    auto ffi_col = fluss::utils::to_ffi_column(col);
+    EXPECT_FALSE(ffi_col.nullable);
+    ASSERT_EQ(ffi_col.array_nullability.size(), 2u);
+    EXPECT_EQ(ffi_col.array_nullability[1], 0);
+    auto back = fluss::utils::from_ffi_column(ffi_col);
+    EXPECT_FALSE(back.data_type.nullable());
+    ASSERT_NE(back.data_type.element_type(), nullptr);
+    EXPECT_FALSE(back.data_type.element_type()->nullable());
+}
+
+TEST(FfiConverterTest, NestedArrayIntermediateNullabilityRoundTrip) {
+    fluss::Column col{
+        "nested",
+        fluss::DataType::Array(fluss::DataType::Array(fluss::DataType::Int()).NotNull()),
+        "",
+    };
+    auto ffi_col = fluss::utils::to_ffi_column(col);
+    auto back = fluss::utils::from_ffi_column(ffi_col);
+
+    EXPECT_TRUE(back.data_type.nullable());
+    ASSERT_NE(back.data_type.element_type(), nullptr);
+    EXPECT_FALSE(back.data_type.element_type()->nullable());
+    ASSERT_NE(back.data_type.element_type()->element_type(), nullptr);
+    EXPECT_TRUE(back.data_type.element_type()->element_type()->nullable());
+}
+
+TEST(FfiConverterTest, NestedArrayAllLevelsNullabilityRoundTrip) {
+    fluss::Column col{
+        "strict_nested",
+        fluss::DataType::Array(
+            fluss::DataType::Array(fluss::DataType::Int().NotNull()).NotNull())
+            .NotNull(),
+        "",
+    };
+    auto ffi_col = fluss::utils::to_ffi_column(col);
+    auto back = fluss::utils::from_ffi_column(ffi_col);
+
+    EXPECT_FALSE(back.data_type.nullable());
+    ASSERT_NE(back.data_type.element_type(), nullptr);
+    EXPECT_FALSE(back.data_type.element_type()->nullable());
+    ASSERT_NE(back.data_type.element_type()->element_type(), nullptr);
+    EXPECT_FALSE(back.data_type.element_type()->element_type()->nullable());
+}
+
+TEST(FfiConverterTest, FfiColumnNonNullableScalarReconstructed) {
+    auto col = MakeScalarColumn("id", fluss::TypeId::Int, false);
+    auto converted = fluss::utils::from_ffi_column(col);
+    EXPECT_FALSE(converted.data_type.nullable());
+    EXPECT_EQ(converted.data_type.id(), fluss::TypeId::Int);
+}
+
+TEST(FfiConverterTest, FfiColumnNonNullableArrayReconstructed) {
+    auto col = MakeArrayColumn(1, static_cast<int32_t>(fluss::TypeId::String), false, false);
+    auto converted = fluss::utils::from_ffi_column(col);
+    EXPECT_FALSE(converted.data_type.nullable());
+    ASSERT_NE(converted.data_type.element_type(), nullptr);
+    EXPECT_FALSE(converted.data_type.element_type()->nullable());
+}
diff --git a/fluss-rust/bindings/cpp/test/test_kv_table.cpp b/fluss-rust/bindings/cpp/test/test_kv_table.cpp
new file mode 100644
index 0000000000..5cc8f79d23
--- /dev/null
+++ b/fluss-rust/bindings/cpp/test/test_kv_table.cpp
@@ -0,0 +1,892 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "test_utils.h"
+
+class KvTableTest : public ::testing::Test {
+   protected:
+    fluss::Admin& admin() { return fluss_test::FlussTestEnvironment::Instance()->GetAdmin(); }
+
+    fluss::Connection& connection() {
+        return fluss_test::FlussTestEnvironment::Instance()->GetConnection();
+    }
+};
+
+TEST_F(KvTableTest, UpsertDeleteAndLookup) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_upsert_and_lookup_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .AddColumn("age", fluss::DataType::BigInt())
+                      .SetPrimaryKeys({"id"})
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    // Create upsert writer
+    auto table_upsert = table.NewUpsert();
+    fluss::UpsertWriter upsert_writer;
+    ASSERT_OK(table_upsert.CreateWriter(upsert_writer));
+
+    // Upsert 3 rows (fire-and-forget, then flush)
+    struct TestData {
+        int32_t id;
+        std::string name;
+        int64_t age;
+    };
+    std::vector<TestData> test_data = {{1, "Verso", 32}, {2, "Noco", 25}, {3, "Esquie", 35}};
+
+    for (const auto& d : test_data) {
+        fluss::GenericRow row(3);
+        row.SetInt32(0, d.id);
+        row.SetString(1, d.name);
+        row.SetInt64(2, d.age);
+        ASSERT_OK(upsert_writer.Upsert(row));
+    }
+    ASSERT_OK(upsert_writer.Flush());
+
+    // Create lookuper
+    fluss::Lookuper lookuper;
+    ASSERT_OK(table.NewLookup().CreateLookuper(lookuper));
+
+    // Verify lookup results
+    for (const auto& d : test_data) {
+        fluss::GenericRow key(3);
+        key.SetInt32(0, d.id);
+
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found()) << "Row with id=" << d.id << " should exist";
+
+        EXPECT_EQ(result.GetInt32(0), d.id) << "id mismatch";
+        EXPECT_EQ(result.GetString(1), d.name) << "name mismatch";
+        EXPECT_EQ(result.GetInt64(2), d.age) << "age mismatch";
+    }
+
+    // Update record with id=1 (await acknowledgment)
+    {
+        fluss::GenericRow updated_row(3);
+        updated_row.SetInt32(0, 1);
+        updated_row.SetString(1, "Verso");
+        updated_row.SetInt64(2, 33);
+        fluss::WriteResult wr;
+        ASSERT_OK(upsert_writer.Upsert(updated_row, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Verify the update
+    {
+        fluss::GenericRow key(3);
+        key.SetInt32(0, 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(result.GetInt64(2), 33) << "Age should be updated";
+        EXPECT_EQ(result.GetString(1), "Verso") << "Name should remain unchanged";
+    }
+
+    // Delete record with id=1 (await acknowledgment)
+    {
+        fluss::GenericRow delete_row(3);
+        delete_row.SetInt32(0, 1);
+        fluss::WriteResult wr;
+        ASSERT_OK(upsert_writer.Delete(delete_row, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Verify deletion
+    {
+        fluss::GenericRow key(3);
+        key.SetInt32(0, 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_FALSE(result.Found()) << "Record 1 should not exist after delete";
+    }
+
+    // Verify other records still exist
+    for (int id : {2, 3}) {
+        fluss::GenericRow key(3);
+        key.SetInt32(0, id);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found()) << "Record " << id
+                                    << " should still exist after deleting record 1";
+    }
+
+    // Lookup non-existent key
+    {
+        fluss::GenericRow key(3);
+        key.SetInt32(0, 999);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_FALSE(result.Found()) << "Non-existent key should return not found";
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(KvTableTest, LookupWithNestedArrayArrayView) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_lookup_nested_array_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("matrix",
+                                 fluss::DataType::Array(fluss::DataType::Array(fluss::DataType::Int())))
+                      .SetPrimaryKeys({"id"})
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    auto upsert = table.NewUpsert();
+    fluss::UpsertWriter writer;
+    ASSERT_OK(upsert.CreateWriter(writer));
+
+    {
+        auto row = table.NewRow();
+        row.Set("id", 1);
+
+        fluss::ArrayWriter inner1(2, fluss::DataType::Int());
+        inner1.SetInt32(0, 11);
+        inner1.SetInt32(1, 12);
+
+        fluss::ArrayWriter inner2(2, fluss::DataType::Int());
+        inner2.SetInt32(0, 21);
+        inner2.SetInt32(1, 22);
+
+        fluss::ArrayWriter outer(2, fluss::DataType::Array(fluss::DataType::Int()));
+        outer.SetArray(0, std::move(inner1));
+        outer.SetArray(1, std::move(inner2));
+        row.Set("matrix", std::move(outer));
+
+        ASSERT_OK(writer.Upsert(row));
+        ASSERT_OK(writer.Flush());
+    }
+
+    fluss::Lookuper lookuper;
+    ASSERT_OK(table.NewLookup().CreateLookuper(lookuper));
+
+    auto key = table.NewRow();
+    key.Set("id", 1);
+
+    fluss::LookupResult result;
+    ASSERT_OK(lookuper.Lookup(key, result));
+    ASSERT_TRUE(result.Found());
+    EXPECT_EQ(result.GetArraySize("matrix"), 2u);
+    EXPECT_EQ(result.GetArrayElementType("matrix"), fluss::TypeId::Array);
+
+    auto outer = result.GetArrayView("matrix");
+    ASSERT_EQ(outer.Size(), 2u);
+    EXPECT_EQ(outer.ElementType(), fluss::TypeId::Array);
+
+    auto first = outer.GetArray(0);
+    ASSERT_EQ(first.Size(), 2u);
+    EXPECT_EQ(first.ElementType(), fluss::TypeId::Int);
+    EXPECT_EQ(first.GetInt32(0), 11);
+    EXPECT_EQ(first.GetInt32(1), 12);
+
+    auto second = outer.GetArray(1);
+    ASSERT_EQ(second.Size(), 2u);
+    EXPECT_EQ(second.ElementType(), fluss::TypeId::Int);
+    EXPECT_EQ(second.GetInt32(0), 21);
+    EXPECT_EQ(second.GetInt32(1), 22);
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(KvTableTest, LookupArrayValidationErrors) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_lookup_array_validation_errors_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("vals", fluss::DataType::Array(fluss::DataType::Int()))
+                      .SetPrimaryKeys({"id"})
+                      .Build();
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+    auto upsert = table.NewUpsert();
+    fluss::UpsertWriter writer;
+    ASSERT_OK(upsert.CreateWriter(writer));
+
+    auto row = table.NewRow();
+    row.Set("id", 1);
+    fluss::ArrayWriter vals(2, fluss::DataType::Int());
+    vals.SetInt32(0, 99);
+    vals.SetNull(1);
+    row.Set("vals", std::move(vals));
+    ASSERT_OK(writer.Upsert(row));
+    ASSERT_OK(writer.Flush());
+
+    fluss::Lookuper lookuper;
+    ASSERT_OK(table.NewLookup().CreateLookuper(lookuper));
+
+    auto key = table.NewRow();
+    key.Set("id", 1);
+    fluss::LookupResult result;
+    ASSERT_OK(lookuper.Lookup(key, result));
+    ASSERT_TRUE(result.Found());
+
+    bool wrong_type_threw = false;
+    try {
+        (void)result.GetArrayInt64("vals", 0);
+    } catch (const std::exception&) {
+        wrong_type_threw = true;
+    }
+    EXPECT_TRUE(wrong_type_threw);
+
+    bool null_typed_getter_threw = false;
+    try {
+        (void)result.GetArrayInt32("vals", 1);
+    } catch (const std::exception&) {
+        null_typed_getter_threw = true;
+    }
+    EXPECT_TRUE(null_typed_getter_threw);
+
+    auto view = result.GetArrayView("vals");
+    EXPECT_EQ(view.Size(), 2u);
+    EXPECT_TRUE(view.IsNull(1));
+
+    bool view_wrong_type_threw = false;
+    try {
+        (void)view.GetInt64(0);
+    } catch (const std::exception&) {
+        view_wrong_type_threw = true;
+    }
+    EXPECT_TRUE(view_wrong_type_threw);
+
+    bool view_null_typed_getter_threw = false;
+    try {
+        (void)view.GetInt32(1);
+    } catch (const std::exception&) {
+        view_null_typed_getter_threw = true;
+    }
+    EXPECT_TRUE(view_null_typed_getter_threw);
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(KvTableTest, CompositePrimaryKeys) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_composite_pk_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("region", fluss::DataType::String())
+                      .AddColumn("score", fluss::DataType::BigInt())
+                      .AddColumn("user_id", fluss::DataType::Int())
+                      .SetPrimaryKeys({"region", "user_id"})
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    auto table_upsert = table.NewUpsert();
+    fluss::UpsertWriter upsert_writer;
+    ASSERT_OK(table_upsert.CreateWriter(upsert_writer));
+
+    // Insert records with composite keys
+    struct TestData {
+        std::string region;
+        int32_t user_id;
+        int64_t score;
+    };
+    std::vector<TestData> test_data = {
+        {"US", 1, 100}, {"US", 2, 200}, {"EU", 1, 150}, {"EU", 2, 250}};
+
+    for (const auto& d : test_data) {
+        auto row = table.NewRow();
+        row.Set("region", d.region);
+        row.Set("score", d.score);
+        row.Set("user_id", d.user_id);
+        ASSERT_OK(upsert_writer.Upsert(row));
+    }
+    ASSERT_OK(upsert_writer.Flush());
+
+    // Create lookuper
+    fluss::Lookuper lookuper;
+    ASSERT_OK(table.NewLookup().CreateLookuper(lookuper));
+
+    // Lookup (US, 1) - should return score 100
+    {
+        auto key = table.NewRow();
+        key.Set("region", "US");
+        key.Set("user_id", 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(result.GetInt64("score"), 100) << "Score for (US, 1) should be 100";
+    }
+
+    // Lookup (EU, 2) - should return score 250
+    {
+        auto key = table.NewRow();
+        key.Set("region", "EU");
+        key.Set("user_id", 2);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(result.GetInt64("score"), 250) << "Score for (EU, 2) should be 250";
+    }
+
+    // Update (US, 1) score (await acknowledgment)
+    {
+        auto update_row = table.NewRow();
+        update_row.Set("region", "US");
+        update_row.Set("user_id", 1);
+        update_row.Set("score", static_cast<int64_t>(500));
+        fluss::WriteResult wr;
+        ASSERT_OK(upsert_writer.Upsert(update_row, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Verify update
+    {
+        auto key = table.NewRow();
+        key.Set("region", "US");
+        key.Set("user_id", 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(result.GetInt64("score"), 500) << "Row score should be updated";
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(KvTableTest, PartialUpdate) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_partial_update_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .AddColumn("age", fluss::DataType::BigInt())
+                      .AddColumn("score", fluss::DataType::BigInt())
+                      .SetPrimaryKeys({"id"})
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    // Insert initial record with all columns
+    auto table_upsert = table.NewUpsert();
+    fluss::UpsertWriter upsert_writer;
+    ASSERT_OK(table_upsert.CreateWriter(upsert_writer));
+
+    {
+        fluss::GenericRow row(4);
+        row.SetInt32(0, 1);
+        row.SetString(1, "Verso");
+        row.SetInt64(2, 32);
+        row.SetInt64(3, 6942);
+        fluss::WriteResult wr;
+        ASSERT_OK(upsert_writer.Upsert(row, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Verify initial record
+    fluss::Lookuper lookuper;
+    ASSERT_OK(table.NewLookup().CreateLookuper(lookuper));
+
+    {
+        fluss::GenericRow key(4);
+        key.SetInt32(0, 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(result.GetInt32(0), 1);
+        EXPECT_EQ(result.GetString(1), "Verso");
+        EXPECT_EQ(result.GetInt64(2), 32);
+        EXPECT_EQ(result.GetInt64(3), 6942);
+    }
+
+    // Create partial update writer to update only score column
+    auto partial_upsert = table.NewUpsert();
+    partial_upsert.PartialUpdateByName({"id", "score"});
+    fluss::UpsertWriter partial_writer;
+    ASSERT_OK(partial_upsert.CreateWriter(partial_writer));
+
+    // Update only the score column (await acknowledgment)
+    {
+        fluss::GenericRow partial_row(4);
+        partial_row.SetInt32(0, 1);
+        partial_row.SetNull(1);  // not in partial update
+        partial_row.SetNull(2);  // not in partial update
+        partial_row.SetInt64(3, 420);
+        fluss::WriteResult wr;
+        ASSERT_OK(partial_writer.Upsert(partial_row, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Verify partial update - name and age should remain unchanged
+    {
+        fluss::GenericRow key(4);
+        key.SetInt32(0, 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(result.GetInt32(0), 1) << "id should remain 1";
+        EXPECT_EQ(result.GetString(1), "Verso") << "name should remain unchanged";
+        EXPECT_EQ(result.GetInt64(2), 32) << "age should remain unchanged";
+        EXPECT_EQ(result.GetInt64(3), 420) << "score should be updated to 420";
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(KvTableTest, PartialUpdateByIndex) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_partial_update_by_index_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .AddColumn("age", fluss::DataType::BigInt())
+                      .AddColumn("score", fluss::DataType::BigInt())
+                      .SetPrimaryKeys({"id"})
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    // Insert initial record with all columns
+    auto table_upsert = table.NewUpsert();
+    fluss::UpsertWriter upsert_writer;
+    ASSERT_OK(table_upsert.CreateWriter(upsert_writer));
+
+    {
+        fluss::GenericRow row(4);
+        row.SetInt32(0, 1);
+        row.SetString(1, "Verso");
+        row.SetInt64(2, 32);
+        row.SetInt64(3, 6942);
+        fluss::WriteResult wr;
+        ASSERT_OK(upsert_writer.Upsert(row, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Verify initial record
+    fluss::Lookuper lookuper;
+    ASSERT_OK(table.NewLookup().CreateLookuper(lookuper));
+
+    {
+        fluss::GenericRow key(4);
+        key.SetInt32(0, 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(result.GetInt32(0), 1);
+        EXPECT_EQ(result.GetString(1), "Verso");
+        EXPECT_EQ(result.GetInt64(2), 32);
+        EXPECT_EQ(result.GetInt64(3), 6942);
+    }
+
+    // Create partial update writer using column indices: 0 (id) and 3 (score)
+    auto partial_upsert = table.NewUpsert();
+    partial_upsert.PartialUpdateByIndex({0, 3});
+    fluss::UpsertWriter partial_writer;
+    ASSERT_OK(partial_upsert.CreateWriter(partial_writer));
+
+    // Update only the score column (await acknowledgment)
+    {
+        fluss::GenericRow partial_row(4);
+        partial_row.SetInt32(0, 1);
+        partial_row.SetNull(1);  // not in partial update
+        partial_row.SetNull(2);  // not in partial update
+        partial_row.SetInt64(3, 420);
+        fluss::WriteResult wr;
+        ASSERT_OK(partial_writer.Upsert(partial_row, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Verify partial update - name and age should remain unchanged
+    {
+        fluss::GenericRow key(4);
+        key.SetInt32(0, 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(result.GetInt32(0), 1) << "id should remain 1";
+        EXPECT_EQ(result.GetString(1), "Verso") << "name should remain unchanged";
+        EXPECT_EQ(result.GetInt64(2), 32) << "age should remain unchanged";
+        EXPECT_EQ(result.GetInt64(3), 420) << "score should be updated to 420";
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(KvTableTest, PartitionedTableUpsertAndLookup) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_partitioned_kv_table_cpp");
+
+    // Create a partitioned KV table with region as partition key
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("region", fluss::DataType::String())
+                      .AddColumn("user_id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .AddColumn("score", fluss::DataType::BigInt())
+                      .SetPrimaryKeys({"region", "user_id"})
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetPartitionKeys({"region"})
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    // Create partitions
+    fluss_test::CreatePartitions(adm, table_path, "region", {"US", "EU", "APAC"});
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    auto table_upsert = table.NewUpsert();
+    fluss::UpsertWriter upsert_writer;
+    ASSERT_OK(table_upsert.CreateWriter(upsert_writer));
+
+    // Insert records with different partitions
+    struct TestData {
+        std::string region;
+        int32_t user_id;
+        std::string name;
+        int64_t score;
+    };
+    std::vector<TestData> test_data = {{"US", 1, "Gustave", 100}, {"US", 2, "Lune", 200},
+                                       {"EU", 1, "Sciel", 150},   {"EU", 2, "Maelle", 250},
+                                       {"APAC", 1, "Noco", 300}};
+
+    for (const auto& d : test_data) {
+        fluss::GenericRow row(4);
+        row.SetString(0, d.region);
+        row.SetInt32(1, d.user_id);
+        row.SetString(2, d.name);
+        row.SetInt64(3, d.score);
+        ASSERT_OK(upsert_writer.Upsert(row));
+    }
+    ASSERT_OK(upsert_writer.Flush());
+
+    // Create lookuper
+    fluss::Lookuper lookuper;
+    ASSERT_OK(table.NewLookup().CreateLookuper(lookuper));
+
+    // Lookup records
+    for (const auto& d : test_data) {
+        fluss::GenericRow key(4);
+        key.SetString(0, d.region);
+        key.SetInt32(1, d.user_id);
+
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+
+        EXPECT_EQ(std::string(result.GetString(0)), d.region) << "region mismatch";
+        EXPECT_EQ(result.GetInt32(1), d.user_id) << "user_id mismatch";
+        EXPECT_EQ(std::string(result.GetString(2)), d.name) << "name mismatch";
+        EXPECT_EQ(result.GetInt64(3), d.score) << "score mismatch";
+    }
+
+    // Update within a partition (await acknowledgment)
+    {
+        fluss::GenericRow updated_row(4);
+        updated_row.SetString(0, "US");
+        updated_row.SetInt32(1, 1);
+        updated_row.SetString(2, "Gustave Updated");
+        updated_row.SetInt64(3, 999);
+        fluss::WriteResult wr;
+        ASSERT_OK(upsert_writer.Upsert(updated_row, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Verify the update
+    {
+        fluss::GenericRow key(4);
+        key.SetString(0, "US");
+        key.SetInt32(1, 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(std::string(result.GetString(2)), "Gustave Updated");
+        EXPECT_EQ(result.GetInt64(3), 999);
+    }
+
+    // Lookup in non-existent partition should return not found
+    {
+        fluss::GenericRow key(4);
+        key.SetString(0, "UNKNOWN_REGION");
+        key.SetInt32(1, 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_FALSE(result.Found()) << "Lookup in non-existent partition should return not found";
+    }
+
+    // Delete a record within a partition (await acknowledgment)
+    {
+        fluss::GenericRow delete_key(4);
+        delete_key.SetString(0, "EU");
+        delete_key.SetInt32(1, 1);
+        fluss::WriteResult wr;
+        ASSERT_OK(upsert_writer.Delete(delete_key, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Verify deletion
+    {
+        fluss::GenericRow key(4);
+        key.SetString(0, "EU");
+        key.SetInt32(1, 1);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_FALSE(result.Found()) << "Deleted record should not exist";
+    }
+
+    // Verify other records in same partition still exist
+    {
+        fluss::GenericRow key(4);
+        key.SetString(0, "EU");
+        key.SetInt32(1, 2);
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+        EXPECT_EQ(std::string(result.GetString(2)), "Maelle");
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(KvTableTest, AllSupportedDatatypes) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_all_datatypes_cpp");
+
+    // Create a table with all supported datatypes
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("pk_int", fluss::DataType::Int())
+                      .AddColumn("col_boolean", fluss::DataType::Boolean())
+                      .AddColumn("col_tinyint", fluss::DataType::TinyInt())
+                      .AddColumn("col_smallint", fluss::DataType::SmallInt())
+                      .AddColumn("col_int", fluss::DataType::Int())
+                      .AddColumn("col_bigint", fluss::DataType::BigInt())
+                      .AddColumn("col_float", fluss::DataType::Float())
+                      .AddColumn("col_double", fluss::DataType::Double())
+                      .AddColumn("col_char", fluss::DataType::Char(10))
+                      .AddColumn("col_string", fluss::DataType::String())
+                      .AddColumn("col_decimal", fluss::DataType::Decimal(10, 2))
+                      .AddColumn("col_date", fluss::DataType::Date())
+                      .AddColumn("col_time", fluss::DataType::Time())
+                      .AddColumn("col_timestamp", fluss::DataType::Timestamp())
+                      .AddColumn("col_timestamp_ltz", fluss::DataType::TimestampLtz())
+                      .AddColumn("col_bytes", fluss::DataType::Bytes())
+                      .AddColumn("col_binary", fluss::DataType::Binary(20))
+                      .SetPrimaryKeys({"pk_int"})
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    auto table_upsert = table.NewUpsert();
+    fluss::UpsertWriter upsert_writer;
+    ASSERT_OK(table_upsert.CreateWriter(upsert_writer));
+
+    // Test data
+    int32_t pk_int = 1;
+    bool col_boolean = true;
+    int32_t col_tinyint = 127;
+    int32_t col_smallint = 32767;
+    int32_t col_int = 2147483647;
+    int64_t col_bigint = 9223372036854775807LL;
+    float col_float = 3.14f;
+    double col_double = 2.718281828459045;
+    std::string col_char = "hello";
+    std::string col_string = "world of fluss rust client";
+    std::string col_decimal = "123.45";
+    auto col_date = fluss::Date::FromDays(20476);           // 2026-01-23
+    auto col_time = fluss::Time::FromMillis(36827000);       // 10:13:47
+    auto col_timestamp = fluss::Timestamp::FromMillis(1769163227123);      // 2026-01-23 10:13:47.123
+    auto col_timestamp_ltz = fluss::Timestamp::FromMillis(1769163227123);
+    std::vector<uint8_t> col_bytes = {'b', 'i', 'n', 'a', 'r', 'y', ' ', 'd', 'a', 't', 'a'};
+    std::vector<uint8_t> col_binary = {'f', 'i', 'x', 'e', 'd', ' ', 'b', 'i', 'n', 'a',
+                                       'r', 'y', ' ', 'd', 'a', 't', 'a', '!', '!', '!'};
+
+    // Upsert a row with all datatypes
+    {
+        fluss::GenericRow row(17);
+        row.SetInt32(0, pk_int);
+        row.SetBool(1, col_boolean);
+        row.SetInt32(2, col_tinyint);
+        row.SetInt32(3, col_smallint);
+        row.SetInt32(4, col_int);
+        row.SetInt64(5, col_bigint);
+        row.SetFloat32(6, col_float);
+        row.SetFloat64(7, col_double);
+        row.SetString(8, col_char);
+        row.SetString(9, col_string);
+        row.SetDecimal(10, col_decimal);
+        row.SetDate(11, col_date);
+        row.SetTime(12, col_time);
+        row.SetTimestampNtz(13, col_timestamp);
+        row.SetTimestampLtz(14, col_timestamp_ltz);
+        row.SetBytes(15, col_bytes);
+        row.SetBytes(16, col_binary);
+        fluss::WriteResult wr;
+        ASSERT_OK(upsert_writer.Upsert(row, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Lookup the record
+    fluss::Lookuper lookuper;
+    ASSERT_OK(table.NewLookup().CreateLookuper(lookuper));
+
+    {
+        fluss::GenericRow key(17);
+        key.SetInt32(0, pk_int);
+
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+
+        // Verify all datatypes
+        EXPECT_EQ(result.GetInt32(0), pk_int) << "pk_int mismatch";
+        EXPECT_EQ(result.GetBool(1), col_boolean) << "col_boolean mismatch";
+        EXPECT_EQ(result.GetInt32(2), col_tinyint) << "col_tinyint mismatch";
+        EXPECT_EQ(result.GetInt32(3), col_smallint) << "col_smallint mismatch";
+        EXPECT_EQ(result.GetInt32(4), col_int) << "col_int mismatch";
+        EXPECT_EQ(result.GetInt64(5), col_bigint) << "col_bigint mismatch";
+        EXPECT_NEAR(result.GetFloat32(6), col_float, 1e-6f) << "col_float mismatch";
+        EXPECT_NEAR(result.GetFloat64(7), col_double, 1e-15) << "col_double mismatch";
+        EXPECT_EQ(result.GetString(8), col_char) << "col_char mismatch";
+        EXPECT_EQ(result.GetString(9), col_string) << "col_string mismatch";
+        EXPECT_EQ(result.GetDecimalString(10), col_decimal) << "col_decimal mismatch";
+        EXPECT_EQ(result.GetDate(11).days_since_epoch, col_date.days_since_epoch) << "col_date mismatch";
+        EXPECT_EQ(result.GetTime(12).millis_since_midnight, col_time.millis_since_midnight) << "col_time mismatch";
+        EXPECT_EQ(result.GetTimestamp(13).epoch_millis, col_timestamp.epoch_millis)
+            << "col_timestamp mismatch";
+        EXPECT_EQ(result.GetTimestamp(14).epoch_millis, col_timestamp_ltz.epoch_millis)
+            << "col_timestamp_ltz mismatch";
+
+        auto [bytes_ptr, bytes_len] = result.GetBytes(15);
+        EXPECT_EQ(bytes_len, col_bytes.size()) << "col_bytes length mismatch";
+        EXPECT_TRUE(std::memcmp(bytes_ptr, col_bytes.data(), bytes_len) == 0)
+            << "col_bytes mismatch";
+
+        auto [binary_ptr, binary_len] = result.GetBytes(16);
+        EXPECT_EQ(binary_len, col_binary.size()) << "col_binary length mismatch";
+        EXPECT_TRUE(std::memcmp(binary_ptr, col_binary.data(), binary_len) == 0)
+            << "col_binary mismatch";
+    }
+
+    // Test with null values for nullable columns
+    {
+        fluss::GenericRow row_with_nulls(17);
+        row_with_nulls.SetInt32(0, 2);  // pk_int = 2
+        for (size_t i = 1; i < 17; ++i) {
+            row_with_nulls.SetNull(i);
+        }
+        fluss::WriteResult wr;
+        ASSERT_OK(upsert_writer.Upsert(row_with_nulls, wr));
+        ASSERT_OK(wr.Wait());
+    }
+
+    // Lookup row with nulls
+    {
+        fluss::GenericRow key(17);
+        key.SetInt32(0, 2);
+
+        fluss::LookupResult result;
+        ASSERT_OK(lookuper.Lookup(key, result));
+        ASSERT_TRUE(result.Found());
+
+        EXPECT_EQ(result.GetInt32(0), 2) << "pk_int mismatch";
+        for (size_t i = 1; i < 17; ++i) {
+            EXPECT_TRUE(result.IsNull(i)) << "column " << i << " should be null";
+        }
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
diff --git a/fluss-rust/bindings/cpp/test/test_log_table.cpp b/fluss-rust/bindings/cpp/test/test_log_table.cpp
new file mode 100644
index 0000000000..5678e4bb10
--- /dev/null
+++ b/fluss-rust/bindings/cpp/test/test_log_table.cpp
@@ -0,0 +1,1523 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow/api.h>
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <limits>
+#include <thread>
+#include <tuple>
+
+#include "test_utils.h"
+
+class LogTableTest : public ::testing::Test {
+   protected:
+    fluss::Admin& admin() { return fluss_test::FlussTestEnvironment::Instance()->GetAdmin(); }
+
+    fluss::Connection& connection() {
+        return fluss_test::FlussTestEnvironment::Instance()->GetConnection();
+    }
+};
+
+TEST_F(LogTableTest, AppendRecordBatchAndScan) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_append_record_batch_and_scan_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("c1", fluss::DataType::Int())
+                      .AddColumn("c2", fluss::DataType::String())
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetBucketCount(3)
+                                .SetBucketKeys({"c1"})
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    // Create append writer
+    auto table_append = table.NewAppend();
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table_append.CreateWriter(append_writer));
+
+    // Append Arrow record batches
+    {
+        auto c1 = arrow::Int32Builder();
+        c1.AppendValues({1, 2, 3}).ok();
+        auto c2 = arrow::StringBuilder();
+        c2.AppendValues({"a1", "a2", "a3"}).ok();
+
+        auto batch = arrow::RecordBatch::Make(
+            arrow::schema({arrow::field("c1", arrow::int32()), arrow::field("c2", arrow::utf8())}),
+            3, {c1.Finish().ValueOrDie(), c2.Finish().ValueOrDie()});
+
+        ASSERT_OK(append_writer.AppendArrowBatch(batch));
+    }
+
+    {
+        auto c1 = arrow::Int32Builder();
+        c1.AppendValues({4, 5, 6}).ok();
+        auto c2 = arrow::StringBuilder();
+        c2.AppendValues({"a4", "a5", "a6"}).ok();
+
+        auto batch = arrow::RecordBatch::Make(
+            arrow::schema({arrow::field("c1", arrow::int32()), arrow::field("c2", arrow::utf8())}),
+            3, {c1.Finish().ValueOrDie(), c2.Finish().ValueOrDie()});
+
+        ASSERT_OK(append_writer.AppendArrowBatch(batch));
+    }
+
+    ASSERT_OK(append_writer.Flush());
+
+    // Create scanner and subscribe to all 3 buckets
+    fluss::Table scan_table;
+    ASSERT_OK(conn.GetTable(table_path, scan_table));
+    int32_t num_buckets = scan_table.GetTableInfo().num_buckets;
+    ASSERT_EQ(num_buckets, 3) << "Table should have 3 buckets";
+
+    auto table_scan = scan_table.NewScan();
+    fluss::LogScanner log_scanner;
+    ASSERT_OK(table_scan.CreateLogScanner(log_scanner));
+
+    for (int32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
+        ASSERT_OK(log_scanner.Subscribe(bucket_id, fluss::EARLIEST_OFFSET));
+    }
+
+    // Poll for records across all buckets
+    std::vector<std::pair<int32_t, std::string>> records;
+    fluss_test::PollRecords(log_scanner, 6, [](const fluss::ScanRecord& rec) {
+        return std::make_pair(rec.row.GetInt32(0), std::string(rec.row.GetString(1)));
+    }, records);
+    ASSERT_EQ(records.size(), 6u) << "Expected 6 records";
+    std::sort(records.begin(), records.end());
+
+    std::vector<std::pair<int32_t, std::string>> expected = {
+        {1, "a1"}, {2, "a2"}, {3, "a3"}, {4, "a4"}, {5, "a5"}, {6, "a6"}};
+    EXPECT_EQ(records, expected);
+
+    // Verify per-bucket iteration via BucketRecords
+    {
+        fluss::Table bucket_table;
+        ASSERT_OK(conn.GetTable(table_path, bucket_table));
+        auto bucket_scan = bucket_table.NewScan();
+        fluss::LogScanner bucket_scanner;
+        ASSERT_OK(bucket_scan.CreateLogScanner(bucket_scanner));
+
+        for (int32_t bid = 0; bid < num_buckets; ++bid) {
+            ASSERT_OK(bucket_scanner.Subscribe(bid, fluss::EARLIEST_OFFSET));
+        }
+
+        std::vector<std::pair<int32_t, std::string>> bucket_records;
+        auto bucket_deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10);
+        size_t buckets_with_data = 0;
+        while (bucket_records.size() < 6 && std::chrono::steady_clock::now() < bucket_deadline) {
+            fluss::ScanRecords scan_records;
+            ASSERT_OK(bucket_scanner.Poll(500, scan_records));
+
+            // Iterate by bucket
+            for (size_t b = 0; b < scan_records.BucketCount(); ++b) {
+                auto bkt_records = scan_records.BucketAt(b);
+                if (!bkt_records.Empty()) {
+                    buckets_with_data++;
+                }
+                for (auto rec : bkt_records) {
+                    bucket_records.emplace_back(rec.row.GetInt32(0),
+                                                std::string(rec.row.GetString(1)));
+                }
+            }
+        }
+
+        ASSERT_EQ(bucket_records.size(), 6u) << "Expected 6 records via per-bucket iteration";
+        EXPECT_GT(buckets_with_data, 1u) << "Records should be distributed across multiple buckets";
+
+        std::sort(bucket_records.begin(), bucket_records.end());
+        EXPECT_EQ(bucket_records, expected);
+    }
+
+    // Test unsubscribe
+    ASSERT_OK(log_scanner.Unsubscribe(0));
+
+    // Verify unsubscribe_partition fails on a non-partitioned table
+    auto unsub_result = log_scanner.UnsubscribePartition(0, 0);
+    ASSERT_FALSE(unsub_result.Ok())
+        << "unsubscribe_partition should fail on a non-partitioned table";
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, ListOffsets) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_list_offsets_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    // Wait for table initialization
+    std::this_thread::sleep_for(std::chrono::seconds(2));
+
+    // Earliest offset should be 0 for empty table
+    std::unordered_map<int32_t, int64_t> earliest_offsets;
+    ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Earliest(), earliest_offsets));
+    EXPECT_EQ(earliest_offsets[0], 0) << "Earliest offset should be 0 for bucket 0";
+
+    // Latest offset should be 0 for empty table
+    std::unordered_map<int32_t, int64_t> latest_offsets;
+    ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Latest(), latest_offsets));
+    EXPECT_EQ(latest_offsets[0], 0) << "Latest offset should be 0 for empty table";
+
+    auto before_append_ms =
+        std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::system_clock::now().time_since_epoch())
+            .count();
+
+    // Append records
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+    auto table_append = table.NewAppend();
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table_append.CreateWriter(append_writer));
+
+    {
+        auto id_builder = arrow::Int32Builder();
+        id_builder.AppendValues({1, 2, 3}).ok();
+        auto name_builder = arrow::StringBuilder();
+        name_builder.AppendValues({"alice", "bob", "charlie"}).ok();
+
+        auto batch = arrow::RecordBatch::Make(
+            arrow::schema(
+                {arrow::field("id", arrow::int32()), arrow::field("name", arrow::utf8())}),
+            3, {id_builder.Finish().ValueOrDie(), name_builder.Finish().ValueOrDie()});
+
+        ASSERT_OK(append_writer.AppendArrowBatch(batch));
+    }
+    ASSERT_OK(append_writer.Flush());
+
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+
+    auto after_append_ms =
+        std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::system_clock::now().time_since_epoch())
+            .count();
+
+    // Latest offset after appending should be 3
+    std::unordered_map<int32_t, int64_t> latest_after;
+    ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Latest(), latest_after));
+    EXPECT_EQ(latest_after[0], 3) << "Latest offset should be 3 after appending 3 records";
+
+    // Earliest offset should still be 0
+    std::unordered_map<int32_t, int64_t> earliest_after;
+    ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Earliest(), earliest_after));
+    EXPECT_EQ(earliest_after[0], 0) << "Earliest offset should still be 0";
+
+    // Timestamp before append should resolve to offset 0
+    std::unordered_map<int32_t, int64_t> ts_offsets;
+    ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Timestamp(before_append_ms),
+                              ts_offsets));
+    EXPECT_EQ(ts_offsets[0], 0)
+        << "Timestamp before append should resolve to offset 0";
+
+    // Timestamp after append should resolve to offset 3
+    std::unordered_map<int32_t, int64_t> ts_after_offsets;
+    ASSERT_OK(adm.ListOffsets(table_path, {0}, fluss::OffsetSpec::Timestamp(after_append_ms),
+                              ts_after_offsets));
+    EXPECT_EQ(ts_after_offsets[0], 3)
+        << "Timestamp after append should resolve to offset 3";
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, TestProject) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_project_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("col_a", fluss::DataType::Int())
+                      .AddColumn("col_b", fluss::DataType::String())
+                      .AddColumn("col_c", fluss::DataType::Int())
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    // Append 3 records
+    auto table_append = table.NewAppend();
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table_append.CreateWriter(append_writer));
+
+    {
+        auto col_a_builder = arrow::Int32Builder();
+        col_a_builder.AppendValues({1, 2, 3}).ok();
+        auto col_b_builder = arrow::StringBuilder();
+        col_b_builder.AppendValues({"x", "y", "z"}).ok();
+        auto col_c_builder = arrow::Int32Builder();
+        col_c_builder.AppendValues({10, 20, 30}).ok();
+
+        auto batch = arrow::RecordBatch::Make(
+            arrow::schema({arrow::field("col_a", arrow::int32()),
+                           arrow::field("col_b", arrow::utf8()),
+                           arrow::field("col_c", arrow::int32())}),
+            3,
+            {col_a_builder.Finish().ValueOrDie(), col_b_builder.Finish().ValueOrDie(),
+             col_c_builder.Finish().ValueOrDie()});
+
+        ASSERT_OK(append_writer.AppendArrowBatch(batch));
+    }
+    ASSERT_OK(append_writer.Flush());
+
+    // Test project_by_name: select col_b and col_c only
+    {
+        fluss::Table proj_table;
+        ASSERT_OK(conn.GetTable(table_path, proj_table));
+        auto scan = proj_table.NewScan();
+        scan.ProjectByName({"col_b", "col_c"});
+        fluss::LogScanner scanner;
+        ASSERT_OK(scan.CreateLogScanner(scanner));
+
+        ASSERT_OK(scanner.Subscribe(0, 0));
+
+        fluss::ScanRecords records;
+        ASSERT_OK(scanner.Poll(10000, records));
+
+        ASSERT_EQ(records.Count(), 3u) << "Should have 3 records with project_by_name";
+
+        std::vector<std::string> expected_col_b = {"x", "y", "z"};
+        std::vector<int32_t> expected_col_c = {10, 20, 30};
+
+        // Collect and sort by col_c to get deterministic order
+        std::vector<std::pair<std::string, int32_t>> collected;
+        for (auto rec : records) {
+            collected.emplace_back(std::string(rec.row.GetString(0)), rec.row.GetInt32(1));
+        }
+        std::sort(collected.begin(), collected.end(),
+                  [](const auto& a, const auto& b) { return a.second < b.second; });
+
+        for (size_t i = 0; i < 3; ++i) {
+            EXPECT_EQ(collected[i].first, expected_col_b[i]) << "col_b mismatch at index " << i;
+            EXPECT_EQ(collected[i].second, expected_col_c[i]) << "col_c mismatch at index " << i;
+        }
+    }
+
+    // Test project by column indices: select col_b (1) and col_a (0) in that order
+    {
+        fluss::Table proj_table;
+        ASSERT_OK(conn.GetTable(table_path, proj_table));
+        auto scan = proj_table.NewScan();
+        scan.ProjectByIndex({1, 0});
+        fluss::LogScanner scanner;
+        ASSERT_OK(scan.CreateLogScanner(scanner));
+
+        ASSERT_OK(scanner.Subscribe(0, 0));
+
+        fluss::ScanRecords records;
+        ASSERT_OK(scanner.Poll(10000, records));
+
+        ASSERT_EQ(records.Count(), 3u);
+
+        std::vector<std::string> expected_col_b = {"x", "y", "z"};
+        std::vector<int32_t> expected_col_a = {1, 2, 3};
+
+        std::vector<std::pair<std::string, int32_t>> collected;
+        for (auto rec : records) {
+            collected.emplace_back(std::string(rec.row.GetString(0)), rec.row.GetInt32(1));
+        }
+        std::sort(collected.begin(), collected.end(),
+                  [](const auto& a, const auto& b) { return a.second < b.second; });
+
+        for (size_t i = 0; i < 3; ++i) {
+            EXPECT_EQ(collected[i].first, expected_col_b[i]) << "col_b mismatch at index " << i;
+            EXPECT_EQ(collected[i].second, expected_col_a[i]) << "col_a mismatch at index " << i;
+        }
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, TestPollBatches) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_poll_batches_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("name", fluss::DataType::String())
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    auto scan = table.NewScan();
+    fluss::LogScanner scanner;
+    ASSERT_OK(scan.CreateRecordBatchLogScanner(scanner));
+    ASSERT_OK(scanner.Subscribe(0, 0));
+
+    // Test 1: Empty table should return empty result
+    {
+        fluss::ArrowRecordBatches batches;
+        ASSERT_OK(scanner.PollRecordBatch(500, batches));
+        ASSERT_TRUE(batches.Empty());
+    }
+
+    // Append data
+    auto table_append = table.NewAppend();
+    fluss::AppendWriter writer;
+    ASSERT_OK(table_append.CreateWriter(writer));
+
+    auto make_batch = [](std::vector<int32_t> ids, std::vector<std::string> names) {
+        auto id_builder = arrow::Int32Builder();
+        id_builder.AppendValues(ids).ok();
+        auto name_builder = arrow::StringBuilder();
+        name_builder.AppendValues(names).ok();
+        return arrow::RecordBatch::Make(
+            arrow::schema(
+                {arrow::field("id", arrow::int32()), arrow::field("name", arrow::utf8())}),
+            static_cast<int64_t>(ids.size()),
+            {id_builder.Finish().ValueOrDie(), name_builder.Finish().ValueOrDie()});
+    };
+
+    ASSERT_OK(writer.AppendArrowBatch(make_batch({1, 2}, {"a", "b"})));
+    ASSERT_OK(writer.AppendArrowBatch(make_batch({3, 4}, {"c", "d"})));
+    ASSERT_OK(writer.AppendArrowBatch(make_batch({5, 6}, {"e", "f"})));
+    ASSERT_OK(writer.Flush());
+
+    // Extract ids from Arrow batches
+    auto extract_ids = [](const fluss::ArrowRecordBatches& batches) {
+        std::vector<int32_t> ids;
+        for (const auto& batch : batches) {
+            auto arr =
+                std::static_pointer_cast<arrow::Int32Array>(batch->GetArrowRecordBatch()->column(0));
+            for (int64_t i = 0; i < arr->length(); ++i) {
+                ids.push_back(arr->Value(i));
+            }
+        }
+        return ids;
+    };
+
+    // Test 2: Poll until we get all 6 records
+    std::vector<int32_t> all_ids;
+    fluss_test::PollRecordBatches(scanner, 6, extract_ids, all_ids);
+    ASSERT_EQ(all_ids, (std::vector<int32_t>{1, 2, 3, 4, 5, 6}));
+
+    // Test 3: Append more and verify offset continuation (no duplicates)
+    ASSERT_OK(writer.AppendArrowBatch(make_batch({7, 8}, {"g", "h"})));
+    ASSERT_OK(writer.Flush());
+
+    std::vector<int32_t> new_ids;
+    fluss_test::PollRecordBatches(scanner, 2, extract_ids, new_ids);
+    ASSERT_EQ(new_ids, (std::vector<int32_t>{7, 8}));
+
+    // Test 4: Subscribing from mid-offset should truncate batch
+    {
+        fluss::Table trunc_table;
+        ASSERT_OK(conn.GetTable(table_path, trunc_table));
+        auto trunc_scan = trunc_table.NewScan();
+        fluss::LogScanner trunc_scanner;
+        ASSERT_OK(trunc_scan.CreateRecordBatchLogScanner(trunc_scanner));
+        ASSERT_OK(trunc_scanner.Subscribe(0, 3));
+
+        std::vector<int32_t> trunc_ids;
+        fluss_test::PollRecordBatches(trunc_scanner, 5, extract_ids, trunc_ids);
+        ASSERT_EQ(trunc_ids, (std::vector<int32_t>{4, 5, 6, 7, 8}));
+    }
+
+    // Test 5: Projection should only return requested columns
+    {
+        fluss::Table proj_table;
+        ASSERT_OK(conn.GetTable(table_path, proj_table));
+        auto proj_scan = proj_table.NewScan();
+        proj_scan.ProjectByName({"id"});
+        fluss::LogScanner proj_scanner;
+        ASSERT_OK(proj_scan.CreateRecordBatchLogScanner(proj_scanner));
+        ASSERT_OK(proj_scanner.Subscribe(0, 0));
+
+        fluss::ArrowRecordBatches proj_batches;
+        ASSERT_OK(proj_scanner.PollRecordBatch(10000, proj_batches));
+
+        ASSERT_FALSE(proj_batches.Empty());
+        EXPECT_EQ(proj_batches[0]->GetArrowRecordBatch()->num_columns(), 1)
+            << "Projected batch should have 1 column (id), not 2";
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, AllSupportedDatatypes) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_log_all_datatypes_cpp");
+
+    // Create a log table with all supported datatypes
+    auto schema =
+        fluss::Schema::NewBuilder()
+            .AddColumn("col_tinyint", fluss::DataType::TinyInt())
+            .AddColumn("col_smallint", fluss::DataType::SmallInt())
+            .AddColumn("col_int", fluss::DataType::Int())
+            .AddColumn("col_bigint", fluss::DataType::BigInt())
+            .AddColumn("col_float", fluss::DataType::Float())
+            .AddColumn("col_double", fluss::DataType::Double())
+            .AddColumn("col_boolean", fluss::DataType::Boolean())
+            .AddColumn("col_char", fluss::DataType::Char(10))
+            .AddColumn("col_string", fluss::DataType::String())
+            .AddColumn("col_decimal", fluss::DataType::Decimal(10, 2))
+            .AddColumn("col_date", fluss::DataType::Date())
+            .AddColumn("col_time", fluss::DataType::Time())
+            .AddColumn("col_timestamp", fluss::DataType::Timestamp())
+            .AddColumn("col_timestamp_ltz", fluss::DataType::TimestampLtz())
+            .AddColumn("col_bytes", fluss::DataType::Bytes())
+            .AddColumn("col_binary", fluss::DataType::Binary(4))
+            .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    size_t field_count = table.GetTableInfo().schema.columns.size();
+
+    auto table_append = table.NewAppend();
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table_append.CreateWriter(append_writer));
+
+    // Test data
+    int32_t col_tinyint = 127;
+    int32_t col_smallint = 32767;
+    int32_t col_int = 2147483647;
+    int64_t col_bigint = 9223372036854775807LL;
+    float col_float = 3.14f;
+    double col_double = 2.718281828459045;
+    bool col_boolean = true;
+    std::string col_char = "hello";
+    std::string col_string = "world of fluss rust client";
+    std::string col_decimal = "123.45";
+    auto col_date = fluss::Date::FromDays(20476);           // 2026-01-23
+    auto col_time = fluss::Time::FromMillis(36827000);       // 10:13:47
+    auto col_timestamp = fluss::Timestamp::FromMillisNanos(1769163227123, 456000);
+    auto col_timestamp_ltz = fluss::Timestamp::FromMillisNanos(1769163227123, 456000);
+    std::vector<uint8_t> col_bytes = {'b', 'i', 'n', 'a', 'r', 'y', ' ', 'd', 'a', 't', 'a'};
+    std::vector<uint8_t> col_binary = {0xDE, 0xAD, 0xBE, 0xEF};
+
+    // Append a row with all datatypes
+    {
+        fluss::GenericRow row(field_count);
+        row.SetInt32(0, col_tinyint);
+        row.SetInt32(1, col_smallint);
+        row.SetInt32(2, col_int);
+        row.SetInt64(3, col_bigint);
+        row.SetFloat32(4, col_float);
+        row.SetFloat64(5, col_double);
+        row.SetBool(6, col_boolean);
+        row.SetString(7, col_char);
+        row.SetString(8, col_string);
+        row.SetDecimal(9, col_decimal);
+        row.SetDate(10, col_date);
+        row.SetTime(11, col_time);
+        row.SetTimestampNtz(12, col_timestamp);
+        row.SetTimestampLtz(13, col_timestamp_ltz);
+        row.SetBytes(14, col_bytes);
+        row.SetBytes(15, col_binary);
+        ASSERT_OK(append_writer.Append(row));
+    }
+
+    // Append a row with null values
+    {
+        fluss::GenericRow row_with_nulls(field_count);
+        for (size_t i = 0; i < field_count; ++i) {
+            row_with_nulls.SetNull(i);
+        }
+        ASSERT_OK(append_writer.Append(row_with_nulls));
+    }
+
+    ASSERT_OK(append_writer.Flush());
+
+    // Scan the records
+    fluss::Table scan_table;
+    ASSERT_OK(conn.GetTable(table_path, scan_table));
+    auto table_scan = scan_table.NewScan();
+    fluss::LogScanner log_scanner;
+    ASSERT_OK(table_scan.CreateLogScanner(log_scanner));
+    ASSERT_OK(log_scanner.Subscribe(0, 0));
+
+    // Poll until we get 2 records
+    std::vector<fluss::ScanRecord> all_records;
+    fluss_test::PollRecords(log_scanner, 2,
+        [](const fluss::ScanRecord& rec) { return rec; }, all_records);
+    ASSERT_EQ(all_records.size(), 2u) << "Expected 2 records";
+
+    // Verify first record (all values)
+    auto& row = all_records[0].row;
+
+    EXPECT_EQ(row.GetInt32(0), col_tinyint) << "col_tinyint mismatch";
+    EXPECT_EQ(row.GetInt32(1), col_smallint) << "col_smallint mismatch";
+    EXPECT_EQ(row.GetInt32(2), col_int) << "col_int mismatch";
+    EXPECT_EQ(row.GetInt64(3), col_bigint) << "col_bigint mismatch";
+    EXPECT_NEAR(row.GetFloat32(4), col_float, 1e-6f) << "col_float mismatch";
+    EXPECT_NEAR(row.GetFloat64(5), col_double, 1e-15) << "col_double mismatch";
+    EXPECT_EQ(row.GetBool(6), col_boolean) << "col_boolean mismatch";
+    EXPECT_EQ(row.GetString(7), col_char) << "col_char mismatch";
+    EXPECT_EQ(row.GetString(8), col_string) << "col_string mismatch";
+    EXPECT_EQ(row.GetDecimalString(9), col_decimal) << "col_decimal mismatch";
+    EXPECT_EQ(row.GetDate(10).days_since_epoch, col_date.days_since_epoch) << "col_date mismatch";
+    EXPECT_EQ(row.GetTime(11).millis_since_midnight, col_time.millis_since_midnight)
+        << "col_time mismatch";
+    EXPECT_EQ(row.GetTimestamp(12).epoch_millis, col_timestamp.epoch_millis)
+        << "col_timestamp millis mismatch";
+    EXPECT_EQ(row.GetTimestamp(12).nano_of_millisecond, col_timestamp.nano_of_millisecond)
+        << "col_timestamp nanos mismatch";
+    EXPECT_EQ(row.GetTimestamp(13).epoch_millis, col_timestamp_ltz.epoch_millis)
+        << "col_timestamp_ltz millis mismatch";
+    EXPECT_EQ(row.GetTimestamp(13).nano_of_millisecond, col_timestamp_ltz.nano_of_millisecond)
+        << "col_timestamp_ltz nanos mismatch";
+
+    auto [bytes_ptr, bytes_len] = row.GetBytes(14);
+    EXPECT_EQ(bytes_len, col_bytes.size()) << "col_bytes length mismatch";
+    EXPECT_TRUE(std::memcmp(bytes_ptr, col_bytes.data(), bytes_len) == 0)
+        << "col_bytes mismatch";
+
+    auto [binary_ptr, binary_len] = row.GetBytes(15);
+    EXPECT_EQ(binary_len, col_binary.size()) << "col_binary length mismatch";
+    EXPECT_TRUE(std::memcmp(binary_ptr, col_binary.data(), binary_len) == 0)
+        << "col_binary mismatch";
+
+    // Verify second record (all nulls)
+    auto& null_row = all_records[1].row;
+    for (size_t i = 0; i < field_count; ++i) {
+        EXPECT_TRUE(null_row.IsNull(i)) << "column " << i << " should be null";
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, PartitionedTableAppendScan) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_partitioned_log_append_cpp");
+
+    // Create a partitioned log table
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("region", fluss::DataType::String())
+                      .AddColumn("value", fluss::DataType::BigInt())
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetPartitionKeys({"region"})
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    // Create partitions
+    fluss_test::CreatePartitions(adm, table_path, "region", {"US", "EU"});
+
+    // Wait for partitions
+    std::this_thread::sleep_for(std::chrono::seconds(2));
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    auto table_append = table.NewAppend();
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table_append.CreateWriter(append_writer));
+
+    // Append rows
+    struct TestData {
+        int32_t id;
+        std::string region;
+        int64_t value;
+    };
+    std::vector<TestData> test_data = {{1, "US", 100}, {2, "US", 200}, {3, "EU", 300}, {4, "EU", 400}};
+
+    for (const auto& d : test_data) {
+        fluss::GenericRow row(3);
+        row.SetInt32(0, d.id);
+        row.SetString(1, d.region);
+        row.SetInt64(2, d.value);
+        ASSERT_OK(append_writer.Append(row));
+    }
+    ASSERT_OK(append_writer.Flush());
+
+    // Append arrow batches per partition
+    {
+        auto id_builder = arrow::Int32Builder();
+        id_builder.AppendValues({5, 6}).ok();
+        auto region_builder = arrow::StringBuilder();
+        region_builder.AppendValues({"US", "US"}).ok();
+        auto value_builder = arrow::Int64Builder();
+        value_builder.AppendValues({500, 600}).ok();
+
+        auto batch = arrow::RecordBatch::Make(
+            arrow::schema({arrow::field("id", arrow::int32()),
+                           arrow::field("region", arrow::utf8()),
+                           arrow::field("value", arrow::int64())}),
+            2,
+            {id_builder.Finish().ValueOrDie(), region_builder.Finish().ValueOrDie(),
+             value_builder.Finish().ValueOrDie()});
+
+        ASSERT_OK(append_writer.AppendArrowBatch(batch));
+    }
+
+    {
+        auto id_builder = arrow::Int32Builder();
+        id_builder.AppendValues({7, 8}).ok();
+        auto region_builder = arrow::StringBuilder();
+        region_builder.AppendValues({"EU", "EU"}).ok();
+        auto value_builder = arrow::Int64Builder();
+        value_builder.AppendValues({700, 800}).ok();
+
+        auto batch = arrow::RecordBatch::Make(
+            arrow::schema({arrow::field("id", arrow::int32()),
+                           arrow::field("region", arrow::utf8()),
+                           arrow::field("value", arrow::int64())}),
+            2,
+            {id_builder.Finish().ValueOrDie(), region_builder.Finish().ValueOrDie(),
+             value_builder.Finish().ValueOrDie()});
+
+        ASSERT_OK(append_writer.AppendArrowBatch(batch));
+    }
+    ASSERT_OK(append_writer.Flush());
+
+    // Test list partition offsets
+    std::unordered_map<int32_t, int64_t> us_offsets;
+    ASSERT_OK(adm.ListPartitionOffsets(table_path, "US", {0}, fluss::OffsetSpec::Latest(),
+                                       us_offsets));
+    EXPECT_EQ(us_offsets[0], 4) << "US partition should have 4 records";
+
+    std::unordered_map<int32_t, int64_t> eu_offsets;
+    ASSERT_OK(adm.ListPartitionOffsets(table_path, "EU", {0}, fluss::OffsetSpec::Latest(),
+                                       eu_offsets));
+    EXPECT_EQ(eu_offsets[0], 4) << "EU partition should have 4 records";
+
+    // Subscribe to all partitions and scan
+    fluss::Table scan_table;
+    ASSERT_OK(conn.GetTable(table_path, scan_table));
+    auto table_scan = scan_table.NewScan();
+    fluss::LogScanner log_scanner;
+    ASSERT_OK(table_scan.CreateLogScanner(log_scanner));
+
+    std::vector<fluss::PartitionInfo> partition_infos;
+    ASSERT_OK(adm.ListPartitionInfos(table_path, partition_infos));
+
+    for (const auto& pi : partition_infos) {
+        ASSERT_OK(log_scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0));
+    }
+
+    // Collect all records
+    using Record = std::tuple<int32_t, std::string, int64_t>;
+    auto extract_record = [](const fluss::ScanRecord& rec) -> Record {
+        return {rec.row.GetInt32(0), std::string(rec.row.GetString(1)), rec.row.GetInt64(2)};
+    };
+    std::vector<Record> collected;
+    fluss_test::PollRecords(log_scanner, 8, extract_record, collected);
+
+    ASSERT_EQ(collected.size(), 8u) << "Expected 8 records total";
+    std::sort(collected.begin(), collected.end());
+
+    std::vector<Record> expected = {{1, "US", 100},  {2, "US", 200},  {3, "EU", 300},
+                                    {4, "EU", 400},  {5, "US", 500},  {6, "US", 600},
+                                    {7, "EU", 700},  {8, "EU", 800}};
+    EXPECT_EQ(collected, expected);
+
+    // Test unsubscribe_partition: unsubscribe EU, should only get US data
+    {
+        fluss::Table unsub_table;
+        ASSERT_OK(conn.GetTable(table_path, unsub_table));
+        auto unsub_scan = unsub_table.NewScan();
+        fluss::LogScanner unsub_scanner;
+        ASSERT_OK(unsub_scan.CreateLogScanner(unsub_scanner));
+
+        int64_t eu_partition_id = -1;
+        for (const auto& pi : partition_infos) {
+            ASSERT_OK(unsub_scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0));
+            if (pi.partition_name == "EU") {
+                eu_partition_id = pi.partition_id;
+            }
+        }
+        ASSERT_GE(eu_partition_id, 0) << "EU partition should exist";
+
+        ASSERT_OK(unsub_scanner.UnsubscribePartition(eu_partition_id, 0));
+
+        std::vector<Record> us_only;
+        fluss_test::PollRecords(unsub_scanner, 4, extract_record, us_only);
+
+        ASSERT_EQ(us_only.size(), 4u) << "Should receive exactly 4 US records";
+        for (const auto& [id, region, val] : us_only) {
+            EXPECT_EQ(region, "US") << "After unsubscribe EU, only US data should be read";
+        }
+    }
+
+    // Test subscribe_partition_buckets (batch subscribe)
+    {
+        fluss::Table batch_table;
+        ASSERT_OK(conn.GetTable(table_path, batch_table));
+        auto batch_scan = batch_table.NewScan();
+        fluss::LogScanner batch_scanner;
+        ASSERT_OK(batch_scan.CreateLogScanner(batch_scanner));
+
+        std::vector<fluss::PartitionBucketSubscription> subs;
+        for (const auto& pi : partition_infos) {
+            subs.push_back({pi.partition_id, 0, 0});
+        }
+        ASSERT_OK(batch_scanner.SubscribePartitionBuckets(subs));
+
+        std::vector<Record> batch_collected;
+        fluss_test::PollRecords(batch_scanner, 8, extract_record, batch_collected);
+        ASSERT_EQ(batch_collected.size(), 8u);
+        std::sort(batch_collected.begin(), batch_collected.end());
+        EXPECT_EQ(batch_collected, expected);
+    }
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+// ============================================================================
+// Array data type tests
+// ============================================================================
+
+TEST_F(LogTableTest, AppendAndScanWithArray) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_append_scan_with_array_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("tags", fluss::DataType::Array(fluss::DataType::String()))
+                      .AddColumn("scores", fluss::DataType::Array(fluss::DataType::Int()))
+                      .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetBucketCount(1)
+                                .SetBucketKeys({"id"})
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    auto info = table.GetTableInfo();
+    ASSERT_GE(info.schema.columns.size(), 3u);
+    const auto& tags_type = info.schema.columns[1].data_type;
+    ASSERT_EQ(tags_type.id(), fluss::TypeId::Array);
+    ASSERT_NE(tags_type.element_type(), nullptr);
+    ASSERT_EQ(tags_type.element_type()->id(), fluss::TypeId::String);
+    const auto& scores_type = info.schema.columns[2].data_type;
+    ASSERT_EQ(scores_type.id(), fluss::TypeId::Array);
+    ASSERT_NE(scores_type.element_type(), nullptr);
+    ASSERT_EQ(scores_type.element_type()->id(), fluss::TypeId::Int);
+
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table.NewAppend().CreateWriter(append_writer));
+
+    {
+        auto row = table.NewRow();
+        row.Set("id", 1);
+
+        fluss::ArrayWriter tags(2, fluss::DataType::String());
+        tags.SetString(0, "hello");
+        tags.SetString(1, "world");
+        row.SetArray(1, std::move(tags));
+
+        fluss::ArrayWriter scores(3, fluss::DataType::Int());
+        scores.SetInt32(0, 10);
+        scores.SetInt32(1, 20);
+        scores.SetInt32(2, 30);
+        row.SetArray(2, std::move(scores));
+
+        ASSERT_OK(append_writer.Append(row));
+    }
+    {
+        auto row = table.NewRow();
+        row.Set("id", 2);
+
+        fluss::ArrayWriter tags(1, fluss::DataType::String());
+        tags.SetNull(0);
+        row.SetArray(1, std::move(tags));
+
+        fluss::ArrayWriter scores(0, fluss::DataType::Int());
+        row.SetArray(2, std::move(scores));
+
+        ASSERT_OK(append_writer.Append(row));
+    }
+
+    ASSERT_OK(append_writer.Flush());
+
+    auto scan = table.NewScan();
+    fluss::LogScanner scanner;
+    ASSERT_OK(scan.CreateLogScanner(scanner));
+    ASSERT_OK(scanner.Subscribe(0, 0));
+
+    struct Record {
+        int32_t id;
+        size_t tag_count;
+        std::vector<std::string> tags;
+        size_t score_count;
+        std::vector<int32_t> scores;
+    };
+
+    std::vector<Record> collected;
+    auto extract = [](const fluss::ScanRecord& scan_rec) {
+        const auto& rv = scan_rec.row;
+        Record rec;
+        rec.id = rv.GetInt32(0);
+
+        rec.tag_count = rv.GetArraySize(1);
+        for (size_t i = 0; i < rec.tag_count; ++i) {
+            if (rv.IsArrayElementNull(1, i)) {
+                rec.tags.push_back("<null>");
+            } else {
+                rec.tags.push_back(rv.GetArrayString(1, i));
+            }
+        }
+
+        rec.score_count = rv.GetArraySize(2);
+        for (size_t i = 0; i < rec.score_count; ++i) {
+            rec.scores.push_back(rv.GetArrayInt32(2, i));
+        }
+
+        return rec;
+    };
+
+    fluss_test::PollRecords(scanner, 2, extract, collected);
+
+    ASSERT_EQ(collected.size(), 2u);
+
+    std::sort(collected.begin(), collected.end(),
+              [](const Record& a, const Record& b) { return a.id < b.id; });
+
+    EXPECT_EQ(collected[0].id, 1);
+    ASSERT_EQ(collected[0].tag_count, 2u);
+    EXPECT_EQ(collected[0].tags[0], "hello");
+    EXPECT_EQ(collected[0].tags[1], "world");
+    ASSERT_EQ(collected[0].score_count, 3u);
+    EXPECT_EQ(collected[0].scores[0], 10);
+    EXPECT_EQ(collected[0].scores[1], 20);
+    EXPECT_EQ(collected[0].scores[2], 30);
+
+    EXPECT_EQ(collected[1].id, 2);
+    ASSERT_EQ(collected[1].tag_count, 1u);
+    EXPECT_EQ(collected[1].tags[0], "<null>");
+    ASSERT_EQ(collected[1].score_count, 0u);
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, AppendAndScanWithNestedArray) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_append_scan_nested_array_cpp");
+
+    auto schema =
+        fluss::Schema::NewBuilder()
+            .AddColumn("id", fluss::DataType::Int())
+            .AddColumn("matrix",
+                        fluss::DataType::Array(fluss::DataType::Array(fluss::DataType::Int())))
+            .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetBucketCount(1)
+                                .SetBucketKeys({"id"})
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table.NewAppend().CreateWriter(append_writer));
+
+    {
+        auto row = table.NewRow();
+        row.Set("id", 1);
+
+        fluss::ArrayWriter inner1(2, fluss::DataType::Int());
+        inner1.SetInt32(0, 1);
+        inner1.SetInt32(1, 2);
+
+        fluss::ArrayWriter inner2(2, fluss::DataType::Int());
+        inner2.SetInt32(0, 3);
+        inner2.SetInt32(1, 4);
+
+        fluss::ArrayWriter outer(2, fluss::DataType::Array(fluss::DataType::Int()));
+        outer.SetArray(0, std::move(inner1));
+        outer.SetArray(1, std::move(inner2));
+
+        row.SetArray(1, std::move(outer));
+        ASSERT_OK(append_writer.Append(row));
+    }
+
+    ASSERT_OK(append_writer.Flush());
+
+    auto scan = table.NewScan();
+    fluss::LogScanner scanner;
+    ASSERT_OK(scan.CreateLogScanner(scanner));
+    ASSERT_OK(scanner.Subscribe(0, 0));
+
+    struct Record {
+        int32_t id;
+        size_t outer_count;
+        fluss::TypeId element_type;
+        std::vector<std::vector<int32_t>> values;
+    };
+
+    std::vector<Record> collected;
+    auto extract = [](const fluss::ScanRecord& scan_rec) {
+        const auto& rv = scan_rec.row;
+        Record rec;
+        rec.id = rv.GetInt32(0);
+        rec.outer_count = rv.GetArraySize(1);
+        rec.element_type = rv.GetArrayElementType(1);
+        auto outer = rv.GetArrayView(1);
+        rec.values.reserve(outer.Size());
+        for (size_t i = 0; i < outer.Size(); ++i) {
+            auto inner = outer.GetArray(i);
+            std::vector<int32_t> row;
+            row.reserve(inner.Size());
+            for (size_t j = 0; j < inner.Size(); ++j) {
+                row.push_back(inner.GetInt32(j));
+            }
+            rec.values.push_back(std::move(row));
+        }
+        return rec;
+    };
+
+    fluss_test::PollRecords(scanner, 1, extract, collected);
+    ASSERT_EQ(collected.size(), 1u);
+    EXPECT_EQ(collected[0].id, 1);
+    EXPECT_EQ(collected[0].outer_count, 2u);
+    EXPECT_EQ(collected[0].element_type, fluss::TypeId::Array);
+    ASSERT_EQ(collected[0].values.size(), 2u);
+    EXPECT_EQ(collected[0].values[0], (std::vector<int32_t>{1, 2}));
+    EXPECT_EQ(collected[0].values[1], (std::vector<int32_t>{3, 4}));
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, AppendAndScanWithArrayRichTypes) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_append_scan_array_rich_types_cpp");
+
+    auto schema =
+        fluss::Schema::NewBuilder()
+            .AddColumn("id", fluss::DataType::Int())
+            .AddColumn("arr_bytes", fluss::DataType::Array(fluss::DataType::Bytes()))
+            .AddColumn("arr_date", fluss::DataType::Array(fluss::DataType::Date()))
+            .AddColumn("arr_time", fluss::DataType::Array(fluss::DataType::Time()))
+            .AddColumn("arr_ts", fluss::DataType::Array(fluss::DataType::Timestamp(6)))
+            .AddColumn("arr_decimal", fluss::DataType::Array(fluss::DataType::Decimal(10, 2)))
+            .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetBucketCount(1)
+                                .SetBucketKeys({"id"})
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table.NewAppend().CreateWriter(append_writer));
+
+    {
+        auto row = table.NewRow();
+        row.Set("id", 1);
+
+        fluss::ArrayWriter arr_bytes(2, fluss::DataType::Bytes());
+        arr_bytes.SetBytes(0, std::vector<uint8_t>{0x10, 0x20, 0x30});
+        arr_bytes.SetNull(1);
+        row.SetArray(1, std::move(arr_bytes));
+
+        fluss::ArrayWriter arr_date(2, fluss::DataType::Date());
+        auto d0 = fluss::Date::FromDays(20000);
+        arr_date.SetDate(0, d0);
+        arr_date.SetNull(1);
+        row.SetArray(2, std::move(arr_date));
+
+        fluss::ArrayWriter arr_time(1, fluss::DataType::Time());
+        auto t0 = fluss::Time::FromMillis(3600000);
+        arr_time.SetTime(0, t0);
+        row.SetArray(3, std::move(arr_time));
+
+        fluss::ArrayWriter arr_ts(1, fluss::DataType::Timestamp(6));
+        auto ts0 = fluss::Timestamp::FromMillisNanos(1769163227123, 456000);
+        arr_ts.SetTimestampNtz(0, ts0);
+        row.SetArray(4, std::move(arr_ts));
+
+        fluss::ArrayWriter arr_decimal(2, fluss::DataType::Decimal(10, 2));
+        arr_decimal.SetDecimal(0, "123.45");
+        arr_decimal.SetNull(1);
+        row.SetArray(5, std::move(arr_decimal));
+
+        ASSERT_OK(append_writer.Append(row));
+    }
+
+    ASSERT_OK(append_writer.Flush());
+
+    auto scan = table.NewScan();
+    fluss::LogScanner scanner;
+    ASSERT_OK(scan.CreateLogScanner(scanner));
+    ASSERT_OK(scanner.Subscribe(0, 0));
+
+    fluss::ScanRecords records;
+    ASSERT_OK(scanner.Poll(10000, records));
+    ASSERT_EQ(records.Count(), 1u);
+
+    auto it = records.begin();
+    ASSERT_TRUE(it != records.end());
+    auto rec = *it;
+    const auto& rv = rec.row;
+
+    EXPECT_EQ(rv.GetArraySize(1), 2u);
+    auto bytes0 = rv.GetArrayBytes(1, 0);
+    ASSERT_EQ(bytes0.size(), 3u);
+    EXPECT_EQ(bytes0[0], 0x10);
+    EXPECT_EQ(bytes0[1], 0x20);
+    EXPECT_EQ(bytes0[2], 0x30);
+    EXPECT_TRUE(rv.IsArrayElementNull(1, 1));
+
+    EXPECT_EQ(rv.GetArraySize(2), 2u);
+    EXPECT_EQ(rv.GetArrayDate(2, 0).days_since_epoch, fluss::Date::FromDays(20000).days_since_epoch);
+    EXPECT_TRUE(rv.IsArrayElementNull(2, 1));
+
+    EXPECT_EQ(rv.GetArraySize(3), 1u);
+    EXPECT_EQ(rv.GetArrayTime(3, 0).millis_since_midnight, fluss::Time::FromMillis(3600000).millis_since_midnight);
+
+    EXPECT_EQ(rv.GetArraySize(4), 1u);
+    auto ts = rv.GetArrayTimestamp(4, 0);
+    EXPECT_EQ(ts.epoch_millis, 1769163227123);
+    EXPECT_EQ(ts.nano_of_millisecond, 456000);
+
+    EXPECT_EQ(rv.GetArraySize(5), 2u);
+    EXPECT_EQ(rv.GetArrayDecimalString(5, 0), "123.45");
+    EXPECT_TRUE(rv.IsArrayElementNull(5, 1));
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, ArrayApiValidationErrors) {
+    // Type mismatch setter should fail through FFI Result propagation.
+    {
+        fluss::ArrayWriter bool_array(1, fluss::DataType::Boolean());
+        bool threw = false;
+        try {
+            bool_array.SetInt32(0, 42);
+        } catch (const std::exception&) {
+            threw = true;
+        }
+        EXPECT_TRUE(threw);
+    }
+
+    auto& adm = admin();
+    auto& conn = connection();
+    fluss::TablePath table_path("fluss", "test_array_api_validation_errors_cpp");
+
+    auto schema = fluss::Schema::NewBuilder()
+                      .AddColumn("id", fluss::DataType::Int())
+                      .AddColumn("vals", fluss::DataType::Array(fluss::DataType::Int()))
+                      .Build();
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetBucketCount(1)
+                                .SetBucketKeys({"id"})
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table.NewAppend().CreateWriter(append_writer));
+    auto row = table.NewRow();
+    row.Set("id", 1);
+    fluss::ArrayWriter vals(2, fluss::DataType::Int());
+    vals.SetInt32(0, 7);
+    vals.SetNull(1);
+    row.SetArray(1, std::move(vals));
+    ASSERT_OK(append_writer.Append(row));
+    ASSERT_OK(append_writer.Flush());
+
+    auto scan = table.NewScan();
+    fluss::LogScanner scanner;
+    ASSERT_OK(scan.CreateLogScanner(scanner));
+    ASSERT_OK(scanner.Subscribe(0, 0));
+    fluss::ScanRecords records;
+    ASSERT_OK(scanner.Poll(10000, records));
+    ASSERT_EQ(records.Count(), 1u);
+    auto it = records.begin();
+    ASSERT_TRUE(it != records.end());
+    auto rec = *it;
+
+    bool oob_threw = false;
+    try {
+        (void)rec.row.GetArrayInt32(1, 5);
+    } catch (const std::exception&) {
+        oob_threw = true;
+    }
+    EXPECT_TRUE(oob_threw);
+
+    bool wrong_type_threw = false;
+    try {
+        (void)rec.row.GetArrayInt64(1, 0);
+    } catch (const std::exception&) {
+        wrong_type_threw = true;
+    }
+    EXPECT_TRUE(wrong_type_threw);
+
+    bool null_typed_getter_threw = false;
+    try {
+        (void)rec.row.GetArrayInt32(1, 1);
+    } catch (const std::exception&) {
+        null_typed_getter_threw = true;
+    }
+    EXPECT_TRUE(null_typed_getter_threw);
+
+    auto view = rec.row.GetArrayView(1);
+    EXPECT_EQ(view.Size(), 2u);
+    EXPECT_TRUE(view.IsNull(1));
+
+    bool view_wrong_type_threw = false;
+    try {
+        (void)view.GetInt64(0);
+    } catch (const std::exception&) {
+        view_wrong_type_threw = true;
+    }
+    EXPECT_TRUE(view_wrong_type_threw);
+
+    bool view_null_typed_getter_threw = false;
+    try {
+        (void)view.GetInt32(1);
+    } catch (const std::exception&) {
+        view_null_typed_getter_threw = true;
+    }
+    EXPECT_TRUE(view_null_typed_getter_threw);
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, AppendAndScanWithArrayEncodingEdgeCases) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_array_encoding_edge_cases_cpp");
+
+    auto schema =
+        fluss::Schema::NewBuilder()
+            .AddColumn("id", fluss::DataType::Int())
+            .AddColumn("arr_long_str", fluss::DataType::Array(fluss::DataType::String()))
+            .AddColumn("arr_big_decimal", fluss::DataType::Array(fluss::DataType::Decimal(22, 5)))
+            .AddColumn("arr_ts_nano", fluss::DataType::Array(fluss::DataType::Timestamp(9)))
+            .AddColumn("arr_float", fluss::DataType::Array(fluss::DataType::Float()))
+            .AddColumn("arr_double", fluss::DataType::Array(fluss::DataType::Double()))
+            .AddColumn("arr_binary", fluss::DataType::Array(fluss::DataType::Binary(4)))
+            .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetBucketCount(1)
+                                .SetBucketKeys({"id"})
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+    fluss::AppendWriter append_writer;
+    ASSERT_OK(table.NewAppend().CreateWriter(append_writer));
+
+    {
+        auto row = table.NewRow();
+        row.Set("id", 1);
+
+        // >= 8 bytes forces the heap-pointer variable-length path (threshold: 7)
+        fluss::ArrayWriter arr_long_str(2, fluss::DataType::String());
+        arr_long_str.SetString(0, "abcdefgh");
+        arr_long_str.SetString(1, "this is a much longer string that definitely exceeds inline");
+        row.SetArray(1, std::move(arr_long_str));
+
+        // precision > 18 forces non-compact decimal encoding
+        fluss::ArrayWriter arr_big_decimal(2, fluss::DataType::Decimal(22, 5));
+        arr_big_decimal.SetDecimal(0, "12345678901234567.12345");
+        arr_big_decimal.SetDecimal(1, "-99999999999999999.99999");
+        row.SetArray(2, std::move(arr_big_decimal));
+
+        // precision > 3 forces non-compact timestamp (millis + nanos-of-millis)
+        fluss::ArrayWriter arr_ts_nano(1, fluss::DataType::Timestamp(9));
+        auto ts_nano = fluss::Timestamp::FromMillisNanos(1769163227123, 456789);
+        arr_ts_nano.SetTimestampNtz(0, ts_nano);
+        row.SetArray(3, std::move(arr_ts_nano));
+
+        // IEEE 754 special values: NaN, +Infinity, -Infinity
+        fluss::ArrayWriter arr_float(3, fluss::DataType::Float());
+        arr_float.SetFloat32(0, std::numeric_limits<float>::quiet_NaN());
+        arr_float.SetFloat32(1, std::numeric_limits<float>::infinity());
+        arr_float.SetFloat32(2, -std::numeric_limits<float>::infinity());
+        row.SetArray(4, std::move(arr_float));
+
+        fluss::ArrayWriter arr_double(3, fluss::DataType::Double());
+        arr_double.SetFloat64(0, std::numeric_limits<double>::quiet_NaN());
+        arr_double.SetFloat64(1, std::numeric_limits<double>::infinity());
+        arr_double.SetFloat64(2, -std::numeric_limits<double>::infinity());
+        row.SetArray(5, std::move(arr_double));
+
+        // Fixed-length binary
+        fluss::ArrayWriter arr_binary(2, fluss::DataType::Binary(4));
+        arr_binary.SetBytes(0, std::vector<uint8_t>{0xDE, 0xAD, 0xBE, 0xEF});
+        arr_binary.SetNull(1);
+        row.SetArray(6, std::move(arr_binary));
+
+        ASSERT_OK(append_writer.Append(row));
+    }
+
+    ASSERT_OK(append_writer.Flush());
+
+    auto scan = table.NewScan();
+    fluss::LogScanner scanner;
+    ASSERT_OK(scan.CreateLogScanner(scanner));
+    ASSERT_OK(scanner.Subscribe(0, 0));
+
+    fluss::ScanRecords records;
+    ASSERT_OK(scanner.Poll(10000, records));
+    ASSERT_EQ(records.Count(), 1u);
+
+    auto it = records.begin();
+    ASSERT_TRUE(it != records.end());
+    auto rec = *it;
+    const auto& rv = rec.row;
+
+    // Long strings: heap-encoded variable-length round-trip
+    EXPECT_EQ(rv.GetArraySize(1), 2u);
+    EXPECT_EQ(rv.GetArrayString(1, 0), "abcdefgh");
+    EXPECT_EQ(rv.GetArrayString(1, 1), "this is a much longer string that definitely exceeds inline");
+
+    // Non-compact decimal (precision 22 > MAX_COMPACT_PRECISION 18)
+    EXPECT_EQ(rv.GetArraySize(2), 2u);
+    EXPECT_EQ(rv.GetArrayDecimalString(2, 0), "12345678901234567.12345");
+    EXPECT_EQ(rv.GetArrayDecimalString(2, 1), "-99999999999999999.99999");
+
+    // Non-compact timestamp (precision 9 > MAX_COMPACT_TIMESTAMP_PRECISION 3)
+    EXPECT_EQ(rv.GetArraySize(3), 1u);
+    auto ts = rv.GetArrayTimestamp(3, 0);
+    EXPECT_EQ(ts.epoch_millis, 1769163227123);
+    EXPECT_EQ(ts.nano_of_millisecond, 456789);
+
+    // Float NaN / Infinity round-trip
+    EXPECT_EQ(rv.GetArraySize(4), 3u);
+    EXPECT_TRUE(std::isnan(rv.GetArrayFloat32(4, 0)));
+    EXPECT_TRUE(std::isinf(rv.GetArrayFloat32(4, 1)));
+    EXPECT_GT(rv.GetArrayFloat32(4, 1), 0.0f);
+    EXPECT_TRUE(std::isinf(rv.GetArrayFloat32(4, 2)));
+    EXPECT_LT(rv.GetArrayFloat32(4, 2), 0.0f);
+
+    // Double NaN / Infinity round-trip
+    EXPECT_EQ(rv.GetArraySize(5), 3u);
+    EXPECT_TRUE(std::isnan(rv.GetArrayFloat64(5, 0)));
+    EXPECT_TRUE(std::isinf(rv.GetArrayFloat64(5, 1)));
+    EXPECT_GT(rv.GetArrayFloat64(5, 1), 0.0);
+    EXPECT_TRUE(std::isinf(rv.GetArrayFloat64(5, 2)));
+    EXPECT_LT(rv.GetArrayFloat64(5, 2), 0.0);
+
+    // Fixed-length binary round-trip
+    EXPECT_EQ(rv.GetArraySize(6), 2u);
+    auto bin = rv.GetArrayBytes(6, 0);
+    ASSERT_EQ(bin.size(), 4u);
+    EXPECT_EQ(bin[0], 0xDE);
+    EXPECT_EQ(bin[1], 0xAD);
+    EXPECT_EQ(bin[2], 0xBE);
+    EXPECT_EQ(bin[3], 0xEF);
+    EXPECT_TRUE(rv.IsArrayElementNull(6, 1));
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
+
+TEST_F(LogTableTest, ArrayWriterOverflowDetection) {
+    // SetInt32 on TINYINT array must throw when value overflows i8 range (-128..127)
+    {
+        fluss::ArrayWriter tinyint_arr(1, fluss::DataType::TinyInt());
+        EXPECT_EQ(tinyint_arr.Size(), 1u);
+        bool threw = false;
+        try {
+            tinyint_arr.SetInt32(0, 1000);
+        } catch (const std::exception& e) {
+            threw = true;
+            std::string msg(e.what());
+            EXPECT_NE(msg.find("TINYINT"), std::string::npos);
+        }
+        EXPECT_TRUE(threw);
+    }
+
+    // SetInt32 on SMALLINT array must throw when value overflows i16 range (-32768..32767)
+    {
+        fluss::ArrayWriter smallint_arr(1, fluss::DataType::SmallInt());
+        bool threw = false;
+        try {
+            smallint_arr.SetInt32(0, 40000);
+        } catch (const std::exception& e) {
+            threw = true;
+            std::string msg(e.what());
+            EXPECT_NE(msg.find("SMALLINT"), std::string::npos);
+        }
+        EXPECT_TRUE(threw);
+    }
+
+    // Negative overflow: -200 doesn't fit TINYINT
+    {
+        fluss::ArrayWriter tinyint_arr(1, fluss::DataType::TinyInt());
+        bool threw = false;
+        try {
+            tinyint_arr.SetInt32(0, -200);
+        } catch (const std::exception&) {
+            threw = true;
+        }
+        EXPECT_TRUE(threw);
+    }
+
+    // Values within range must succeed
+    {
+        fluss::ArrayWriter tinyint_arr(1, fluss::DataType::TinyInt());
+        EXPECT_NO_THROW(tinyint_arr.SetInt32(0, 127));
+    }
+    {
+        fluss::ArrayWriter tinyint_arr(1, fluss::DataType::TinyInt());
+        EXPECT_NO_THROW(tinyint_arr.SetInt32(0, -128));
+    }
+    {
+        fluss::ArrayWriter smallint_arr(1, fluss::DataType::SmallInt());
+        EXPECT_NO_THROW(smallint_arr.SetInt32(0, 32767));
+    }
+}
+
+TEST_F(LogTableTest, NullabilityPreservedInTableInfo) {
+    auto& adm = admin();
+    auto& conn = connection();
+
+    fluss::TablePath table_path("fluss", "test_nullability_table_info_cpp");
+
+    auto schema =
+        fluss::Schema::NewBuilder()
+            .AddColumn("id", fluss::DataType::Int())
+            .AddColumn("name", fluss::DataType::String())
+            .AddColumn("tags", fluss::DataType::Array(fluss::DataType::String().NotNull()))
+            .AddColumn("ids", fluss::DataType::Array(fluss::DataType::Int()).NotNull())
+            .AddColumn("nested",
+                       fluss::DataType::Array(
+                           fluss::DataType::Array(fluss::DataType::Int()).NotNull()))
+            .SetPrimaryKeys({"id"})
+            .Build();
+
+    auto table_descriptor = fluss::TableDescriptor::NewBuilder()
+                                .SetSchema(schema)
+                                .SetProperty("table.replication.factor", "1")
+                                .Build();
+
+    fluss_test::CreateTable(adm, table_path, table_descriptor);
+
+    fluss::Table table;
+    ASSERT_OK(conn.GetTable(table_path, table));
+    auto info = table.GetTableInfo();
+
+    ASSERT_EQ(info.schema.columns.size(), 5u);
+    EXPECT_EQ(info.primary_keys, std::vector<std::string>{"id"});
+
+    // Primary key columns are forced NOT NULL by schema normalization.
+    EXPECT_EQ(info.schema.columns[0].data_type.id(), fluss::TypeId::Int);
+    EXPECT_FALSE(info.schema.columns[0].data_type.nullable());
+
+    // "name" STRING (nullable)
+    EXPECT_EQ(info.schema.columns[1].data_type.id(), fluss::TypeId::String);
+    EXPECT_TRUE(info.schema.columns[1].data_type.nullable());
+
+    // "tags" ARRAY<STRING NOT NULL> (outer nullable)
+    EXPECT_EQ(info.schema.columns[2].data_type.id(), fluss::TypeId::Array);
+    EXPECT_TRUE(info.schema.columns[2].data_type.nullable());
+    ASSERT_NE(info.schema.columns[2].data_type.element_type(), nullptr);
+    EXPECT_FALSE(info.schema.columns[2].data_type.element_type()->nullable());
+
+    // "ids" ARRAY<INT> NOT NULL (outer not null, element nullable)
+    EXPECT_EQ(info.schema.columns[3].data_type.id(), fluss::TypeId::Array);
+    EXPECT_FALSE(info.schema.columns[3].data_type.nullable());
+    ASSERT_NE(info.schema.columns[3].data_type.element_type(), nullptr);
+    EXPECT_TRUE(info.schema.columns[3].data_type.element_type()->nullable());
+
+    // "nested" ARRAY<ARRAY<INT> NOT NULL> (outer nullable, inner array not null)
+    EXPECT_EQ(info.schema.columns[4].data_type.id(), fluss::TypeId::Array);
+    EXPECT_TRUE(info.schema.columns[4].data_type.nullable());
+    ASSERT_NE(info.schema.columns[4].data_type.element_type(), nullptr);
+    EXPECT_FALSE(info.schema.columns[4].data_type.element_type()->nullable());
+    ASSERT_NE(info.schema.columns[4].data_type.element_type()->element_type(), nullptr);
+    EXPECT_TRUE(info.schema.columns[4].data_type.element_type()->element_type()->nullable());
+
+    ASSERT_OK(adm.DropTable(table_path, false));
+}
diff --git a/fluss-rust/bindings/cpp/test/test_main.cpp b/fluss-rust/bindings/cpp/test/test_main.cpp
new file mode 100644
index 0000000000..48d1050b8d
--- /dev/null
+++ b/fluss-rust/bindings/cpp/test/test_main.cpp
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "test_utils.h"
+
+int main(int argc, char** argv) {
+    for (int i = 1; i < argc; ++i) {
+        if (std::string(argv[i]) == "--cleanup") {
+            fluss_test::FlussTestCluster::StopAll();
+            return 0;
+        }
+    }
+
+    ::testing::InitGoogleTest(&argc, argv);
+    ::testing::AddGlobalTestEnvironment(fluss_test::FlussTestEnvironment::Instance());
+    return RUN_ALL_TESTS();
+}
diff --git a/fluss-rust/bindings/cpp/test/test_sasl_auth.cpp b/fluss-rust/bindings/cpp/test/test_sasl_auth.cpp
new file mode 100644
index 0000000000..5a52a1ab79
--- /dev/null
+++ b/fluss-rust/bindings/cpp/test/test_sasl_auth.cpp
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "test_utils.h"
+
+class SaslAuthTest : public ::testing::Test {
+   protected:
+    const std::string& sasl_servers() {
+        return fluss_test::FlussTestEnvironment::Instance()->GetSaslBootstrapServers();
+    }
+    const std::string& plaintext_servers() {
+        return fluss_test::FlussTestEnvironment::Instance()->GetBootstrapServers();
+    }
+};
+
+TEST_F(SaslAuthTest, SaslConnectWithValidCredentials) {
+    fluss::Configuration config;
+    config.bootstrap_servers = sasl_servers();
+    config.security_protocol = "sasl";
+    config.security_sasl_mechanism = "PLAIN";
+    config.security_sasl_username = "admin";
+    config.security_sasl_password = "admin-secret";
+
+    fluss::Connection conn;
+    ASSERT_OK(fluss::Connection::Create(config, conn));
+
+    fluss::Admin admin;
+    ASSERT_OK(conn.GetAdmin(admin));
+
+    // Perform a basic operation to confirm the connection is fully functional
+    std::string db_name = "cpp_sasl_test_valid_db";
+    fluss::DatabaseDescriptor descriptor;
+    descriptor.comment = "created via SASL auth";
+    ASSERT_OK(admin.CreateDatabase(db_name, descriptor, true));
+
+    bool exists = false;
+    ASSERT_OK(admin.DatabaseExists(db_name, exists));
+    ASSERT_TRUE(exists);
+
+    ASSERT_OK(admin.DropDatabase(db_name, true, true));
+}
+
+TEST_F(SaslAuthTest, SaslConnectWithSecondUser) {
+    fluss::Configuration config;
+    config.bootstrap_servers = sasl_servers();
+    config.security_protocol = "sasl";
+    config.security_sasl_mechanism = "PLAIN";
+    config.security_sasl_username = "alice";
+    config.security_sasl_password = "alice-secret";
+
+    fluss::Connection conn;
+    ASSERT_OK(fluss::Connection::Create(config, conn));
+
+    fluss::Admin admin;
+    ASSERT_OK(conn.GetAdmin(admin));
+
+    // Basic operation to confirm functional connection
+    bool exists = false;
+    ASSERT_OK(admin.DatabaseExists("some_nonexistent_db_alice", exists));
+    ASSERT_FALSE(exists);
+}
+
+TEST_F(SaslAuthTest, SaslConnectWithWrongPassword) {
+    fluss::Configuration config;
+    config.bootstrap_servers = sasl_servers();
+    config.security_protocol = "sasl";
+    config.security_sasl_mechanism = "PLAIN";
+    config.security_sasl_username = "admin";
+    config.security_sasl_password = "wrong-password";
+
+    fluss::Connection conn;
+    auto result = fluss::Connection::Create(config, conn);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::AUTHENTICATE_EXCEPTION);
+    EXPECT_NE(result.error_message.find("Authentication failed"), std::string::npos)
+        << "Expected 'Authentication failed' in: " << result.error_message;
+}
+
+TEST_F(SaslAuthTest, SaslConnectWithUnknownUser) {
+    fluss::Configuration config;
+    config.bootstrap_servers = sasl_servers();
+    config.security_protocol = "sasl";
+    config.security_sasl_mechanism = "PLAIN";
+    config.security_sasl_username = "nonexistent_user";
+    config.security_sasl_password = "some-password";
+
+    fluss::Connection conn;
+    auto result = fluss::Connection::Create(config, conn);
+    ASSERT_FALSE(result.Ok());
+    EXPECT_EQ(result.error_code, fluss::ErrorCode::AUTHENTICATE_EXCEPTION);
+    EXPECT_NE(result.error_message.find("Authentication failed"), std::string::npos)
+        << "Expected 'Authentication failed' in: " << result.error_message;
+}
+
+TEST_F(SaslAuthTest, SaslClientToPlaintextServer) {
+    fluss::Configuration config;
+    config.bootstrap_servers = plaintext_servers();
+    config.security_protocol = "sasl";
+    config.security_sasl_mechanism = "PLAIN";
+    config.security_sasl_username = "admin";
+    config.security_sasl_password = "admin-secret";
+
+    fluss::Connection conn;
+    auto result = fluss::Connection::Create(config, conn);
+    ASSERT_FALSE(result.Ok()) << "SASL client connecting to plaintext server should fail";
+}
diff --git a/fluss-rust/bindings/cpp/test/test_utils.h b/fluss-rust/bindings/cpp/test/test_utils.h
new file mode 100644
index 0000000000..5d40afbaf3
--- /dev/null
+++ b/fluss-rust/bindings/cpp/test/test_utils.h
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <sstream>
+#include <nlohmann/json.hpp>
+#include <string>
+#include <vector>
+
+#include "fluss.hpp"
+
+#define ASSERT_OK(result) ASSERT_TRUE((result).Ok()) << (result).error_message
+#define EXPECT_OK(result) EXPECT_TRUE((result).Ok()) << (result).error_message
+
+namespace fluss_test {
+
+inline std::string FindCliBinary() {
+    const char* env_bin = std::getenv("FLUSS_TEST_CLUSTER_BIN");
+    if (env_bin && std::strlen(env_bin) > 0) {
+        if (std::ifstream(env_bin).good()) {
+            return env_bin;
+        }
+        std::cerr << "FLUSS_TEST_CLUSTER_BIN is set to '" << env_bin
+                  << "' but that file does not exist." << std::endl;
+        std::abort();
+    }
+    FILE* pipe = popen("cargo locate-project --workspace --message-format plain", "r");
+    if (pipe) {
+        char buf[512];
+        std::string root;
+        while (fgets(buf, sizeof(buf), pipe)) root += buf;
+        if (pclose(pipe) == 0) {
+            // cargo returns path to Cargo.toml; strip filename + trailing whitespace.
+            while (!root.empty() && (root.back() == '\n' || root.back() == '\r')) root.pop_back();
+            auto slash = root.rfind('/');
+            if (slash != std::string::npos) {
+                std::string dir = root.substr(0, slash);
+                for (const char* profile : {"debug", "release"}) {
+                    std::string path = dir + "/target/" + profile + "/fluss-test-cluster";
+                    if (std::ifstream(path).good()) return path;
+                }
+            }
+        }
+    }
+    return "fluss-test-cluster";
+}
+
+constexpr const char* kClusterName = "shared-test";
+
+inline std::string CliStartCmd() {
+    return FindCliBinary() + " start --sasl --name " + kClusterName;
+}
+
+constexpr const char* kClusterJsonPrefix = "CLUSTER_JSON: ";
+
+inline bool ParseClusterJson(const std::string& output, std::string& bootstrap,
+                             std::string& sasl_bootstrap) {
+    // Look for the CLUSTER_JSON: token in output lines.
+    std::istringstream stream(output);
+    std::string line;
+    while (std::getline(stream, line)) {
+        if (line.rfind(kClusterJsonPrefix, 0) != 0) continue;
+        std::string json_str = line.substr(std::strlen(kClusterJsonPrefix));
+        try {
+            auto info = nlohmann::json::parse(json_str);
+            bootstrap = info.at("bootstrap_servers").get<std::string>();
+            if (info.contains("sasl_bootstrap_servers") &&
+                !info["sasl_bootstrap_servers"].is_null()) {
+                sasl_bootstrap = info["sasl_bootstrap_servers"].get<std::string>();
+            }
+            return true;
+        } catch (const nlohmann::json::exception& e) {
+            std::cerr << "Failed to parse cluster JSON: " << e.what() << "\n"
+                      << "Line: " << line << std::endl;
+            return false;
+        }
+    }
+    std::cerr << "No CLUSTER_JSON token found in output:\n" << output << std::endl;
+    return false;
+}
+
+class FlussTestCluster {
+   public:
+    FlussTestCluster() = default;
+
+    bool Start() {
+        const char* env = std::getenv("FLUSS_BOOTSTRAP_SERVERS");
+        if (env && std::strlen(env) > 0) {
+            bootstrap_servers_ = env;
+            const char* env_sasl = std::getenv("FLUSS_SASL_BOOTSTRAP_SERVERS");
+            sasl_bootstrap_servers_ = (env_sasl && std::strlen(env_sasl) > 0) ? env_sasl : env;
+            return true;
+        }
+
+        std::string cli_cmd = CliStartCmd();
+        FILE* pipe = popen(cli_cmd.c_str(), "r");
+        if (!pipe) {
+            std::cerr << "Failed to launch fluss-test-cluster binary" << std::endl;
+            return false;
+        }
+        std::string output;
+        char buf[512];
+        while (fgets(buf, sizeof(buf), pipe)) output += buf;
+        int rc = pclose(pipe);
+        if (rc != 0) {
+            std::cerr << "fluss-test-cluster start failed (exit " << rc << "):\n"
+                      << output << std::endl;
+            return false;
+        }
+        if (!ParseClusterJson(output, bootstrap_servers_, sasl_bootstrap_servers_)) {
+            std::cerr << "Failed to parse cluster JSON from:\n" << output << std::endl;
+            return false;
+        }
+        return true;
+    }
+
+    static void StopAll() {
+        std::string cmd = FindCliBinary() + " stop --name " + kClusterName;
+        system(cmd.c_str());
+    }
+
+    const std::string& GetBootstrapServers() const { return bootstrap_servers_; }
+    const std::string& GetSaslBootstrapServers() const { return sasl_bootstrap_servers_; }
+
+   private:
+    std::string bootstrap_servers_;
+    std::string sasl_bootstrap_servers_;
+};
+
+class FlussTestEnvironment : public ::testing::Environment {
+   public:
+    static FlussTestEnvironment* Instance() {
+        static FlussTestEnvironment* instance = nullptr;
+        if (!instance) {
+            instance = new FlussTestEnvironment();
+        }
+        return instance;
+    }
+
+    void SetUp() override {
+        if (!cluster_.Start()) {
+            GTEST_SKIP() << "Failed to start Fluss cluster. Skipping integration tests.";
+        }
+
+        fluss::Configuration config;
+        config.bootstrap_servers = cluster_.GetBootstrapServers();
+        auto result = fluss::Connection::Create(config, connection_);
+        if (!result.Ok()) {
+            GTEST_SKIP() << "Failed to connect: " << result.error_message;
+        }
+        auto admin_result = connection_.GetAdmin(admin_);
+        if (!admin_result.Ok()) {
+            GTEST_SKIP() << "Failed to get admin: " << admin_result.error_message;
+        }
+    }
+
+    void TearDown() override {}
+
+    fluss::Connection& GetConnection() { return connection_; }
+    fluss::Admin& GetAdmin() { return admin_; }
+    const std::string& GetBootstrapServers() { return cluster_.GetBootstrapServers(); }
+    const std::string& GetSaslBootstrapServers() { return cluster_.GetSaslBootstrapServers(); }
+
+   private:
+    FlussTestEnvironment() = default;
+
+    FlussTestCluster cluster_;
+    fluss::Connection connection_;
+    fluss::Admin admin_;
+};
+
+inline void CreateTable(fluss::Admin& admin, const fluss::TablePath& path,
+                        const fluss::TableDescriptor& descriptor) {
+    admin.DropTable(path, true);  // ignore if not exists
+    auto result = admin.CreateTable(path, descriptor, false);
+    ASSERT_OK(result);
+}
+
+inline void CreatePartitions(fluss::Admin& admin, const fluss::TablePath& path,
+                             const std::string& partition_column,
+                             const std::vector<std::string>& values) {
+    for (const auto& value : values) {
+        std::unordered_map<std::string, std::string> spec;
+        spec[partition_column] = value;
+        auto result = admin.CreatePartition(path, spec, true);
+        ASSERT_OK(result);
+    }
+}
+
+template <typename T, typename ExtractFn>
+void PollRecords(fluss::LogScanner& scanner, size_t expected_count, ExtractFn extract_fn,
+                 std::vector<T>& out) {
+    auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10);
+    while (out.size() < expected_count && std::chrono::steady_clock::now() < deadline) {
+        fluss::ScanRecords records;
+        ASSERT_OK(scanner.Poll(1000, records));
+        for (auto rec : records) {
+            out.push_back(extract_fn(rec));
+        }
+    }
+}
+
+template <typename T, typename ExtractFn>
+void PollRecordBatches(fluss::LogScanner& scanner, size_t expected_count, ExtractFn extract_fn,
+                       std::vector<T>& out) {
+    auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(10);
+    while (out.size() < expected_count && std::chrono::steady_clock::now() < deadline) {
+        fluss::ArrowRecordBatches batches;
+        ASSERT_OK(scanner.PollRecordBatch(1000, batches));
+        auto items = extract_fn(batches);
+        out.insert(out.end(), items.begin(), items.end());
+    }
+}
+
+}  // namespace fluss_test
diff --git a/fluss-rust/bindings/elixir/.formatter.exs b/fluss-rust/bindings/elixir/.formatter.exs
new file mode 100644
index 0000000000..dd63ff521c
--- /dev/null
+++ b/fluss-rust/bindings/elixir/.formatter.exs
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[
+  inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
+]
diff --git a/fluss-rust/bindings/elixir/.gitignore b/fluss-rust/bindings/elixir/.gitignore
new file mode 100644
index 0000000000..90277ffb16
--- /dev/null
+++ b/fluss-rust/bindings/elixir/.gitignore
@@ -0,0 +1,9 @@
+# Elixir build artifacts
+_build/
+deps/
+
+# Generated NIF shared library
+priv/native/
+
+# Crash dumps
+erl_crash.dump
diff --git a/fluss-rust/bindings/elixir/README.md b/fluss-rust/bindings/elixir/README.md
new file mode 100644
index 0000000000..656b03c51d
--- /dev/null
+++ b/fluss-rust/bindings/elixir/README.md
@@ -0,0 +1,60 @@
+# Fluss Elixir Client
+
+Elixir client for [Apache Fluss (Incubating)](https://fluss.apache.org/), built on the official Rust client via [Rustler](https://github.com/rusterlium/rustler) NIFs.
+
+Currently supports **log tables** (append + scan). Primary key (KV) table support is planned.
+
+## Requirements
+
+- Elixir >= 1.15
+- Rust stable toolchain (for compiling the NIF)
+
+## Quick Start
+
+```elixir
+config = Fluss.Config.new("localhost:9123")
+conn = Fluss.Connection.new!(config)
+admin = Fluss.Admin.new!(conn)
+
+schema =
+  Fluss.Schema.build()
+  |> Fluss.Schema.column("ts", :bigint)
+  |> Fluss.Schema.column("message", :string)
+  |> Fluss.Schema.build!()
+
+:ok = Fluss.Admin.create_table(admin, "my_db", "events", Fluss.TableDescriptor.new!(schema))
+
+table = Fluss.Table.get!(conn, "my_db", "events")
+writer = Fluss.AppendWriter.new!(table)
+Fluss.AppendWriter.append(writer, [1_700_000_000, "hello"])
+:ok = Fluss.AppendWriter.flush(writer)
+
+scanner = Fluss.LogScanner.new!(table)
+:ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset())
+:ok = Fluss.LogScanner.poll(scanner, 5_000)
+
+receive do
+  {:fluss_records, records} ->
+    for record <- records, do: IO.inspect(record[:row])
+end
+```
+
+## Data Types
+
+Simple: `:boolean`, `:tinyint`, `:smallint`, `:int`, `:bigint`, `:float`, `:double`, `:string`, `:bytes`, `:date`, `:time`, `:timestamp`, `:timestamp_ltz`
+
+Parameterized: `{:decimal, precision, scale}`, `{:char, length}`, `{:binary, length}`
+
+## Development
+
+```bash
+cd bindings/elixir
+mix test                        # unit tests
+mix test --include integration  # starts Docker cluster
+```
+
+Set `FLUSS_BOOTSTRAP_SERVERS` to use an existing cluster.
+
+## License
+
+Apache License 2.0
diff --git a/fluss-rust/bindings/elixir/lib/fluss.ex b/fluss-rust/bindings/elixir/lib/fluss.ex
new file mode 100644
index 0000000000..25aa649160
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss.ex
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss do
+  @moduledoc """
+  Elixir client for Apache Fluss (Incubating).
+
+  ## Examples
+
+      config = Fluss.Config.new("localhost:9123")
+      conn = Fluss.Connection.new!(config)
+      admin = Fluss.Admin.new!(conn)
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("ts", :bigint)
+        |> Fluss.Schema.column("message", :string)
+
+      :ok = Fluss.Admin.create_table(admin, "my_db", "events", Fluss.TableDescriptor.new!(schema))
+
+      table = Fluss.Table.get!(conn, "my_db", "events")
+      writer = Fluss.AppendWriter.new!(table)
+      Fluss.AppendWriter.append(writer, [1_700_000_000, "hello"])
+      :ok = Fluss.AppendWriter.flush(writer)
+
+      scanner = Fluss.LogScanner.new!(table)
+      :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset())
+      :ok = Fluss.LogScanner.poll(scanner, 5_000)
+      receive do
+        {:fluss_records, records} -> records
+      end
+
+  """
+
+  alias Fluss.Native
+
+  def earliest_offset, do: Native.earliest_offset()
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/admin.ex b/fluss-rust/bindings/elixir/lib/fluss/admin.ex
new file mode 100644
index 0000000000..6dbdb3a9c1
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/admin.ex
@@ -0,0 +1,111 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.Admin do
+  @moduledoc """
+  Admin client for DDL operations (create/drop databases and tables).
+
+  ## Examples
+
+      admin = Fluss.Admin.new!(conn)
+      :ok = Fluss.Admin.create_database(admin, "my_db")
+
+      schema = Fluss.Schema.new() |> Fluss.Schema.column("ts", :bigint)
+      descriptor = Fluss.TableDescriptor.new!(schema)
+      :ok = Fluss.Admin.create_table(admin, "my_db", "events", descriptor)
+
+  """
+
+  alias Fluss.Native
+
+  @type t :: reference()
+
+  @spec new(Fluss.Connection.t()) :: {:ok, t()} | {:error, Fluss.Error.t()}
+  def new(conn) do
+    case Native.admin_new(conn) do
+      {:error, _} = err -> err
+      admin -> {:ok, admin}
+    end
+  end
+
+  @spec new!(Fluss.Connection.t()) :: t()
+  def new!(conn) do
+    case new(conn) do
+      {:ok, admin} -> admin
+      {:error, %Fluss.Error{} = err} -> raise err
+    end
+  end
+
+  @spec create_database(t(), String.t(), boolean()) :: :ok | {:error, Fluss.Error.t()}
+  def create_database(admin, name, ignore_if_exists \\ true) do
+    admin
+    |> Native.admin_create_database(name, ignore_if_exists)
+    |> Native.await_nif()
+  end
+
+  @spec drop_database(t(), String.t(), boolean()) :: :ok | {:error, Fluss.Error.t()}
+  def drop_database(admin, name, ignore_if_not_exists \\ true) do
+    admin
+    |> Native.admin_drop_database(name, ignore_if_not_exists)
+    |> Native.await_nif()
+  end
+
+  @spec list_databases(t()) :: {:ok, [String.t()]} | {:error, Fluss.Error.t()}
+  def list_databases(admin) do
+    admin
+    |> Native.admin_list_databases()
+    |> Native.await_nif()
+  end
+
+  @spec list_databases!(t()) :: [String.t()]
+  def list_databases!(admin) do
+    case list_databases(admin) do
+      {:ok, dbs} -> dbs
+      {:error, %Fluss.Error{} = err} -> raise err
+    end
+  end
+
+  @spec create_table(t(), String.t(), String.t(), Fluss.TableDescriptor.t(), boolean()) ::
+          :ok | {:error, Fluss.Error.t()}
+  def create_table(admin, database, table, descriptor, ignore_if_exists \\ true) do
+    admin
+    |> Native.admin_create_table(database, table, descriptor, ignore_if_exists)
+    |> Native.await_nif()
+  end
+
+  @spec drop_table(t(), String.t(), String.t(), boolean()) :: :ok | {:error, Fluss.Error.t()}
+  def drop_table(admin, database, table, ignore_if_not_exists \\ true) do
+    admin
+    |> Native.admin_drop_table(database, table, ignore_if_not_exists)
+    |> Native.await_nif()
+  end
+
+  @spec list_tables(t(), String.t()) :: {:ok, [String.t()]} | {:error, Fluss.Error.t()}
+  def list_tables(admin, database) do
+    admin
+    |> Native.admin_list_tables(database)
+    |> Native.await_nif()
+  end
+
+  @spec list_tables!(t(), String.t()) :: [String.t()]
+  def list_tables!(admin, database) do
+    case list_tables(admin, database) do
+      {:ok, tables} -> tables
+      {:error, %Fluss.Error{} = err} -> raise err
+    end
+  end
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/append_writer.ex b/fluss-rust/bindings/elixir/lib/fluss/append_writer.ex
new file mode 100644
index 0000000000..5dddbf7b1d
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/append_writer.ex
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.AppendWriter do
+  @moduledoc """
+  Writer for appending records to a log table.
+
+  Values are passed as a list in column order. Use `nil` for null values.
+  `append/2` returns a `Fluss.WriteHandle` — drop it for fire-and-forget,
+  or call `Fluss.WriteHandle.wait/1` for per-record acknowledgment.
+
+  ## Examples
+
+      writer = Fluss.AppendWriter.new!(table)
+
+      # Fire-and-forget
+      Fluss.AppendWriter.append(writer, [1_700_000_000, "hello"])
+      Fluss.AppendWriter.append(writer, [1_700_000_001, "world"])
+      :ok = Fluss.AppendWriter.flush(writer)
+
+      # Per-record ack
+      {:ok, handle} = Fluss.AppendWriter.append(writer, [1_700_000_002, "critical"])
+      :ok = Fluss.WriteHandle.wait(handle)
+
+  """
+
+  alias Fluss.Native
+
+  @type t :: reference()
+
+  @spec new(Fluss.Table.t()) :: {:ok, t()} | {:error, Fluss.Error.t()}
+  def new(table) do
+    case Native.append_writer_new(table) do
+      {:error, _} = err -> err
+      w -> {:ok, w}
+    end
+  end
+
+  @spec new!(Fluss.Table.t()) :: t()
+  def new!(table) do
+    case new(table) do
+      {:ok, w} -> w
+      {:error, %Fluss.Error{} = err} -> raise err
+    end
+  end
+
+  @spec append(t(), list()) :: {:ok, Fluss.WriteHandle.t()} | {:error, Fluss.Error.t()}
+  def append(writer, values) when is_list(values) do
+    case Native.append_writer_append(writer, values) do
+      {:error, _} = err -> err
+      handle -> {:ok, handle}
+    end
+  end
+
+  @spec flush(t()) :: :ok | {:error, Fluss.Error.t()}
+  def flush(writer) do
+    writer
+    |> Native.append_writer_flush()
+    |> Native.await_nif()
+  end
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/config.ex b/fluss-rust/bindings/elixir/lib/fluss/config.ex
new file mode 100644
index 0000000000..8aaacf7993
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/config.ex
@@ -0,0 +1,239 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.Config do
+  @moduledoc """
+  Client configuration for connecting to a Fluss cluster.
+
+  Fields left as `nil` use the client's defaults.
+
+  ## Examples
+
+      config = Fluss.Config.new("localhost:9123")
+
+      config =
+        Fluss.Config.new("host1:9123,host2:9123")
+        |> Fluss.Config.set_writer_batch_size(1_048_576)
+
+  """
+
+  @enforce_keys [:bootstrap_servers]
+  defstruct bootstrap_servers: nil,
+            connect_timeout_ms: nil,
+            remote_file_download_thread_num: nil,
+            scanner_log_fetch_max_bytes: nil,
+            scanner_log_fetch_max_bytes_for_bucket: nil,
+            scanner_log_fetch_min_bytes: nil,
+            scanner_log_fetch_wait_max_time_ms: nil,
+            scanner_log_max_poll_records: nil,
+            scanner_remote_log_prefetch_num: nil,
+            scanner_remote_log_read_concurrency: nil,
+            security_protocol: nil,
+            security_sasl_mechanism: nil,
+            security_sasl_password: nil,
+            security_sasl_username: nil,
+            writer_acks: nil,
+            writer_batch_size: nil,
+            writer_batch_timeout_ms: nil,
+            writer_bucket_no_key_assigner: nil,
+            writer_buffer_memory_size: nil,
+            writer_buffer_wait_timeout_ms: nil,
+            writer_dynamic_batch_size_enabled: nil,
+            writer_dynamic_batch_size_min: nil,
+            writer_enable_idempotence: nil,
+            writer_max_inflight_requests_per_bucket: nil,
+            writer_request_max_size: nil,
+            writer_retries: nil
+
+  @type t :: %__MODULE__{
+          bootstrap_servers: String.t(),
+          connect_timeout_ms: non_neg_integer() | nil,
+          remote_file_download_thread_num: non_neg_integer() | nil,
+          scanner_log_fetch_max_bytes: non_neg_integer() | nil,
+          scanner_log_fetch_max_bytes_for_bucket: non_neg_integer() | nil,
+          scanner_log_fetch_min_bytes: non_neg_integer() | nil,
+          scanner_log_fetch_wait_max_time_ms: non_neg_integer() | nil,
+          scanner_log_max_poll_records: non_neg_integer() | nil,
+          scanner_remote_log_prefetch_num: non_neg_integer() | nil,
+          scanner_remote_log_read_concurrency: non_neg_integer() | nil,
+          security_protocol: String.t() | nil,
+          security_sasl_mechanism: String.t() | nil,
+          security_sasl_password: String.t() | nil,
+          security_sasl_username: String.t() | nil,
+          writer_acks: String.t() | nil,
+          writer_batch_size: non_neg_integer() | nil,
+          writer_batch_timeout_ms: non_neg_integer() | nil,
+          writer_bucket_no_key_assigner: :sticky | :round_robin | nil,
+          writer_buffer_memory_size: non_neg_integer() | nil,
+          writer_buffer_wait_timeout_ms: non_neg_integer() | nil,
+          writer_dynamic_batch_size_enabled: boolean() | nil,
+          writer_dynamic_batch_size_min: non_neg_integer() | nil,
+          writer_enable_idempotence: boolean() | nil,
+          writer_max_inflight_requests_per_bucket: non_neg_integer() | nil,
+          writer_request_max_size: non_neg_integer() | nil,
+          writer_retries: non_neg_integer() | nil
+        }
+
+  @spec new(String.t()) :: t()
+  def new(bootstrap_servers) when is_binary(bootstrap_servers) do
+    %__MODULE__{bootstrap_servers: bootstrap_servers}
+  end
+
+  @spec default() :: t()
+  def default, do: %__MODULE__{bootstrap_servers: ""}
+
+  @spec set_bootstrap_servers(t(), String.t()) :: t()
+  def set_bootstrap_servers(%__MODULE__{} = config, servers) when is_binary(servers),
+    do: %{config | bootstrap_servers: servers}
+
+  @spec set_connect_timeout_ms(t(), non_neg_integer()) :: t()
+  def set_connect_timeout_ms(%__MODULE__{} = config, ms) when is_integer(ms),
+    do: %{config | connect_timeout_ms: ms}
+
+  @spec set_remote_file_download_thread_num(t(), non_neg_integer()) :: t()
+  def set_remote_file_download_thread_num(%__MODULE__{} = config, threads)
+      when is_integer(threads),
+      do: %{config | remote_file_download_thread_num: threads}
+
+  @spec set_scanner_log_fetch_max_bytes(t(), non_neg_integer()) :: t()
+  def set_scanner_log_fetch_max_bytes(%__MODULE__{} = config, max_bytes)
+      when is_integer(max_bytes),
+      do: %{config | scanner_log_fetch_max_bytes: max_bytes}
+
+  @spec set_scanner_log_fetch_max_bytes_for_bucket(t(), non_neg_integer()) :: t()
+  def set_scanner_log_fetch_max_bytes_for_bucket(%__MODULE__{} = config, max_bytes)
+      when is_integer(max_bytes),
+      do: %{config | scanner_log_fetch_max_bytes_for_bucket: max_bytes}
+
+  @spec set_scanner_log_fetch_min_bytes(t(), non_neg_integer()) :: t()
+  def set_scanner_log_fetch_min_bytes(%__MODULE__{} = config, min_bytes)
+      when is_integer(min_bytes),
+      do: %{config | scanner_log_fetch_min_bytes: min_bytes}
+
+  @spec set_scanner_log_fetch_wait_max_time_ms(t(), non_neg_integer()) :: t()
+  def set_scanner_log_fetch_wait_max_time_ms(%__MODULE__{} = config, wait_ms)
+      when is_integer(wait_ms),
+      do: %{config | scanner_log_fetch_wait_max_time_ms: wait_ms}
+
+  @spec set_scanner_log_max_poll_records(t(), non_neg_integer()) :: t()
+  def set_scanner_log_max_poll_records(%__MODULE__{} = config, num) when is_integer(num),
+    do: %{config | scanner_log_max_poll_records: num}
+
+  @spec set_scanner_remote_log_prefetch_num(t(), non_neg_integer()) :: t()
+  def set_scanner_remote_log_prefetch_num(%__MODULE__{} = config, num) when is_integer(num),
+    do: %{config | scanner_remote_log_prefetch_num: num}
+
+  @spec set_scanner_remote_log_read_concurrency(t(), non_neg_integer()) :: t()
+  def set_scanner_remote_log_read_concurrency(%__MODULE__{} = config, concurrency)
+      when is_integer(concurrency),
+      do: %{config | scanner_remote_log_read_concurrency: concurrency}
+
+  @spec set_security_protocol(t(), String.t()) :: t()
+  def set_security_protocol(%__MODULE__{} = config, protocol) when is_binary(protocol),
+    do: %{config | security_protocol: protocol}
+
+  @spec set_security_sasl_mechanism(t(), String.t()) :: t()
+  def set_security_sasl_mechanism(%__MODULE__{} = config, mechanism) when is_binary(mechanism),
+    do: %{config | security_sasl_mechanism: mechanism}
+
+  @spec set_security_sasl_password(t(), String.t()) :: t()
+  def set_security_sasl_password(%__MODULE__{} = config, pass) when is_binary(pass),
+    do: %{config | security_sasl_password: pass}
+
+  @spec set_security_sasl_username(t(), String.t()) :: t()
+  def set_security_sasl_username(%__MODULE__{} = config, username) when is_binary(username),
+    do: %{config | security_sasl_username: username}
+
+  @spec set_writer_acks(t(), String.t()) :: t()
+  def set_writer_acks(%__MODULE__{} = config, acks) when is_binary(acks),
+    do: %{config | writer_acks: acks}
+
+  @spec set_writer_batch_size(t(), non_neg_integer()) :: t()
+  def set_writer_batch_size(%__MODULE__{} = config, size) when is_integer(size),
+    do: %{config | writer_batch_size: size}
+
+  @spec set_writer_batch_timeout_ms(t(), non_neg_integer()) :: t()
+  def set_writer_batch_timeout_ms(%__MODULE__{} = config, ms) when is_integer(ms),
+    do: %{config | writer_batch_timeout_ms: ms}
+
+  @spec set_writer_bucket_no_key_assigner(t(), :sticky | :round_robin) :: t()
+  def set_writer_bucket_no_key_assigner(%__MODULE__{} = config, assigner)
+      when assigner in [:sticky, :round_robin],
+      do: %{config | writer_bucket_no_key_assigner: assigner}
+
+  @spec set_writer_buffer_memory_size(t(), non_neg_integer()) :: t()
+  def set_writer_buffer_memory_size(%__MODULE__{} = config, size) when is_integer(size),
+    do: %{config | writer_buffer_memory_size: size}
+
+  @spec set_writer_buffer_wait_timeout_ms(t(), non_neg_integer()) :: t()
+  def set_writer_buffer_wait_timeout_ms(%__MODULE__{} = config, ms) when is_integer(ms),
+    do: %{config | writer_buffer_wait_timeout_ms: ms}
+
+  @spec set_writer_dynamic_batch_size_enabled(t(), boolean()) :: t()
+  def set_writer_dynamic_batch_size_enabled(%__MODULE__{} = config, enabled)
+      when is_boolean(enabled),
+      do: %{config | writer_dynamic_batch_size_enabled: enabled}
+
+  @spec set_writer_dynamic_batch_size_min(t(), non_neg_integer()) :: t()
+  def set_writer_dynamic_batch_size_min(%__MODULE__{} = config, size) when is_integer(size),
+    do: %{config | writer_dynamic_batch_size_min: size}
+
+  @spec set_writer_enable_idempotence(t(), boolean()) :: t()
+  def set_writer_enable_idempotence(%__MODULE__{} = config, enabled)
+      when is_boolean(enabled),
+      do: %{config | writer_enable_idempotence: enabled}
+
+  @spec set_writer_max_inflight_requests_per_bucket(t(), non_neg_integer()) :: t()
+  def set_writer_max_inflight_requests_per_bucket(%__MODULE__{} = config, n)
+      when is_integer(n),
+      do: %{config | writer_max_inflight_requests_per_bucket: n}
+
+  @spec set_writer_request_max_size(t(), non_neg_integer()) :: t()
+  def set_writer_request_max_size(%__MODULE__{} = config, size) when is_integer(size),
+    do: %{config | writer_request_max_size: size}
+
+  @spec set_writer_retries(t(), non_neg_integer()) :: t()
+  def set_writer_retries(%__MODULE__{} = config, n) when is_integer(n),
+    do: %{config | writer_retries: n}
+
+  @spec get_bootstrap_servers(t()) :: String.t()
+  def get_bootstrap_servers(%__MODULE__{bootstrap_servers: servers}), do: servers
+end
+
+defimpl Inspect, for: Fluss.Config do
+  import Inspect.Algebra
+
+  def inspect(%Fluss.Config{} = config, opts) do
+    sanitized = %{config | security_sasl_password: redact(config.security_sasl_password)}
+
+    fields = sanitized |> Map.from_struct() |> Map.to_list()
+
+    container_doc(
+      "%Fluss.Config{",
+      fields,
+      "}",
+      opts,
+      fn {key, value}, opts ->
+        concat([Atom.to_string(key), ": ", to_doc(value, opts)])
+      end,
+      separator: ","
+    )
+  end
+
+  defp redact(nil), do: nil
+  defp redact(_), do: "[REDACTED]"
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/connection.ex b/fluss-rust/bindings/elixir/lib/fluss/connection.ex
new file mode 100644
index 0000000000..a56c72a664
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/connection.ex
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.Connection do
+  @moduledoc """
+  A connection to a Fluss cluster.
+
+  Errors are per-operation, not per-connection. If the server becomes
+  unreachable, individual calls fail but the connection recovers
+  transparently — there is no need to recreate it.
+
+  ## Examples
+
+      config = Fluss.Config.new("localhost:9123")
+      {:ok, conn} = Fluss.Connection.new(config)
+
+  """
+
+  alias Fluss.Native
+
+  @type t :: reference()
+
+  @spec new(Fluss.Config.t()) :: {:ok, t()} | {:error, Fluss.Error.t()}
+  def new(%Fluss.Config{} = config) do
+    config
+    |> Native.connection_new()
+    |> Native.await_nif()
+  end
+
+  @spec new!(Fluss.Config.t()) :: t()
+  def new!(%Fluss.Config{} = config) do
+    case new(config) do
+      {:ok, conn} -> conn
+      {:error, %Fluss.Error{} = err} -> raise err
+    end
+  end
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/error.ex b/fluss-rust/bindings/elixir/lib/fluss/error.ex
new file mode 100644
index 0000000000..fe5d1ca8b4
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/error.ex
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.Error do
+  @moduledoc """
+  Structured error returned from Fluss operations.
+
+  Fields:
+
+    * `:code` — stable atom for pattern matching.
+    * `:error_code` — raw integer code. Protocol codes `0..57`, `-1` for
+      `:unknown_server_error`, `-2` for `:client_error`.
+    * `:message` — human-readable description.
+
+  Also an exception, so `raise err` works.
+
+  `:client_error` covers any failure that didn't come from the server API
+  (bad input, transport, I/O, decode, consumed write handle, etc.) and is
+  not retriable, matching the Python and C++ bindings.
+  """
+
+  defexception [:code, :error_code, :message]
+
+  @typedoc "Error code atom."
+  @type code ::
+          :none
+          | :unknown_server_error
+          | :network_exception
+          | :unsupported_version
+          | :corrupt_message
+          | :database_not_exist
+          | :database_not_empty
+          | :database_already_exist
+          | :table_not_exist
+          | :table_already_exist
+          | :schema_not_exist
+          | :log_storage_exception
+          | :kv_storage_exception
+          | :not_leader_or_follower
+          | :record_too_large_exception
+          | :corrupt_record_exception
+          | :invalid_table_exception
+          | :invalid_database_exception
+          | :invalid_replication_factor
+          | :invalid_required_acks
+          | :log_offset_out_of_range_exception
+          | :non_primary_key_table_exception
+          | :unknown_table_or_bucket_exception
+          | :invalid_update_version_exception
+          | :invalid_coordinator_exception
+          | :fenced_leader_epoch_exception
+          | :request_time_out
+          | :storage_exception
+          | :operation_not_attempted_exception
+          | :not_enough_replicas_after_append_exception
+          | :not_enough_replicas_exception
+          | :security_token_exception
+          | :out_of_order_sequence_exception
+          | :duplicate_sequence_exception
+          | :unknown_writer_id_exception
+          | :invalid_column_projection
+          | :invalid_target_column
+          | :partition_not_exists
+          | :table_not_partitioned_exception
+          | :invalid_timestamp_exception
+          | :invalid_config_exception
+          | :lake_storage_not_configured_exception
+          | :kv_snapshot_not_exist
+          | :partition_already_exists
+          | :partition_spec_invalid_exception
+          | :leader_not_available_exception
+          | :partition_max_num_exception
+          | :authenticate_exception
+          | :security_disabled_exception
+          | :authorization_exception
+          | :bucket_max_num_exception
+          | :fenced_tiering_epoch_exception
+          | :retriable_authenticate_exception
+          | :invalid_server_rack_info_exception
+          | :lake_snapshot_not_exist
+          | :lake_table_already_exist
+          | :ineligible_replica_exception
+          | :invalid_alter_table_exception
+          | :deletion_disabled_exception
+          | :client_error
+
+  @type t :: %__MODULE__{code: code(), error_code: integer(), message: String.t()}
+
+  @retriable_codes [
+    :network_exception,
+    :corrupt_message,
+    :schema_not_exist,
+    :log_storage_exception,
+    :kv_storage_exception,
+    :not_leader_or_follower,
+    :corrupt_record_exception,
+    :unknown_table_or_bucket_exception,
+    :request_time_out,
+    :storage_exception,
+    :not_enough_replicas_after_append_exception,
+    :not_enough_replicas_exception,
+    :leader_not_available_exception
+  ]
+
+  @impl true
+  def message(%__MODULE__{code: code, message: msg}) do
+    "Fluss error [#{code}]: #{msg}"
+  end
+
+  @doc "Returns `true` if retrying the operation may succeed."
+  @spec retriable?(t()) :: boolean()
+  def retriable?(%__MODULE__{code: code}), do: code in @retriable_codes
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/log_scanner.ex b/fluss-rust/bindings/elixir/lib/fluss/log_scanner.ex
new file mode 100644
index 0000000000..fca4168c3e
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/log_scanner.ex
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.LogScanner do
+  @moduledoc """
+  Scanner for reading records from a log table.
+
+  `poll/2` is non-blocking — it returns `:ok` immediately and sends results
+  as `{:fluss_records, records}` or `{:fluss_poll_error, %Fluss.Error{}}` to
+  the calling process. No dirty scheduler threads are held during the wait.
+
+  Each record is an atom-keyed map: `:offset`, `:timestamp`, `:change_type`, `:row`.
+  Row values are also atom-keyed (column names interned as atoms).
+
+  ## Examples
+
+      scanner = Fluss.LogScanner.new!(table)
+      :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset())
+      :ok = Fluss.LogScanner.poll(scanner, 5_000)
+
+      receive do
+        {:fluss_records, records} ->
+          for record <- records, do: IO.inspect(record[:row])
+        {:fluss_poll_error, %Fluss.Error{code: code, message: msg}} ->
+          IO.puts("poll error [\#{code}]: \#{msg}")
+      end
+
+  """
+
+  alias Fluss.Native
+
+  @type t :: reference()
+  @type record :: %{atom() => term()}
+
+  @spec new(Fluss.Table.t()) :: {:ok, t()} | {:error, Fluss.Error.t()}
+  def new(table) do
+    case Native.log_scanner_new(table) do
+      {:error, _} = err -> err
+      s -> {:ok, s}
+    end
+  end
+
+  @spec new!(Fluss.Table.t()) :: t()
+  def new!(table) do
+    case new(table) do
+      {:ok, s} -> s
+      {:error, %Fluss.Error{} = err} -> raise err
+    end
+  end
+
+  @spec subscribe(t(), integer(), integer()) :: :ok | {:error, Fluss.Error.t()}
+  def subscribe(scanner, bucket, offset) do
+    scanner
+    |> Native.log_scanner_subscribe(bucket, offset)
+    |> Native.await_nif()
+  end
+
+  @doc """
+  Subscribes to multiple buckets. Takes a list of `{bucket_id, offset}` tuples.
+  """
+  @spec subscribe_buckets(t(), [{integer(), integer()}]) :: :ok | {:error, Fluss.Error.t()}
+  def subscribe_buckets(scanner, bucket_offsets) when is_list(bucket_offsets) do
+    scanner
+    |> Native.log_scanner_subscribe_buckets(bucket_offsets)
+    |> Native.await_nif()
+  end
+
+  @spec unsubscribe(t(), integer()) :: :ok | {:error, Fluss.Error.t()}
+  def unsubscribe(scanner, bucket) do
+    scanner
+    |> Native.log_scanner_unsubscribe(bucket)
+    |> Native.await_nif()
+  end
+
+  @doc """
+  Starts a non-blocking poll. Returns `:ok` immediately.
+  Results arrive as `{:fluss_records, [record]}` or
+  `{:fluss_poll_error, %Fluss.Error{}}`.
+  """
+  @spec poll(t(), non_neg_integer()) :: :ok
+  def poll(scanner, timeout_ms),
+    do: Native.log_scanner_poll(scanner, timeout_ms)
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/native.ex b/fluss-rust/bindings/elixir/lib/fluss/native.ex
new file mode 100644
index 0000000000..865dda142d
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/native.ex
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.Native do
+  @moduledoc false
+  use Rustler, otp_app: :fluss, crate: "fluss_nif"
+
+  # Connection
+  def connection_new(_config), do: :erlang.nif_error(:nif_not_loaded)
+
+  # Admin
+  def admin_new(_conn), do: :erlang.nif_error(:nif_not_loaded)
+
+  def admin_create_database(_admin, _name, _ignore_if_exists),
+    do: :erlang.nif_error(:nif_not_loaded)
+
+  def admin_drop_database(_admin, _name, _ignore_if_not_exists),
+    do: :erlang.nif_error(:nif_not_loaded)
+
+  def admin_list_databases(_admin), do: :erlang.nif_error(:nif_not_loaded)
+
+  def admin_create_table(_admin, _db, _table, _descriptor, _ignore_if_exists),
+    do: :erlang.nif_error(:nif_not_loaded)
+
+  def admin_drop_table(_admin, _db, _table, _ignore_if_not_exists),
+    do: :erlang.nif_error(:nif_not_loaded)
+
+  def admin_list_tables(_admin, _database), do: :erlang.nif_error(:nif_not_loaded)
+
+  # Schema / TableDescriptor
+  def table_descriptor_new(_schema, _bucket_count, _properties),
+    do: :erlang.nif_error(:nif_not_loaded)
+
+  # Table
+  def table_get(_conn, _database, _table), do: :erlang.nif_error(:nif_not_loaded)
+  def table_has_primary_key(_table), do: :erlang.nif_error(:nif_not_loaded)
+  def table_column_names(_table), do: :erlang.nif_error(:nif_not_loaded)
+
+  # AppendWriter
+  def append_writer_new(_table), do: :erlang.nif_error(:nif_not_loaded)
+  def append_writer_append(_writer, _values), do: :erlang.nif_error(:nif_not_loaded)
+  def append_writer_flush(_writer), do: :erlang.nif_error(:nif_not_loaded)
+
+  # LogScanner
+  def log_scanner_new(_table), do: :erlang.nif_error(:nif_not_loaded)
+  def log_scanner_subscribe(_scanner, _bucket, _offset), do: :erlang.nif_error(:nif_not_loaded)
+
+  def log_scanner_subscribe_buckets(_scanner, _bucket_offsets),
+    do: :erlang.nif_error(:nif_not_loaded)
+
+  def log_scanner_unsubscribe(_scanner, _bucket), do: :erlang.nif_error(:nif_not_loaded)
+  def log_scanner_poll(_scanner, _timeout_ms), do: :erlang.nif_error(:nif_not_loaded)
+
+  # WriteHandle
+  def write_handle_wait(_handle), do: :erlang.nif_error(:nif_not_loaded)
+
+  # Constants
+  def earliest_offset, do: :erlang.nif_error(:nif_not_loaded)
+
+  @doc false
+  def await_nif(ref) do
+    receive do
+      {^ref, result} -> result
+    end
+  end
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/schema.ex b/fluss-rust/bindings/elixir/lib/fluss/schema.ex
new file mode 100644
index 0000000000..e11911eee7
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/schema.ex
@@ -0,0 +1,74 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.Schema do
+  @moduledoc """
+  Schema definition for a Fluss table.
+
+  Simple types: `:boolean`, `:tinyint`, `:smallint`, `:int`, `:bigint`,
+  `:float`, `:double`, `:string`, `:bytes`, `:date`, `:time`, `:timestamp`, `:timestamp_ltz`
+
+  Parameterized types: `{:decimal, precision, scale}`, `{:char, length}`, `{:binary, length}`
+
+  ## Examples
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("id", :int)
+        |> Fluss.Schema.column("name", :string)
+        |> Fluss.Schema.column("amount", {:decimal, 10, 2})
+
+  """
+
+  defstruct columns: [], primary_key: []
+
+  @type data_type ::
+          :boolean
+          | :tinyint
+          | :smallint
+          | :int
+          | :bigint
+          | :float
+          | :double
+          | :string
+          | :bytes
+          | :date
+          | :time
+          | :timestamp
+          | :timestamp_ltz
+          | {:decimal, non_neg_integer(), non_neg_integer()}
+          | {:char, non_neg_integer()}
+          | {:binary, non_neg_integer()}
+
+  @type t :: %__MODULE__{
+          columns: [{String.t(), data_type()}],
+          primary_key: [String.t()]
+        }
+
+  @spec new() :: t()
+  def new, do: %__MODULE__{}
+
+  @spec column(t(), String.t(), data_type()) :: t()
+  def column(%__MODULE__{} = schema, name, data_type) when is_binary(name) do
+    %{schema | columns: schema.columns ++ [{name, data_type}]}
+  end
+
+  @spec primary_key(t(), [String.t()]) :: t()
+  def primary_key(%__MODULE__{} = schema, keys) when is_list(keys) do
+    %{schema | primary_key: keys}
+  end
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/table.ex b/fluss-rust/bindings/elixir/lib/fluss/table.ex
new file mode 100644
index 0000000000..c934fc0c01
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/table.ex
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.Table do
+  @moduledoc """
+  A handle to a Fluss table, used to create writers and scanners.
+  """
+
+  alias Fluss.Native
+
+  @type t :: reference()
+
+  @spec get(Fluss.Connection.t(), String.t(), String.t()) ::
+          {:ok, t()} | {:error, Fluss.Error.t()}
+  def get(conn, database, table) do
+    conn
+    |> Native.table_get(database, table)
+    |> Native.await_nif()
+  end
+
+  @spec get!(Fluss.Connection.t(), String.t(), String.t()) :: t()
+  def get!(conn, database, table) do
+    case get(conn, database, table) do
+      {:ok, t} -> t
+      {:error, %Fluss.Error{} = err} -> raise err
+    end
+  end
+
+  @spec has_primary_key?(t()) :: boolean()
+  def has_primary_key?(table), do: Native.table_has_primary_key(table)
+
+  @spec column_names(t()) :: [String.t()]
+  def column_names(table), do: Native.table_column_names(table)
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/table_descriptor.ex b/fluss-rust/bindings/elixir/lib/fluss/table_descriptor.ex
new file mode 100644
index 0000000000..b95b5a503b
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/table_descriptor.ex
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.TableDescriptor do
+  @moduledoc """
+  Descriptor for creating a Fluss table.
+
+  Options: `:bucket_count`, `:properties` (list of `{key, value}` string tuples).
+
+  ## Examples
+
+      Fluss.TableDescriptor.new!(schema)
+      Fluss.TableDescriptor.new!(schema, bucket_count: 3)
+
+  """
+
+  alias Fluss.Native
+
+  @type t :: reference()
+
+  @spec new!(Fluss.Schema.t(), keyword()) :: t()
+  def new!(%Fluss.Schema{} = schema, opts \\ []) do
+    bucket_count = Keyword.get(opts, :bucket_count)
+    properties = Keyword.get(opts, :properties, [])
+
+    case Native.table_descriptor_new(schema, bucket_count, properties) do
+      {:error, %Fluss.Error{} = err} -> raise err
+      ref -> ref
+    end
+  end
+end
diff --git a/fluss-rust/bindings/elixir/lib/fluss/write_handle.ex b/fluss-rust/bindings/elixir/lib/fluss/write_handle.ex
new file mode 100644
index 0000000000..f5f16591f4
--- /dev/null
+++ b/fluss-rust/bindings/elixir/lib/fluss/write_handle.ex
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.WriteHandle do
+  @moduledoc """
+  Handle for a pending write operation.
+
+  Returned by `Fluss.AppendWriter.append/2`. Drop for fire-and-forget,
+  or call `wait/1` for per-record server acknowledgment.
+  """
+
+  alias Fluss.Native
+
+  @type t :: reference()
+
+  @spec wait(t()) :: :ok | {:error, Fluss.Error.t()}
+  def wait(handle) do
+    handle
+    |> Native.write_handle_wait()
+    |> Native.await_nif()
+  end
+
+  @spec wait!(t()) :: :ok
+  def wait!(handle) do
+    case wait(handle) do
+      :ok -> :ok
+      {:error, %Fluss.Error{} = err} -> raise err
+    end
+  end
+end
diff --git a/fluss-rust/bindings/elixir/mix.exs b/fluss-rust/bindings/elixir/mix.exs
new file mode 100644
index 0000000000..b83e9f94b9
--- /dev/null
+++ b/fluss-rust/bindings/elixir/mix.exs
@@ -0,0 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.MixProject do
+  use Mix.Project
+
+  @version "1.0.0"
+
+  def project do
+    [
+      app: :fluss,
+      version: @version,
+      elixir: "~> 1.15",
+      start_permanent: Mix.env() == :prod,
+      elixirc_paths: elixirc_paths(Mix.env()),
+      deps: deps(),
+      description: "Elixir client for Apache Fluss",
+      package: package()
+    ]
+  end
+
+  def application do
+    [
+      extra_applications: [:logger]
+    ]
+  end
+
+  defp elixirc_paths(:test), do: ["lib", "test/support"]
+  defp elixirc_paths(_), do: ["lib"]
+
+  defp deps do
+    [
+      {:rustler, "~> 0.37"},
+      {:ex_doc, "~> 0.31", only: :dev, runtime: false},
+      {:credo, "~> 1.7", only: [:dev, :test], runtime: false}
+    ]
+  end
+
+  defp package do
+    [
+      licenses: ["Apache-2.0"],
+      links: %{
+        "GitHub" => "https://github.com/apache/fluss-rust"
+      }
+    ]
+  end
+end
diff --git a/fluss-rust/bindings/elixir/mix.lock b/fluss-rust/bindings/elixir/mix.lock
new file mode 100644
index 0000000000..b1170d3f5b
--- /dev/null
+++ b/fluss-rust/bindings/elixir/mix.lock
@@ -0,0 +1,13 @@
+%{
+  "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"},
+  "credo": {:hex, :credo, "1.7.17", "f92b6aa5b26301eaa5a35e4d48ebf5aa1e7094ac00ae38f87086c562caf8a22f", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "1eb5645c835f0b6c9b5410f94b5a185057bcf6d62a9c2b476da971cde8749645"},
+  "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"},
+  "ex_doc": {:hex, :ex_doc, "0.40.1", "67542e4b6dde74811cfd580e2c0149b78010fd13001fda7cfeb2b2c2ffb1344d", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "bcef0e2d360d93ac19f01a85d58f91752d930c0a30e2681145feea6bd3516e00"},
+  "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"},
+  "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
+  "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"},
+  "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"},
+  "makeup_erlang": {:hex, :makeup_erlang, "1.0.3", "4252d5d4098da7415c390e847c814bad3764c94a814a0b4245176215615e1035", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "953297c02582a33411ac6208f2c6e55f0e870df7f80da724ed613f10e6706afd"},
+  "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"},
+  "rustler": {:hex, :rustler, "0.37.3", "5f4e6634d43b26f0a69834dd1d3ed4e1710b022a053bf4a670220c9540c92602", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "a6872c6f53dcf00486d1e7f9e046e20e01bf1654bdacc4193016c2e8002b32a2"},
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/Cargo.toml b/fluss-rust/bindings/elixir/native/fluss_nif/Cargo.toml
new file mode 100644
index 0000000000..dd4d453506
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/Cargo.toml
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "fluss_nif"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+rust-version.workspace = true
+
+[lib]
+name = "fluss_nif"
+path = "src/lib.rs"
+crate-type = ["cdylib"]
+
+[dependencies]
+bigdecimal = { workspace = true }
+fluss = { workspace = true }
+rustler = "0.37"
+tokio = { workspace = true }
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/admin.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/admin.rs
new file mode 100644
index 0000000000..e3f29aebcd
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/admin.rs
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::async_nif;
+use crate::atoms::to_nif_err;
+use crate::connection::ConnectionResource;
+use crate::schema::TableDescriptorResource;
+use fluss::client::FlussAdmin;
+use fluss::metadata::TablePath;
+use rustler::{Env, ResourceArc, Term};
+use std::sync::Arc;
+
+pub struct AdminResource {
+    pub inner: Arc<FlussAdmin>,
+}
+
+impl std::panic::RefUnwindSafe for AdminResource {}
+
+#[rustler::resource_impl]
+impl rustler::Resource for AdminResource {}
+
+#[rustler::nif]
+fn admin_new(
+    conn: ResourceArc<ConnectionResource>,
+) -> Result<ResourceArc<AdminResource>, rustler::Error> {
+    let inner = conn.inner.get_admin().map_err(to_nif_err)?;
+    Ok(ResourceArc::new(AdminResource { inner }))
+}
+
+#[rustler::nif]
+fn admin_create_database<'a>(
+    env: Env<'a>,
+    admin: ResourceArc<AdminResource>,
+    database_name: String,
+    ignore_if_exists: bool,
+) -> Term<'a> {
+    async_nif::spawn_task(env, async move {
+        admin
+            .inner
+            .create_database(&database_name, None, ignore_if_exists)
+            .await
+    })
+}
+
+#[rustler::nif]
+fn admin_drop_database<'a>(
+    env: Env<'a>,
+    admin: ResourceArc<AdminResource>,
+    database_name: String,
+    ignore_if_not_exists: bool,
+) -> Term<'a> {
+    async_nif::spawn_task(env, async move {
+        admin
+            .inner
+            .drop_database(&database_name, ignore_if_not_exists, false)
+            .await
+    })
+}
+
+#[rustler::nif]
+fn admin_list_databases<'a>(env: Env<'a>, admin: ResourceArc<AdminResource>) -> Term<'a> {
+    async_nif::spawn_task_with_result(env, async move { admin.inner.list_databases().await })
+}
+
+#[rustler::nif]
+fn admin_create_table<'a>(
+    env: Env<'a>,
+    admin: ResourceArc<AdminResource>,
+    database_name: String,
+    table_name: String,
+    descriptor: ResourceArc<TableDescriptorResource>,
+    ignore_if_exists: bool,
+) -> Term<'a> {
+    async_nif::spawn_task(env, async move {
+        let path = TablePath::new(&database_name, &table_name);
+        admin
+            .inner
+            .create_table(&path, &descriptor.inner, ignore_if_exists)
+            .await
+    })
+}
+
+#[rustler::nif]
+fn admin_drop_table<'a>(
+    env: Env<'a>,
+    admin: ResourceArc<AdminResource>,
+    database_name: String,
+    table_name: String,
+    ignore_if_not_exists: bool,
+) -> Term<'a> {
+    async_nif::spawn_task(env, async move {
+        let path = TablePath::new(&database_name, &table_name);
+        admin.inner.drop_table(&path, ignore_if_not_exists).await
+    })
+}
+
+#[rustler::nif]
+fn admin_list_tables<'a>(
+    env: Env<'a>,
+    admin: ResourceArc<AdminResource>,
+    database_name: String,
+) -> Term<'a> {
+    async_nif::spawn_task_with_result(
+        env,
+        async move { admin.inner.list_tables(&database_name).await },
+    )
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/append_writer.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/append_writer.rs
new file mode 100644
index 0000000000..f26884419e
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/append_writer.rs
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::RUNTIME;
+use crate::async_nif;
+use crate::atoms::{client_err, to_nif_err};
+use crate::row_convert;
+use crate::table::TableResource;
+use crate::write_handle::WriteHandleResource;
+use fluss::client::AppendWriter;
+use fluss::metadata::Column;
+use rustler::{Env, ResourceArc, Term};
+
+pub struct AppendWriterResource {
+    pub inner: AppendWriter,
+    pub columns: Vec<Column>,
+}
+
+impl std::panic::RefUnwindSafe for AppendWriterResource {}
+
+#[rustler::resource_impl]
+impl rustler::Resource for AppendWriterResource {}
+
+#[rustler::nif]
+fn append_writer_new(
+    table: ResourceArc<TableResource>,
+) -> Result<ResourceArc<AppendWriterResource>, rustler::Error> {
+    // WriterClient::new() calls tokio::spawn internally.
+    let _guard = RUNTIME.enter();
+    let (inner, columns) = table.with_table(|t| {
+        let inner = t
+            .new_append()
+            .map_err(to_nif_err)?
+            .create_writer()
+            .map_err(to_nif_err)?;
+        Ok((inner, t.get_table_info().schema.columns().to_vec()))
+    })?;
+    Ok(ResourceArc::new(AppendWriterResource { inner, columns }))
+}
+
+#[rustler::nif]
+fn append_writer_append<'a>(
+    env: Env<'a>,
+    writer: ResourceArc<AppendWriterResource>,
+    values: Term<'a>,
+) -> Result<ResourceArc<WriteHandleResource>, rustler::Error> {
+    let row = row_convert::term_to_row(env, values, &writer.columns).map_err(client_err)?;
+    let future = writer.inner.append(&row).map_err(to_nif_err)?;
+    Ok(ResourceArc::new(WriteHandleResource::new(future)))
+}
+
+#[rustler::nif]
+fn append_writer_flush<'a>(env: Env<'a>, writer: ResourceArc<AppendWriterResource>) -> Term<'a> {
+    async_nif::spawn_task(env, async move { writer.inner.flush().await })
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/async_nif.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/async_nif.rs
new file mode 100644
index 0000000000..6b26eaaf8a
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/async_nif.rs
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Async NIF helpers — spawn on tokio, send `{ref, result}` back as a BEAM
+//! message instead of blocking dirty schedulers.
+
+use crate::RUNTIME;
+use crate::atoms::{self, NifFlussError};
+use fluss::error::Error as CoreError;
+use rustler::env::OwnedEnv;
+use rustler::{Encoder, Env, Term};
+use std::future::Future;
+
+fn encode_err<'a>(env: Env<'a>, err: CoreError) -> Term<'a> {
+    (atoms::error(), NifFlussError::from_core(&err)).encode(env)
+}
+
+pub fn spawn_task<'a, F>(env: Env<'a>, future: F) -> Term<'a>
+where
+    F: Future<Output = Result<(), CoreError>> + Send + 'static,
+{
+    let pid = env.pid();
+    let ref_term: Term<'a> = *env.make_ref();
+    let mut task_env = OwnedEnv::new();
+    let saved_ref = task_env.save(ref_term);
+
+    RUNTIME.spawn(async move {
+        let result = future.await;
+        let _ = task_env.send_and_clear(&pid, |env| {
+            let r = saved_ref.load(env);
+            match result {
+                Ok(()) => (r, atoms::ok()).encode(env),
+                Err(e) => (r, encode_err(env, e)).encode(env),
+            }
+        });
+    });
+
+    ref_term
+}
+
+pub fn spawn_task_with_result<'a, F, T>(env: Env<'a>, future: F) -> Term<'a>
+where
+    F: Future<Output = Result<T, CoreError>> + Send + 'static,
+    T: Encoder + Send + 'static,
+{
+    let pid = env.pid();
+    let ref_term: Term<'a> = *env.make_ref();
+    let mut task_env = OwnedEnv::new();
+    let saved_ref = task_env.save(ref_term);
+
+    RUNTIME.spawn(async move {
+        let result = future.await;
+        let _ = task_env.send_and_clear(&pid, |env| {
+            let r = saved_ref.load(env);
+            match result {
+                Ok(val) => (r, (atoms::ok(), val)).encode(env),
+                Err(e) => (r, encode_err(env, e)).encode(env),
+            }
+        });
+    });
+
+    ref_term
+}
+
+pub fn send_client_error<'a>(env: Env<'a>, msg: &str) -> Term<'a> {
+    let pid = env.pid();
+    let ref_term: Term<'a> = *env.make_ref();
+    let mut task_env = OwnedEnv::new();
+    let saved_ref = task_env.save(ref_term);
+    let message = msg.to_string();
+
+    let _ = task_env.send_and_clear(&pid, |env| {
+        let r = saved_ref.load(env);
+        let err = NifFlussError::client(message);
+        (r, (atoms::error(), err)).encode(env)
+    });
+
+    ref_term
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/atoms.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/atoms.rs
new file mode 100644
index 0000000000..45d5aa303a
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/atoms.rs
@@ -0,0 +1,224 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use fluss::error::{Error as CoreError, FlussError};
+use rustler::{Atom, NifStruct};
+
+rustler::atoms! {
+    ok,
+    error,
+    nil,
+
+    // Change types
+    append_only,
+    insert,
+    update_before,
+    update_after,
+    delete,
+
+    // Poll result message tags
+    fluss_records,
+    fluss_poll_error,
+
+    // Record map keys
+    offset,
+    timestamp,
+    change_type,
+    row,
+
+    // Error code atoms (mirror of fluss::error::FlussError).
+    none,
+    unknown_server_error,
+    network_exception,
+    unsupported_version,
+    corrupt_message,
+    database_not_exist,
+    database_not_empty,
+    database_already_exist,
+    table_not_exist,
+    table_already_exist,
+    schema_not_exist,
+    log_storage_exception,
+    kv_storage_exception,
+    not_leader_or_follower,
+    record_too_large_exception,
+    corrupt_record_exception,
+    invalid_table_exception,
+    invalid_database_exception,
+    invalid_replication_factor,
+    invalid_required_acks,
+    log_offset_out_of_range_exception,
+    non_primary_key_table_exception,
+    unknown_table_or_bucket_exception,
+    invalid_update_version_exception,
+    invalid_coordinator_exception,
+    fenced_leader_epoch_exception,
+    request_time_out,
+    storage_exception,
+    operation_not_attempted_exception,
+    not_enough_replicas_after_append_exception,
+    not_enough_replicas_exception,
+    security_token_exception,
+    out_of_order_sequence_exception,
+    duplicate_sequence_exception,
+    unknown_writer_id_exception,
+    invalid_column_projection,
+    invalid_target_column,
+    partition_not_exists,
+    table_not_partitioned_exception,
+    invalid_timestamp_exception,
+    invalid_config_exception,
+    lake_storage_not_configured_exception,
+    kv_snapshot_not_exist,
+    partition_already_exists,
+    partition_spec_invalid_exception,
+    leader_not_available_exception,
+    partition_max_num_exception,
+    authenticate_exception,
+    security_disabled_exception,
+    authorization_exception,
+    bucket_max_num_exception,
+    fenced_tiering_epoch_exception,
+    retriable_authenticate_exception,
+    invalid_server_rack_info_exception,
+    lake_snapshot_not_exist,
+    lake_table_already_exist,
+    ineligible_replica_exception,
+    invalid_alter_table_exception,
+    deletion_disabled_exception,
+    client_error,
+}
+
+pub const CLIENT_ERROR_CODE: i32 = -2;
+
+// `__exception__` is the marker `defexception` sets. Rustler bypasses the
+// Elixir constructor, so we must serialize it explicitly or `raise err`
+// rejects the struct at the Elixir side.
+#[derive(NifStruct)]
+#[module = "Fluss.Error"]
+pub struct NifFlussError {
+    pub code: Atom,
+    pub error_code: i32,
+    pub message: String,
+    #[allow(non_snake_case)]
+    pub __exception__: bool,
+}
+
+impl NifFlussError {
+    pub fn from_core(error: &CoreError) -> Self {
+        // Transport failures map to `:network_exception` (Java parity,
+        // retriable).
+        let (code, error_code) = match error {
+            CoreError::FlussAPIError { api_error } => {
+                (api_error_atom(api_error.code), api_error.code)
+            }
+            CoreError::RpcError { .. } => {
+                (network_exception(), FlussError::NetworkException.code())
+            }
+            _ => (client_error(), CLIENT_ERROR_CODE),
+        };
+        Self {
+            code,
+            error_code,
+            message: error.to_string(),
+            __exception__: true,
+        }
+    }
+
+    pub fn client(message: String) -> Self {
+        Self {
+            code: client_error(),
+            error_code: CLIENT_ERROR_CODE,
+            message,
+            __exception__: true,
+        }
+    }
+}
+
+fn api_error_atom(code: i32) -> Atom {
+    match FlussError::for_code(code) {
+        FlussError::UnknownServerError => unknown_server_error(),
+        FlussError::None => none(),
+        FlussError::NetworkException => network_exception(),
+        FlussError::UnsupportedVersion => unsupported_version(),
+        FlussError::CorruptMessage => corrupt_message(),
+        FlussError::DatabaseNotExist => database_not_exist(),
+        FlussError::DatabaseNotEmpty => database_not_empty(),
+        FlussError::DatabaseAlreadyExist => database_already_exist(),
+        FlussError::TableNotExist => table_not_exist(),
+        FlussError::TableAlreadyExist => table_already_exist(),
+        FlussError::SchemaNotExist => schema_not_exist(),
+        FlussError::LogStorageException => log_storage_exception(),
+        FlussError::KvStorageException => kv_storage_exception(),
+        FlussError::NotLeaderOrFollower => not_leader_or_follower(),
+        FlussError::RecordTooLargeException => record_too_large_exception(),
+        FlussError::CorruptRecordException => corrupt_record_exception(),
+        FlussError::InvalidTableException => invalid_table_exception(),
+        FlussError::InvalidDatabaseException => invalid_database_exception(),
+        FlussError::InvalidReplicationFactor => invalid_replication_factor(),
+        FlussError::InvalidRequiredAcks => invalid_required_acks(),
+        FlussError::LogOffsetOutOfRangeException => log_offset_out_of_range_exception(),
+        FlussError::NonPrimaryKeyTableException => non_primary_key_table_exception(),
+        FlussError::UnknownTableOrBucketException => unknown_table_or_bucket_exception(),
+        FlussError::InvalidUpdateVersionException => invalid_update_version_exception(),
+        FlussError::InvalidCoordinatorException => invalid_coordinator_exception(),
+        FlussError::FencedLeaderEpochException => fenced_leader_epoch_exception(),
+        FlussError::RequestTimeOut => request_time_out(),
+        FlussError::StorageException => storage_exception(),
+        FlussError::OperationNotAttemptedException => operation_not_attempted_exception(),
+        FlussError::NotEnoughReplicasAfterAppendException => {
+            not_enough_replicas_after_append_exception()
+        }
+        FlussError::NotEnoughReplicasException => not_enough_replicas_exception(),
+        FlussError::SecurityTokenException => security_token_exception(),
+        FlussError::OutOfOrderSequenceException => out_of_order_sequence_exception(),
+        FlussError::DuplicateSequenceException => duplicate_sequence_exception(),
+        FlussError::UnknownWriterIdException => unknown_writer_id_exception(),
+        FlussError::InvalidColumnProjection => invalid_column_projection(),
+        FlussError::InvalidTargetColumn => invalid_target_column(),
+        FlussError::PartitionNotExists => partition_not_exists(),
+        FlussError::TableNotPartitionedException => table_not_partitioned_exception(),
+        FlussError::InvalidTimestampException => invalid_timestamp_exception(),
+        FlussError::InvalidConfigException => invalid_config_exception(),
+        FlussError::LakeStorageNotConfiguredException => lake_storage_not_configured_exception(),
+        FlussError::KvSnapshotNotExist => kv_snapshot_not_exist(),
+        FlussError::PartitionAlreadyExists => partition_already_exists(),
+        FlussError::PartitionSpecInvalidException => partition_spec_invalid_exception(),
+        FlussError::LeaderNotAvailableException => leader_not_available_exception(),
+        FlussError::PartitionMaxNumException => partition_max_num_exception(),
+        FlussError::AuthenticateException => authenticate_exception(),
+        FlussError::SecurityDisabledException => security_disabled_exception(),
+        FlussError::AuthorizationException => authorization_exception(),
+        FlussError::BucketMaxNumException => bucket_max_num_exception(),
+        FlussError::FencedTieringEpochException => fenced_tiering_epoch_exception(),
+        FlussError::RetriableAuthenticateException => retriable_authenticate_exception(),
+        FlussError::InvalidServerRackInfoException => invalid_server_rack_info_exception(),
+        FlussError::LakeSnapshotNotExist => lake_snapshot_not_exist(),
+        FlussError::LakeTableAlreadyExist => lake_table_already_exist(),
+        FlussError::IneligibleReplicaException => ineligible_replica_exception(),
+        FlussError::InvalidAlterTableException => invalid_alter_table_exception(),
+        FlussError::DeletionDisabledException => deletion_disabled_exception(),
+    }
+}
+
+pub fn to_nif_err(e: CoreError) -> rustler::Error {
+    rustler::Error::Term(Box::new(NifFlussError::from_core(&e)))
+}
+
+pub fn client_err(msg: impl Into<String>) -> rustler::Error {
+    rustler::Error::Term(Box::new(NifFlussError::client(msg.into())))
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/config.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/config.rs
new file mode 100644
index 0000000000..8c1bab51eb
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/config.rs
@@ -0,0 +1,147 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use fluss::config::{Config, NoKeyAssigner};
+use rustler::{NifStruct, NifUnitEnum};
+
+/// Bucket-assigner strategy for tables without bucket keys.
+/// Maps to fluss::config::NoKeyAssigner.
+#[derive(NifUnitEnum)]
+pub enum NifNoKeyAssigner {
+    Sticky,
+    RoundRobin,
+}
+
+/// Decoded from `%Fluss.Config{}` Elixir struct.
+#[derive(NifStruct)]
+#[module = "Fluss.Config"]
+pub struct NifConfig {
+    pub bootstrap_servers: String,
+    pub connect_timeout_ms: Option<u64>,
+    pub remote_file_download_thread_num: Option<u64>,
+    pub scanner_log_fetch_max_bytes: Option<i32>,
+    pub scanner_log_fetch_max_bytes_for_bucket: Option<i32>,
+    pub scanner_log_fetch_min_bytes: Option<i32>,
+    pub scanner_log_fetch_wait_max_time_ms: Option<i32>,
+    pub scanner_log_max_poll_records: Option<u64>,
+    pub scanner_remote_log_prefetch_num: Option<u64>,
+    pub scanner_remote_log_read_concurrency: Option<u64>,
+    pub security_protocol: Option<String>,
+    pub security_sasl_mechanism: Option<String>,
+    pub security_sasl_password: Option<String>,
+    pub security_sasl_username: Option<String>,
+    pub writer_acks: Option<String>,
+    pub writer_batch_size: Option<i32>,
+    pub writer_batch_timeout_ms: Option<i64>,
+    pub writer_bucket_no_key_assigner: Option<NifNoKeyAssigner>,
+    pub writer_buffer_memory_size: Option<u64>,
+    pub writer_buffer_wait_timeout_ms: Option<u64>,
+    pub writer_dynamic_batch_size_enabled: Option<bool>,
+    pub writer_dynamic_batch_size_min: Option<i32>,
+    pub writer_enable_idempotence: Option<bool>,
+    pub writer_max_inflight_requests_per_bucket: Option<u64>,
+    pub writer_request_max_size: Option<i32>,
+    pub writer_retries: Option<i32>,
+}
+
+impl NifConfig {
+    pub fn into_core(self) -> Config {
+        let mut config = Config {
+            bootstrap_servers: self.bootstrap_servers,
+            ..Config::default()
+        };
+        if let Some(timeout) = self.connect_timeout_ms {
+            config.connect_timeout_ms = timeout;
+        }
+        if let Some(n) = self.remote_file_download_thread_num {
+            config.remote_file_download_thread_num = n as usize;
+        }
+        if let Some(size) = self.scanner_log_fetch_max_bytes {
+            config.scanner_log_fetch_max_bytes = size;
+        }
+        if let Some(size) = self.scanner_log_fetch_max_bytes_for_bucket {
+            config.scanner_log_fetch_max_bytes_for_bucket = size;
+        }
+        if let Some(size) = self.scanner_log_fetch_min_bytes {
+            config.scanner_log_fetch_min_bytes = size;
+        }
+        if let Some(ms) = self.scanner_log_fetch_wait_max_time_ms {
+            config.scanner_log_fetch_wait_max_time_ms = ms;
+        }
+        if let Some(n) = self.scanner_log_max_poll_records {
+            config.scanner_log_max_poll_records = n as usize;
+        }
+        if let Some(n) = self.scanner_remote_log_prefetch_num {
+            config.scanner_remote_log_prefetch_num = n as usize;
+        }
+        if let Some(n) = self.scanner_remote_log_read_concurrency {
+            config.scanner_remote_log_read_concurrency = n as usize;
+        }
+        if let Some(protocol) = self.security_protocol {
+            config.security_protocol = protocol;
+        }
+        if let Some(mechanism) = self.security_sasl_mechanism {
+            config.security_sasl_mechanism = mechanism;
+        }
+        if let Some(password) = self.security_sasl_password {
+            config.security_sasl_password = password;
+        }
+        if let Some(username) = self.security_sasl_username {
+            config.security_sasl_username = username;
+        }
+        if let Some(size) = self.writer_batch_size {
+            config.writer_batch_size = size;
+        }
+        if let Some(ms) = self.writer_batch_timeout_ms {
+            config.writer_batch_timeout_ms = ms;
+        }
+        if let Some(enabled) = self.writer_dynamic_batch_size_enabled {
+            config.writer_dynamic_batch_size_enabled = enabled;
+        }
+        if let Some(size) = self.writer_dynamic_batch_size_min {
+            config.writer_dynamic_batch_size_min = size;
+        }
+        if let Some(acks) = self.writer_acks {
+            config.writer_acks = acks;
+        }
+        if let Some(assigner) = self.writer_bucket_no_key_assigner {
+            config.writer_bucket_no_key_assigner = match assigner {
+                NifNoKeyAssigner::Sticky => NoKeyAssigner::Sticky,
+                NifNoKeyAssigner::RoundRobin => NoKeyAssigner::RoundRobin,
+            };
+        }
+        if let Some(memory_size) = self.writer_buffer_memory_size {
+            config.writer_buffer_memory_size = memory_size as usize;
+        }
+        if let Some(timeout_ms) = self.writer_buffer_wait_timeout_ms {
+            config.writer_buffer_wait_timeout_ms = timeout_ms;
+        }
+        if let Some(enabled) = self.writer_enable_idempotence {
+            config.writer_enable_idempotence = enabled;
+        }
+        if let Some(requests_limit) = self.writer_max_inflight_requests_per_bucket {
+            config.writer_max_inflight_requests_per_bucket = requests_limit as usize;
+        }
+        if let Some(max_size) = self.writer_request_max_size {
+            config.writer_request_max_size = max_size;
+        }
+        if let Some(retries) = self.writer_retries {
+            config.writer_retries = retries;
+        }
+        config
+    }
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/connection.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/connection.rs
new file mode 100644
index 0000000000..4c788eeec7
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/connection.rs
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::async_nif;
+use crate::config::NifConfig;
+use fluss::client::FlussConnection;
+use rustler::{Env, ResourceArc, Term};
+use std::sync::Arc;
+
+pub struct ConnectionResource {
+    pub inner: Arc<FlussConnection>,
+}
+
+impl std::panic::RefUnwindSafe for ConnectionResource {}
+
+#[rustler::resource_impl]
+impl rustler::Resource for ConnectionResource {}
+
+#[rustler::nif]
+fn connection_new<'a>(env: Env<'a>, config: NifConfig) -> Term<'a> {
+    let core_config = config.into_core();
+    async_nif::spawn_task_with_result(env, async move {
+        FlussConnection::new(core_config).await.map(|conn| {
+            ResourceArc::new(ConnectionResource {
+                inner: Arc::new(conn),
+            })
+        })
+    })
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/lib.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/lib.rs
new file mode 100644
index 0000000000..a843d65f21
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/lib.rs
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Rustler 0.37 wraps every NIF body in `std::panic::catch_unwind`, which requires
+// all captured values (including `ResourceArc<T>`) to be `RefUnwindSafe`.
+// `ResourceArc` contains `*mut T`, so it is only `RefUnwindSafe` when `T` is.
+// Our resource types contain `parking_lot` locks (`UnsafeCell`) which opt out of
+// the auto-trait. We manually impl `RefUnwindSafe` on each resource type because
+// panic safety is already guaranteed by the NIF boundary — a panic is caught and
+// converted to an Erlang exception, never observed by Rust code.
+
+mod admin;
+mod append_writer;
+mod async_nif;
+mod atoms;
+mod config;
+mod connection;
+mod log_scanner;
+mod row_convert;
+mod schema;
+mod table;
+mod write_handle;
+
+use std::sync::LazyLock;
+
+static RUNTIME: LazyLock<tokio::runtime::Runtime> = LazyLock::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .expect("failed to create tokio runtime")
+});
+
+rustler::init!("Elixir.Fluss.Native");
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/log_scanner.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/log_scanner.rs
new file mode 100644
index 0000000000..62614e0e67
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/log_scanner.rs
@@ -0,0 +1,166 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::RUNTIME;
+use crate::async_nif;
+use crate::atoms::{self, NifFlussError, to_nif_err};
+use crate::row_convert;
+use crate::table::TableResource;
+use fluss::client::{EARLIEST_OFFSET, LogScanner};
+use fluss::error::Error;
+use fluss::metadata::Column;
+use fluss::record::{ChangeType, ScanRecords};
+use rustler::env::OwnedEnv;
+use rustler::types::LocalPid;
+use rustler::{Atom, Encoder, Env, ResourceArc, Term};
+use std::collections::HashMap;
+use std::time::Duration;
+
+pub struct LogScannerResource {
+    pub inner: LogScanner,
+    pub columns: Vec<Column>,
+}
+
+impl std::panic::RefUnwindSafe for LogScannerResource {}
+
+#[rustler::resource_impl]
+impl rustler::Resource for LogScannerResource {}
+
+#[rustler::nif]
+fn log_scanner_new(
+    table: ResourceArc<TableResource>,
+) -> Result<ResourceArc<LogScannerResource>, rustler::Error> {
+    let _guard = RUNTIME.enter();
+    let (inner, columns) = table.with_table(|t| {
+        let inner = t.new_scan().create_log_scanner().map_err(to_nif_err)?;
+        Ok((inner, t.get_table_info().schema.columns().to_vec()))
+    })?;
+    Ok(ResourceArc::new(LogScannerResource { inner, columns }))
+}
+
+#[rustler::nif]
+fn log_scanner_subscribe<'a>(
+    env: Env<'a>,
+    scanner: ResourceArc<LogScannerResource>,
+    bucket: i32,
+    offset: i64,
+) -> Term<'a> {
+    async_nif::spawn_task(
+        env,
+        async move { scanner.inner.subscribe(bucket, offset).await },
+    )
+}
+
+#[rustler::nif]
+fn log_scanner_subscribe_buckets<'a>(
+    env: Env<'a>,
+    scanner: ResourceArc<LogScannerResource>,
+    bucket_offsets: Vec<(i32, i64)>,
+) -> Term<'a> {
+    let map: HashMap<i32, i64> = bucket_offsets.into_iter().collect();
+    async_nif::spawn_task(
+        env,
+        async move { scanner.inner.subscribe_buckets(&map).await },
+    )
+}
+
+#[rustler::nif]
+fn log_scanner_unsubscribe<'a>(
+    env: Env<'a>,
+    scanner: ResourceArc<LogScannerResource>,
+    bucket: i32,
+) -> Term<'a> {
+    async_nif::spawn_task(env, async move { scanner.inner.unsubscribe(bucket).await })
+}
+
+#[rustler::nif]
+fn log_scanner_poll(env: Env, scanner: ResourceArc<LogScannerResource>, timeout_ms: u64) -> Atom {
+    let pid = env.pid();
+    let scanner = scanner.clone();
+
+    RUNTIME.spawn(async move {
+        let result = scanner.inner.poll(Duration::from_millis(timeout_ms)).await;
+        send_poll_result(&pid, result, &scanner.columns);
+    });
+
+    atoms::ok()
+}
+
+fn send_poll_result(pid: &LocalPid, result: Result<ScanRecords, Error>, columns: &[Column]) {
+    let mut msg_env = OwnedEnv::new();
+
+    match result {
+        Ok(scan_records) => {
+            let _ = msg_env.send_and_clear(pid, |env| {
+                match encode_scan_records(env, scan_records, columns) {
+                    Ok(records) => (atoms::fluss_records(), records).encode(env),
+                    Err(message) => {
+                        (atoms::fluss_poll_error(), NifFlussError::client(message)).encode(env)
+                    }
+                }
+            });
+        }
+        Err(e) => {
+            let _ = msg_env.send_and_clear(pid, |env| {
+                (atoms::fluss_poll_error(), NifFlussError::from_core(&e)).encode(env)
+            });
+        }
+    }
+}
+
+fn encode_scan_records<'a>(
+    env: Env<'a>,
+    scan_records: ScanRecords,
+    columns: &[Column],
+) -> Result<rustler::Term<'a>, String> {
+    let column_atoms = row_convert::intern_column_atoms(env, columns);
+    let mut result = Vec::new();
+
+    for record in scan_records {
+        let row_map = row_convert::row_to_term(env, record.row(), columns, &column_atoms)
+            .map_err(|e| format!("failed to convert row at offset {}: {e}", record.offset()))?;
+        let change_type_atom = match record.change_type() {
+            ChangeType::AppendOnly => atoms::append_only().encode(env),
+            ChangeType::Insert => atoms::insert().encode(env),
+            ChangeType::UpdateBefore => atoms::update_before().encode(env),
+            ChangeType::UpdateAfter => atoms::update_after().encode(env),
+            ChangeType::Delete => atoms::delete().encode(env),
+        };
+
+        let record_map = rustler::Term::map_from_pairs(
+            env,
+            &[
+                (atoms::offset().encode(env), record.offset().encode(env)),
+                (
+                    atoms::timestamp().encode(env),
+                    record.timestamp().encode(env),
+                ),
+                (atoms::change_type().encode(env), change_type_atom),
+                (atoms::row().encode(env), row_map),
+            ],
+        )
+        .map_err(|_| "failed to create record map".to_string())?;
+        result.push(record_map);
+    }
+
+    Ok(result.encode(env))
+}
+
+#[rustler::nif]
+fn earliest_offset() -> i64 {
+    EARLIEST_OFFSET
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/row_convert.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/row_convert.rs
new file mode 100644
index 0000000000..c72395e900
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/row_convert.rs
@@ -0,0 +1,267 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::str::FromStr;
+
+use fluss::metadata::{Column, DataType};
+use fluss::row::{Date, Decimal, GenericRow, InternalRow, Time, TimestampLtz, TimestampNtz};
+use rustler::types::binary::NewBinary;
+use rustler::{Encoder, Env, Term};
+
+use crate::atoms;
+
+/// Convert column names to BEAM atoms for use as map keys.
+///
+/// Note: BEAM atoms are never garbage-collected. This is safe because column
+/// names come from server-defined table schemas (bounded set), not arbitrary
+/// user input. The BEAM deduplicates atoms, so repeated calls with the same
+/// column names do not grow the atom table.
+pub fn intern_column_atoms<'a>(env: Env<'a>, columns: &[Column]) -> Vec<rustler::Atom> {
+    columns
+        .iter()
+        .map(|col| rustler::Atom::from_str(env, col.name()).expect("valid atom"))
+        .collect()
+}
+
+pub fn row_to_term<'a>(
+    env: Env<'a>,
+    row: &dyn InternalRow,
+    columns: &[Column],
+    column_atoms: &[rustler::Atom],
+) -> Result<Term<'a>, String> {
+    let pairs: Vec<(Term<'a>, Term<'a>)> = columns
+        .iter()
+        .enumerate()
+        .map(|(i, col)| {
+            let key = column_atoms[i].encode(env);
+            let value = field_to_term(env, row, i, col.data_type())?;
+            Ok((key, value))
+        })
+        .collect::<Result<_, String>>()?;
+    Term::map_from_pairs(env, &pairs).map_err(|_| "failed to create map".to_string())
+}
+
+fn field_to_term<'a>(
+    env: Env<'a>,
+    row: &dyn InternalRow,
+    pos: usize,
+    data_type: &DataType,
+) -> Result<Term<'a>, String> {
+    if row.is_null_at(pos).map_err(|e| e.to_string())? {
+        return Ok(atoms::nil().encode(env));
+    }
+
+    match data_type {
+        DataType::Boolean(_) => {
+            let v = row.get_boolean(pos).map_err(|e| e.to_string())?;
+            Ok(v.encode(env))
+        }
+        DataType::TinyInt(_) => {
+            let v = row.get_byte(pos).map_err(|e| e.to_string())?;
+            Ok(v.encode(env))
+        }
+        DataType::SmallInt(_) => {
+            let v = row.get_short(pos).map_err(|e| e.to_string())?;
+            Ok(v.encode(env))
+        }
+        DataType::Int(_) => {
+            let v = row.get_int(pos).map_err(|e| e.to_string())?;
+            Ok(v.encode(env))
+        }
+        DataType::BigInt(_) => {
+            let v = row.get_long(pos).map_err(|e| e.to_string())?;
+            Ok(v.encode(env))
+        }
+        DataType::Float(_) => {
+            let v = row.get_float(pos).map_err(|e| e.to_string())?;
+            Ok(v.encode(env))
+        }
+        DataType::Double(_) => {
+            let v = row.get_double(pos).map_err(|e| e.to_string())?;
+            Ok(v.encode(env))
+        }
+        DataType::String(_) => {
+            let v = row.get_string(pos).map_err(|e| e.to_string())?;
+            Ok(v.encode(env))
+        }
+        DataType::Char(ct) => {
+            let v = row
+                .get_char(pos, ct.length() as usize)
+                .map_err(|e| e.to_string())?;
+            Ok(v.encode(env))
+        }
+        DataType::Bytes(_) => {
+            let v = row.get_bytes(pos).map_err(|e| e.to_string())?;
+            let mut bin = NewBinary::new(env, v.len());
+            bin.as_mut_slice().copy_from_slice(v);
+            let binary: rustler::Binary = bin.into();
+            Ok(binary.encode(env))
+        }
+        DataType::Binary(bt) => {
+            let v = row
+                .get_binary(pos, bt.length())
+                .map_err(|e| e.to_string())?;
+            let mut bin = NewBinary::new(env, v.len());
+            bin.as_mut_slice().copy_from_slice(v);
+            let binary: rustler::Binary = bin.into();
+            Ok(binary.encode(env))
+        }
+        DataType::Date(_) => {
+            let v = row.get_date(pos).map_err(|e| e.to_string())?;
+            Ok(v.get_inner().encode(env))
+        }
+        DataType::Time(_) => {
+            let v = row.get_time(pos).map_err(|e| e.to_string())?;
+            Ok(v.get_inner().encode(env))
+        }
+        DataType::Timestamp(ts) => {
+            let v = row
+                .get_timestamp_ntz(pos, ts.precision())
+                .map_err(|e| e.to_string())?;
+            Ok((v.get_millisecond(), v.get_nano_of_millisecond()).encode(env))
+        }
+        DataType::TimestampLTz(ts) => {
+            let v = row
+                .get_timestamp_ltz(pos, ts.precision())
+                .map_err(|e| e.to_string())?;
+            Ok((v.get_epoch_millisecond(), v.get_nano_of_millisecond()).encode(env))
+        }
+        DataType::Decimal(dt) => {
+            let v = row
+                .get_decimal(pos, dt.precision() as usize, dt.scale() as usize)
+                .map_err(|e| e.to_string())?;
+            Ok(v.to_string().encode(env))
+        }
+        _ => Err(format!("unsupported data type: {data_type:?}")),
+    }
+}
+
+pub fn term_to_row<'a>(
+    env: Env<'a>,
+    values: Term<'a>,
+    columns: &[Column],
+) -> Result<GenericRow<'static>, String> {
+    let list: Vec<Term<'a>> = values
+        .decode()
+        .map_err(|_| "expected a list of values".to_string())?;
+    if list.len() != columns.len() {
+        return Err(format!(
+            "expected {} values, got {}",
+            columns.len(),
+            list.len()
+        ));
+    }
+
+    let mut row = GenericRow::new(columns.len());
+    for (i, (term, col)) in list.iter().zip(columns.iter()).enumerate() {
+        if term.is_atom()
+            && let Ok(atom) = term.decode::<rustler::Atom>()
+            && atom == atoms::nil()
+        {
+            continue; // leave as null
+        }
+        set_field_from_term(env, &mut row, i, *term, col.data_type())?;
+    }
+    Ok(row)
+}
+
+fn set_field_from_term<'a>(
+    _env: Env<'a>,
+    row: &mut GenericRow<'static>,
+    pos: usize,
+    term: Term<'a>,
+    data_type: &DataType,
+) -> Result<(), String> {
+    match data_type {
+        DataType::Boolean(_) => {
+            let v: bool = term.decode().map_err(|_| "expected boolean")?;
+            row.set_field(pos, v);
+        }
+        DataType::TinyInt(_) => {
+            let v: i8 = term
+                .decode()
+                .map_err(|_| "expected integer in range -128..127 for tinyint")?;
+            row.set_field(pos, v);
+        }
+        DataType::SmallInt(_) => {
+            let v: i16 = term
+                .decode()
+                .map_err(|_| "expected integer in range -32768..32767 for smallint")?;
+            row.set_field(pos, v);
+        }
+        DataType::Int(_) => {
+            let v: i32 = term.decode().map_err(|_| "expected integer")?;
+            row.set_field(pos, v);
+        }
+        DataType::BigInt(_) => {
+            let v: i64 = term.decode().map_err(|_| "expected integer")?;
+            row.set_field(pos, v);
+        }
+        DataType::Date(_) => {
+            let v: i32 = term
+                .decode()
+                .map_err(|_| "expected integer (days since epoch)")?;
+            row.set_field(pos, Date::new(v));
+        }
+        DataType::Time(_) => {
+            let v: i32 = term
+                .decode()
+                .map_err(|_| "expected integer (millis since midnight)")?;
+            row.set_field(pos, Time::new(v));
+        }
+        DataType::Timestamp(_) => {
+            let (millis, nanos): (i64, i32) = term
+                .decode()
+                .map_err(|_| "expected {millis, nanos} tuple for timestamp")?;
+            let ts = TimestampNtz::from_millis_nanos(millis, nanos).map_err(|e| e.to_string())?;
+            row.set_field(pos, ts);
+        }
+        DataType::TimestampLTz(_) => {
+            let (millis, nanos): (i64, i32) = term
+                .decode()
+                .map_err(|_| "expected {millis, nanos} tuple for timestamp_ltz")?;
+            let ts = TimestampLtz::from_millis_nanos(millis, nanos).map_err(|e| e.to_string())?;
+            row.set_field(pos, ts);
+        }
+        DataType::Float(_) => {
+            let v: f64 = term.decode().map_err(|_| "expected number for float")?;
+            row.set_field(pos, v as f32);
+        }
+        DataType::Double(_) => {
+            let v: f64 = term.decode().map_err(|_| "expected number for double")?;
+            row.set_field(pos, v);
+        }
+        DataType::String(_) | DataType::Char(_) => {
+            let v: String = term.decode().map_err(|_| "expected string")?;
+            row.set_field(pos, v);
+        }
+        DataType::Decimal(dt) => {
+            let v: String = term.decode().map_err(|_| "expected string for decimal")?;
+            let bd = bigdecimal::BigDecimal::from_str(&v)
+                .map_err(|e| format!("failed to parse decimal '{v}': {e}"))?;
+            let decimal = Decimal::from_big_decimal(bd, dt.precision(), dt.scale())
+                .map_err(|e| e.to_string())?;
+            row.set_field(pos, decimal);
+        }
+        DataType::Bytes(_) | DataType::Binary(_) => {
+            let bin: rustler::Binary = term.decode().map_err(|_| "expected binary")?;
+            row.set_field(pos, bin.as_slice().to_vec());
+        }
+        _ => return Err(format!("unsupported data type for writing: {data_type:?}")),
+    }
+    Ok(())
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/schema.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/schema.rs
new file mode 100644
index 0000000000..5d61d29daf
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/schema.rs
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::atoms::to_nif_err;
+use fluss::metadata::{self, DataTypes, Schema, TableDescriptor};
+use rustler::{NifStruct, NifTaggedEnum, ResourceArc};
+
+pub struct TableDescriptorResource {
+    pub inner: TableDescriptor,
+}
+
+impl std::panic::RefUnwindSafe for TableDescriptorResource {}
+
+#[rustler::resource_impl]
+impl rustler::Resource for TableDescriptorResource {}
+
+/// Fluss data type for NIF interop.
+///
+/// Simple types map to atoms: `:int`, `:string`, etc.
+/// Parameterized types map to tuples: `{:decimal, 10, 2}`, `{:char, 20}`.
+#[derive(NifTaggedEnum)]
+pub enum DataType {
+    Boolean,
+    Tinyint,
+    Smallint,
+    Int,
+    Bigint,
+    Float,
+    Double,
+    String,
+    Bytes,
+    Date,
+    Time,
+    Timestamp,
+    TimestampLtz,
+    Decimal(u32, u32),
+    Char(u32),
+    Binary(usize),
+}
+
+fn to_fluss_type(dt: &DataType) -> metadata::DataType {
+    match dt {
+        DataType::Boolean => DataTypes::boolean(),
+        DataType::Tinyint => DataTypes::tinyint(),
+        DataType::Smallint => DataTypes::smallint(),
+        DataType::Int => DataTypes::int(),
+        DataType::Bigint => DataTypes::bigint(),
+        DataType::Float => DataTypes::float(),
+        DataType::Double => DataTypes::double(),
+        DataType::String => DataTypes::string(),
+        DataType::Bytes => DataTypes::bytes(),
+        DataType::Date => DataTypes::date(),
+        DataType::Time => DataTypes::time(),
+        DataType::Timestamp => DataTypes::timestamp(),
+        DataType::TimestampLtz => DataTypes::timestamp_ltz(),
+        DataType::Decimal(precision, scale) => DataTypes::decimal(*precision, *scale),
+        DataType::Char(length) => DataTypes::char(*length),
+        DataType::Binary(length) => DataTypes::binary(*length),
+    }
+}
+
+/// Decoded from `%Fluss.Schema{}` Elixir struct.
+#[derive(NifStruct)]
+#[module = "Fluss.Schema"]
+pub struct NifSchema {
+    pub columns: Vec<(String, DataType)>,
+    pub primary_key: Vec<String>,
+}
+
+#[rustler::nif]
+fn table_descriptor_new(
+    schema: NifSchema,
+    bucket_count: Option<i32>,
+    properties: Vec<(String, String)>,
+) -> Result<ResourceArc<TableDescriptorResource>, rustler::Error> {
+    let mut schema_builder = Schema::builder();
+    for (name, dt) in &schema.columns {
+        schema_builder = schema_builder.column(name, to_fluss_type(dt));
+    }
+    if !schema.primary_key.is_empty() {
+        schema_builder = schema_builder.primary_key(schema.primary_key);
+    }
+    let built_schema = schema_builder.build().map_err(to_nif_err)?;
+
+    let mut builder = TableDescriptor::builder().schema(built_schema);
+    if let Some(count) = bucket_count {
+        builder = builder.distributed_by(Some(count), vec![]);
+    }
+    for (key, value) in properties {
+        builder = builder.property(&key, &value);
+    }
+    let descriptor = builder.build().map_err(to_nif_err)?;
+    Ok(ResourceArc::new(TableDescriptorResource {
+        inner: descriptor,
+    }))
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/table.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/table.rs
new file mode 100644
index 0000000000..d48ff7ab29
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/table.rs
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::async_nif;
+use crate::connection::ConnectionResource;
+use fluss::client::{FlussConnection, FlussTable, Metadata};
+use fluss::error::Error;
+use fluss::metadata::{Column, TableInfo, TablePath};
+use rustler::{Env, ResourceArc, Term};
+use std::sync::Arc;
+
+/// Holds the data needed to reconstruct FlussTable (which has a lifetime
+/// tied to FlussConnection). We store the Arc<FlussConnection> to keep
+/// it alive and reconstruct short-lived FlussTable instances on demand.
+pub struct TableResource {
+    pub connection: Arc<FlussConnection>,
+    pub metadata: Arc<Metadata>,
+    pub table_info: TableInfo,
+}
+
+impl std::panic::RefUnwindSafe for TableResource {}
+
+#[rustler::resource_impl]
+impl rustler::Resource for TableResource {}
+
+impl TableResource {
+    pub fn columns(&self) -> &[Column] {
+        self.table_info.schema.columns()
+    }
+
+    pub fn with_table<T>(&self, f: impl FnOnce(&FlussTable<'_>) -> T) -> T {
+        let table = FlussTable::new(
+            &self.connection,
+            self.metadata.clone(),
+            self.table_info.clone(),
+        );
+        f(&table)
+    }
+}
+
+#[rustler::nif]
+fn table_get<'a>(
+    env: Env<'a>,
+    conn: ResourceArc<ConnectionResource>,
+    database_name: String,
+    table_name: String,
+) -> Term<'a> {
+    let conn_arc = conn.inner.clone();
+    async_nif::spawn_task_with_result(env, async move {
+        let path = TablePath::new(&database_name, &table_name);
+        let (metadata, table_info) = {
+            let table = conn_arc.get_table(&path).await?;
+            (table.metadata().clone(), table.get_table_info().clone())
+        };
+        Ok::<_, Error>(ResourceArc::new(TableResource {
+            connection: conn_arc,
+            metadata,
+            table_info,
+        }))
+    })
+}
+
+#[rustler::nif]
+fn table_has_primary_key(table: ResourceArc<TableResource>) -> bool {
+    table.table_info.has_primary_key()
+}
+
+#[rustler::nif]
+fn table_column_names(table: ResourceArc<TableResource>) -> Vec<String> {
+    table
+        .columns()
+        .iter()
+        .map(|c| c.name().to_string())
+        .collect()
+}
diff --git a/fluss-rust/bindings/elixir/native/fluss_nif/src/write_handle.rs b/fluss-rust/bindings/elixir/native/fluss_nif/src/write_handle.rs
new file mode 100644
index 0000000000..08046660bf
--- /dev/null
+++ b/fluss-rust/bindings/elixir/native/fluss_nif/src/write_handle.rs
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::async_nif;
+use fluss::client::WriteResultFuture;
+use rustler::{Env, ResourceArc, Term};
+use std::sync::Mutex;
+
+pub struct WriteHandleResource {
+    inner: Mutex<Option<WriteResultFuture>>,
+}
+
+impl std::panic::RefUnwindSafe for WriteHandleResource {}
+
+#[rustler::resource_impl]
+impl rustler::Resource for WriteHandleResource {}
+
+impl WriteHandleResource {
+    pub fn new(future: WriteResultFuture) -> Self {
+        Self {
+            inner: Mutex::new(Some(future)),
+        }
+    }
+}
+
+#[rustler::nif]
+fn write_handle_wait<'a>(env: Env<'a>, handle: ResourceArc<WriteHandleResource>) -> Term<'a> {
+    let future = handle.inner.lock().unwrap().take();
+    match future {
+        Some(f) => async_nif::spawn_task(env, f),
+        None => async_nif::send_client_error(env, "WriteHandle already consumed"),
+    }
+}
diff --git a/fluss-rust/bindings/elixir/test/config_test.exs b/fluss-rust/bindings/elixir/test/config_test.exs
new file mode 100644
index 0000000000..f4b8a11ca1
--- /dev/null
+++ b/fluss-rust/bindings/elixir/test/config_test.exs
@@ -0,0 +1,228 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.ConfigTest do
+  use ExUnit.Case, async: true
+
+  test "new/1 creates config with bootstrap_servers; all other fields default to nil" do
+    config = Fluss.Config.new("localhost:9123")
+    assert config == %Fluss.Config{bootstrap_servers: "localhost:9123"}
+  end
+
+  test "set_connect_timeout_ms/2 sets the connect timeout" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_connect_timeout_ms(30_000)
+
+    assert config.connect_timeout_ms == 30_000
+  end
+
+  test "set_remote_file_download_thread_num/2 sets the download thread num" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_remote_file_download_thread_num(4)
+
+    assert config.remote_file_download_thread_num == 4
+  end
+
+  test "set_scanner_log_fetch_max_bytes/2 sets the fetch max bytes" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_scanner_log_fetch_max_bytes(16_777_216)
+
+    assert config.scanner_log_fetch_max_bytes == 16_777_216
+  end
+
+  test "set_scanner_log_fetch_max_bytes_for_bucket/2 sets the per-bucket fetch limit" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_scanner_log_fetch_max_bytes_for_bucket(1_048_576)
+
+    assert config.scanner_log_fetch_max_bytes_for_bucket == 1_048_576
+  end
+
+  test "set_scanner_log_fetch_min_bytes/2 sets the fetch min bytes" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_scanner_log_fetch_min_bytes(1)
+
+    assert config.scanner_log_fetch_min_bytes == 1
+  end
+
+  test "set_scanner_log_fetch_wait_max_time_ms/2 sets the max wait time" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_scanner_log_fetch_wait_max_time_ms(500)
+
+    assert config.scanner_log_fetch_wait_max_time_ms == 500
+  end
+
+  test "set_scanner_log_max_poll_records/2 sets the max poll records" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_scanner_log_max_poll_records(1000)
+
+    assert config.scanner_log_max_poll_records == 1000
+  end
+
+  test "set_scanner_remote_log_prefetch_num/2 sets the prefetch num" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_scanner_remote_log_prefetch_num(2)
+
+    assert config.scanner_remote_log_prefetch_num == 2
+  end
+
+  test "set_scanner_remote_log_read_concurrency/2 sets the read concurrency" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_scanner_remote_log_read_concurrency(4)
+
+    assert config.scanner_remote_log_read_concurrency == 4
+  end
+
+  test "set_security_protocol/2 sets the security protocol" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_security_protocol("sasl")
+
+    assert config.security_protocol == "sasl"
+  end
+
+  test "set_security_sasl_mechanism/2 sets the SASL mechanism" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_security_sasl_mechanism("PLAIN")
+
+    assert config.security_sasl_mechanism == "PLAIN"
+  end
+
+  test "set_security_sasl_username/2 sets the SASL username" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_security_sasl_username("admin")
+
+    assert config.security_sasl_username == "admin"
+  end
+
+  test "set_security_sasl_password/2 sets the SASL password" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_security_sasl_password("secret")
+
+    assert config.security_sasl_password == "secret"
+  end
+
+  test "inspect/1 redacts security_sasl_password when set" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_security_sasl_password("supersecret")
+
+    output = inspect(config)
+    refute output =~ "supersecret"
+    assert output =~ "[REDACTED]"
+  end
+
+  test "inspect/1 leaves nil security_sasl_password as nil" do
+    config = Fluss.Config.new("localhost:9123")
+    output = inspect(config)
+    assert output =~ "security_sasl_password: nil"
+  end
+
+  test "set_writer_acks/2 sets the acks value" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_acks("all")
+
+    assert config.writer_acks == "all"
+  end
+
+  test "set_writer_bucket_no_key_assigner/2 sets a valid assigner" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_bucket_no_key_assigner(:sticky)
+
+    assert config.writer_bucket_no_key_assigner == :sticky
+  end
+
+  test "set_writer_bucket_no_key_assigner/2 only accepts :sticky or :round_robin" do
+    assert_raise FunctionClauseError, fn ->
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_bucket_no_key_assigner(:custom)
+    end
+  end
+
+  test "set_writer_buffer_memory_size/2 sets the buffer memory size" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_buffer_memory_size(67_108_864)
+
+    assert config.writer_buffer_memory_size == 67_108_864
+  end
+
+  test "set_writer_buffer_wait_timeout_ms/2 sets the wait timeout" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_buffer_wait_timeout_ms(5_000)
+
+    assert config.writer_buffer_wait_timeout_ms == 5_000
+  end
+
+  test "set_writer_enable_idempotence/2 sets the idempotence flag" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_enable_idempotence(false)
+
+    assert config.writer_enable_idempotence == false
+  end
+
+  test "set_writer_max_inflight_requests_per_bucket/2 sets the inflight limit" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_max_inflight_requests_per_bucket(3)
+
+    assert config.writer_max_inflight_requests_per_bucket == 3
+  end
+
+  test "set_writer_request_max_size/2 sets the request max size" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_request_max_size(2_097_152)
+
+    assert config.writer_request_max_size == 2_097_152
+  end
+
+  test "set_writer_retries/2 sets the retry count" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_retries(5)
+
+    assert config.writer_retries == 5
+  end
+
+  test "setters chain correctly" do
+    config =
+      Fluss.Config.new("localhost:9123")
+      |> Fluss.Config.set_writer_acks("all")
+      |> Fluss.Config.set_writer_retries(3)
+      |> Fluss.Config.set_writer_bucket_no_key_assigner(:round_robin)
+
+    assert config.writer_acks == "all"
+    assert config.writer_retries == 3
+    assert config.writer_bucket_no_key_assigner == :round_robin
+  end
+end
diff --git a/fluss-rust/bindings/elixir/test/error_test.exs b/fluss-rust/bindings/elixir/test/error_test.exs
new file mode 100644
index 0000000000..d6d4017597
--- /dev/null
+++ b/fluss-rust/bindings/elixir/test/error_test.exs
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.ErrorTest do
+  use ExUnit.Case, async: true
+
+  @retriable_codes [
+    :network_exception,
+    :corrupt_message,
+    :schema_not_exist,
+    :log_storage_exception,
+    :kv_storage_exception,
+    :not_leader_or_follower,
+    :corrupt_record_exception,
+    :unknown_table_or_bucket_exception,
+    :request_time_out,
+    :storage_exception,
+    :not_enough_replicas_after_append_exception,
+    :not_enough_replicas_exception,
+    :leader_not_available_exception
+  ]
+
+  @non_retriable_codes [
+    :client_error,
+    :unknown_server_error,
+    :none,
+    :table_not_exist,
+    :authenticate_exception,
+    :authorization_exception,
+    :record_too_large_exception,
+    :deletion_disabled_exception,
+    :invalid_coordinator_exception,
+    :fenced_leader_epoch_exception,
+    :fenced_tiering_epoch_exception,
+    :retriable_authenticate_exception
+  ]
+
+  defp err(code), do: %Fluss.Error{code: code, error_code: 0, message: ""}
+
+  test "Exception.message/1 formats '[<code>]: <msg>'" do
+    err = %Fluss.Error{code: :network_exception, error_code: 1, message: "disconnected"}
+    assert Exception.message(err) == "Fluss error [network_exception]: disconnected"
+  end
+
+  test "retriable?/1 returns true for transient protocol codes" do
+    for code <- @retriable_codes do
+      assert Fluss.Error.retriable?(err(code)), "expected #{code} to be retriable"
+    end
+  end
+
+  test "retriable?/1 returns false for :client_error and permanent codes" do
+    for code <- @non_retriable_codes do
+      refute Fluss.Error.retriable?(err(code)), "expected #{code} to not be retriable"
+    end
+  end
+
+  describe "NIF error surface" do
+    test "unreachable server returns %Fluss.Error{code: :network_exception, error_code: 1}" do
+      config = Fluss.Config.new("127.0.0.1:1")
+
+      assert {:error, %Fluss.Error{code: :network_exception, error_code: 1}} =
+               Fluss.Connection.new(config)
+    end
+
+    test "bang variant raises %Fluss.Error{}" do
+      config = Fluss.Config.new("127.0.0.1:1")
+
+      assert_raise Fluss.Error, ~r/\[network_exception\]/, fn ->
+        Fluss.Connection.new!(config)
+      end
+    end
+  end
+end
diff --git a/fluss-rust/bindings/elixir/test/fluss_test.exs b/fluss-rust/bindings/elixir/test/fluss_test.exs
new file mode 100644
index 0000000000..3eee273482
--- /dev/null
+++ b/fluss-rust/bindings/elixir/test/fluss_test.exs
@@ -0,0 +1,66 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule FlussTest do
+  use ExUnit.Case
+
+  describe "TableDescriptor" do
+    test "creates descriptor from schema" do
+      Fluss.Schema.new()
+      |> Fluss.Schema.column("id", :int)
+      |> Fluss.TableDescriptor.new!()
+    end
+
+    test "creates descriptor with bucket count" do
+      Fluss.Schema.new()
+      |> Fluss.Schema.column("id", :int)
+      |> Fluss.TableDescriptor.new!(bucket_count: 3)
+    end
+
+    test "accepts all simple data types" do
+      Fluss.Schema.new()
+      |> Fluss.Schema.column("a", :boolean)
+      |> Fluss.Schema.column("b", :tinyint)
+      |> Fluss.Schema.column("c", :smallint)
+      |> Fluss.Schema.column("d", :int)
+      |> Fluss.Schema.column("e", :bigint)
+      |> Fluss.Schema.column("f", :float)
+      |> Fluss.Schema.column("g", :double)
+      |> Fluss.Schema.column("h", :string)
+      |> Fluss.Schema.column("i", :bytes)
+      |> Fluss.Schema.column("j", :date)
+      |> Fluss.Schema.column("k", :time)
+      |> Fluss.Schema.column("l", :timestamp)
+      |> Fluss.Schema.column("m", :timestamp_ltz)
+      |> Fluss.TableDescriptor.new!()
+    end
+
+    test "accepts parameterized data types" do
+      Fluss.Schema.new()
+      |> Fluss.Schema.column("amount", {:decimal, 10, 2})
+      |> Fluss.Schema.column("code", {:char, 5})
+      |> Fluss.Schema.column("data", {:binary, 16})
+      |> Fluss.TableDescriptor.new!()
+    end
+  end
+
+  describe "earliest_offset/0" do
+    test "returns -2" do
+      assert Fluss.earliest_offset() == -2
+    end
+  end
+end
diff --git a/fluss-rust/bindings/elixir/test/integration/log_table_test.exs b/fluss-rust/bindings/elixir/test/integration/log_table_test.exs
new file mode 100644
index 0000000000..b3041b9587
--- /dev/null
+++ b/fluss-rust/bindings/elixir/test/integration/log_table_test.exs
@@ -0,0 +1,413 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.Integration.LogTableTest do
+  use ExUnit.Case, async: false
+
+  alias Fluss.Test.Cluster
+
+  @moduletag :integration
+
+  @database "fluss"
+
+  setup_all do
+    case Cluster.ensure_started() do
+      {:ok, servers} ->
+        config = Fluss.Config.new(servers)
+
+        # Wait for cluster to be fully ready (connection + admin working)
+        {conn, admin} = connect_with_retry(config, 90)
+
+        %{conn: conn, admin: admin, config: config}
+
+      {:error, reason} ->
+        raise "Failed to start Fluss cluster: #{reason}"
+    end
+  end
+
+  describe "append and scan" do
+    test "append rows and scan with log scanner", %{conn: conn, admin: admin} do
+      table_name = "ex_test_append_and_scan_#{:rand.uniform(100_000)}"
+      cleanup_table(admin, table_name)
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("c1", :int)
+        |> Fluss.Schema.column("c2", :string)
+
+      descriptor = Fluss.TableDescriptor.new!(schema)
+      :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false)
+
+      table = Fluss.Table.get!(conn, @database, table_name)
+      writer = Fluss.AppendWriter.new!(table)
+
+      # Append 6 rows
+      for {c1, c2} <- [{1, "a1"}, {2, "a2"}, {3, "a3"}, {4, "a4"}, {5, "a5"}, {6, "a6"}] do
+        {:ok, _} = Fluss.AppendWriter.append(writer, [c1, c2])
+      end
+
+      :ok = Fluss.AppendWriter.flush(writer)
+
+      # Scan all records
+      scanner = Fluss.LogScanner.new!(table)
+      :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset())
+
+      records = poll_records(scanner, 6)
+
+      assert length(records) == 6
+
+      sorted = Enum.sort_by(records, fn r -> r[:row][:c1] end)
+
+      for {record, i} <- Enum.with_index(sorted, 1) do
+        assert record[:row][:c1] == i
+        assert record[:row][:c2] == "a#{i}"
+        assert record[:change_type] == :append_only
+      end
+
+      # Unsubscribe should not error
+      :ok = Fluss.LogScanner.unsubscribe(scanner, 0)
+
+      cleanup_table(admin, table_name)
+    end
+
+    test "append with nil values", %{conn: conn, admin: admin} do
+      table_name = "ex_test_append_nil_#{:rand.uniform(100_000)}"
+      cleanup_table(admin, table_name)
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("id", :int)
+        |> Fluss.Schema.column("name", :string)
+
+      descriptor = Fluss.TableDescriptor.new!(schema)
+      :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false)
+
+      table = Fluss.Table.get!(conn, @database, table_name)
+      writer = Fluss.AppendWriter.new!(table)
+
+      {:ok, _} = Fluss.AppendWriter.append(writer, [1, nil])
+      {:ok, _} = Fluss.AppendWriter.append(writer, [2, "present"])
+      :ok = Fluss.AppendWriter.flush(writer)
+
+      scanner = Fluss.LogScanner.new!(table)
+      :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset())
+
+      records = poll_records(scanner, 2)
+      assert length(records) == 2
+
+      sorted = Enum.sort_by(records, fn r -> r[:row][:id] end)
+      assert Enum.at(sorted, 0)[:row][:name] == nil
+      assert Enum.at(sorted, 1)[:row][:name] == "present"
+
+      cleanup_table(admin, table_name)
+    end
+  end
+
+  describe "multiple data types" do
+    test "tinyint, smallint, int, bigint, float, double, string, boolean", %{
+      conn: conn,
+      admin: admin
+    } do
+      table_name = "ex_test_data_types_#{:rand.uniform(100_000)}"
+      cleanup_table(admin, table_name)
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("a_tinyint", :tinyint)
+        |> Fluss.Schema.column("b_smallint", :smallint)
+        |> Fluss.Schema.column("c_int", :int)
+        |> Fluss.Schema.column("d_bigint", :bigint)
+        |> Fluss.Schema.column("e_float", :float)
+        |> Fluss.Schema.column("f_double", :double)
+        |> Fluss.Schema.column("g_string", :string)
+        |> Fluss.Schema.column("h_bool", :boolean)
+
+      descriptor = Fluss.TableDescriptor.new!(schema)
+      :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false)
+
+      table = Fluss.Table.get!(conn, @database, table_name)
+      writer = Fluss.AppendWriter.new!(table)
+
+      {:ok, _} =
+        Fluss.AppendWriter.append(writer, [
+          127,
+          32_000,
+          42,
+          1_000_000_000_000,
+          3.14,
+          2.718281828,
+          "hello",
+          true
+        ])
+
+      {:ok, _} =
+        Fluss.AppendWriter.append(writer, [-128, -32_000, -1, -999, 0.0, -1.5, "", false])
+
+      :ok = Fluss.AppendWriter.flush(writer)
+
+      scanner = Fluss.LogScanner.new!(table)
+      :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset())
+
+      records = poll_records(scanner, 2)
+      assert length(records) == 2
+
+      sorted = Enum.sort_by(records, fn r -> r[:row][:c_int] end)
+      row1 = Enum.at(sorted, 0)[:row]
+      row2 = Enum.at(sorted, 1)[:row]
+
+      assert row1[:a_tinyint] == -128
+      assert row1[:b_smallint] == -32_000
+      assert row1[:c_int] == -1
+      assert row1[:d_bigint] == -999
+      assert row1[:g_string] == ""
+      assert row1[:h_bool] == false
+
+      assert row2[:a_tinyint] == 127
+      assert row2[:b_smallint] == 32_000
+      assert row2[:c_int] == 42
+      assert row2[:d_bigint] == 1_000_000_000_000
+      assert row2[:g_string] == "hello"
+      assert row2[:h_bool] == true
+
+      cleanup_table(admin, table_name)
+    end
+  end
+
+  describe "subscribe_buckets" do
+    test "subscribe to multiple buckets at once", %{conn: conn, admin: admin} do
+      table_name = "ex_test_subscribe_buckets_#{:rand.uniform(100_000)}"
+      cleanup_table(admin, table_name)
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("id", :int)
+        |> Fluss.Schema.column("val", :string)
+
+      descriptor = Fluss.TableDescriptor.new!(schema, bucket_count: 3)
+      :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false)
+
+      table = Fluss.Table.get!(conn, @database, table_name)
+      writer = Fluss.AppendWriter.new!(table)
+
+      for i <- 1..9 do
+        {:ok, _} = Fluss.AppendWriter.append(writer, [i, "v#{i}"])
+      end
+
+      :ok = Fluss.AppendWriter.flush(writer)
+
+      scanner = Fluss.LogScanner.new!(table)
+      earliest = Fluss.earliest_offset()
+
+      :ok =
+        Fluss.LogScanner.subscribe_buckets(scanner, [
+          {0, earliest},
+          {1, earliest},
+          {2, earliest}
+        ])
+
+      records = poll_records(scanner, 9)
+      assert length(records) == 9
+
+      ids = records |> Enum.map(fn r -> r[:row][:id] end) |> Enum.sort()
+      assert ids == Enum.to_list(1..9)
+
+      cleanup_table(admin, table_name)
+    end
+  end
+
+  describe "admin operations" do
+    test "create and drop database", %{admin: admin} do
+      db_name = "ex_test_db_#{:rand.uniform(100_000)}"
+      :ok = Fluss.Admin.create_database(admin, db_name, true)
+
+      {:ok, databases} = Fluss.Admin.list_databases(admin)
+      assert db_name in databases
+
+      :ok = Fluss.Admin.drop_database(admin, db_name, true)
+    end
+
+    test "list tables", %{admin: admin} do
+      table_name = "ex_test_list_tables_#{:rand.uniform(100_000)}"
+      cleanup_table(admin, table_name)
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("id", :int)
+
+      descriptor = Fluss.TableDescriptor.new!(schema)
+      :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false)
+
+      {:ok, tables} = Fluss.Admin.list_tables(admin, @database)
+      assert table_name in tables
+
+      cleanup_table(admin, table_name)
+    end
+
+    test "table metadata", %{conn: conn, admin: admin} do
+      table_name = "ex_test_table_meta_#{:rand.uniform(100_000)}"
+      cleanup_table(admin, table_name)
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("id", :int)
+        |> Fluss.Schema.column("name", :string)
+
+      descriptor = Fluss.TableDescriptor.new!(schema)
+      :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false)
+
+      table = Fluss.Table.get!(conn, @database, table_name)
+      assert Fluss.Table.has_primary_key?(table) == false
+      assert Fluss.Table.column_names(table) == ["id", "name"]
+
+      cleanup_table(admin, table_name)
+    end
+  end
+
+  describe "scan from offset" do
+    test "subscribe from specific offset skips earlier records", %{conn: conn, admin: admin} do
+      table_name = "ex_test_scan_offset_#{:rand.uniform(100_000)}"
+      cleanup_table(admin, table_name)
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("id", :int)
+
+      descriptor = Fluss.TableDescriptor.new!(schema)
+      :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false)
+
+      table = Fluss.Table.get!(conn, @database, table_name)
+      writer = Fluss.AppendWriter.new!(table)
+
+      for i <- 1..5 do
+        {:ok, _} = Fluss.AppendWriter.append(writer, [i])
+      end
+
+      :ok = Fluss.AppendWriter.flush(writer)
+
+      # Subscribe from offset 3, should skip first 3 records
+      scanner = Fluss.LogScanner.new!(table)
+      :ok = Fluss.LogScanner.subscribe(scanner, 0, 3)
+
+      records = poll_records(scanner, 2)
+      assert length(records) == 2
+
+      ids = records |> Enum.map(fn r -> r[:row][:id] end) |> Enum.sort()
+      assert ids == [4, 5]
+
+      cleanup_table(admin, table_name)
+    end
+  end
+
+  describe "multiple flushes" do
+    test "append, flush, append more, flush, scan all", %{conn: conn, admin: admin} do
+      table_name = "ex_test_multi_flush_#{:rand.uniform(100_000)}"
+      cleanup_table(admin, table_name)
+
+      schema =
+        Fluss.Schema.new()
+        |> Fluss.Schema.column("id", :int)
+        |> Fluss.Schema.column("batch", :string)
+
+      descriptor = Fluss.TableDescriptor.new!(schema)
+      :ok = Fluss.Admin.create_table(admin, @database, table_name, descriptor, false)
+
+      table = Fluss.Table.get!(conn, @database, table_name)
+      writer = Fluss.AppendWriter.new!(table)
+
+      # First batch
+      {:ok, _} = Fluss.AppendWriter.append(writer, [1, "first"])
+      {:ok, _} = Fluss.AppendWriter.append(writer, [2, "first"])
+      :ok = Fluss.AppendWriter.flush(writer)
+
+      # Second batch
+      {:ok, _} = Fluss.AppendWriter.append(writer, [3, "second"])
+      {:ok, _} = Fluss.AppendWriter.append(writer, [4, "second"])
+      :ok = Fluss.AppendWriter.flush(writer)
+
+      scanner = Fluss.LogScanner.new!(table)
+      :ok = Fluss.LogScanner.subscribe(scanner, 0, Fluss.earliest_offset())
+
+      records = poll_records(scanner, 4)
+      assert length(records) == 4
+
+      sorted = Enum.sort_by(records, fn r -> r[:row][:id] end)
+      assert Enum.at(sorted, 0)[:row][:batch] == "first"
+      assert Enum.at(sorted, 1)[:row][:batch] == "first"
+      assert Enum.at(sorted, 2)[:row][:batch] == "second"
+      assert Enum.at(sorted, 3)[:row][:batch] == "second"
+
+      cleanup_table(admin, table_name)
+    end
+  end
+
+  defp poll_records(scanner, expected_count, timeout_ms \\ 10_000) do
+    deadline = System.monotonic_time(:millisecond) + timeout_ms
+    do_poll(scanner, expected_count, deadline, [])
+  end
+
+  defp do_poll(_scanner, expected_count, _deadline, acc) when length(acc) >= expected_count do
+    acc
+  end
+
+  defp do_poll(scanner, expected_count, deadline, acc) do
+    remaining = deadline - System.monotonic_time(:millisecond)
+
+    if remaining <= 0 do
+      acc
+    else
+      :ok = Fluss.LogScanner.poll(scanner, min(5_000, remaining))
+
+      receive do
+        {:fluss_records, records} ->
+          do_poll(scanner, expected_count, deadline, acc ++ records)
+
+        {:fluss_poll_error, reason} ->
+          IO.warn("poll error during test: #{inspect(reason)}")
+          do_poll(scanner, expected_count, deadline, acc)
+      after
+        min(6_000, remaining) ->
+          acc
+      end
+    end
+  end
+
+  defp cleanup_table(admin, table_name) do
+    Fluss.Admin.drop_table(admin, @database, table_name, true)
+  end
+
+  defp connect_with_retry(config, timeout_s) do
+    deadline = System.monotonic_time(:second) + timeout_s
+    do_connect_retry(config, deadline, nil)
+  end
+
+  defp do_connect_retry(config, deadline, last_error) do
+    if System.monotonic_time(:second) >= deadline do
+      raise "Could not connect to Fluss cluster: #{inspect(last_error)}"
+    end
+
+    try do
+      conn = Fluss.Connection.new!(config)
+      admin = Fluss.Admin.new!(conn)
+      {:ok, _databases} = Fluss.Admin.list_databases(admin)
+      {conn, admin}
+    rescue
+      e ->
+        Process.sleep(2_000)
+        do_connect_retry(config, deadline, e)
+    end
+  end
+end
diff --git a/fluss-rust/bindings/elixir/test/support/cluster.ex b/fluss-rust/bindings/elixir/test/support/cluster.ex
new file mode 100644
index 0000000000..40f0f68d35
--- /dev/null
+++ b/fluss-rust/bindings/elixir/test/support/cluster.ex
@@ -0,0 +1,130 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+defmodule Fluss.Test.Cluster do
+  @moduledoc false
+
+  # Shells out to the `fluss-test-cluster` CLI (from `crates/fluss-test-cluster`),
+  # the same binary used by the Python and C++ integration tests.
+
+  @cluster_name "shared-test"
+  @cluster_json_prefix "CLUSTER_JSON: "
+
+  def ensure_started do
+    case System.get_env("FLUSS_BOOTSTRAP_SERVERS") do
+      nil -> start_cluster()
+      servers -> {:ok, servers}
+    end
+  end
+
+  def stop do
+    if System.get_env("FLUSS_BOOTSTRAP_SERVERS") do
+      :ok
+    else
+      case find_cli_binary() do
+        {:ok, cli} ->
+          System.cmd(cli, ["stop", "--name", @cluster_name], stderr_to_stdout: true)
+          :ok
+
+        {:error, _} ->
+          :ok
+      end
+    end
+  end
+
+  defp start_cluster do
+    with {:ok, cli} <- find_cli_binary(),
+         {output, 0} <-
+           System.cmd(cli, ["start", "--sasl", "--name", @cluster_name], stderr_to_stdout: true),
+         {:ok, bootstrap} <- parse_cluster_json(output) do
+      {:ok, bootstrap}
+    else
+      {output, code} when is_binary(output) ->
+        {:error, "fluss-test-cluster start failed (exit #{code}):\n#{output}"}
+
+      {:error, _} = err ->
+        err
+    end
+  end
+
+  defp find_cli_binary do
+    case System.get_env("FLUSS_TEST_CLUSTER_BIN") do
+      bin when is_binary(bin) and bin != "" ->
+        if File.regular?(bin),
+          do: {:ok, bin},
+          else: {:error, "FLUSS_TEST_CLUSTER_BIN=#{bin} does not exist"}
+
+      _ ->
+        locate_via_cargo()
+    end
+  end
+
+  defp locate_via_cargo do
+    case System.cmd("cargo", ["locate-project", "--workspace", "--message-format", "plain"],
+           stderr_to_stdout: true
+         ) do
+      {output, 0} ->
+        output |> String.trim() |> Path.dirname() |> find_binary_in_target()
+
+      {output, code} ->
+        {:error, "cargo locate-project failed (exit #{code}): #{output}"}
+    end
+  end
+
+  defp find_binary_in_target(root) do
+    Enum.find_value(
+      ["debug", "release"],
+      {:error, "fluss-test-cluster binary not found. Run: cargo build -p fluss-test-cluster"},
+      &check_binary(root, &1)
+    )
+  end
+
+  defp check_binary(root, profile) do
+    path = Path.join([root, "target", profile, "fluss-test-cluster"])
+    if File.regular?(path), do: {:ok, path}, else: nil
+  end
+
+  defp parse_cluster_json(output) do
+    output
+    |> String.split("\n", trim: true)
+    |> Enum.find_value(
+      {:error, "No #{@cluster_json_prefix} token in output:\n#{output}"},
+      &extract_bootstrap/1
+    )
+  end
+
+  defp extract_bootstrap(line) do
+    case String.split(line, @cluster_json_prefix, parts: 2) do
+      [_, json] ->
+        case decode_bootstrap(json) do
+          {:ok, bootstrap} -> {:ok, bootstrap}
+          _ -> nil
+        end
+
+      _ ->
+        nil
+    end
+  end
+
+  # Minimal JSON extractor for `bootstrap_servers`: avoids adding a JSON dep just for tests.
+  defp decode_bootstrap(json) do
+    case Regex.run(~r/"bootstrap_servers"\s*:\s*"([^"]+)"/, json) do
+      [_, servers] -> {:ok, servers}
+      _ -> {:error, "no bootstrap_servers in: #{json}"}
+    end
+  end
+end
diff --git a/fluss-rust/bindings/elixir/test/test_helper.exs b/fluss-rust/bindings/elixir/test/test_helper.exs
new file mode 100644
index 0000000000..b15b1f44a8
--- /dev/null
+++ b/fluss-rust/bindings/elixir/test/test_helper.exs
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Exclude integration tests by default (they need a Docker cluster).
+# Run with: mix test --include integration
+ExUnit.start(exclude: [:integration])
+
+# Stop Docker containers after all tests finish (matches Python's pytest_unconfigure).
+ExUnit.after_suite(fn _ ->
+  unless System.get_env("FLUSS_BOOTSTRAP_SERVERS") do
+    Fluss.Test.Cluster.stop()
+  end
+end)
diff --git a/fluss-rust/bindings/python/Cargo.toml b/fluss-rust/bindings/python/Cargo.toml
new file mode 100644
index 0000000000..30ac0469bc
--- /dev/null
+++ b/fluss-rust/bindings/python/Cargo.toml
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "fluss_python"
+edition.workspace = true
+version.workspace = true
+license.workspace = true
+rust-version.workspace = true
+
+[lib]
+name = "fluss"
+crate-type = ["cdylib"]
+
+[dependencies]
+pyo3 = { version = "0.26.0", features = ["extension-module", "generate-import-lib"] }
+fluss = { workspace = true, features = ["storage-all"] }
+tokio = { workspace = true }
+arrow = { workspace = true }
+arrow-pyarrow = "57.0.0"
+arrow-schema = "57.0.0"
+arrow-array = "57.0.0"
+pyo3-async-runtimes = { version = "0.26.0", features = ["tokio-runtime"] }
+jiff = { workspace = true }
+bigdecimal = "0.4"
+indexmap = "2"
diff --git a/fluss-rust/bindings/python/DEPENDENCIES.rust.tsv b/fluss-rust/bindings/python/DEPENDENCIES.rust.tsv
new file mode 100644
index 0000000000..bc7b9b78f2
--- /dev/null
+++ b/fluss-rust/bindings/python/DEPENDENCIES.rust.tsv
@@ -0,0 +1,310 @@
+crate	Apache-2.0	Apache-2.0 WITH LLVM-exception	BSD-2-Clause	BSD-3-Clause	BSL-1.0	CC0-1.0	CDLA-Permissive-2.0	ISC	LGPL-2.1-or-later	MIT	Unicode-3.0	Unlicense	Zlib
+ahash@0.8.12	X									X			
+aho-corasick@1.1.4										X		X	
+android_system_properties@0.1.5	X									X			
+anstream@1.0.0	X									X			
+anstyle@1.0.14	X									X			
+anstyle-parse@1.0.0	X									X			
+anstyle-query@1.1.5	X									X			
+anstyle-wincon@3.0.11	X									X			
+anyhow@1.0.102	X									X			
+arrow@57.3.0	X												
+arrow-arith@57.3.0	X												
+arrow-array@57.3.0	X												
+arrow-buffer@57.3.0	X												
+arrow-cast@57.3.0	X												
+arrow-csv@57.3.0	X												
+arrow-data@57.3.0	X												
+arrow-ipc@57.3.0	X												
+arrow-json@57.3.0	X												
+arrow-ord@57.3.0	X												
+arrow-pyarrow@57.3.0	X												
+arrow-row@57.3.0	X												
+arrow-schema@57.3.0	X												
+arrow-select@57.3.0	X												
+arrow-string@57.3.0	X												
+async-trait@0.1.89	X									X			
+atoi@2.0.0										X			
+atomic-waker@1.1.2	X									X			
+autocfg@1.5.0	X									X			
+backon@1.6.0	X												
+base64@0.22.1	X									X			
+bigdecimal@0.4.10	X									X			
+bitflags@2.11.0	X									X			
+bitvec@1.0.1										X			
+block-buffer@0.10.4	X									X			
+bumpalo@3.20.2	X									X			
+byteorder@1.5.0										X		X	
+bytes@1.11.1										X			
+cc@1.2.57	X									X			
+cfg-if@1.0.4	X									X			
+chrono@0.4.44	X									X			
+clap@4.6.0	X									X			
+clap_builder@4.6.0	X									X			
+clap_derive@4.6.0	X									X			
+clap_lex@1.1.0	X									X			
+colorchoice@1.0.5	X									X			
+const-oid@0.9.6	X									X			
+const-random@0.1.18	X									X			
+const-random-macro@0.1.16	X									X			
+core-foundation-sys@0.8.7	X									X			
+cpufeatures@0.2.17	X									X			
+crc32c@0.6.8	X									X			
+crossbeam-utils@0.8.21	X									X			
+crunchy@0.2.4										X			
+crypto-common@0.1.7	X									X			
+csv@1.4.0										X		X	
+csv-core@0.1.13										X		X	
+dashmap@6.1.0										X			
+delegate@0.13.5	X									X			
+digest@0.10.7	X									X			
+displaydoc@0.2.5	X									X			
+either@1.15.0	X									X			
+equivalent@1.0.2	X									X			
+errno@0.3.14	X									X			
+fastrand@2.3.0	X									X			
+find-msvc-tools@0.1.9	X									X			
+fixedbitset@0.5.7	X									X			
+flatbuffers@25.12.19	X												
+fluss-rs@0.1.0	X												
+fluss_python@0.1.0	X												
+fnv@1.0.7	X									X			
+foldhash@0.1.5													X
+form_urlencoded@1.2.2	X									X			
+funty@2.0.0										X			
+futures@0.3.32	X									X			
+futures-channel@0.3.32	X									X			
+futures-core@0.3.32	X									X			
+futures-executor@0.3.32	X									X			
+futures-io@0.3.32	X									X			
+futures-macro@0.3.32	X									X			
+futures-sink@0.3.32	X									X			
+futures-task@0.3.32	X									X			
+futures-util@0.3.32	X									X			
+generic-array@0.14.7										X			
+getrandom@0.2.17	X									X			
+getrandom@0.3.4	X									X			
+getrandom@0.4.2	X									X			
+gloo-timers@0.3.0	X									X			
+h2@0.4.13										X			
+half@2.7.1	X									X			
+hashbrown@0.14.5	X									X			
+hashbrown@0.15.5	X									X			
+hashbrown@0.16.1	X									X			
+heck@0.5.0	X									X			
+hex@0.4.3	X									X			
+hmac@0.12.1	X									X			
+home@0.5.12	X									X			
+http@1.4.0	X									X			
+http-body@1.0.1										X			
+http-body-util@0.1.3										X			
+httparse@1.10.1	X									X			
+httpdate@1.0.3	X									X			
+hyper@1.8.1										X			
+hyper-rustls@0.27.7	X							X		X			
+hyper-util@0.1.20										X			
+iana-time-zone@0.1.65	X									X			
+iana-time-zone-haiku@0.1.2	X									X			
+icu_collections@2.1.1											X		
+icu_locale_core@2.1.1											X		
+icu_normalizer@2.1.1											X		
+icu_normalizer_data@2.1.1											X		
+icu_properties@2.1.2											X		
+icu_properties_data@2.1.2											X		
+icu_provider@2.1.1											X		
+idna@1.1.0	X									X			
+idna_adapter@1.2.1	X									X			
+indexmap@2.13.0	X									X			
+indoc@2.0.7	X									X			
+ipnet@2.12.0	X									X			
+iri-string@0.7.11	X									X			
+is_terminal_polyfill@1.70.2	X									X			
+itertools@0.14.0	X									X			
+itoa@1.0.18	X									X			
+jiff@0.2.23										X		X	
+jiff-tzdb@0.1.6										X		X	
+jiff-tzdb-platform@0.1.3										X		X	
+jobserver@0.1.34	X									X			
+js-sys@0.3.91	X									X			
+lexical-core@1.0.6	X									X			
+lexical-parse-float@1.0.6	X									X			
+lexical-parse-integer@1.0.6	X									X			
+lexical-util@1.0.7	X									X			
+lexical-write-float@1.0.6	X									X			
+lexical-write-integer@1.0.6	X									X			
+libc@0.2.183	X									X			
+libm@0.2.16										X			
+linked-hash-map@0.5.6	X									X			
+linux-raw-sys@0.12.1	X	X								X			
+litemap@0.8.1											X		
+lock_api@0.4.14	X									X			
+log@0.4.29	X									X			
+lz4_flex@0.12.1										X			
+md-5@0.10.6	X									X			
+memchr@2.8.0										X		X	
+memoffset@0.9.1										X			
+mio@1.1.1										X			
+multimap@0.10.1	X									X			
+num-bigint@0.4.6	X									X			
+num-complex@0.4.6	X									X			
+num-integer@0.1.46	X									X			
+num-traits@0.2.19	X									X			
+once_cell@1.21.4	X									X			
+once_cell_polyfill@1.70.2	X									X			
+opendal@0.55.0	X												
+ordered-float@5.1.0										X			
+parking_lot@0.12.5	X									X			
+parking_lot_core@0.9.12	X									X			
+parse-display@0.10.0	X									X			
+parse-display-derive@0.10.0	X									X			
+percent-encoding@2.3.2	X									X			
+petgraph@0.8.3	X									X			
+pin-project-lite@0.2.17	X									X			
+pin-utils@0.1.0	X									X			
+pkg-config@0.3.32	X									X			
+portable-atomic@1.13.1	X									X			
+portable-atomic-util@0.2.6	X									X			
+potential_utf@0.1.4											X		
+ppv-lite86@0.2.21	X									X			
+prettyplease@0.2.37	X									X			
+proc-macro2@1.0.106	X									X			
+prost@0.14.3	X												
+prost-build@0.14.3	X												
+prost-derive@0.14.3	X												
+prost-types@0.14.3	X												
+pyo3@0.26.0	X									X			
+pyo3-async-runtimes@0.26.0	X												
+pyo3-build-config@0.26.0	X									X			
+pyo3-ffi@0.26.0	X									X			
+pyo3-macros@0.26.0	X									X			
+pyo3-macros-backend@0.26.0	X									X			
+python3-dll-a@0.2.14										X			
+quick-xml@0.37.5										X			
+quick-xml@0.38.4										X			
+quote@1.0.45	X									X			
+r-efi@5.3.0	X								X	X			
+r-efi@6.0.0	X								X	X			
+radium@0.7.0										X			
+rand@0.8.5	X									X			
+rand@0.9.2	X									X			
+rand_chacha@0.3.1	X									X			
+rand_chacha@0.9.0	X									X			
+rand_core@0.6.4	X									X			
+rand_core@0.9.5	X									X			
+redox_syscall@0.5.18										X			
+regex@1.12.3	X									X			
+regex-automata@0.4.14	X									X			
+regex-syntax@0.8.10	X									X			
+reqsign@0.16.5	X												
+reqwest@0.12.28	X									X			
+ring@0.17.14	X							X					
+rustc_version@0.4.1	X									X			
+rustix@1.1.4	X	X								X			
+rustls@0.23.37	X							X		X			
+rustls-pki-types@1.14.0	X									X			
+rustls-webpki@0.103.10								X					
+rustversion@1.0.22	X									X			
+ryu@1.0.23	X				X								
+scopeguard@1.2.0	X									X			
+semver@1.0.27	X									X			
+serde@1.0.228	X									X			
+serde_core@1.0.228	X									X			
+serde_derive@1.0.228	X									X			
+serde_json@1.0.149	X									X			
+serde_urlencoded@0.7.1	X									X			
+sha1@0.10.6	X									X			
+sha2@0.10.9	X									X			
+shlex@1.3.0	X									X			
+signal-hook-registry@1.4.8	X									X			
+simdutf8@0.1.5	X									X			
+slab@0.4.12										X			
+smallvec@1.15.1	X									X			
+snafu@0.8.9	X									X			
+snafu-derive@0.8.9	X									X			
+socket2@0.6.3	X									X			
+stable_deref_trait@1.2.1	X									X			
+strsim@0.11.1										X			
+structmeta@0.3.0	X									X			
+structmeta-derive@0.3.0	X									X			
+strum@0.26.3										X			
+strum_macros@0.26.4										X			
+subtle@2.6.1				X									
+syn@2.0.117	X									X			
+sync_wrapper@1.0.2	X												
+synstructure@0.13.2										X			
+tap@1.0.1										X			
+target-lexicon@0.13.5		X											
+tempfile@3.27.0	X									X			
+thiserror@1.0.69	X									X			
+thiserror-impl@1.0.69	X									X			
+tiny-keccak@2.0.2						X							
+tinystr@0.8.2											X		
+tokio@1.50.0										X			
+tokio-macros@2.6.1										X			
+tokio-rustls@0.26.4	X									X			
+tokio-util@0.7.18										X			
+tower@0.5.3										X			
+tower-http@0.6.8										X			
+tower-layer@0.3.3										X			
+tower-service@0.3.3										X			
+tracing@0.1.44										X			
+tracing-attributes@0.1.31										X			
+tracing-core@0.1.36										X			
+try-lock@0.2.5										X			
+twox-hash@2.1.2										X			
+typenum@1.19.0	X									X			
+unicode-ident@1.0.24	X									X	X		
+unindent@0.2.4	X									X			
+untrusted@0.9.0								X					
+url@2.5.8	X									X			
+utf8_iter@1.0.4	X									X			
+utf8parse@0.2.2	X									X			
+uuid@1.22.0	X									X			
+value-bag@1.12.0	X									X			
+version_check@0.9.5	X									X			
+want@0.3.1										X			
+wasi@0.11.1+wasi-snapshot-preview1	X	X								X			
+wasip2@1.0.2+wasi-0.2.9	X	X								X			
+wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06	X	X								X			
+wasm-bindgen@0.2.114	X									X			
+wasm-bindgen-futures@0.4.64	X									X			
+wasm-bindgen-macro@0.2.114	X									X			
+wasm-bindgen-macro-support@0.2.114	X									X			
+wasm-bindgen-shared@0.2.114	X									X			
+wasm-streams@0.4.2	X									X			
+web-sys@0.3.91	X									X			
+webpki-roots@1.0.6							X						
+windows-core@0.62.2	X									X			
+windows-implement@0.60.2	X									X			
+windows-interface@0.59.3	X									X			
+windows-link@0.2.1	X									X			
+windows-result@0.4.1	X									X			
+windows-strings@0.5.1	X									X			
+windows-sys@0.52.0	X									X			
+windows-sys@0.61.2	X									X			
+windows-targets@0.52.6	X									X			
+windows_aarch64_gnullvm@0.52.6	X									X			
+windows_aarch64_msvc@0.52.6	X									X			
+windows_i686_gnu@0.52.6	X									X			
+windows_i686_gnullvm@0.52.6	X									X			
+windows_i686_msvc@0.52.6	X									X			
+windows_x86_64_gnu@0.52.6	X									X			
+windows_x86_64_gnullvm@0.52.6	X									X			
+windows_x86_64_msvc@0.52.6	X									X			
+wit-bindgen@0.51.0	X	X								X			
+writeable@0.6.2											X		
+wyz@0.5.1										X			
+yoke@0.8.1											X		
+yoke-derive@0.8.1											X		
+zerocopy@0.8.47	X		X							X			
+zerocopy-derive@0.8.47	X		X							X			
+zerofrom@0.1.6											X		
+zerofrom-derive@0.1.6											X		
+zeroize@1.8.2	X									X			
+zerotrie@0.2.3											X		
+zerovec@0.11.5											X		
+zerovec-derive@0.11.2											X		
+zmij@1.0.21										X			
+zstd@0.13.3										X			
+zstd-safe@7.2.4	X									X			
+zstd-sys@2.0.16+zstd.1.5.7	X									X			
diff --git a/fluss-rust/bindings/python/DEVELOPMENT.md b/fluss-rust/bindings/python/DEVELOPMENT.md
new file mode 100644
index 0000000000..cccd0d1ee6
--- /dev/null
+++ b/fluss-rust/bindings/python/DEVELOPMENT.md
@@ -0,0 +1,95 @@
+# Development
+
+## Requirements
+
+- Python 3.9+
+- Rust 1.70+
+- [uv](https://docs.astral.sh/uv/) package manager
+- Linux or MacOS
+
+> **Before you start:**
+> Please make sure you can successfully build and run the [Fluss Rust client](../../crates/fluss/README.md) on your machine.
+> The Python bindings require a working Fluss Rust backend and compatible environment.
+
+## Install Development Dependencies
+
+```bash
+cd bindings/python
+uv sync --all-extras
+```
+
+## Build Development Version
+
+```bash
+source .venv/bin/activate
+uv run maturin develop
+```
+
+## Build Release Version
+
+```bash
+uv run maturin build --release
+```
+
+## Code Formatting and Linting
+
+```bash
+uv run ruff format python/
+uv run ruff check python/
+```
+
+## Type Checking
+
+```bash
+uv run mypy python/
+```
+
+## Run Examples
+
+```bash
+uv run python example/example.py
+```
+
+## Build API Docs
+
+```bash
+uv run pdoc fluss
+```
+
+## Release
+
+```bash
+# Build wheel
+uv run maturin build --release
+
+# Publish to PyPI
+uv run maturin publish
+```
+
+## Project Structure
+
+```
+bindings/python/
+├── Cargo.toml            # Rust dependency configuration
+├── pyproject.toml         # Python project configuration
+├── README.md              # User guide
+├── DEVELOPMENT.md         # This file
+├── API_REFERENCE.md       # API reference
+├── src/                   # Rust source code (PyO3 bindings)
+│   ├── lib.rs
+│   ├── config.rs
+│   ├── connection.rs
+│   ├── admin.rs
+│   ├── table.rs
+│   └── error.rs
+├── fluss/                 # Python package
+│   ├── __init__.py
+│   ├── __init__.pyi       # Type stubs
+│   └── py.typed
+└── example/
+    └── example.py
+```
+
+## License
+
+Apache 2.0 License
diff --git a/fluss-rust/bindings/python/PYPI_README.md b/fluss-rust/bindings/python/PYPI_README.md
new file mode 100644
index 0000000000..2e538f5ca7
--- /dev/null
+++ b/fluss-rust/bindings/python/PYPI_README.md
@@ -0,0 +1,28 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+# Fluss Python Client
+
+PyFluss is a Python library for programmatic access to Apache Fluss (Incubating).
+It provides Python APIs to work with Fluss table metadata and read or write table data.
+
+The documentation is available at <https://clients.fluss.apache.org/user-guide/python/installation/>.
+
+## Get in Touch
+
+Join the Fluss community at <https://fluss.apache.org/community/welcome/>.
diff --git a/fluss-rust/bindings/python/README.md b/fluss-rust/bindings/python/README.md
new file mode 100644
index 0000000000..54a167bc56
--- /dev/null
+++ b/fluss-rust/bindings/python/README.md
@@ -0,0 +1,21 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+# Fluss Python Client
+
+For full documentation, see the [Python user guide](../../website/docs/user-guide/python/).
diff --git a/fluss-rust/bindings/python/example/example.py b/fluss-rust/bindings/python/example/example.py
new file mode 100644
index 0000000000..23ccc6d1c1
--- /dev/null
+++ b/fluss-rust/bindings/python/example/example.py
@@ -0,0 +1,971 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import asyncio
+import traceback
+from datetime import date, datetime
+from datetime import time as dt_time
+from decimal import Decimal
+
+import pandas as pd
+import pyarrow as pa
+
+import fluss
+
+
+async def main():
+    # Create connection configuration
+    config_spec = {
+        "bootstrap.servers": "127.0.0.1:9123",
+        # Add other configuration options as needed
+        "writer.request-max-size": "10485760",  # 10 MB
+        "writer.acks": "all",  # Wait for all replicas to acknowledge
+        "writer.retries": "3",  # Retry up to 3 times on failure
+        "writer.batch-size": "1000",  # Batch size for writes
+    }
+    config = fluss.Config(config_spec)
+
+    # Create connection using the static create method
+    conn = await fluss.FlussConnection.create(config)
+
+    # Define fields for PyArrow
+    fields = [
+        pa.field("id", pa.int32()),
+        pa.field("name", pa.string()),
+        pa.field("score", pa.float32()),
+        pa.field("age", pa.int32()),
+        pa.field("birth_date", pa.date32()),
+        pa.field("check_in_time", pa.time32("ms")),
+        pa.field("created_at", pa.timestamp("us")),  # TIMESTAMP (NTZ)
+        pa.field("updated_at", pa.timestamp("us", tz="UTC")),  # TIMESTAMP_LTZ
+        pa.field("salary", pa.decimal128(10, 2)),
+    ]
+
+    # Create a PyArrow schema
+    schema = pa.schema(fields)
+
+    # Create a Fluss Schema first (this is what TableDescriptor expects)
+    fluss_schema = fluss.Schema(schema)
+
+    # Create a Fluss TableDescriptor
+    table_descriptor = fluss.TableDescriptor(fluss_schema)
+
+    # Get the admin for Fluss
+    admin = conn.get_admin()
+
+    # Create a Fluss table
+    table_path = fluss.TablePath("fluss", "sample_table_types")
+
+    try:
+        await admin.create_table(table_path, table_descriptor, True)
+        print(f"Created table: {table_path}")
+    except Exception as e:
+        print(f"Table creation failed: {e}")
+
+    # Get table information via admin
+    try:
+        table_info = await admin.get_table_info(table_path)
+        print(f"Table info: {table_info}")
+        print(f"Table ID: {table_info.table_id}")
+        print(f"Schema ID: {table_info.schema_id}")
+        print(f"Created time: {table_info.created_time}")
+        print(f"Primary keys: {table_info.get_primary_keys()}")
+    except Exception as e:
+        print(f"Failed to get table info: {e}")
+
+    # Demo: List offsets
+    print("\n--- Testing list_offsets() ---")
+    try:
+        # Query latest offsets using OffsetSpec factory method
+        offsets = await admin.list_offsets(
+            table_path,
+            bucket_ids=[0],
+            offset_spec=fluss.OffsetSpec.latest()
+        )
+        print(f"Latest offsets for table (before writes): {offsets}")
+    except Exception as e:
+        print(f"Failed to list offsets: {e}")
+
+    # Get the table instance
+    table = await conn.get_table(table_path)
+    print(f"Got table: {table}")
+
+    # Create a writer for the table
+    append_writer = table.new_append().create_writer()
+    print(f"Created append writer: {append_writer}")
+
+    try:
+        # Demo: Write PyArrow Table
+        print("\n--- Testing PyArrow Table write ---")
+        pa_table = pa.Table.from_arrays(
+            [
+                pa.array([1, 2, 3], type=pa.int32()),
+                pa.array(["Alice", "Bob", "Charlie"], type=pa.string()),
+                pa.array([95.2, 87.2, 92.1], type=pa.float32()),
+                pa.array([25, 30, 35], type=pa.int32()),
+                pa.array(
+                    [date(1999, 5, 15), date(1994, 3, 20), date(1989, 11, 8)],
+                    type=pa.date32(),
+                ),
+                pa.array(
+                    [dt_time(9, 0, 0), dt_time(9, 30, 0), dt_time(10, 0, 0)],
+                    type=pa.time32("ms"),
+                ),
+                pa.array(
+                    [
+                        datetime(2024, 1, 15, 10, 30),
+                        datetime(2024, 1, 15, 11, 0),
+                        datetime(2024, 1, 15, 11, 30),
+                    ],
+                    type=pa.timestamp("us"),
+                ),
+                pa.array(
+                    [
+                        datetime(2024, 1, 15, 10, 30),
+                        datetime(2024, 1, 15, 11, 0),
+                        datetime(2024, 1, 15, 11, 30),
+                    ],
+                    type=pa.timestamp("us", tz="UTC"),
+                ),
+                pa.array(
+                    [Decimal("75000.00"), Decimal("82000.50"), Decimal("95000.75")],
+                    type=pa.decimal128(10, 2),
+                ),
+            ],
+            schema=schema,
+        )
+
+        append_writer.write_arrow(pa_table)
+        print("Successfully wrote PyArrow Table")
+
+        # Demo: Write PyArrow RecordBatch
+        print("\n--- Testing PyArrow RecordBatch write ---")
+        pa_record_batch = pa.RecordBatch.from_arrays(
+            [
+                pa.array([4, 5], type=pa.int32()),
+                pa.array(["David", "Eve"], type=pa.string()),
+                pa.array([88.5, 91.0], type=pa.float32()),
+                pa.array([28, 32], type=pa.int32()),
+                pa.array([date(1996, 7, 22), date(1992, 12, 1)], type=pa.date32()),
+                pa.array([dt_time(14, 15, 0), dt_time(8, 45, 0)], type=pa.time32("ms")),
+                pa.array(
+                    [datetime(2024, 1, 16, 9, 0), datetime(2024, 1, 16, 9, 30)],
+                    type=pa.timestamp("us"),
+                ),
+                pa.array(
+                    [datetime(2024, 1, 16, 9, 0), datetime(2024, 1, 16, 9, 30)],
+                    type=pa.timestamp("us", tz="UTC"),
+                ),
+                pa.array(
+                    [Decimal("68000.00"), Decimal("72500.25")],
+                    type=pa.decimal128(10, 2),
+                ),
+            ],
+            schema=schema,
+        )
+
+        append_writer.write_arrow_batch(pa_record_batch)
+        print("Successfully wrote PyArrow RecordBatch")
+
+        # Test 3: Append single rows with Date, Time, Timestamp, Decimal
+        print("\n--- Testing single row append with temporal/decimal types ---")
+        # Dict input with all types including Date, Time, Timestamp, Decimal
+        append_writer.append(
+            {
+                "id": 8,
+                "name": "Helen",
+                "score": 93.5,
+                "age": 26,
+                "birth_date": date(1998, 4, 10),
+                "check_in_time": dt_time(11, 30, 45),
+                "created_at": datetime(2024, 1, 17, 14, 0, 0),
+                "updated_at": datetime(2024, 1, 17, 14, 0, 0),
+                "salary": Decimal("88000.00"),
+            }
+        )
+        print("Successfully appended row (dict with Date, Time, Timestamp, Decimal)")
+
+        # List input with all types
+        append_writer.append(
+            [
+                9,
+                "Ivan",
+                90.0,
+                31,
+                date(1993, 8, 25),
+                dt_time(16, 45, 0),
+                datetime(2024, 1, 17, 15, 30, 0),
+                datetime(2024, 1, 17, 15, 30, 0),
+                Decimal("91500.50"),
+            ]
+        )
+        print("Successfully appended row (list with Date, Time, Timestamp, Decimal)")
+
+        # Demo: Write Pandas DataFrame
+        print("\n--- Testing Pandas DataFrame write ---")
+        df = pd.DataFrame(
+            {
+                "id": [10, 11],
+                "name": ["Frank", "Grace"],
+                "score": [89.3, 94.7],
+                "age": [29, 27],
+                "birth_date": [date(1995, 2, 14), date(1997, 9, 30)],
+                "check_in_time": [dt_time(10, 0, 0), dt_time(10, 30, 0)],
+                "created_at": [
+                    datetime(2024, 1, 18, 8, 0),
+                    datetime(2024, 1, 18, 8, 30),
+                ],
+                "updated_at": [
+                    datetime(2024, 1, 18, 8, 0),
+                    datetime(2024, 1, 18, 8, 30),
+                ],
+                "salary": [Decimal("79000.00"), Decimal("85500.75")],
+            }
+        )
+
+        append_writer.write_pandas(df)
+        print("Successfully wrote Pandas DataFrame")
+
+        # Flush all pending data
+        print("\n--- Flushing data ---")
+        await append_writer.flush()
+        print("Successfully flushed data")
+
+        # Demo: Check offsets after writes
+        print("\n--- Checking offsets after writes ---")
+        try:
+            offsets = await admin.list_offsets(
+                table_path,
+                bucket_ids=[0],
+                offset_spec=fluss.OffsetSpec.latest()
+            )
+            print(f"Latest offsets after writing 7 records: {offsets}")
+        except Exception as e:
+            print(f"Failed to list offsets: {e}")
+
+    except Exception as e:
+        print(f"Error during writing: {e}")
+
+    # Now scan the table to verify data was written
+    print("\n--- Scanning table (batch scanner) ---")
+    try:
+        # Use new_scan().create_record_batch_log_scanner() for batch-based operations
+        batch_scanner = await table.new_scan().create_record_batch_log_scanner()
+        print(f"Created batch scanner: {batch_scanner}")
+
+        # Subscribe to buckets (required before to_arrow/to_pandas)
+        # Use subscribe_buckets to subscribe all buckets from EARLIEST_OFFSET
+        num_buckets = (await admin.get_table_info(table_path)).num_buckets
+        batch_scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+        print(f"Subscribed to {num_buckets} buckets from EARLIEST_OFFSET")
+
+        # Read all data using to_arrow()
+        print("Scanning results using to_arrow():")
+
+        # Try to get as PyArrow Table
+        try:
+            pa_table_result = await batch_scanner.to_arrow()
+            print(f"\nAs PyArrow Table: {pa_table_result}")
+        except Exception as e:
+            print(f"Could not convert to PyArrow: {e}")
+
+        # Create a new batch scanner for to_pandas() test
+        batch_scanner2 = await table.new_scan().create_record_batch_log_scanner()
+        batch_scanner2.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+        # Try to get as Pandas DataFrame
+        try:
+            df_result = await batch_scanner2.to_pandas()
+            print(f"\nAs Pandas DataFrame:\n{df_result}")
+        except Exception as e:
+            print(f"Could not convert to Pandas: {e}")
+
+        # to_arrow_batch_reader() — returns a lazy PyArrow RecordBatchReader
+        batch_scanner_reader = await table.new_scan().create_record_batch_log_scanner()
+        batch_scanner_reader.subscribe_buckets(
+            {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}
+        )
+        arrow_reader = batch_scanner_reader.to_arrow_batch_reader()
+        reader_table = pa.Table.from_batches(list(arrow_reader), schema=arrow_reader.schema)
+        print(f"\nVia to_arrow_batch_reader(): {reader_table.num_rows} rows")
+
+        # TODO: support to_duckdb()
+
+        # Test poll_arrow() method for incremental reading as Arrow Table
+        print("\n--- Testing poll_arrow() method ---")
+        batch_scanner3 = await table.new_scan().create_record_batch_log_scanner()
+        batch_scanner3.subscribe(bucket_id=0, start_offset=fluss.EARLIEST_OFFSET)
+        print(f"Subscribed to bucket 0 at EARLIEST_OFFSET ({fluss.EARLIEST_OFFSET})")
+
+        # Poll with a timeout of 5000ms (5 seconds)
+        # Note: poll_arrow() returns an empty table (not an error) on timeout
+        try:
+            poll_result = await batch_scanner3.poll_arrow(5000)
+            print(f"Number of rows: {poll_result.num_rows}")
+
+            if poll_result.num_rows > 0:
+                poll_df = poll_result.to_pandas()
+                print(f"Polled data:\n{poll_df}")
+            else:
+                print("Empty result (no records available)")
+                # Empty table still has schema - this is useful!
+                print(f"Schema: {poll_result.schema}")
+
+        except Exception as e:
+            print(f"Error during poll_arrow: {e}")
+
+        # Test poll_record_batch() method for batches with metadata
+        print("\n--- Testing poll_record_batch() method ---")
+        batch_scanner4 = await table.new_scan().create_record_batch_log_scanner()
+        batch_scanner4.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+        try:
+            batches = await batch_scanner4.poll_record_batch(5000)
+            print(f"Number of batches: {len(batches)}")
+
+            for i, batch in enumerate(batches):
+                print(f"  Batch {i}: bucket={batch.bucket}, "
+                      f"offsets={batch.base_offset}-{batch.last_offset}, "
+                      f"rows={batch.batch.num_rows}")
+
+        except Exception as e:
+            print(f"Error during poll_record_batch: {e}")
+
+    except Exception as e:
+        print(f"Error during batch scanning: {e}")
+
+    # Test record-based scanning with poll()
+    print("\n--- Scanning table (record scanner) ---")
+    try:
+        # Use new_scan().create_log_scanner() for record-based operations
+        record_scanner = await table.new_scan().create_log_scanner()
+        print(f"Created record scanner: {record_scanner}")
+
+        record_scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+        # Poll returns ScanRecords — records grouped by bucket
+        print("\n--- Testing poll() method (record-by-record) ---")
+        try:
+            scan_records = await record_scanner.poll(5000)
+            print(f"Total records: {scan_records.count()}, buckets: {len(scan_records.buckets())}")
+
+            # Flat iteration over all records (regardless of bucket)
+            print(f"  Flat iteration: {scan_records.count()} records")
+            for record in scan_records:
+                print(f"    offset={record.offset}, timestamp={record.timestamp}")
+
+            # Per-bucket access
+            for bucket in scan_records.buckets():
+                bucket_recs = scan_records.records(bucket)
+                print(f"  Bucket {bucket}: {len(bucket_recs)} records")
+                for record in bucket_recs[:3]:
+                    print(f"    offset={record.offset}, "
+                          f"timestamp={record.timestamp}, "
+                          f"change_type={record.change_type}, "
+                          f"row={record.row}")
+
+        except Exception as e:
+            print(f"Error during poll: {e}")
+
+    except Exception as e:
+        print(f"Error during record scanning: {e}")
+
+    # Demo: unsubscribe — unsubscribe from a bucket (non-partitioned tables)
+    print("\n--- Testing unsubscribe ---")
+    try:
+        unsub_scanner = await table.new_scan().create_record_batch_log_scanner()
+        unsub_scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+        print(f"Subscribed to {num_buckets} buckets")
+        # Unsubscribe from bucket 0 — future polls will skip this bucket
+        unsub_scanner.unsubscribe(bucket_id=0)
+        print("Unsubscribed from bucket 0")
+        remaining = await unsub_scanner.poll_arrow(5000)
+        print(f"After unsubscribe, got {remaining.num_rows} records (from remaining buckets)")
+    except Exception as e:
+        print(f"Error during unsubscribe test: {e}")
+
+    # =====================================================
+    # Demo: Primary Key Table with Lookup and Upsert
+    # =====================================================
+    print("\n" + "=" * 60)
+    print("--- Testing Primary Key Table (Lookup & Upsert) ---")
+    print("=" * 60)
+
+    # Create a primary key table for lookup/upsert tests
+    # Include temporal and decimal types to test full conversion
+    pk_table_fields = [
+        pa.field("user_id", pa.int32()),
+        pa.field("name", pa.string()),
+        pa.field("email", pa.string()),
+        pa.field("age", pa.int32()),
+        pa.field("birth_date", pa.date32()),
+        pa.field("login_time", pa.time32("ms")),
+        pa.field("created_at", pa.timestamp("us")),  # TIMESTAMP (NTZ)
+        pa.field("updated_at", pa.timestamp("us", tz="UTC")),  # TIMESTAMP_LTZ
+        pa.field("balance", pa.decimal128(10, 2)),
+    ]
+    pk_schema = pa.schema(pk_table_fields)
+    fluss_pk_schema = fluss.Schema(pk_schema, primary_keys=["user_id"])
+
+    # Create table descriptor
+    pk_table_descriptor = fluss.TableDescriptor(
+        fluss_pk_schema,
+        bucket_count=3,
+    )
+
+    pk_table_path = fluss.TablePath("fluss", "users_pk_table_v3")
+
+    try:
+        await admin.create_table(pk_table_path, pk_table_descriptor, True)
+        print(f"Created PK table: {pk_table_path}")
+    except Exception as e:
+        print(f"PK Table creation failed (may already exist): {e}")
+
+    # Get the PK table
+    pk_table = await conn.get_table(pk_table_path)
+    print(f"Got PK table: {pk_table}")
+    print(f"Has primary key: {pk_table.has_primary_key()}")
+
+    # --- Test Upsert ---
+    print("\n--- Testing Upsert (fire-and-forget) ---")
+    try:
+        upsert_writer = pk_table.new_upsert().create_writer()
+        print(f"Created upsert writer: {upsert_writer}")
+
+        # Fire-and-forget: queue writes synchronously, flush at end.
+        # Records are batched internally for efficiency.
+        upsert_writer.upsert(
+            {
+                "user_id": 1,
+                "name": "Alice",
+                "email": "alice@example.com",
+                "age": 25,
+                "birth_date": date(1999, 5, 15),
+                "login_time": dt_time(9, 30, 45, 123000),  # 09:30:45.123
+                "created_at": datetime(
+                    2024, 1, 15, 10, 30, 45, 123456
+                ),  # with microseconds
+                "updated_at": datetime(2024, 1, 15, 10, 30, 45, 123456),
+                "balance": Decimal("1234.56"),
+            }
+        )
+        print("Queued user_id=1 (Alice)")
+
+        upsert_writer.upsert(
+            {
+                "user_id": 2,
+                "name": "Bob",
+                "email": "bob@example.com",
+                "age": 30,
+                "birth_date": date(1994, 3, 20),
+                "login_time": dt_time(14, 15, 30, 500000),  # 14:15:30.500
+                "created_at": datetime(2024, 1, 16, 11, 22, 33, 444555),
+                "updated_at": datetime(2024, 1, 16, 11, 22, 33, 444555),
+                "balance": Decimal("5678.91"),
+            }
+        )
+        print("Queued user_id=2 (Bob)")
+
+        upsert_writer.upsert(
+            {
+                "user_id": 3,
+                "name": "Charlie",
+                "email": "charlie@example.com",
+                "age": 35,
+                "birth_date": date(1989, 11, 8),
+                "login_time": dt_time(16, 45, 59, 999000),  # 16:45:59.999
+                "created_at": datetime(2024, 1, 17, 23, 59, 59, 999999),
+                "updated_at": datetime(2024, 1, 17, 23, 59, 59, 999999),
+                "balance": Decimal("9876.54"),
+            }
+        )
+        print("Queued user_id=3 (Charlie)")
+
+        # flush() waits for all queued writes to be acknowledged by the server
+        await upsert_writer.flush()
+        print("Flushed — all 3 rows acknowledged by server")
+
+        # Per-record acknowledgment: await the returned handle to block until
+        # the server confirms this specific write, useful when you need to
+        # read-after-write or verify critical updates.
+        print("\n--- Testing Upsert (per-record acknowledgment) ---")
+        handle = upsert_writer.upsert(
+            {
+                "user_id": 1,
+                "name": "Alice Updated",
+                "email": "alice.new@example.com",
+                "age": 26,
+                "birth_date": date(1999, 5, 15),
+                "login_time": dt_time(10, 11, 12, 345000),  # 10:11:12.345
+                "created_at": datetime(2024, 1, 15, 10, 30, 45, 123456),  # unchanged
+                "updated_at": datetime(
+                    2024, 1, 20, 15, 45, 30, 678901
+                ),  # new update time
+                "balance": Decimal("2345.67"),
+            }
+        )
+        await handle.wait()  # wait for server acknowledgment
+        print("Updated user_id=1 (Alice -> Alice Updated) — server acknowledged")
+
+    except Exception as e:
+        print(f"Error during upsert: {e}")
+        traceback.print_exc()
+
+    # --- Test Lookup ---
+    print("\n--- Testing Lookup ---")
+    try:
+        lookuper = pk_table.new_lookup().create_lookuper()
+        print(f"Created lookuper: {lookuper}")
+
+        result = await lookuper.lookup({"user_id": 1})
+        if result:
+            print("Lookup user_id=1: Found!")
+            print(f"  name: {result['name']}")
+            print(f"  email: {result['email']}")
+            print(f"  age: {result['age']}")
+            print(
+                f"  birth_date: {result['birth_date']} (type: {type(result['birth_date']).__name__})"
+            )
+            print(
+                f"  login_time: {result['login_time']} (type: {type(result['login_time']).__name__})"
+            )
+            print(
+                f"  created_at: {result['created_at']} (type: {type(result['created_at']).__name__})"
+            )
+            print(
+                f"  updated_at: {result['updated_at']} (type: {type(result['updated_at']).__name__})"
+            )
+            print(
+                f"  balance: {result['balance']} (type: {type(result['balance']).__name__})"
+            )
+        else:
+            print("Lookup user_id=1: Not found")
+
+        # Lookup another row
+        result = await lookuper.lookup({"user_id": 2})
+        if result:
+            print(f"Lookup user_id=2: Found! -> {result}")
+        else:
+            print("Lookup user_id=2: Not found")
+
+        # Lookup non-existent row
+        result = await lookuper.lookup({"user_id": 999})
+        if result:
+            print(f"Lookup user_id=999: Found! -> {result}")
+        else:
+            print("Lookup user_id=999: Not found (as expected)")
+
+    except Exception as e:
+        print(f"Error during lookup: {e}")
+        traceback.print_exc()
+
+    # --- Test Delete ---
+    print("\n--- Testing Delete ---")
+    try:
+        upsert_writer = pk_table.new_upsert().create_writer()
+
+        handle = upsert_writer.delete({"user_id": 3})
+        await handle.wait()
+        print("Deleted user_id=3 — server acknowledged")
+
+        lookuper = pk_table.new_lookup().create_lookuper()
+        result = await lookuper.lookup({"user_id": 3})
+        if result:
+            print(f"Lookup user_id=3 after delete: Still found! -> {result}")
+        else:
+            print("Lookup user_id=3 after delete: Not found (deletion confirmed)")
+
+    except Exception as e:
+        print(f"Error during delete: {e}")
+        traceback.print_exc()
+
+    # --- Test Partial Update by column names ---
+    print("\n--- Testing Partial Update (by column names) ---")
+    try:
+        partial_writer = pk_table.new_upsert().partial_update_by_name(["user_id", "balance"]).create_writer()
+        handle = partial_writer.upsert({"user_id": 1, "balance": Decimal("9999.99")})
+        await handle.wait()
+        print("Partial update: set balance=9999.99 for user_id=1")
+
+        lookuper = pk_table.new_lookup().create_lookuper()
+        result = await lookuper.lookup({"user_id": 1})
+        if result:
+            print(f"Partial update verified:"
+                  f"\n  name={result['name']} (unchanged)"
+                  f"\n  balance={result['balance']} (updated)")
+        else:
+            print("ERROR: Expected to find user_id=1")
+
+    except Exception as e:
+        print(f"Error during partial update by names: {e}")
+        traceback.print_exc()
+
+    # --- Test Partial Update by column indices ---
+    print("\n--- Testing Partial Update (by column indices) ---")
+    try:
+        # Columns: 0=user_id (PK), 1=name — update name only
+        partial_writer_idx = pk_table.new_upsert().partial_update_by_index([0, 1]).create_writer()
+        handle = partial_writer_idx.upsert([1, "Alice Renamed"])
+        await handle.wait()
+        print("Partial update by indices: set name='Alice Renamed' for user_id=1")
+
+        lookuper = pk_table.new_lookup().create_lookuper()
+        result = await lookuper.lookup({"user_id": 1})
+        if result:
+            print(f"Partial update by indices verified:"
+                  f"\n  name={result['name']} (updated)"
+                  f"\n  balance={result['balance']} (unchanged)")
+        else:
+            print("ERROR: Expected to find user_id=1")
+
+    except Exception as e:
+        print(f"Error during partial update by indices: {e}")
+        traceback.print_exc()
+
+    # Demo: Column projection using builder pattern
+    print("\n--- Testing Column Projection ---")
+    try:
+        # Get bucket count for subscriptions
+        num_buckets = (await admin.get_table_info(table_path)).num_buckets
+
+        # Project specific columns by index (using batch scanner for to_pandas)
+        print("\n1. Projection by index [0, 1] (id, name):")
+        scanner_index = await table.new_scan().project([0, 1]).create_record_batch_log_scanner()
+        scanner_index.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+        df_projected = await scanner_index.to_pandas()
+        print(df_projected.head())
+        print(
+            f"   Projected {df_projected.shape[1]} columns: {list(df_projected.columns)}"
+        )
+
+        # Project specific columns by name (Pythonic!)
+        print("\n2. Projection by name ['name', 'score'] (Pythonic):")
+        scanner_names = await table.new_scan() \
+            .project_by_name(["name", "score"]) \
+            .create_record_batch_log_scanner()
+        scanner_names.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+        df_named = await scanner_names.to_pandas()
+        print(df_named.head())
+        print(f"   Projected {df_named.shape[1]} columns: {list(df_named.columns)}")
+
+        # Test empty result schema with projection
+        print("\n3. Testing empty result schema with projection:")
+        scanner_proj = await table.new_scan().project([0, 2]).create_record_batch_log_scanner()
+        scanner_proj.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+        # Quick poll that may return empty
+        result = await scanner_proj.poll_arrow(100)
+        print(f"   Schema columns: {result.schema.names}")
+
+    except Exception as e:
+        print(f"Error during projection: {e}")
+
+
+    print("\n--- New: async context manager demo ---")
+    async with await fluss.FlussConnection.create(config) as demo_conn:
+        demo_table = await demo_conn.get_table(table_path)
+        async with demo_table.new_append().create_writer() as writer:
+            writer.append(
+                {
+                    "id": 1,
+                    "name": "demo",
+                    "score": 1.0,
+                    "age": 25,
+                    "birth_date": date(2000, 1, 1),
+                    "check_in_time": dt_time(12, 0, 0),
+                    "created_at": datetime(2024, 1, 1, 12, 0, 0),
+                    "updated_at": datetime(2024, 1, 1, 12, 0, 0),
+                    "salary": Decimal("100.00"),
+                }
+            )
+            # auto-flushes on exit
+
+    # Demo: Drop tables
+    print("\n--- Testing drop_table() ---")
+    try:
+        # Drop the log table
+        await admin.drop_table(table_path, ignore_if_not_exists=True)
+        print(f"Successfully dropped table: {table_path}")
+        # Drop the PK table
+        await admin.drop_table(pk_table_path, ignore_if_not_exists=True)
+        print(f"Successfully dropped table: {pk_table_path}")
+    except Exception as e:
+        print(f"Failed to drop table: {e}")
+
+    # =====================================================
+    # Demo: Partitioned Table with list_partition_offsets
+    # =====================================================
+    print("\n" + "=" * 60)
+    print("--- Testing Partitioned Table ---")
+    print("=" * 60)
+
+    # Create a partitioned log table
+    partitioned_fields = [
+        pa.field("id", pa.int32()),
+        pa.field("region", pa.string()),  # partition key
+        pa.field("value", pa.int64()),
+    ]
+    partitioned_schema = pa.schema(partitioned_fields)
+    fluss_partitioned_schema = fluss.Schema(partitioned_schema)
+
+    partitioned_table_descriptor = fluss.TableDescriptor(
+        fluss_partitioned_schema,
+        partition_keys=["region"],  # Partition by region
+        bucket_count=1,
+    )
+
+    partitioned_table_path = fluss.TablePath("fluss", "partitioned_log_table_py")
+
+    try:
+        # Drop if exists first
+        await admin.drop_table(partitioned_table_path, ignore_if_not_exists=True)
+        print(f"Dropped existing table: {partitioned_table_path}")
+
+        # Create the partitioned table
+        await admin.create_table(partitioned_table_path, partitioned_table_descriptor, False)
+        print(f"Created partitioned table: {partitioned_table_path}")
+
+        # Create partitions for US and EU regions
+        print("\n--- Creating partitions ---")
+        await admin.create_partition(partitioned_table_path, {"region": "US"}, ignore_if_exists=True)
+        print("Created partition: region=US")
+        await admin.create_partition(partitioned_table_path, {"region": "EU"}, ignore_if_exists=True)
+        print("Created partition: region=EU")
+
+        # List partitions
+        print("\n--- Listing partitions ---")
+        partition_infos = await admin.list_partition_infos(partitioned_table_path)
+        for p in partition_infos:
+            print(f"  {p}")  # PartitionInfo(partition_id=..., partition_name='region=...')
+
+        # Get the table and write some data
+        partitioned_table = await conn.get_table(partitioned_table_path)
+        partitioned_writer = partitioned_table.new_append().create_writer()
+
+        # Append data to US partition
+        partitioned_writer.append({"id": 1, "region": "US", "value": 100})
+        partitioned_writer.append({"id": 2, "region": "US", "value": 200})
+        # Append data to EU partition
+        partitioned_writer.append({"id": 3, "region": "EU", "value": 300})
+        partitioned_writer.append({"id": 4, "region": "EU", "value": 400})
+        await partitioned_writer.flush()
+        print("\nWrote 4 records (2 to US, 2 to EU)")
+
+        # Demo: list_partition_infos with partial spec filter
+        print("\n--- Testing list_partition_infos with spec ---")
+        us_partitions = await admin.list_partition_infos(
+            partitioned_table_path, partition_spec={"region": "US"}
+        )
+        print(f"Filtered partitions (region=US): {us_partitions}")
+
+        # Demo: list_partition_offsets
+        print("\n--- Testing list_partition_offsets ---")
+
+        # Query offsets for US partition
+        # Note: partition_name is just the value (e.g., "US"), not "region=US"
+        us_offsets = await admin.list_partition_offsets(
+            partitioned_table_path,
+            partition_name="US",
+            bucket_ids=[0],
+            offset_spec=fluss.OffsetSpec.latest()
+        )
+        print(f"US partition latest offsets: {us_offsets}")
+
+        # Query offsets for EU partition
+        eu_offsets = await admin.list_partition_offsets(
+            partitioned_table_path,
+            partition_name="EU",
+            bucket_ids=[0],
+            offset_spec=fluss.OffsetSpec.latest()
+        )
+        print(f"EU partition latest offsets: {eu_offsets}")
+
+        # Demo: subscribe_partition for reading partitioned data
+        print("\n--- Testing subscribe_partition + to_arrow() ---")
+        partitioned_scanner = await partitioned_table.new_scan().create_record_batch_log_scanner()
+
+        # Subscribe to each partition using partition_id
+        for p in partition_infos:
+            partitioned_scanner.subscribe_partition(
+                partition_id=p.partition_id,
+                bucket_id=0,
+                start_offset=fluss.EARLIEST_OFFSET
+            )
+            print(f"Subscribed to partition {p.partition_name} (id={p.partition_id})")
+
+        # Use to_arrow() - now works for partitioned tables!
+        partitioned_arrow = await partitioned_scanner.to_arrow()
+        print(f"\nto_arrow() returned {partitioned_arrow.num_rows} records from partitioned table:")
+        print(partitioned_arrow.to_pandas())
+
+        # Demo: subscribe_partition_buckets for batch subscribing to multiple partitions at once
+        print("\n--- Testing subscribe_partition_buckets + to_arrow() ---")
+        partitioned_scanner_batch = await partitioned_table.new_scan().create_record_batch_log_scanner()
+        partition_bucket_offsets = {
+            (p.partition_id, 0): fluss.EARLIEST_OFFSET for p in partition_infos
+        }
+        partitioned_scanner_batch.subscribe_partition_buckets(partition_bucket_offsets)
+        print(f"Batch subscribed to {len(partition_bucket_offsets)} partition+bucket combinations")
+        partitioned_batch_arrow = await partitioned_scanner_batch.to_arrow()
+        print(f"to_arrow() returned {partitioned_batch_arrow.num_rows} records:")
+        print(partitioned_batch_arrow.to_pandas())
+
+        # Demo: unsubscribe_partition - unsubscribe from one partition, read remaining
+        print("\n--- Testing unsubscribe_partition ---")
+        partitioned_scanner3 = await partitioned_table.new_scan().create_record_batch_log_scanner()
+        for p in partition_infos:
+            partitioned_scanner3.subscribe_partition(p.partition_id, 0, fluss.EARLIEST_OFFSET)
+        # Unsubscribe from the first partition
+        first_partition = partition_infos[0]
+        partitioned_scanner3.unsubscribe_partition(first_partition.partition_id, 0)
+        print(f"Unsubscribed from partition {first_partition.partition_name} (id={first_partition.partition_id})")
+        remaining_arrow = await partitioned_scanner3.to_arrow()
+        print(f"After unsubscribe, to_arrow() returned {remaining_arrow.num_rows} records (from remaining partitions):")
+        print(remaining_arrow.to_pandas())
+
+        # Demo: to_pandas() also works for partitioned tables
+        print("\n--- Testing to_pandas() on partitioned table ---")
+        partitioned_scanner2 = await partitioned_table.new_scan().create_record_batch_log_scanner()
+        for p in partition_infos:
+            partitioned_scanner2.subscribe_partition(p.partition_id, 0, fluss.EARLIEST_OFFSET)
+        partitioned_df = await partitioned_scanner2.to_pandas()
+        print(f"to_pandas() returned {len(partitioned_df)} records:")
+        print(partitioned_df)
+
+        # Cleanup
+        await admin.drop_table(partitioned_table_path, ignore_if_not_exists=True)
+        print(f"\nDropped partitioned table: {partitioned_table_path}")
+
+    except Exception as e:
+        print(f"Error with partitioned table: {e}")
+        traceback.print_exc()
+
+    # =====================================================
+    # Demo: Partitioned KV Table (Upsert, Lookup, Delete)
+    # =====================================================
+    print("\n" + "=" * 60)
+    print("--- Testing Partitioned KV Table ---")
+    print("=" * 60)
+
+    partitioned_kv_fields = [
+        pa.field("region", pa.string()),   # partition key + part of PK
+        pa.field("user_id", pa.int32()),   # part of PK
+        pa.field("name", pa.string()),
+        pa.field("score", pa.int64()),
+    ]
+    partitioned_kv_schema = pa.schema(partitioned_kv_fields)
+    fluss_partitioned_kv_schema = fluss.Schema(
+        partitioned_kv_schema, primary_keys=["region", "user_id"]
+    )
+
+    partitioned_kv_descriptor = fluss.TableDescriptor(
+        fluss_partitioned_kv_schema,
+        partition_keys=["region"],
+    )
+
+    partitioned_kv_path = fluss.TablePath("fluss", "partitioned_kv_table_py")
+
+    try:
+        await admin.drop_table(partitioned_kv_path, ignore_if_not_exists=True)
+        await admin.create_table(partitioned_kv_path, partitioned_kv_descriptor, False)
+        print(f"Created partitioned KV table: {partitioned_kv_path}")
+
+        # Create partitions
+        await admin.create_partition(partitioned_kv_path, {"region": "US"})
+        await admin.create_partition(partitioned_kv_path, {"region": "EU"})
+        await admin.create_partition(partitioned_kv_path, {"region": "APAC"})
+        print("Created partitions: US, EU, APAC")
+
+        partitioned_kv_table = await conn.get_table(partitioned_kv_path)
+        upsert_writer = partitioned_kv_table.new_upsert().create_writer()
+
+        # Upsert rows across partitions
+        test_data = [
+            ("US", 1, "Gustave", 100),
+            ("US", 2, "Lune", 200),
+            ("EU", 1, "Sciel", 150),
+            ("EU", 2, "Maelle", 250),
+            ("APAC", 1, "Noco", 300),
+        ]
+        for region, user_id, name, score in test_data:
+            upsert_writer.upsert({
+                "region": region, "user_id": user_id,
+                "name": name, "score": score,
+            })
+        await upsert_writer.flush()
+        print(f"Upserted {len(test_data)} rows across 3 partitions")
+
+        # Lookup all rows across partitions
+        print("\n--- Lookup across partitions ---")
+        lookuper = partitioned_kv_table.new_lookup().create_lookuper()
+        for region, user_id, name, score in test_data:
+            result = await lookuper.lookup({"region": region, "user_id": user_id})
+            assert result is not None, f"Expected to find region={region} user_id={user_id}"
+            assert result["name"] == name, f"Name mismatch: {result['name']} != {name}"
+            assert result["score"] == score, f"Score mismatch: {result['score']} != {score}"
+        print(f"All {len(test_data)} rows verified across partitions")
+
+        # Update within a partition
+        print("\n--- Update within partition ---")
+        handle = upsert_writer.upsert({
+            "region": "US", "user_id": 1,
+            "name": "Gustave Updated", "score": 999,
+        })
+        await handle.wait()
+        result = await lookuper.lookup({"region": "US", "user_id": 1})
+        assert result is not None, "Expected to find region=US user_id=1 after update"
+        assert result["name"] == "Gustave Updated"
+        assert result["score"] == 999
+        print(f"Update verified: US/1 name={result['name']} score={result['score']}")
+
+        # Lookup in non-existent partition
+        print("\n--- Lookup in non-existent partition ---")
+        result = await lookuper.lookup({"region": "UNKNOWN", "user_id": 1})
+        assert result is None, "Expected UNKNOWN partition lookup to return None"
+        print("UNKNOWN partition lookup: not found (expected)")
+
+        # Delete within a partition
+        print("\n--- Delete within partition ---")
+        handle = upsert_writer.delete({"region": "EU", "user_id": 1})
+        await handle.wait()
+        result = await lookuper.lookup({"region": "EU", "user_id": 1})
+        assert result is None, "Expected EU/1 to be deleted"
+        print("Delete verified: EU/1 not found")
+
+        # Verify sibling record still exists
+        result = await lookuper.lookup({"region": "EU", "user_id": 2})
+        assert result is not None, "Expected EU/2 to still exist"
+        assert result["name"] == "Maelle"
+        print(f"EU/2 still exists: name={result['name']}")
+
+        # Cleanup
+        await admin.drop_table(partitioned_kv_path, ignore_if_not_exists=True)
+        print(f"\nDropped partitioned KV table: {partitioned_kv_path}")
+
+    except Exception as e:
+        print(f"Error with partitioned KV table: {e}")
+        traceback.print_exc()
+
+
+
+    # Close connection
+    await conn.close()
+    print("\nConnection closed")
+
+
+if __name__ == "__main__":
+    # Run the async main function
+    asyncio.run(main())
diff --git a/fluss-rust/bindings/python/fluss/__init__.py b/fluss-rust/bindings/python/fluss/__init__.py
new file mode 100644
index 0000000000..098014adc6
--- /dev/null
+++ b/fluss-rust/bindings/python/fluss/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from ._fluss import *
+
+__version__ = "0.1.0"
diff --git a/fluss-rust/bindings/python/fluss/__init__.pyi b/fluss-rust/bindings/python/fluss/__init__.pyi
new file mode 100644
index 0000000000..b5bfdfab28
--- /dev/null
+++ b/fluss-rust/bindings/python/fluss/__init__.pyi
@@ -0,0 +1,1156 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Type stubs for Fluss Python bindings."""
+
+from enum import IntEnum
+from types import TracebackType
+from typing import (
+    Any,
+    AsyncIterator,
+    Dict,
+    Iterator,
+    List,
+    Optional,
+    Tuple,
+    Union,
+    overload,
+)
+
+import pandas as pd
+import pyarrow as pa
+
+class ChangeType(IntEnum):
+    """Represents the type of change for a record in a log."""
+
+    AppendOnly = 0
+    """Append-only operation"""
+    Insert = 1
+    """Insert operation"""
+    UpdateBefore = 2
+    """Update operation containing the previous content of the updated row"""
+    UpdateAfter = 3
+    """Update operation containing the new content of the updated row"""
+    Delete = 4
+    """Delete operation"""
+
+    def short_string(self) -> str:
+        """Returns a short string representation (+A, +I, -U, +U, -D)."""
+        ...
+
+class ScanRecord:
+    """Represents a single scan record with metadata.
+
+    The bucket is the key in ScanRecords, not on the individual record
+    (matches Rust/Java).
+    """
+
+    @property
+    def offset(self) -> int:
+        """The position of this record in the log."""
+        ...
+    @property
+    def timestamp(self) -> int:
+        """The timestamp of this record."""
+        ...
+    @property
+    def change_type(self) -> ChangeType:
+        """The type of change (insert, update, delete, etc.)."""
+        ...
+    @property
+    def row(self) -> Dict[str, object]:
+        """The row data as a dictionary mapping column names to values."""
+        ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
+class RecordBatch:
+    """Represents a batch of records with metadata."""
+
+    @property
+    def batch(self) -> pa.RecordBatch:
+        """The Arrow RecordBatch containing the data."""
+        ...
+    @property
+    def bucket(self) -> TableBucket:
+        """The bucket this batch belongs to."""
+        ...
+    @property
+    def base_offset(self) -> int:
+        """The offset of the first record in this batch."""
+        ...
+    @property
+    def last_offset(self) -> int:
+        """The offset of the last record in this batch."""
+        ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
+class ScanRecords:
+    """A collection of scan records grouped by bucket.
+
+    Returned by ``LogScanner.poll()``. Supports flat iteration
+    (``for rec in records``) and per-bucket access (``records.records(bucket)``).
+    """
+
+    def buckets(self) -> List[TableBucket]:
+        """List of distinct buckets that have records."""
+        ...
+    def records(self, bucket: TableBucket) -> List[ScanRecord]:
+        """Get records for a specific bucket. Returns empty list if bucket not present."""
+        ...
+    def count(self) -> int:
+        """Total number of records across all buckets."""
+        ...
+    def is_empty(self) -> bool:
+        """Whether the result set is empty."""
+        ...
+    def keys(self) -> List[TableBucket]:
+        """Mapping protocol: alias for ``buckets()``."""
+        ...
+    def values(self) -> Iterator[List[ScanRecord]]:
+        """Mapping protocol: lazy iterator over record lists, one per bucket."""
+        ...
+    def items(self) -> Iterator[Tuple[TableBucket, List[ScanRecord]]]:
+        """Mapping protocol: lazy iterator over ``(bucket, records)`` pairs."""
+        ...
+    def __len__(self) -> int: ...
+    @overload
+    def __getitem__(self, index: int) -> ScanRecord: ...
+    @overload
+    def __getitem__(self, index: slice) -> List[ScanRecord]: ...
+    @overload
+    def __getitem__(self, bucket: TableBucket) -> List[ScanRecord]: ...
+    def __getitem__(self, key: Union[int, slice, TableBucket]) -> Union[ScanRecord, List[ScanRecord]]: ...
+    def __contains__(self, bucket: TableBucket) -> bool: ...
+    def __iter__(self) -> Iterator[ScanRecord]: ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
+class Config:
+    def __init__(self, properties: Optional[Dict[str, str]] = None) -> None: ...
+    @property
+    def bootstrap_servers(self) -> str: ...
+    @bootstrap_servers.setter
+    def bootstrap_servers(self, server: str) -> None: ...
+    @property
+    def writer_request_max_size(self) -> int: ...
+    @writer_request_max_size.setter
+    def writer_request_max_size(self, size: int) -> None: ...
+    @property
+    def writer_acks(self) -> str: ...
+    @writer_acks.setter
+    def writer_acks(self, acks: str) -> None: ...
+    @property
+    def writer_retries(self) -> int: ...
+    @writer_retries.setter
+    def writer_retries(self, retries: int) -> None: ...
+    @property
+    def writer_batch_size(self) -> int: ...
+    @writer_batch_size.setter
+    def writer_batch_size(self, size: int) -> None: ...
+    @property
+    def writer_dynamic_batch_size_enabled(self) -> bool: ...
+    @writer_dynamic_batch_size_enabled.setter
+    def writer_dynamic_batch_size_enabled(self, enabled: bool) -> None: ...
+    @property
+    def writer_dynamic_batch_size_min(self) -> int: ...
+    @writer_dynamic_batch_size_min.setter
+    def writer_dynamic_batch_size_min(self, size: int) -> None: ...
+    @property
+    def writer_bucket_no_key_assigner(self) -> str: ...
+    @writer_bucket_no_key_assigner.setter
+    def writer_bucket_no_key_assigner(self, value: str) -> None: ...
+    @property
+    def scanner_remote_log_prefetch_num(self) -> int: ...
+    @scanner_remote_log_prefetch_num.setter
+    def scanner_remote_log_prefetch_num(self, num: int) -> None: ...
+    @property
+    def remote_file_download_thread_num(self) -> int: ...
+    @remote_file_download_thread_num.setter
+    def remote_file_download_thread_num(self, num: int) -> None: ...
+    @property
+    def scanner_remote_log_read_concurrency(self) -> int: ...
+    @scanner_remote_log_read_concurrency.setter
+    def scanner_remote_log_read_concurrency(self, num: int) -> None: ...
+    @property
+    def scanner_log_max_poll_records(self) -> int: ...
+    @scanner_log_max_poll_records.setter
+    def scanner_log_max_poll_records(self, num: int) -> None: ...
+    @property
+    def scanner_log_fetch_max_bytes(self) -> int: ...
+    @scanner_log_fetch_max_bytes.setter
+    def scanner_log_fetch_max_bytes(self, bytes: int) -> None: ...
+    @property
+    def scanner_log_fetch_min_bytes(self) -> int: ...
+    @scanner_log_fetch_min_bytes.setter
+    def scanner_log_fetch_min_bytes(self, bytes: int) -> None: ...
+    @property
+    def scanner_log_fetch_wait_max_time_ms(self) -> int: ...
+    @scanner_log_fetch_wait_max_time_ms.setter
+    def scanner_log_fetch_wait_max_time_ms(self, ms: int) -> None: ...
+    @property
+    def scanner_log_fetch_max_bytes_for_bucket(self) -> int: ...
+    @scanner_log_fetch_max_bytes_for_bucket.setter
+    def scanner_log_fetch_max_bytes_for_bucket(self, bytes: int) -> None: ...
+    @property
+    def writer_batch_timeout_ms(self) -> int: ...
+    @writer_batch_timeout_ms.setter
+    def writer_batch_timeout_ms(self, timeout: int) -> None: ...
+    @property
+    def writer_enable_idempotence(self) -> bool: ...
+    @writer_enable_idempotence.setter
+    def writer_enable_idempotence(self, enabled: bool) -> None: ...
+    @property
+    def writer_max_inflight_requests_per_bucket(self) -> int: ...
+    @writer_max_inflight_requests_per_bucket.setter
+    def writer_max_inflight_requests_per_bucket(self, num: int) -> None: ...
+    @property
+    def writer_buffer_memory_size(self) -> int: ...
+    @writer_buffer_memory_size.setter
+    def writer_buffer_memory_size(self, size: int) -> None: ...
+    @property
+    def writer_buffer_wait_timeout_ms(self) -> int: ...
+    @writer_buffer_wait_timeout_ms.setter
+    def writer_buffer_wait_timeout_ms(self, timeout: int) -> None: ...
+    @property
+    def connect_timeout_ms(self) -> int: ...
+    @connect_timeout_ms.setter
+    def connect_timeout_ms(self, timeout: int) -> None: ...
+    @property
+    def security_protocol(self) -> str: ...
+    @security_protocol.setter
+    def security_protocol(self, protocol: str) -> None: ...
+    @property
+    def security_sasl_mechanism(self) -> str: ...
+    @security_sasl_mechanism.setter
+    def security_sasl_mechanism(self, mechanism: str) -> None: ...
+    @property
+    def security_sasl_username(self) -> str: ...
+    @security_sasl_username.setter
+    def security_sasl_username(self, username: str) -> None: ...
+    @property
+    def security_sasl_password(self) -> str: ...
+    @security_sasl_password.setter
+    def security_sasl_password(self, password: str) -> None: ...
+
+class FlussConnection:
+    @staticmethod
+    async def create(config: Config) -> FlussConnection: ...
+    def get_admin(self) -> FlussAdmin: ...
+    async def get_table(self, table_path: TablePath) -> FlussTable: ...
+    async def close(self) -> None: ...
+    def __enter__(self) -> FlussConnection: ...
+    def __exit__(
+        self,
+        exc_type: Optional[type],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> bool: ...
+    async def __aenter__(self) -> FlussConnection: ...
+    async def __aexit__(
+        self,
+        exc_type: Optional[type],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> bool: ...
+    def __repr__(self) -> str: ...
+
+class ServerNode:
+    """Information about a server node in the Fluss cluster."""
+
+    @property
+    def id(self) -> int:
+        """The server node ID."""
+        ...
+    @property
+    def host(self) -> str:
+        """The hostname of the server."""
+        ...
+    @property
+    def port(self) -> int:
+        """The port number of the server."""
+        ...
+    @property
+    def server_type(self) -> str:
+        """The type of server ('CoordinatorServer' or 'TabletServer')."""
+        ...
+    @property
+    def uid(self) -> str:
+        """The unique identifier of the server (e.g. 'cs-0', 'ts-1')."""
+        ...
+    def __repr__(self) -> str: ...
+
+class FlussAdmin:
+    async def create_database(
+        self,
+        database_name: str,
+        database_descriptor: Optional["DatabaseDescriptor"] = None,
+        ignore_if_exists: bool = False,
+    ) -> None:
+        """Create a database."""
+        ...
+    async def drop_database(
+        self,
+        database_name: str,
+        ignore_if_not_exists: bool = False,
+        cascade: bool = True,
+    ) -> None:
+        """Drop a database."""
+        ...
+    async def list_databases(self) -> List[str]:
+        """List all databases."""
+        ...
+    async def database_exists(self, database_name: str) -> bool:
+        """Check if a database exists."""
+        ...
+    async def get_database_info(self, database_name: str) -> "DatabaseInfo":
+        """Get database information."""
+        ...
+    async def list_tables(self, database_name: str) -> List[str]:
+        """List all tables in a database."""
+        ...
+    async def table_exists(self, table_path: TablePath) -> bool:
+        """Check if a table exists."""
+        ...
+    async def drop_partition(
+        self,
+        table_path: TablePath,
+        partition_spec: Dict[str, str],
+        ignore_if_not_exists: bool = False,
+    ) -> None:
+        """Drop a partition from a partitioned table."""
+        ...
+    async def create_table(
+        self,
+        table_path: TablePath,
+        table_descriptor: TableDescriptor,
+        ignore_if_exists: Optional[bool] = False,
+    ) -> None: ...
+    async def get_table_info(self, table_path: TablePath) -> TableInfo: ...
+    async def get_latest_lake_snapshot(self, table_path: TablePath) -> LakeSnapshot: ...
+    async def drop_table(
+        self,
+        table_path: TablePath,
+        ignore_if_not_exists: bool = False,
+    ) -> None: ...
+    async def list_offsets(
+        self,
+        table_path: TablePath,
+        bucket_ids: List[int],
+        offset_spec: "OffsetSpec",
+    ) -> Dict[int, int]:
+        """List offsets for the specified buckets.
+
+        Args:
+            table_path: Path to the table
+            bucket_ids: List of bucket IDs to query
+            offset_spec: Offset specification (OffsetSpec.earliest(), OffsetSpec.latest(),
+                or OffsetSpec.timestamp(ts))
+
+        Returns:
+            Dict mapping bucket_id -> offset
+        """
+        ...
+    async def list_partition_offsets(
+        self,
+        table_path: TablePath,
+        partition_name: str,
+        bucket_ids: List[int],
+        offset_spec: "OffsetSpec",
+    ) -> Dict[int, int]:
+        """List offsets for buckets in a specific partition.
+
+        Args:
+            table_path: Path to the table
+            partition_name: Partition value (e.g., "US" not "region=US")
+            bucket_ids: List of bucket IDs to query
+            offset_spec: Offset specification (OffsetSpec.earliest(), OffsetSpec.latest(),
+                or OffsetSpec.timestamp(ts))
+
+        Returns:
+            Dict mapping bucket_id -> offset
+        """
+        ...
+    async def create_partition(
+        self,
+        table_path: TablePath,
+        partition_spec: Dict[str, str],
+        ignore_if_exists: bool = False,
+    ) -> None:
+        """Create a partition for a partitioned table.
+
+        Args:
+            table_path: Path to the table
+            partition_spec: Dict mapping partition column name to value (e.g., {"region": "US"})
+            ignore_if_exists: If True, don't raise error if partition already exists
+        """
+        ...
+    async def list_partition_infos(
+        self,
+        table_path: TablePath,
+        partition_spec: Optional[Dict[str, str]] = None,
+    ) -> List["PartitionInfo"]:
+        """List partitions for a partitioned table.
+
+        Args:
+            table_path: Path to the table
+            partition_spec: Optional partial partition spec to filter results.
+                Dict mapping partition column name to value (e.g., {"region": "US"}).
+                If None, returns all partitions.
+
+        Returns:
+            List of PartitionInfo objects
+        """
+        ...
+    async def get_server_nodes(self) -> List[ServerNode]:
+        """Get all alive server nodes in the cluster.
+
+        Returns:
+            List of ServerNode objects (coordinator and tablet servers)
+        """
+        ...
+    def __repr__(self) -> str: ...
+
+
+class DatabaseDescriptor:
+    """Descriptor for a Fluss database (comment and custom properties)."""
+
+    def __init__(
+        self,
+        comment: Optional[str] = None,
+        custom_properties: Optional[Dict[str, str]] = None,
+    ) -> None: ...
+    @property
+    def comment(self) -> Optional[str]: ...
+    def get_custom_properties(self) -> Dict[str, str]: ...
+    def __repr__(self) -> str: ...
+
+
+class DatabaseInfo:
+    """Information about a Fluss database."""
+
+    @property
+    def database_name(self) -> str: ...
+    def get_database_descriptor(self) -> DatabaseDescriptor: ...
+    @property
+    def created_time(self) -> int: ...
+    @property
+    def modified_time(self) -> int: ...
+    def __repr__(self) -> str: ...
+
+class TableScan:
+    """Builder for creating log scanners with flexible configuration.
+
+    Use this builder to configure projection before creating a log scanner.
+    Obtain a TableScan instance via `FlussTable.new_scan()`.
+
+    Example:
+        ```python
+        # Record-based scanning with projection
+        scanner = await table.new_scan() \\
+            .project([0, 1, 2]) \\
+            .create_log_scanner()
+
+        # Batch-based scanning with column names
+        scanner = await table.new_scan() \\
+            .project_by_name(["id", "name"]) \\
+            .create_record_batch_log_scanner()
+        ```
+    """
+
+    def project(self, indices: List[int]) -> "TableScan":
+        """Project to specific columns by their indices.
+
+        Args:
+            indices: List of column indices (0-based) to include in the scan.
+
+        Returns:
+            Self for method chaining.
+        """
+        ...
+    def project_by_name(self, names: List[str]) -> "TableScan":
+        """Project to specific columns by their names.
+
+        Args:
+            names: List of column names to include in the scan.
+
+        Returns:
+            Self for method chaining.
+        """
+        ...
+    async def create_log_scanner(self) -> LogScanner:
+        """Create a record-based log scanner.
+
+        Use this scanner with `poll()` to get individual records with metadata
+        (offset, timestamp, change_type).
+
+        Returns:
+            LogScanner for record-by-record scanning with `poll()`
+        """
+        ...
+    async def create_record_batch_log_scanner(self) -> LogScanner:
+        """Create a batch-based log scanner.
+
+        Use this scanner with `poll_arrow()` to get Arrow Tables, or with
+        `poll_record_batch()` to get individual batches with metadata.
+
+        Returns:
+            LogScanner for batch-based scanning with `poll_arrow()` or `poll_record_batch()`
+        """
+        ...
+    def __repr__(self) -> str: ...
+
+class FlussTable:
+    def new_scan(self) -> TableScan:
+        """Create a new table scan builder for configuring and creating log scanners.
+
+        Use this method to create scanners with the builder pattern:
+
+        Example:
+            ```python
+            # Record-based scanning
+            scanner = await table.new_scan() \\
+                .project([0, 1]) \\
+                .create_log_scanner()
+
+            # Batch-based scanning
+            scanner = await table.new_scan() \\
+                .project_by_name(["id", "name"]) \\
+                .create_record_batch_log_scanner()
+            ```
+
+        Returns:
+            TableScan builder for configuring the scanner.
+        """
+        ...
+    def new_append(self) -> TableAppend: ...
+    def new_upsert(self) -> TableUpsert: ...
+    def new_lookup(self) -> TableLookup: ...
+    def get_table_info(self) -> TableInfo: ...
+    def get_table_path(self) -> TablePath: ...
+    def has_primary_key(self) -> bool: ...
+    def __repr__(self) -> str: ...
+
+class TableAppend:
+    """Builder for creating an AppendWriter.
+
+    Obtain via `FlussTable.new_append()`, then call `create_writer()`.
+
+    Example:
+        writer = table.new_append().create_writer()
+    """
+
+    def create_writer(self) -> AppendWriter: ...
+    def __repr__(self) -> str: ...
+
+class TableUpsert:
+    """Builder for creating an UpsertWriter, with optional partial update.
+
+    Obtain via `FlussTable.new_upsert()`, then optionally call
+    `partial_update_by_name()` or `partial_update_by_index()`,
+    then call `create_writer()`.
+
+    Example:
+        # Full row upsert
+        writer = table.new_upsert().create_writer()
+
+        # Partial update by column names
+        writer = table.new_upsert().partial_update_by_name(["col1", "col2"]).create_writer()
+
+        # Partial update by column indices
+        writer = table.new_upsert().partial_update_by_index([0, 1]).create_writer()
+    """
+
+    def partial_update_by_name(self, columns: List[str]) -> "TableUpsert": ...
+    def partial_update_by_index(self, column_indices: List[int]) -> "TableUpsert": ...
+    def create_writer(self) -> UpsertWriter: ...
+    def __repr__(self) -> str: ...
+
+class TableLookup:
+    """Builder for creating a Lookuper or PrefixLookuper.
+
+    Obtain via `FlussTable.new_lookup()`, then call `create_lookuper()`
+    for primary key lookup, or `lookup_by(columns).create_lookuper()`
+    for prefix key lookup.
+
+    Example:
+        lookuper = table.new_lookup().create_lookuper()
+        prefix_lookuper = table.new_lookup().lookup_by(["a", "b"]).create_lookuper()
+    """
+
+    def create_lookuper(self) -> Lookuper: ...
+    def lookup_by(self, column_names: List[str]) -> "TablePrefixLookup":
+        """Switch to prefix-scan mode for the given lookup columns.
+
+        The columns must be the table's partition keys (if any) plus the
+        bucket keys, in that order.
+
+        Args:
+            column_names: List of column names forming the prefix key.
+
+        Returns:
+            TablePrefixLookup builder. Call `create_lookuper()` to get a PrefixLookuper.
+        """
+        ...
+    def __repr__(self) -> str: ...
+
+class TablePrefixLookup:
+    """Builder for creating a PrefixLookuper.
+
+    Obtain via `TableLookup.lookup_by(columns)`, then call `create_lookuper()`.
+
+    Example:
+        prefix_lookuper = table.new_lookup().lookup_by(["a", "b"]).create_lookuper()
+    """
+
+    def create_lookuper(self) -> "PrefixLookuper": ...
+    def __repr__(self) -> str: ...
+
+class AppendWriter:
+    def append(self, row: dict | list | tuple) -> WriteResultHandle:
+        """Append a single row to the table.
+
+        Args:
+            row: Dictionary mapping field names to values, or
+                 list/tuple of values in schema order
+
+        Returns:
+            WriteResultHandle: Ignore for fire-and-forget, or await handle.wait() for acknowledgement.
+
+        Supported Types:
+            - Boolean, TinyInt, SmallInt, Int, BigInt (integers)
+            - Float, Double (floating point)
+            - String, Char (text)
+            - Bytes, Binary (binary data)
+            - Date, Time, Timestamp, TimestampLTZ (temporal)
+            - Decimal (arbitrary precision)
+            - Null values
+
+        Example:
+            writer.append({'id': 1, 'name': 'Alice', 'score': 95.5})
+            writer.append([1, 'Alice', 95.5])
+
+        Note:
+            For high-throughput bulk loading, prefer write_arrow_batch().
+            Use flush() to ensure all queued records are sent and acknowledged.
+        """
+        ...
+    def write_arrow(self, table: pa.Table) -> None: ...
+    def write_arrow_batch(self, batch: pa.RecordBatch) -> WriteResultHandle: ...
+    def write_pandas(self, df: pd.DataFrame) -> None: ...
+    async def flush(self) -> None: ...
+    async def __aenter__(self) -> AppendWriter:
+        """
+        Enter the async context manager.
+
+        Returns:
+            The AppendWriter instance.
+        """
+        ...
+    async def __aexit__(
+        self,
+        exc_type: Optional[type],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> bool:
+        """
+        Exit the async context manager.
+
+        On exit, the writer is automatically flushed to ensure
+        all pending records are sent and acknowledged.
+        """
+        ...
+    def __repr__(self) -> str: ...
+
+class UpsertWriter:
+    """Writer for upserting and deleting data in a Fluss primary key table."""
+
+    def upsert(self, row: dict | list | tuple) -> WriteResultHandle:
+        """Upsert a row into the table.
+
+        If a row with the same primary key exists, it will be updated.
+        Otherwise, a new row will be inserted.
+
+        Args:
+            row: Dictionary mapping field names to values, or
+                 list/tuple of values in schema order
+
+        Returns:
+            WriteResultHandle: Ignore for fire-and-forget, or await handle.wait() for ack.
+        """
+        ...
+    def delete(self, pk: dict | list | tuple) -> WriteResultHandle:
+        """Delete a row from the table by primary key.
+
+        Args:
+            pk: Dictionary with PK column names as keys, or
+                list/tuple of PK values in PK column order
+
+        Returns:
+            WriteResultHandle: Ignore for fire-and-forget, or await handle.wait() for ack.
+        """
+        ...
+    async def flush(self) -> None:
+        """Flush all pending upsert/delete operations to the server."""
+        ...
+    async def __aenter__(self) -> UpsertWriter:
+        """
+        Enter the async context manager.
+
+        Returns:
+            The UpsertWriter instance.
+        """
+        ...
+    async def __aexit__(
+        self,
+        exc_type: Optional[type],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> bool:
+        """
+        Exit the async context manager.
+
+        On exit, the writer is automatically flushed to ensure
+        all pending records are sent and acknowledged.
+        """
+        ...
+    def __repr__(self) -> str: ...
+
+
+class WriteResultHandle:
+    """Handle for a pending write (append/upsert/delete). Ignore for fire-and-forget, or await handle.wait() for ack."""
+
+    async def wait(self) -> None:
+        """Wait for server acknowledgment of this write."""
+        ...
+    def __repr__(self) -> str: ...
+
+
+class Lookuper:
+    """Lookuper for performing primary key lookups on a Fluss table."""
+
+    async def lookup(self, pk: dict | list | tuple) -> Optional[Dict[str, object]]:
+        """Lookup a row by its primary key.
+
+        Args:
+            pk: Dictionary with PK column names as keys, or
+                list/tuple of PK values in PK column order
+
+        Returns:
+            A dict containing the row data if found, None otherwise.
+        """
+        ...
+    def __repr__(self) -> str: ...
+
+class PrefixLookuper:
+    """Lookuper for performing prefix key lookups on a Fluss table.
+
+    Returns all rows whose primary key starts with the given prefix.
+    Create via `table.new_lookup().lookup_by(columns).create_lookuper()`.
+    """
+
+    async def lookup(self, prefix: dict | list | tuple) -> List[Dict[str, object]]:
+        """Lookup all rows matching a prefix key.
+
+        Args:
+            prefix: A dict, list, or tuple containing only the prefix key values
+                (the columns specified in lookup_by()).
+                For dict: keys are prefix column names.
+                For list/tuple: values in prefix column order.
+
+        Returns:
+            A list of dicts, each containing the full row data.
+            Empty list if no matches.
+        """
+        ...
+    def __repr__(self) -> str: ...
+
+class LogScanner:
+    """Scanner for reading log data from a Fluss table.
+
+    This scanner supports two modes:
+    - Record-based scanning via `poll()` - returns individual records with metadata
+    - Batch-based scanning via `poll_arrow()` / `poll_record_batch()` - returns Arrow batches
+
+    Create scanners using the builder pattern:
+        # Record-based scanning
+        scanner = await table.new_scan().create_log_scanner()
+
+        # Batch-based scanning
+        scanner = await table.new_scan().create_record_batch_log_scanner()
+
+        # With projection
+        scanner = await table.new_scan().project([0, 1]).create_log_scanner()
+    """
+
+    def subscribe(self, bucket_id: int, start_offset: int) -> None:
+        """Subscribe to a single bucket at a specific offset (non-partitioned tables).
+
+        Args:
+            bucket_id: The bucket ID to subscribe to
+            start_offset: The offset to start reading from (use EARLIEST_OFFSET for beginning)
+        """
+        ...
+    def subscribe_buckets(self, bucket_offsets: Dict[int, int]) -> None:
+        """Subscribe to multiple buckets at specified offsets (non-partitioned tables).
+
+        Args:
+            bucket_offsets: Dict mapping bucket_id -> start_offset
+        """
+        ...
+    def subscribe_partition(
+        self, partition_id: int, bucket_id: int, start_offset: int
+    ) -> None:
+        """Subscribe to a bucket within a specific partition (partitioned tables only).
+
+        Args:
+            partition_id: The partition ID (from PartitionInfo.partition_id)
+            bucket_id: The bucket ID within the partition
+            start_offset: The offset to start reading from (use EARLIEST_OFFSET for beginning)
+        """
+        ...
+    def subscribe_partition_buckets(
+        self, partition_bucket_offsets: Dict[Tuple[int, int], int]
+    ) -> None:
+        """Subscribe to multiple partition+bucket combinations at once (partitioned tables only).
+
+        Args:
+            partition_bucket_offsets: Dict mapping (partition_id, bucket_id) tuples to start_offsets.
+                Example: {(partition_id_1, 0): EARLIEST_OFFSET, (partition_id_2, 1): 100}
+        """
+        ...
+    def unsubscribe(self, bucket_id: int) -> None:
+        """Unsubscribe from a specific bucket (non-partitioned tables only).
+
+        Args:
+            bucket_id: The bucket ID to unsubscribe from
+        """
+        ...
+    def unsubscribe_partition(self, partition_id: int, bucket_id: int) -> None:
+        """Unsubscribe from a specific partition bucket (partitioned tables only).
+
+        Args:
+            partition_id: The partition ID to unsubscribe from
+            bucket_id: The bucket ID within the partition
+        """
+        ...
+    async def poll(self, timeout_ms: int) -> ScanRecords:
+        """Poll for individual records with metadata.
+
+        Requires a record-based scanner (created with new_scan().create_log_scanner()).
+
+        Args:
+            timeout_ms: Timeout in milliseconds to wait for records.
+
+        Returns:
+            ScanRecords grouped by bucket. Supports flat iteration
+            (``for rec in records``) and per-bucket access
+            (``records.buckets()``, ``records.records(bucket)``).
+
+        Note:
+            Returns an empty ScanRecords if no records are available or timeout expires.
+        """
+        ...
+    async def poll_record_batch(self, timeout_ms: int) -> List[RecordBatch]:
+        """Poll for batches with metadata.
+
+        Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()).
+
+        Args:
+            timeout_ms: Timeout in milliseconds to wait for batches.
+
+        Returns:
+            List of RecordBatch objects, each containing the Arrow batch along with
+            bucket, base_offset, and last_offset metadata.
+
+        Note:
+            Returns an empty list if no batches are available or timeout expires.
+        """
+        ...
+    async def poll_arrow(self, timeout_ms: int) -> pa.Table:
+        """Poll for records as an Arrow Table.
+
+        Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()).
+
+        Args:
+            timeout_ms: Timeout in milliseconds to wait for records.
+
+        Returns:
+            PyArrow Table containing the polled records (batches merged).
+
+        Note:
+            Returns an empty table (with correct schema) if no records are available
+            or timeout expires.
+        """
+        ...
+    def to_arrow_batch_reader(self) -> pa.RecordBatchReader:
+        """Create a lazy Arrow RecordBatchReader that reads until latest offsets.
+
+        Returns a ``pyarrow.RecordBatchReader`` that lazily polls batches one at
+        a time (streaming). Prefer this when you want to process batches without
+        holding the full result in memory at once.
+
+        Do not call ``poll_arrow`` / ``poll_record_batch`` on this scanner while
+        iterating the reader; they share the same underlying scanner state.
+        Overlapping calls are not supported. Use one active
+        polling/consumption path at a time.
+
+        Requires a batch-based scanner (created with ``new_scan().create_record_batch_log_scanner()``).
+        You must call ``subscribe()``, ``subscribe_buckets()``, ``subscribe_partition()``,
+        or ``subscribe_partition_buckets()`` first.
+
+        Returns:
+            ``pyarrow.RecordBatchReader`` yielding ``RecordBatch`` objects.
+        """
+        ...
+    async def to_pandas(self) -> pd.DataFrame:
+        """Convert all data to Pandas DataFrame.
+
+        Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()).
+        Reads from currently subscribed buckets until reaching their latest offsets.
+
+        You must call subscribe(), subscribe_buckets(), or subscribe_partition() first.
+        """
+        ...
+    async def to_arrow(self) -> pa.Table:
+        """Convert all data to Arrow Table.
+
+        Batches are collected in Rust then combined into one table (no per-batch
+        Python iteration). Do not interleave with ``poll_arrow`` / ``poll_record_batch``
+        for the same subscription session; overlapping use is not supported.
+
+        Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner()).
+        Reads from currently subscribed buckets until reaching their latest offsets.
+
+        You must call subscribe(), subscribe_buckets(), or subscribe_partition() first.
+        """
+        ...
+
+    def __repr__(self) -> str: ...
+    def __aiter__(self) -> AsyncIterator[Union[ScanRecord, RecordBatch]]: ...
+
+class Schema:
+    def __init__(
+        self, schema: pa.Schema, primary_keys: Optional[List[str]] = None
+    ) -> None: ...
+    def get_column_names(self) -> List[str]: ...
+    def get_column_types(self) -> List[str]: ...
+    def get_columns(self) -> List[Tuple[str, str]]: ...
+    def get_primary_keys(self) -> List[str]: ...
+    def __str__(self) -> str: ...
+
+class TableDescriptor:
+    def __init__(
+        self,
+        schema: Schema,
+        *,
+        partition_keys: Optional[List[str]] = None,
+        bucket_count: Optional[int] = None,
+        bucket_keys: Optional[List[str]] = None,
+        comment: Optional[str] = None,
+        log_format: Optional[str] = None,
+        kv_format: Optional[str] = None,
+        properties: Optional[Dict[str, str]] = None,
+        custom_properties: Optional[Dict[str, str]] = None,
+    ) -> None: ...
+    def get_schema(self) -> Schema: ...
+
+class TablePath:
+    def __init__(self, database: str, table: str) -> None: ...
+    @property
+    def database_name(self) -> str: ...
+    @property
+    def table_name(self) -> str: ...
+    def table_path_str(self) -> str: ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+    def __hash__(self) -> int: ...
+    def __eq__(self, other: object) -> bool: ...
+
+class TableInfo:
+    @property
+    def table_id(self) -> int: ...
+    @property
+    def schema_id(self) -> int: ...
+    @property
+    def created_time(self) -> int: ...
+    @property
+    def modified_time(self) -> int: ...
+    @property
+    def table_path(self) -> TablePath: ...
+    @property
+    def num_buckets(self) -> int: ...
+    @property
+    def comment(self) -> Optional[str]: ...
+    def get_primary_keys(self) -> List[str]: ...
+    def get_bucket_keys(self) -> List[str]: ...
+    def get_partition_keys(self) -> List[str]: ...
+    def has_primary_key(self) -> bool: ...
+    def is_partitioned(self) -> bool: ...
+    def get_properties(self) -> Dict[str, str]: ...
+    def get_custom_properties(self) -> Dict[str, str]: ...
+    def get_schema(self) -> Schema: ...
+    def get_column_names(self) -> List[str]: ...
+    def get_column_count(self) -> int: ...
+
+class FlussError(Exception):
+    message: str
+    error_code: int
+    def __init__(self, message: str, error_code: int = -2) -> None: ...
+    def __str__(self) -> str: ...
+    @property
+    def is_retriable(self) -> bool: ...
+
+class LakeSnapshot:
+    def __init__(self, snapshot_id: int) -> None: ...
+    @property
+    def snapshot_id(self) -> int: ...
+    @property
+    def table_buckets_offset(self) -> Dict[TableBucket, int]: ...
+    def get_bucket_offset(self, bucket: TableBucket) -> Optional[int]: ...
+    def get_table_buckets(self) -> List[TableBucket]: ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
+class TableBucket:
+    def __init__(self, table_id: int, bucket: int) -> None: ...
+    @staticmethod
+    def with_partition(
+        table_id: int, partition_id: int, bucket: int
+    ) -> TableBucket: ...
+    @property
+    def table_id(self) -> int: ...
+    @property
+    def bucket_id(self) -> int: ...
+    @property
+    def partition_id(self) -> Optional[int]: ...
+    def __hash__(self) -> int: ...
+    def __eq__(self, other: object) -> bool: ...
+    def __str__(self) -> str: ...
+    def __repr__(self) -> str: ...
+
+class PartitionInfo:
+    """Information about a partition."""
+
+    @property
+    def partition_id(self) -> int:
+        """Get the partition ID (globally unique in the cluster)."""
+        ...
+    @property
+    def partition_name(self) -> str:
+        """Get the partition name."""
+        ...
+    def __repr__(self) -> str: ...
+
+class ErrorCode:
+    """Named constants for Fluss API error codes.
+
+    Server API errors have error_code > 0 or == -1.
+    Client-side errors have error_code == CLIENT_ERROR (-2).
+    These constants are convenience names — new server error codes work
+    automatically since error_code is a raw int, not a closed enum.
+    """
+
+    CLIENT_ERROR: int
+    NONE: int
+    UNKNOWN_SERVER_ERROR: int
+    NETWORK_EXCEPTION: int
+    UNSUPPORTED_VERSION: int
+    CORRUPT_MESSAGE: int
+    DATABASE_NOT_EXIST: int
+    DATABASE_NOT_EMPTY: int
+    DATABASE_ALREADY_EXIST: int
+    TABLE_NOT_EXIST: int
+    TABLE_ALREADY_EXIST: int
+    SCHEMA_NOT_EXIST: int
+    LOG_STORAGE_EXCEPTION: int
+    KV_STORAGE_EXCEPTION: int
+    NOT_LEADER_OR_FOLLOWER: int
+    RECORD_TOO_LARGE_EXCEPTION: int
+    CORRUPT_RECORD_EXCEPTION: int
+    INVALID_TABLE_EXCEPTION: int
+    INVALID_DATABASE_EXCEPTION: int
+    INVALID_REPLICATION_FACTOR: int
+    INVALID_REQUIRED_ACKS: int
+    LOG_OFFSET_OUT_OF_RANGE_EXCEPTION: int
+    NON_PRIMARY_KEY_TABLE_EXCEPTION: int
+    UNKNOWN_TABLE_OR_BUCKET_EXCEPTION: int
+    INVALID_UPDATE_VERSION_EXCEPTION: int
+    INVALID_COORDINATOR_EXCEPTION: int
+    FENCED_LEADER_EPOCH_EXCEPTION: int
+    REQUEST_TIME_OUT: int
+    STORAGE_EXCEPTION: int
+    OPERATION_NOT_ATTEMPTED_EXCEPTION: int
+    NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION: int
+    NOT_ENOUGH_REPLICAS_EXCEPTION: int
+    SECURITY_TOKEN_EXCEPTION: int
+    OUT_OF_ORDER_SEQUENCE_EXCEPTION: int
+    DUPLICATE_SEQUENCE_EXCEPTION: int
+    UNKNOWN_WRITER_ID_EXCEPTION: int
+    INVALID_COLUMN_PROJECTION: int
+    INVALID_TARGET_COLUMN: int
+    PARTITION_NOT_EXISTS: int
+    TABLE_NOT_PARTITIONED_EXCEPTION: int
+    INVALID_TIMESTAMP_EXCEPTION: int
+    INVALID_CONFIG_EXCEPTION: int
+    LAKE_STORAGE_NOT_CONFIGURED_EXCEPTION: int
+    KV_SNAPSHOT_NOT_EXIST: int
+    PARTITION_ALREADY_EXISTS: int
+    PARTITION_SPEC_INVALID_EXCEPTION: int
+    LEADER_NOT_AVAILABLE_EXCEPTION: int
+    PARTITION_MAX_NUM_EXCEPTION: int
+    AUTHENTICATE_EXCEPTION: int
+    SECURITY_DISABLED_EXCEPTION: int
+    AUTHORIZATION_EXCEPTION: int
+    BUCKET_MAX_NUM_EXCEPTION: int
+    FENCED_TIERING_EPOCH_EXCEPTION: int
+    RETRIABLE_AUTHENTICATE_EXCEPTION: int
+    INVALID_SERVER_RACK_INFO_EXCEPTION: int
+    LAKE_SNAPSHOT_NOT_EXIST: int
+    LAKE_TABLE_ALREADY_EXIST: int
+    INELIGIBLE_REPLICA_EXCEPTION: int
+    INVALID_ALTER_TABLE_EXCEPTION: int
+    DELETION_DISABLED_EXCEPTION: int
+
+class OffsetSpec:
+    """Offset specification for list_offsets(), matching Java's OffsetSpec.
+
+    Use factory methods to create instances:
+        OffsetSpec.earliest()
+        OffsetSpec.latest()
+        OffsetSpec.timestamp(ts)
+    """
+
+    @staticmethod
+    def earliest() -> "OffsetSpec":
+        """Create an OffsetSpec for the earliest available offset."""
+        ...
+    @staticmethod
+    def latest() -> "OffsetSpec":
+        """Create an OffsetSpec for the latest available offset."""
+        ...
+    @staticmethod
+    def timestamp(ts: int) -> "OffsetSpec":
+        """Create an OffsetSpec for the offset at or after the given timestamp."""
+        ...
+    def __repr__(self) -> str: ...
+
+# Constant for earliest offset (-2)
+EARLIEST_OFFSET: int
+
+__version__: str
diff --git a/fluss-rust/bindings/python/fluss/py.typed b/fluss-rust/bindings/python/fluss/py.typed
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/fluss-rust/bindings/python/pyproject.toml b/fluss-rust/bindings/python/pyproject.toml
new file mode 100644
index 0000000000..56a059c9d4
--- /dev/null
+++ b/fluss-rust/bindings/python/pyproject.toml
@@ -0,0 +1,105 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[build-system]
+requires = ["maturin>=1.0,<2.0"]
+build-backend = "maturin"
+
+[project]
+name = "pyfluss"
+description = "Apache Fluss (incubating) Python client"
+authors = [{name = "Apache Fluss", email = "dev@fluss.apache.org"}]
+license = {text = "Apache-2.0"}
+readme = "PYPI_README.md"
+requires-python = ">=3.9"
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+
+dynamic = ["version"]
+
+dependencies = [
+    "pandas>=2.3.1",
+    "pyarrow>=10.0.0",
+]
+
+[project.urls]
+Homepage = "https://clients.fluss.apache.org/user-guide/python/installation/"
+Repository = "https://github.com/apache/fluss-rust"
+
+[project.optional-dependencies]
+dev = [
+    "mypy>=1.17.1",
+    "pytest>=8.3.5",
+    "pytest-asyncio>=0.25.3",
+    "pytest-xdist>=3.5.0",
+    "pytest-timeout>=2.3.1",
+    "filelock>=3.0",
+    "ruff>=0.9.10",
+    "maturin>=1.8.2",
+]
+docs = [
+    "pdoc>=15.0.4",
+]
+
+[tool.maturin]
+python-source = "."
+module-name = "fluss._fluss"
+features = ["pyo3/extension-module"]
+
+[tool.uv]
+cache-keys = [
+  { file = "pyproject.toml" },
+  { file = "Cargo.toml" },
+  { file = "src/**/*.rs" },
+  { file = "../../crates/**/*.rs" },
+]
+
+[tool.ruff]
+line-length = 88
+fix = true
+
+[tool.ruff.lint]
+ignore = ["E402", "F403", "F405"]
+select = ["E", "F", "I"]
+
+[tool.ruff.lint.pycodestyle]
+max-doc-length = 88
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+
+[tool.ruff.format]
+docstring-code-format = true
+
+[tool.ruff.lint.isort]
+known-first-party = ["fluss"]
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+asyncio_default_fixture_loop_scope = "session"
+timeout = 120
+
+[tool.mypy]
+python_version = "3.9"
+warn_return_any = true
+warn_unused_configs = true
+ignore_missing_imports = true
diff --git a/fluss-rust/bindings/python/src/admin.rs b/fluss-rust/bindings/python/src/admin.rs
new file mode 100644
index 0000000000..5f4e45d5b9
--- /dev/null
+++ b/fluss-rust/bindings/python/src/admin.rs
@@ -0,0 +1,633 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::*;
+use pyo3::conversion::IntoPyObject;
+use pyo3_async_runtimes::tokio::future_into_py;
+use std::sync::Arc;
+
+/// Administrative client for managing Fluss tables
+#[pyclass]
+pub struct FlussAdmin {
+    __admin: Arc<fcore::client::FlussAdmin>,
+}
+
+/// Validate bucket IDs are non-negative
+fn validate_bucket_ids(bucket_ids: &[i32]) -> PyResult<()> {
+    for &bucket_id in bucket_ids {
+        if bucket_id < 0 {
+            return Err(FlussError::new_err(format!(
+                "Invalid bucket_id: {bucket_id}. Bucket IDs must be non-negative"
+            )));
+        }
+    }
+    Ok(())
+}
+
+#[pymethods]
+impl FlussAdmin {
+    /// Create a database.
+    ///
+    /// Args:
+    ///     database_name: Name of the database
+    ///     ignore_if_exists: If True, don't raise error if database already exists
+    ///     database_descriptor: Optional descriptor (comment, custom_properties)
+    ///
+    /// Returns:
+    ///     None
+    #[pyo3(signature = (database_name, database_descriptor=None, ignore_if_exists=false))]
+    pub fn create_database<'py>(
+        &self,
+        py: Python<'py>,
+        database_name: &str,
+        database_descriptor: Option<&DatabaseDescriptor>,
+        ignore_if_exists: bool,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let admin = self.__admin.clone();
+        let name = database_name.to_string();
+        let descriptor = database_descriptor.map(|d| d.to_core().clone());
+
+        future_into_py(py, async move {
+            admin
+                .create_database(&name, descriptor.as_ref(), ignore_if_exists)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| Ok(py.None()))
+        })
+    }
+
+    /// Drop a database.
+    ///
+    /// Args:
+    ///     database_name: Name of the database
+    ///     ignore_if_not_exists: If True, don't raise error if database does not exist
+    ///     cascade: If True, drop tables in the database first
+    ///
+    /// Returns:
+    ///     None
+    #[pyo3(signature = (database_name, ignore_if_not_exists=false, cascade=true))]
+    pub fn drop_database<'py>(
+        &self,
+        py: Python<'py>,
+        database_name: &str,
+        ignore_if_not_exists: bool,
+        cascade: bool,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let admin = self.__admin.clone();
+        let name = database_name.to_string();
+
+        future_into_py(py, async move {
+            admin
+                .drop_database(&name, ignore_if_not_exists, cascade)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| Ok(py.None()))
+        })
+    }
+
+    /// List all databases.
+    ///
+    /// Returns:
+    ///     List[str]: Names of all databases
+    pub fn list_databases<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let admin = self.__admin.clone();
+
+        future_into_py(py, async move {
+            let names = admin
+                .list_databases()
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let py_list = pyo3::types::PyList::empty(py);
+                for name in names {
+                    py_list.append(name)?;
+                }
+                Ok(py_list.unbind())
+            })
+        })
+    }
+
+    /// Check if a database exists.
+    ///
+    /// Args:
+    ///     database_name: Name of the database
+    ///
+    /// Returns:
+    ///     bool: True if the database exists
+    pub fn database_exists<'py>(
+        &self,
+        py: Python<'py>,
+        database_name: &str,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let admin = self.__admin.clone();
+        let name = database_name.to_string();
+
+        future_into_py(py, async move {
+            let exists = admin
+                .database_exists(&name)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| Ok(exists.into_pyobject(py)?.to_owned().into_any().unbind()))
+        })
+    }
+
+    /// Get database information.
+    ///
+    /// Args:
+    ///     database_name: Name of the database
+    ///
+    /// Returns:
+    ///     DatabaseInfo: Database metadata
+    pub fn get_database_info<'py>(
+        &self,
+        py: Python<'py>,
+        database_name: &str,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let admin = self.__admin.clone();
+        let name = database_name.to_string();
+
+        future_into_py(py, async move {
+            let info = admin
+                .get_database_info(&name)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| Py::new(py, DatabaseInfo::from_core(info)))
+        })
+    }
+
+    /// List all tables in a database.
+    ///
+    /// Args:
+    ///     database_name: Name of the database
+    ///
+    /// Returns:
+    ///     List[str]: Names of all tables in the database
+    pub fn list_tables<'py>(
+        &self,
+        py: Python<'py>,
+        database_name: &str,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let admin = self.__admin.clone();
+        let name = database_name.to_string();
+
+        future_into_py(py, async move {
+            let names = admin
+                .list_tables(&name)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let py_list = pyo3::types::PyList::empty(py);
+                for name in names {
+                    py_list.append(name)?;
+                }
+                Ok(py_list.unbind())
+            })
+        })
+    }
+
+    /// Check if a table exists.
+    ///
+    /// Args:
+    ///     table_path: Path to the table (database, table)
+    ///
+    /// Returns:
+    ///     bool: True if the table exists
+    pub fn table_exists<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let core_table_path = table_path.to_core();
+        let admin = self.__admin.clone();
+
+        future_into_py(py, async move {
+            let exists = admin
+                .table_exists(&core_table_path)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| Ok(exists.into_pyobject(py)?.to_owned().into_any().unbind()))
+        })
+    }
+
+    /// Drop a partition from a partitioned table.
+    ///
+    /// Args:
+    ///     table_path: Path to the table
+    ///     partition_spec: Dict mapping partition column name to value (e.g., {"region": "US"})
+    ///     ignore_if_not_exists: If True, don't raise error if partition does not exist
+    ///
+    /// Returns:
+    ///     None
+    #[pyo3(signature = (table_path, partition_spec, ignore_if_not_exists=false))]
+    pub fn drop_partition<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+        partition_spec: std::collections::HashMap<String, String>,
+        ignore_if_not_exists: bool,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let core_table_path = table_path.to_core();
+        let admin = self.__admin.clone();
+        let core_partition_spec = fcore::metadata::PartitionSpec::new(partition_spec);
+
+        future_into_py(py, async move {
+            admin
+                .drop_partition(&core_table_path, &core_partition_spec, ignore_if_not_exists)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| Ok(py.None()))
+        })
+    }
+
+    /// Create a table with the given schema
+    #[pyo3(signature = (table_path, table_descriptor, ignore_if_exists=None))]
+    pub fn create_table<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+        table_descriptor: &TableDescriptor,
+        ignore_if_exists: Option<bool>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let ignore = ignore_if_exists.unwrap_or(false);
+
+        let core_table_path = table_path.to_core();
+        let core_descriptor = table_descriptor.to_core().clone();
+        let admin = self.__admin.clone();
+
+        future_into_py(py, async move {
+            admin
+                .create_table(&core_table_path, &core_descriptor, ignore)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| Ok(py.None()))
+        })
+    }
+
+    /// Get table information
+    pub fn get_table_info<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let core_table_path = table_path.to_core();
+        let admin = self.__admin.clone();
+
+        future_into_py(py, async move {
+            let core_table_info = admin
+                .get_table_info(&core_table_path)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let table_info = TableInfo::from_core(core_table_info);
+                Py::new(py, table_info)
+            })
+        })
+    }
+
+    /// Get the latest lake snapshot for a table
+    pub fn get_latest_lake_snapshot<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let core_table_path = table_path.to_core();
+        let admin = self.__admin.clone();
+
+        future_into_py(py, async move {
+            let core_lake_snapshot = admin
+                .get_latest_lake_snapshot(&core_table_path)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let lake_snapshot = LakeSnapshot::from_core(core_lake_snapshot);
+                Py::new(py, lake_snapshot)
+            })
+        })
+    }
+
+    /// Drop a table
+    #[pyo3(signature = (table_path, ignore_if_not_exists=false))]
+    pub fn drop_table<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+        ignore_if_not_exists: bool,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let core_table_path = table_path.to_core();
+        let admin = self.__admin.clone();
+
+        future_into_py(py, async move {
+            admin
+                .drop_table(&core_table_path, ignore_if_not_exists)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| Ok(py.None()))
+        })
+    }
+
+    /// List offsets for buckets (non-partitioned tables only).
+    ///
+    /// Args:
+    ///     table_path: Path to the table
+    ///     bucket_ids: List of bucket IDs to query
+    ///     offset_spec: Offset specification (OffsetSpec.earliest(), OffsetSpec.latest(),
+    ///         or OffsetSpec.timestamp(ts))
+    ///
+    /// Returns:
+    ///     dict[int, int]: Mapping of bucket_id -> offset
+    pub fn list_offsets<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+        bucket_ids: Vec<i32>,
+        offset_spec: &OffsetSpec,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        validate_bucket_ids(&bucket_ids)?;
+        let offset_spec = offset_spec.inner.clone();
+
+        let core_table_path = table_path.to_core();
+        let admin = self.__admin.clone();
+
+        future_into_py(py, async move {
+            let offsets = admin
+                .list_offsets(&core_table_path, &bucket_ids, offset_spec)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let dict = pyo3::types::PyDict::new(py);
+                for (bucket_id, offset) in offsets {
+                    dict.set_item(bucket_id, offset)?;
+                }
+                Ok(dict.unbind())
+            })
+        })
+    }
+
+    /// List offsets for buckets in a specific partition of a partitioned table.
+    ///
+    /// Args:
+    ///     table_path: Path to the table
+    ///     partition_name: Partition value (e.g., "US" not "region=US")
+    ///     bucket_ids: List of bucket IDs to query
+    ///     offset_spec: Offset specification (OffsetSpec.earliest(), OffsetSpec.latest(),
+    ///         or OffsetSpec.timestamp(ts))
+    ///
+    /// Returns:
+    ///     dict[int, int]: Mapping of bucket_id -> offset
+    pub fn list_partition_offsets<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+        partition_name: &str,
+        bucket_ids: Vec<i32>,
+        offset_spec: &OffsetSpec,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        validate_bucket_ids(&bucket_ids)?;
+        let offset_spec = offset_spec.inner.clone();
+
+        let core_table_path = table_path.to_core();
+        let admin = self.__admin.clone();
+        let partition_name = partition_name.to_string();
+
+        future_into_py(py, async move {
+            let offsets = admin
+                .list_partition_offsets(&core_table_path, &partition_name, &bucket_ids, offset_spec)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let dict = pyo3::types::PyDict::new(py);
+                for (bucket_id, offset) in offsets {
+                    dict.set_item(bucket_id, offset)?;
+                }
+                Ok(dict.unbind())
+            })
+        })
+    }
+
+    /// Create a partition for a partitioned table.
+    ///
+    /// Args:
+    ///     table_path: Path to the table
+    ///     partition_spec: Dict mapping partition column name to value (e.g., {"region": "US"})
+    ///     ignore_if_exists: If True, don't raise error if partition already exists
+    ///
+    /// Returns:
+    ///     None
+    #[pyo3(signature = (table_path, partition_spec, ignore_if_exists=false))]
+    pub fn create_partition<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+        partition_spec: std::collections::HashMap<String, String>,
+        ignore_if_exists: bool,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let core_table_path = table_path.to_core();
+        let admin = self.__admin.clone();
+        let core_partition_spec = fcore::metadata::PartitionSpec::new(partition_spec);
+
+        future_into_py(py, async move {
+            admin
+                .create_partition(&core_table_path, &core_partition_spec, ignore_if_exists)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| Ok(py.None()))
+        })
+    }
+
+    /// List partitions for a partitioned table.
+    ///
+    /// Args:
+    ///     table_path: Path to the table
+    ///     partition_spec: Optional partial partition spec to filter results.
+    ///         Dict mapping partition column name to value (e.g., {"region": "US"}).
+    ///         If None, returns all partitions.
+    ///
+    /// Returns:
+    ///     List[PartitionInfo]: List of partition info objects
+    #[pyo3(signature = (table_path, partition_spec=None))]
+    pub fn list_partition_infos<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+        partition_spec: Option<std::collections::HashMap<String, String>>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let core_table_path = table_path.to_core();
+        let admin = self.__admin.clone();
+        let core_partition_spec = partition_spec.map(fcore::metadata::PartitionSpec::new);
+
+        future_into_py(py, async move {
+            let partition_infos = admin
+                .list_partition_infos_with_spec(&core_table_path, core_partition_spec.as_ref())
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let py_list = pyo3::types::PyList::empty(py);
+                for info in partition_infos {
+                    let py_info = PartitionInfo::from_core(info);
+                    py_list.append(Py::new(py, py_info)?)?;
+                }
+                Ok(py_list.unbind())
+            })
+        })
+    }
+
+    /// Get all alive server nodes in the cluster.
+    ///
+    /// Returns:
+    ///     List[ServerNode]: List of server nodes (coordinator and tablet servers)
+    pub fn get_server_nodes<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let admin = self.__admin.clone();
+
+        future_into_py(py, async move {
+            let nodes = admin
+                .get_server_nodes()
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let py_list = pyo3::types::PyList::empty(py);
+                for node in nodes {
+                    let py_node = ServerNode::from_core(node);
+                    py_list.append(Py::new(py, py_node)?)?;
+                }
+                Ok(py_list.unbind())
+            })
+        })
+    }
+
+    fn __repr__(&self) -> String {
+        "FlussAdmin()".to_string()
+    }
+}
+
+impl FlussAdmin {
+    // Internal method to create FlussAdmin from core admin
+    pub fn from_core(admin: Arc<fcore::client::FlussAdmin>) -> Self {
+        Self { __admin: admin }
+    }
+}
+
+/// Information about a partition
+#[pyclass]
+pub struct PartitionInfo {
+    partition_id: i64,
+    partition_name: String,
+}
+
+#[pymethods]
+impl PartitionInfo {
+    /// Get the partition ID (globally unique in the cluster)
+    #[getter]
+    fn partition_id(&self) -> i64 {
+        self.partition_id
+    }
+
+    /// Get the partition name (e.g., "US" for a table partitioned by region)
+    #[getter]
+    fn partition_name(&self) -> &str {
+        &self.partition_name
+    }
+
+    fn __repr__(&self) -> String {
+        format!(
+            "PartitionInfo(partition_id={}, partition_name='{}')",
+            self.partition_id, self.partition_name
+        )
+    }
+}
+
+impl PartitionInfo {
+    pub fn from_core(info: fcore::metadata::PartitionInfo) -> Self {
+        Self {
+            partition_id: info.get_partition_id(),
+            partition_name: info.get_partition_name(),
+        }
+    }
+}
+
+/// Information about a server node in the Fluss cluster
+#[pyclass]
+pub struct ServerNode {
+    id: i32,
+    host: String,
+    port: u32,
+    server_type: String,
+    uid: String,
+}
+
+#[pymethods]
+impl ServerNode {
+    #[getter]
+    fn id(&self) -> i32 {
+        self.id
+    }
+
+    #[getter]
+    fn host(&self) -> &str {
+        &self.host
+    }
+
+    #[getter]
+    fn port(&self) -> u32 {
+        self.port
+    }
+
+    #[getter]
+    fn server_type(&self) -> &str {
+        &self.server_type
+    }
+
+    #[getter]
+    fn uid(&self) -> &str {
+        &self.uid
+    }
+
+    fn __repr__(&self) -> String {
+        format!(
+            "ServerNode(id={}, host='{}', port={}, server_type='{}')",
+            self.id, self.host, self.port, self.server_type
+        )
+    }
+}
+
+impl ServerNode {
+    pub fn from_core(node: fcore::ServerNode) -> Self {
+        Self {
+            id: node.id(),
+            host: node.host().to_string(),
+            port: node.port(),
+            server_type: node.server_type().to_string(),
+            uid: node.uid().to_string(),
+        }
+    }
+}
diff --git a/fluss-rust/bindings/python/src/config.rs b/fluss-rust/bindings/python/src/config.rs
new file mode 100644
index 0000000000..11188bf3c6
--- /dev/null
+++ b/fluss-rust/bindings/python/src/config.rs
@@ -0,0 +1,535 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::*;
+use pyo3::types::PyDict;
+
+/// Configuration for Fluss client
+#[pyclass]
+#[derive(Clone)]
+pub struct Config {
+    inner: fcore::config::Config,
+}
+
+#[pymethods]
+impl Config {
+    /// Create a new Config with optional properties from a dictionary
+    #[new]
+    #[pyo3(signature = (properties = None))]
+    fn new(properties: Option<&Bound<'_, PyDict>>) -> PyResult<Self> {
+        let mut config = fcore::config::Config::default();
+
+        if let Some(props) = properties {
+            for item in props.iter() {
+                let key: String = item.0.extract()?;
+                let value: String = item.1.extract()?;
+
+                match key.as_str() {
+                    "bootstrap.servers" => {
+                        config.bootstrap_servers = value;
+                    }
+                    "writer.request-max-size" => {
+                        config.writer_request_max_size = value.parse::<i32>().map_err(|e| {
+                            FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}"))
+                        })?;
+                    }
+                    "writer.acks" => {
+                        config.writer_acks = value;
+                    }
+                    "writer.retries" => {
+                        config.writer_retries = value.parse::<i32>().map_err(|e| {
+                            FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}"))
+                        })?;
+                    }
+                    "writer.batch-size" => {
+                        config.writer_batch_size = value.parse::<i32>().map_err(|e| {
+                            FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}"))
+                        })?;
+                    }
+                    "writer.dynamic-batch-size.enabled" => {
+                        config.writer_dynamic_batch_size_enabled = match value.as_str() {
+                            "true" => true,
+                            "false" => false,
+                            other => {
+                                return Err(FlussError::new_err(format!(
+                                    "Invalid value '{other}' for '{key}', expected 'true' or 'false'"
+                                )));
+                            }
+                        };
+                    }
+                    "writer.dynamic-batch-size-min" => {
+                        config.writer_dynamic_batch_size_min =
+                            value.parse::<i32>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "writer.batch-timeout-ms" => {
+                        config.writer_batch_timeout_ms = value.parse::<i64>().map_err(|e| {
+                            FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}"))
+                        })?;
+                    }
+                    "scanner.remote-log.prefetch-num" => {
+                        config.scanner_remote_log_prefetch_num =
+                            value.parse::<usize>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "remote-file.download-thread-num" => {
+                        config.remote_file_download_thread_num =
+                            value.parse::<usize>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "scanner.remote-log.read-concurrency" => {
+                        config.scanner_remote_log_read_concurrency =
+                            value.parse::<usize>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "scanner.log.max-poll-records" => {
+                        config.scanner_log_max_poll_records =
+                            value.parse::<usize>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "scanner.log.fetch.max-bytes" => {
+                        config.scanner_log_fetch_max_bytes = value.parse::<i32>().map_err(|e| {
+                            FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}"))
+                        })?;
+                    }
+                    "scanner.log.fetch.min-bytes" => {
+                        config.scanner_log_fetch_min_bytes = value.parse::<i32>().map_err(|e| {
+                            FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}"))
+                        })?;
+                    }
+                    "scanner.log.fetch.wait-max-time-ms" => {
+                        config.scanner_log_fetch_wait_max_time_ms =
+                            value.parse::<i32>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "scanner.log.fetch.max-bytes-for-bucket" => {
+                        config.scanner_log_fetch_max_bytes_for_bucket =
+                            value.parse::<i32>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "writer.enable-idempotence" => {
+                        config.writer_enable_idempotence = match value.as_str() {
+                            "true" => true,
+                            "false" => false,
+                            other => {
+                                return Err(FlussError::new_err(format!(
+                                    "Invalid value '{other}' for '{key}', expected 'true' or 'false'"
+                                )));
+                            }
+                        };
+                    }
+                    "writer.max-inflight-requests-per-bucket" => {
+                        config.writer_max_inflight_requests_per_bucket =
+                            value.parse::<usize>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "writer.buffer.memory-size" => {
+                        config.writer_buffer_memory_size = value.parse::<usize>().map_err(|e| {
+                            FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}"))
+                        })?;
+                    }
+                    "writer.buffer.wait-timeout-ms" => {
+                        config.writer_buffer_wait_timeout_ms =
+                            value.parse::<u64>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "writer.bucket.no-key-assigner" => {
+                        config.writer_bucket_no_key_assigner =
+                            value.parse::<fcore::config::NoKeyAssigner>().map_err(|e| {
+                                FlussError::new_err(format!(
+                                    "Invalid value '{value}' for '{key}': {e}"
+                                ))
+                            })?;
+                    }
+                    "connect-timeout" => {
+                        config.connect_timeout_ms = value.parse::<u64>().map_err(|e| {
+                            FlussError::new_err(format!("Invalid value '{value}' for '{key}': {e}"))
+                        })?;
+                    }
+                    "security.protocol" => {
+                        config.security_protocol = value;
+                    }
+                    "security.sasl.mechanism" => {
+                        config.security_sasl_mechanism = value;
+                    }
+                    "security.sasl.username" => {
+                        config.security_sasl_username = value;
+                    }
+                    "security.sasl.password" => {
+                        config.security_sasl_password = value;
+                    }
+                    _ => {
+                        return Err(FlussError::new_err(format!("Unknown property: {key}")));
+                    }
+                }
+            }
+        }
+
+        Ok(Self { inner: config })
+    }
+
+    /// Get the bootstrap servers
+    #[getter]
+    fn bootstrap_servers(&self) -> String {
+        self.inner.bootstrap_servers.clone()
+    }
+
+    /// Set the bootstrap servers
+    #[setter]
+    fn set_bootstrap_servers(&mut self, server: String) {
+        self.inner.bootstrap_servers = server;
+    }
+
+    /// Get the writer request max size
+    #[getter]
+    fn writer_request_max_size(&self) -> i32 {
+        self.inner.writer_request_max_size
+    }
+
+    /// Set the writer request max size
+    #[setter]
+    fn set_writer_request_max_size(&mut self, size: i32) {
+        self.inner.writer_request_max_size = size;
+    }
+
+    /// Get the writer acks
+    #[getter]
+    fn writer_acks(&self) -> String {
+        self.inner.writer_acks.clone()
+    }
+
+    /// Set the writer acks
+    #[setter]
+    fn set_writer_acks(&mut self, acks: String) {
+        self.inner.writer_acks = acks;
+    }
+
+    /// Get the writer retries
+    #[getter]
+    fn writer_retries(&self) -> i32 {
+        self.inner.writer_retries
+    }
+
+    /// Set the writer retries
+    #[setter]
+    fn set_writer_retries(&mut self, retries: i32) {
+        self.inner.writer_retries = retries;
+    }
+
+    /// Get the writer batch size
+    #[getter]
+    fn writer_batch_size(&self) -> i32 {
+        self.inner.writer_batch_size
+    }
+
+    /// Set the writer batch size
+    #[setter]
+    fn set_writer_batch_size(&mut self, size: i32) {
+        self.inner.writer_batch_size = size;
+    }
+
+    /// Get whether the per-table dynamic batch size estimator is enabled
+    #[getter]
+    fn writer_dynamic_batch_size_enabled(&self) -> bool {
+        self.inner.writer_dynamic_batch_size_enabled
+    }
+
+    /// Set whether the per-table dynamic batch size estimator is enabled
+    #[setter]
+    fn set_writer_dynamic_batch_size_enabled(&mut self, enabled: bool) {
+        self.inner.writer_dynamic_batch_size_enabled = enabled;
+    }
+
+    /// Get the lower bound used by the dynamic batch size estimator
+    #[getter]
+    fn writer_dynamic_batch_size_min(&self) -> i32 {
+        self.inner.writer_dynamic_batch_size_min
+    }
+
+    /// Set the lower bound used by the dynamic batch size estimator
+    #[setter]
+    fn set_writer_dynamic_batch_size_min(&mut self, size: i32) {
+        self.inner.writer_dynamic_batch_size_min = size;
+    }
+
+    /// Get the scanner remote log prefetch num
+    #[getter]
+    fn scanner_remote_log_prefetch_num(&self) -> usize {
+        self.inner.scanner_remote_log_prefetch_num
+    }
+
+    /// Set the scanner remote log prefetch num
+    #[setter]
+    fn set_scanner_remote_log_prefetch_num(&mut self, num: usize) {
+        self.inner.scanner_remote_log_prefetch_num = num;
+    }
+
+    /// Get the remote file download thread num
+    #[getter]
+    fn remote_file_download_thread_num(&self) -> usize {
+        self.inner.remote_file_download_thread_num
+    }
+
+    /// Set the remote file download thread num
+    #[setter]
+    fn set_remote_file_download_thread_num(&mut self, num: usize) {
+        self.inner.remote_file_download_thread_num = num;
+    }
+
+    /// Get the scanner remote log read concurrency
+    #[getter]
+    fn scanner_remote_log_read_concurrency(&self) -> usize {
+        self.inner.scanner_remote_log_read_concurrency
+    }
+
+    /// Set the scanner remote log read concurrency
+    #[setter]
+    fn set_scanner_remote_log_read_concurrency(&mut self, num: usize) {
+        self.inner.scanner_remote_log_read_concurrency = num;
+    }
+
+    /// Get the scanner log max poll records
+    #[getter]
+    fn scanner_log_max_poll_records(&self) -> usize {
+        self.inner.scanner_log_max_poll_records
+    }
+
+    /// Set the scanner log max poll records
+    #[setter]
+    fn set_scanner_log_max_poll_records(&mut self, num: usize) {
+        self.inner.scanner_log_max_poll_records = num;
+    }
+
+    /// Get the writer batch timeout in milliseconds
+    #[getter]
+    fn writer_batch_timeout_ms(&self) -> i64 {
+        self.inner.writer_batch_timeout_ms
+    }
+
+    /// Set the writer batch timeout in milliseconds
+    #[setter]
+    fn set_writer_batch_timeout_ms(&mut self, timeout: i64) {
+        self.inner.writer_batch_timeout_ms = timeout;
+    }
+
+    /// Get the bucket assignment strategy for tables without bucket keys
+    #[getter]
+    fn writer_bucket_no_key_assigner(&self) -> String {
+        self.inner.writer_bucket_no_key_assigner.to_string()
+    }
+
+    /// Set the bucket assignment strategy for tables without bucket keys
+    #[setter]
+    fn set_writer_bucket_no_key_assigner(&mut self, value: String) -> PyResult<()> {
+        self.inner.writer_bucket_no_key_assigner =
+            value.parse::<fcore::config::NoKeyAssigner>().map_err(|e| {
+                FlussError::new_err(format!(
+                    "Invalid value '{value}' for 'writer.bucket.no-key-assigner': {e}"
+                ))
+            })?;
+        Ok(())
+    }
+
+    /// Get whether idempotent writes are enabled
+    #[getter]
+    fn writer_enable_idempotence(&self) -> bool {
+        self.inner.writer_enable_idempotence
+    }
+
+    /// Set whether idempotent writes are enabled
+    #[setter]
+    fn set_writer_enable_idempotence(&mut self, enabled: bool) {
+        self.inner.writer_enable_idempotence = enabled;
+    }
+
+    /// Get the max in-flight requests per bucket
+    #[getter]
+    fn writer_max_inflight_requests_per_bucket(&self) -> usize {
+        self.inner.writer_max_inflight_requests_per_bucket
+    }
+
+    /// Set the max in-flight requests per bucket
+    #[setter]
+    fn set_writer_max_inflight_requests_per_bucket(&mut self, num: usize) {
+        self.inner.writer_max_inflight_requests_per_bucket = num;
+    }
+
+    /// Get the writer buffer memory size
+    #[getter]
+    fn writer_buffer_memory_size(&self) -> usize {
+        self.inner.writer_buffer_memory_size
+    }
+
+    /// Set the writer buffer memory size
+    #[setter]
+    fn set_writer_buffer_memory_size(&mut self, size: usize) {
+        self.inner.writer_buffer_memory_size = size;
+    }
+
+    /// Get the writer buffer wait timeout in milliseconds
+    #[getter]
+    fn writer_buffer_wait_timeout_ms(&self) -> u64 {
+        self.inner.writer_buffer_wait_timeout_ms
+    }
+
+    /// Set the writer buffer wait timeout in milliseconds
+    #[setter]
+    fn set_writer_buffer_wait_timeout_ms(&mut self, timeout: u64) {
+        self.inner.writer_buffer_wait_timeout_ms = timeout;
+    }
+
+    /// Get the connect timeout in milliseconds
+    #[getter]
+    fn connect_timeout_ms(&self) -> u64 {
+        self.inner.connect_timeout_ms
+    }
+
+    /// Set the connect timeout in milliseconds
+    #[setter]
+    fn set_connect_timeout_ms(&mut self, timeout: u64) {
+        self.inner.connect_timeout_ms = timeout;
+    }
+
+    /// Get the security protocol
+    #[getter]
+    fn security_protocol(&self) -> String {
+        self.inner.security_protocol.clone()
+    }
+
+    /// Set the security protocol
+    #[setter]
+    fn set_security_protocol(&mut self, protocol: String) {
+        self.inner.security_protocol = protocol;
+    }
+
+    /// Get the SASL mechanism
+    #[getter]
+    fn security_sasl_mechanism(&self) -> String {
+        self.inner.security_sasl_mechanism.clone()
+    }
+
+    /// Set the SASL mechanism
+    #[setter]
+    fn set_security_sasl_mechanism(&mut self, mechanism: String) {
+        self.inner.security_sasl_mechanism = mechanism;
+    }
+
+    /// Get the SASL username
+    #[getter]
+    fn security_sasl_username(&self) -> String {
+        self.inner.security_sasl_username.clone()
+    }
+
+    /// Set the SASL username
+    #[setter]
+    fn set_security_sasl_username(&mut self, username: String) {
+        self.inner.security_sasl_username = username;
+    }
+
+    /// Get the SASL password
+    #[getter]
+    fn security_sasl_password(&self) -> String {
+        self.inner.security_sasl_password.clone()
+    }
+
+    /// Set the SASL password
+    #[setter]
+    fn set_security_sasl_password(&mut self, password: String) {
+        self.inner.security_sasl_password = password;
+    }
+
+    /// Get the maximum bytes per fetch response for LogScanner
+    #[getter]
+    fn scanner_log_fetch_max_bytes(&self) -> i32 {
+        self.inner.scanner_log_fetch_max_bytes
+    }
+
+    /// Set the maximum bytes per fetch response for LogScanner
+    #[setter]
+    fn set_scanner_log_fetch_max_bytes(&mut self, bytes: i32) {
+        self.inner.scanner_log_fetch_max_bytes = bytes;
+    }
+
+    /// Get the minimum bytes to accumulate before returning a fetch response
+    #[getter]
+    fn scanner_log_fetch_min_bytes(&self) -> i32 {
+        self.inner.scanner_log_fetch_min_bytes
+    }
+
+    /// Set the minimum bytes to accumulate before returning a fetch response
+    #[setter]
+    fn set_scanner_log_fetch_min_bytes(&mut self, bytes: i32) {
+        self.inner.scanner_log_fetch_min_bytes = bytes;
+    }
+
+    /// Get the maximum time (ms) the server may wait to satisfy min-bytes
+    #[getter]
+    fn scanner_log_fetch_wait_max_time_ms(&self) -> i32 {
+        self.inner.scanner_log_fetch_wait_max_time_ms
+    }
+
+    /// Set the maximum time (ms) the server may wait to satisfy min-bytes
+    #[setter]
+    fn set_scanner_log_fetch_wait_max_time_ms(&mut self, ms: i32) {
+        self.inner.scanner_log_fetch_wait_max_time_ms = ms;
+    }
+
+    /// Get the maximum bytes per fetch response per bucket for LogScanner
+    #[getter]
+    fn scanner_log_fetch_max_bytes_for_bucket(&self) -> i32 {
+        self.inner.scanner_log_fetch_max_bytes_for_bucket
+    }
+
+    /// Set the maximum bytes per fetch response per bucket for LogScanner
+    #[setter]
+    fn set_scanner_log_fetch_max_bytes_for_bucket(&mut self, bytes: i32) {
+        self.inner.scanner_log_fetch_max_bytes_for_bucket = bytes;
+    }
+}
+
+impl Config {
+    pub fn get_core_config(&self) -> fcore::config::Config {
+        self.inner.clone()
+    }
+}
diff --git a/fluss-rust/bindings/python/src/connection.rs b/fluss-rust/bindings/python/src/connection.rs
new file mode 100644
index 0000000000..3853896ce4
--- /dev/null
+++ b/fluss-rust/bindings/python/src/connection.rs
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::*;
+use pyo3_async_runtimes::tokio::future_into_py;
+use std::sync::Arc;
+use std::time::Duration;
+
+/// Connection to a Fluss cluster
+#[pyclass]
+pub struct FlussConnection {
+    inner: Arc<fcore::client::FlussConnection>,
+}
+
+#[pymethods]
+impl FlussConnection {
+    /// Create a new FlussConnection (async)
+    #[staticmethod]
+    fn create<'py>(py: Python<'py>, config: &Config) -> PyResult<Bound<'py, PyAny>> {
+        let rust_config = config.get_core_config();
+
+        future_into_py(py, async move {
+            let connection = fcore::client::FlussConnection::new(rust_config)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            let py_connection = FlussConnection {
+                inner: Arc::new(connection),
+            };
+
+            Python::attach(|py| Py::new(py, py_connection))
+        })
+    }
+
+    /// Get admin interface
+    fn get_admin(&self, py: Python<'_>) -> PyResult<Py<FlussAdmin>> {
+        let admin = self
+            .inner
+            .get_admin()
+            .map_err(|e| FlussError::from_core_error(&e))?;
+
+        Py::new(py, FlussAdmin::from_core(admin))
+    }
+
+    /// Get a table
+    fn get_table<'py>(
+        &self,
+        py: Python<'py>,
+        table_path: &TablePath,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let client = self.inner.clone();
+        let core_path = table_path.to_core().clone();
+
+        future_into_py(py, async move {
+            let core_table = client
+                .get_table(&core_path)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            let py_table = FlussTable::new_table(
+                client.clone(),
+                core_table.metadata().clone(),
+                core_table.get_table_info().clone(),
+                core_table.table_path().clone(),
+                core_table.has_primary_key(),
+            );
+
+            Python::attach(|py| Py::new(py, py_table))
+        })
+    }
+
+    /// Close the connection (async).
+    ///
+    /// Gracefully shuts down the connection by draining any pending write batches.
+    /// This method is awaitable.
+    fn close<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let inner = self.inner.clone();
+
+        future_into_py(py, async move {
+            inner
+                .close(Duration::MAX)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))
+        })
+    }
+
+    // Enter the runtime context (for 'with' statement)
+    fn __enter__(slf: PyRef<Self>) -> PyRef<Self> {
+        slf
+    }
+
+    // Exit the runtime context (for 'with' statement)
+    #[pyo3(signature = (_exc_type=None, _exc_value=None, _traceback=None))]
+    fn __exit__(
+        &mut self,
+        _exc_type: Option<Bound<'_, PyAny>>,
+        _exc_value: Option<Bound<'_, PyAny>>,
+        _traceback: Option<Bound<'_, PyAny>>,
+    ) -> PyResult<bool> {
+        // Sync exit cannot await the graceful drain, so it's a no-op here.
+        // Users should use 'async with' for graceful shutdown.
+        Ok(false)
+    }
+
+    // Enter the async runtime context (for 'async with' statement)
+    fn __aenter__<'py>(slf: PyRef<'py, Self>, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let py_slf = slf.into_pyobject(py)?.unbind();
+        future_into_py(py, async move { Ok(py_slf) })
+    }
+
+    // Exit the async runtime context (for 'async with' statement)
+    #[pyo3(signature = (exc_type=None, _exc_value=None, _traceback=None))]
+    fn __aexit__<'py>(
+        &self,
+        py: Python<'py>,
+        exc_type: Option<Bound<'py, PyAny>>,
+        _exc_value: Option<Bound<'py, PyAny>>,
+        _traceback: Option<Bound<'py, PyAny>>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let inner = self.inner.clone();
+        let is_exc_none = exc_type.as_ref().is_none_or(|e| e.is_none());
+        future_into_py(py, async move {
+            let res = inner.close(Duration::MAX).await;
+            if let Err(e) = res {
+                if is_exc_none {
+                    return Err(FlussError::from_core_error(&e));
+                }
+            }
+            Ok(false)
+        })
+    }
+
+    fn __repr__(&self) -> String {
+        "FlussConnection()".to_string()
+    }
+}
diff --git a/fluss-rust/bindings/python/src/error.rs b/fluss-rust/bindings/python/src/error.rs
new file mode 100644
index 0000000000..9d718aa66e
--- /dev/null
+++ b/fluss-rust/bindings/python/src/error.rs
@@ -0,0 +1,276 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use fluss::error::Error;
+use fluss::rpc::FlussError as CoreFlussError;
+use pyo3::exceptions::PyException;
+use pyo3::prelude::*;
+
+/// Error code for client-side errors that did not originate from the server API protocol.
+/// The value -2 is outside the server API error code range (-1 .. 57+), so it will never
+/// collide with current or future API codes. Consistent with the CPP binding.
+const CLIENT_ERROR_CODE: i32 = -2;
+
+/// Fluss errors
+#[pyclass(extends=PyException)]
+#[derive(Debug, Clone)]
+pub struct FlussError {
+    #[pyo3(get)]
+    pub message: String,
+    #[pyo3(get)]
+    pub error_code: i32,
+}
+
+#[pymethods]
+impl FlussError {
+    #[new]
+    #[pyo3(signature = (message, error_code=-2))]
+    fn new(message: String, error_code: i32) -> Self {
+        Self {
+            message,
+            error_code,
+        }
+    }
+
+    fn __str__(&self) -> String {
+        if self.error_code != CLIENT_ERROR_CODE {
+            format!("FlussError(code={}): {}", self.error_code, self.message)
+        } else {
+            format!("FlussError: {}", self.message)
+        }
+    }
+
+    /// Returns ``True`` if retrying the request may succeed. Client-side errors always return ``False``.
+    #[getter]
+    fn is_retriable(&self) -> bool {
+        if self.error_code == CLIENT_ERROR_CODE {
+            return false;
+        }
+        CoreFlussError::for_code(self.error_code).is_retriable()
+    }
+}
+
+impl FlussError {
+    pub fn new_err(message: impl ToString) -> PyErr {
+        PyErr::new::<FlussError, _>((message.to_string(), CLIENT_ERROR_CODE))
+    }
+
+    pub fn from_core_error(error: &Error) -> PyErr {
+        // Transport failures map to `NetworkException` (Java parity,
+        // retriable).
+        let (msg, code) = match error {
+            Error::FlussAPIError { api_error } => (api_error.message.clone(), api_error.code),
+            Error::RpcError { .. } => (error.to_string(), CoreFlussError::NetworkException.code()),
+            _ => (error.to_string(), CLIENT_ERROR_CODE),
+        };
+        PyErr::new::<FlussError, _>((msg, code))
+    }
+}
+
+/// Named constants for Fluss API error codes.
+///
+/// Server API errors have error_code > 0 or == -1.
+/// Client-side errors have error_code == CLIENT_ERROR (-2).
+/// These constants match the Rust core FlussError enum and are stable across protocol versions.
+/// New server error codes work automatically (error_code is a raw int, not a closed enum) —
+/// these constants are convenience names, not an exhaustive list.
+#[pyclass]
+pub struct ErrorCode;
+
+#[pymethods]
+impl ErrorCode {
+    /// Client-side error (not from server API protocol). Check the error message for details.
+    #[classattr]
+    const CLIENT_ERROR: i32 = -2;
+    /// No error.
+    #[classattr]
+    const NONE: i32 = 0;
+    /// The server experienced an unexpected error when processing the request.
+    #[classattr]
+    const UNKNOWN_SERVER_ERROR: i32 = -1;
+    /// The server disconnected before a response was received.
+    #[classattr]
+    const NETWORK_EXCEPTION: i32 = 1;
+    /// The version of API is not supported.
+    #[classattr]
+    const UNSUPPORTED_VERSION: i32 = 2;
+    /// This message has failed its CRC checksum, exceeds the valid size, or is otherwise corrupt.
+    #[classattr]
+    const CORRUPT_MESSAGE: i32 = 3;
+    /// The database does not exist.
+    #[classattr]
+    const DATABASE_NOT_EXIST: i32 = 4;
+    /// The database is not empty.
+    #[classattr]
+    const DATABASE_NOT_EMPTY: i32 = 5;
+    /// The database already exists.
+    #[classattr]
+    const DATABASE_ALREADY_EXIST: i32 = 6;
+    /// The table does not exist.
+    #[classattr]
+    const TABLE_NOT_EXIST: i32 = 7;
+    /// The table already exists.
+    #[classattr]
+    const TABLE_ALREADY_EXIST: i32 = 8;
+    /// The schema does not exist.
+    #[classattr]
+    const SCHEMA_NOT_EXIST: i32 = 9;
+    /// Exception occurred while storing data for log in server.
+    #[classattr]
+    const LOG_STORAGE_EXCEPTION: i32 = 10;
+    /// Exception occurred while storing data for kv in server.
+    #[classattr]
+    const KV_STORAGE_EXCEPTION: i32 = 11;
+    /// Not leader or follower.
+    #[classattr]
+    const NOT_LEADER_OR_FOLLOWER: i32 = 12;
+    /// The record is too large.
+    #[classattr]
+    const RECORD_TOO_LARGE_EXCEPTION: i32 = 13;
+    /// The record is corrupt.
+    #[classattr]
+    const CORRUPT_RECORD_EXCEPTION: i32 = 14;
+    /// The client has attempted to perform an operation on an invalid table.
+    #[classattr]
+    const INVALID_TABLE_EXCEPTION: i32 = 15;
+    /// The client has attempted to perform an operation on an invalid database.
+    #[classattr]
+    const INVALID_DATABASE_EXCEPTION: i32 = 16;
+    /// The replication factor is larger than the number of available tablet servers.
+    #[classattr]
+    const INVALID_REPLICATION_FACTOR: i32 = 17;
+    /// Produce request specified an invalid value for required acks.
+    #[classattr]
+    const INVALID_REQUIRED_ACKS: i32 = 18;
+    /// The log offset is out of range.
+    #[classattr]
+    const LOG_OFFSET_OUT_OF_RANGE_EXCEPTION: i32 = 19;
+    /// The table is not a primary key table.
+    #[classattr]
+    const NON_PRIMARY_KEY_TABLE_EXCEPTION: i32 = 20;
+    /// The table or bucket does not exist.
+    #[classattr]
+    const UNKNOWN_TABLE_OR_BUCKET_EXCEPTION: i32 = 21;
+    /// The update version is invalid.
+    #[classattr]
+    const INVALID_UPDATE_VERSION_EXCEPTION: i32 = 22;
+    /// The coordinator is invalid.
+    #[classattr]
+    const INVALID_COORDINATOR_EXCEPTION: i32 = 23;
+    /// The leader epoch is invalid.
+    #[classattr]
+    const FENCED_LEADER_EPOCH_EXCEPTION: i32 = 24;
+    /// The request timed out.
+    #[classattr]
+    const REQUEST_TIME_OUT: i32 = 25;
+    /// The general storage exception.
+    #[classattr]
+    const STORAGE_EXCEPTION: i32 = 26;
+    /// The server did not attempt to execute this operation.
+    #[classattr]
+    const OPERATION_NOT_ATTEMPTED_EXCEPTION: i32 = 27;
+    /// Records are written to the server already, but to fewer in-sync replicas than required.
+    #[classattr]
+    const NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION: i32 = 28;
+    /// Messages are rejected since there are fewer in-sync replicas than required.
+    #[classattr]
+    const NOT_ENOUGH_REPLICAS_EXCEPTION: i32 = 29;
+    /// Get file access security token exception.
+    #[classattr]
+    const SECURITY_TOKEN_EXCEPTION: i32 = 30;
+    /// The tablet server received an out of order sequence batch.
+    #[classattr]
+    const OUT_OF_ORDER_SEQUENCE_EXCEPTION: i32 = 31;
+    /// The tablet server received a duplicate sequence batch.
+    #[classattr]
+    const DUPLICATE_SEQUENCE_EXCEPTION: i32 = 32;
+    /// The tablet server could not locate the writer metadata.
+    #[classattr]
+    const UNKNOWN_WRITER_ID_EXCEPTION: i32 = 33;
+    /// The requested column projection is invalid.
+    #[classattr]
+    const INVALID_COLUMN_PROJECTION: i32 = 34;
+    /// The requested target column to write is invalid.
+    #[classattr]
+    const INVALID_TARGET_COLUMN: i32 = 35;
+    /// The partition does not exist.
+    #[classattr]
+    const PARTITION_NOT_EXISTS: i32 = 36;
+    /// The table is not partitioned.
+    #[classattr]
+    const TABLE_NOT_PARTITIONED_EXCEPTION: i32 = 37;
+    /// The timestamp is invalid.
+    #[classattr]
+    const INVALID_TIMESTAMP_EXCEPTION: i32 = 38;
+    /// The config is invalid.
+    #[classattr]
+    const INVALID_CONFIG_EXCEPTION: i32 = 39;
+    /// The lake storage is not configured.
+    #[classattr]
+    const LAKE_STORAGE_NOT_CONFIGURED_EXCEPTION: i32 = 40;
+    /// The kv snapshot does not exist.
+    #[classattr]
+    const KV_SNAPSHOT_NOT_EXIST: i32 = 41;
+    /// The partition already exists.
+    #[classattr]
+    const PARTITION_ALREADY_EXISTS: i32 = 42;
+    /// The partition spec is invalid.
+    #[classattr]
+    const PARTITION_SPEC_INVALID_EXCEPTION: i32 = 43;
+    /// There is no currently available leader for the given partition.
+    #[classattr]
+    const LEADER_NOT_AVAILABLE_EXCEPTION: i32 = 44;
+    /// Exceed the maximum number of partitions.
+    #[classattr]
+    const PARTITION_MAX_NUM_EXCEPTION: i32 = 45;
+    /// Authentication failed.
+    #[classattr]
+    const AUTHENTICATE_EXCEPTION: i32 = 46;
+    /// Security is disabled.
+    #[classattr]
+    const SECURITY_DISABLED_EXCEPTION: i32 = 47;
+    /// Authorization failed.
+    #[classattr]
+    const AUTHORIZATION_EXCEPTION: i32 = 48;
+    /// Exceed the maximum number of buckets.
+    #[classattr]
+    const BUCKET_MAX_NUM_EXCEPTION: i32 = 49;
+    /// The tiering epoch is invalid.
+    #[classattr]
+    const FENCED_TIERING_EPOCH_EXCEPTION: i32 = 50;
+    /// Authentication failed with retriable exception.
+    #[classattr]
+    const RETRIABLE_AUTHENTICATE_EXCEPTION: i32 = 51;
+    /// The server rack info is invalid.
+    #[classattr]
+    const INVALID_SERVER_RACK_INFO_EXCEPTION: i32 = 52;
+    /// The lake snapshot does not exist.
+    #[classattr]
+    const LAKE_SNAPSHOT_NOT_EXIST: i32 = 53;
+    /// The lake table already exists.
+    #[classattr]
+    const LAKE_TABLE_ALREADY_EXIST: i32 = 54;
+    /// The new ISR contains at least one ineligible replica.
+    #[classattr]
+    const INELIGIBLE_REPLICA_EXCEPTION: i32 = 55;
+    /// The alter table is invalid.
+    #[classattr]
+    const INVALID_ALTER_TABLE_EXCEPTION: i32 = 56;
+    /// Deletion operations are disabled on this table.
+    #[classattr]
+    const DELETION_DISABLED_EXCEPTION: i32 = 57;
+}
diff --git a/fluss-rust/bindings/python/src/lib.rs b/fluss-rust/bindings/python/src/lib.rs
new file mode 100644
index 0000000000..2d71491a7a
--- /dev/null
+++ b/fluss-rust/bindings/python/src/lib.rs
@@ -0,0 +1,144 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::LazyLock;
+
+pub use ::fluss as fcore;
+use pyo3::prelude::*;
+use tokio::runtime::Runtime;
+
+mod admin;
+mod config;
+mod connection;
+mod error;
+mod lookup;
+mod metadata;
+mod table;
+mod upsert;
+mod utils;
+mod write_handle;
+
+pub use admin::*;
+pub use config::*;
+pub use connection::*;
+pub use error::*;
+pub use lookup::*;
+pub use metadata::*;
+pub use table::*;
+pub use upsert::*;
+pub use utils::*;
+pub use write_handle::*;
+
+static TOKIO_RUNTIME: LazyLock<Runtime> = LazyLock::new(|| {
+    tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .expect("Failed to create Tokio runtime")
+});
+
+/// Offset specification for list_offsets(), matching Java's OffsetSpec.
+///
+/// Use factory methods to create instances:
+///   OffsetSpec.earliest()
+///   OffsetSpec.latest()
+///   OffsetSpec.timestamp(ts)
+#[pyclass]
+#[derive(Clone)]
+pub struct OffsetSpec {
+    pub(crate) inner: fcore::rpc::message::OffsetSpec,
+}
+
+#[pymethods]
+impl OffsetSpec {
+    /// Create an OffsetSpec for the earliest available offset.
+    #[staticmethod]
+    fn earliest() -> Self {
+        Self {
+            inner: fcore::rpc::message::OffsetSpec::Earliest,
+        }
+    }
+
+    /// Create an OffsetSpec for the latest available offset.
+    #[staticmethod]
+    fn latest() -> Self {
+        Self {
+            inner: fcore::rpc::message::OffsetSpec::Latest,
+        }
+    }
+
+    /// Create an OffsetSpec for the offset at or after the given timestamp.
+    #[staticmethod]
+    fn timestamp(ts: i64) -> Self {
+        Self {
+            inner: fcore::rpc::message::OffsetSpec::Timestamp(ts),
+        }
+    }
+
+    fn __repr__(&self) -> String {
+        match &self.inner {
+            fcore::rpc::message::OffsetSpec::Earliest => "OffsetSpec.earliest()".to_string(),
+            fcore::rpc::message::OffsetSpec::Latest => "OffsetSpec.latest()".to_string(),
+            fcore::rpc::message::OffsetSpec::Timestamp(ts) => {
+                format!("OffsetSpec.timestamp({ts})")
+            }
+        }
+    }
+}
+
+#[pymodule]
+fn _fluss(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    // Register all classes
+    m.add_class::<Config>()?;
+    m.add_class::<FlussConnection>()?;
+    m.add_class::<TablePath>()?;
+    m.add_class::<TableInfo>()?;
+    m.add_class::<TableDescriptor>()?;
+    m.add_class::<FlussAdmin>()?;
+    m.add_class::<FlussTable>()?;
+    m.add_class::<TableScan>()?;
+    m.add_class::<TableAppend>()?;
+    m.add_class::<TableUpsert>()?;
+    m.add_class::<TableLookup>()?;
+    m.add_class::<TablePrefixLookup>()?;
+    m.add_class::<AppendWriter>()?;
+    m.add_class::<UpsertWriter>()?;
+    m.add_class::<Lookuper>()?;
+    m.add_class::<PrefixLookuper>()?;
+    m.add_class::<Schema>()?;
+    m.add_class::<LogScanner>()?;
+    m.add_class::<LakeSnapshot>()?;
+    m.add_class::<TableBucket>()?;
+    m.add_class::<ChangeType>()?;
+    m.add_class::<ScanRecord>()?;
+    m.add_class::<ScanRecords>()?;
+    m.add_class::<RecordBatch>()?;
+    m.add_class::<PartitionInfo>()?;
+    m.add_class::<ServerNode>()?;
+    m.add_class::<OffsetSpec>()?;
+    m.add_class::<WriteResultHandle>()?;
+    m.add_class::<DatabaseDescriptor>()?;
+    m.add_class::<DatabaseInfo>()?;
+
+    // Register constants
+    m.add("EARLIEST_OFFSET", fcore::client::EARLIEST_OFFSET)?;
+
+    // Register exception types and error codes
+    m.add_class::<FlussError>()?;
+    m.add_class::<ErrorCode>()?;
+
+    Ok(())
+}
diff --git a/fluss-rust/bindings/python/src/lookup.rs b/fluss-rust/bindings/python/src/lookup.rs
new file mode 100644
index 0000000000..196faa1e81
--- /dev/null
+++ b/fluss-rust/bindings/python/src/lookup.rs
@@ -0,0 +1,214 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::table::{internal_row_to_dict, python_to_dense_generic_row};
+use crate::*;
+use pyo3_async_runtimes::tokio::future_into_py;
+use std::sync::Arc;
+use tokio::sync::Mutex;
+
+/// Lookuper for performing primary key lookups on a Fluss table.
+///
+/// The Lookuper caches key encoders and bucketing functions, making
+/// repeated lookups efficient. Create once and reuse for multiple lookups.
+///
+/// # Example:
+///     lookuper = table.new_lookup().create_lookuper()
+///     result = await lookuper.lookup({"user_id": 1})
+///     result2 = await lookuper.lookup({"user_id": 2})  # Reuses cached encoders
+#[pyclass]
+pub struct Lookuper {
+    inner: Arc<Mutex<fcore::client::Lookuper>>,
+    table_info: Arc<fcore::metadata::TableInfo>,
+}
+
+#[pymethods]
+impl Lookuper {
+    /// Lookup a row by its primary key.
+    ///
+    /// Args:
+    ///     pk: A dict, list, or tuple containing only the primary key values.
+    ///         For dict: keys are PK column names.
+    ///         For list/tuple: values in PK column order.
+    ///
+    /// Returns:
+    ///     A dict containing the row data if found, None otherwise.
+    pub fn lookup<'py>(
+        &self,
+        py: Python<'py>,
+        pk: &Bound<'_, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let pk_indices = self.table_info.get_schema().primary_key_indexes();
+        let generic_row = python_to_dense_generic_row(pk, &self.table_info, &pk_indices)?;
+        let inner = self.inner.clone();
+        let table_info = self.table_info.clone();
+
+        future_into_py(py, async move {
+            // Perform async lookup
+            let result = {
+                let mut lookuper = inner.lock().await;
+                lookuper
+                    .lookup(&generic_row)
+                    .await
+                    .map_err(|e| FlussError::from_core_error(&e))?
+            };
+
+            // Extract row data
+            let row_opt = result
+                .get_single_row()
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            // Convert to Python with GIL
+            Python::attach(|py| match row_opt {
+                Some(row) => internal_row_to_dict(py, &row, &table_info),
+                None => Ok(py.None()),
+            })
+        })
+    }
+
+    fn __repr__(&self) -> String {
+        "Lookuper()".to_string()
+    }
+}
+
+impl Lookuper {
+    /// Create a Lookuper from connection components.
+    ///
+    /// This creates the core Lookuper which caches encoders and bucketing functions.
+    pub fn new(
+        connection: &Arc<fcore::client::FlussConnection>,
+        metadata: Arc<fcore::client::Metadata>,
+        table_info: fcore::metadata::TableInfo,
+    ) -> PyResult<Self> {
+        // Run inside tokio runtime context because new_lookup()
+        // spawns a background task via tokio::spawn() in LookupClient::new().
+        let lookuper = TOKIO_RUNTIME.block_on(async {
+            let fluss_table =
+                fcore::client::FlussTable::new(connection, metadata, table_info.clone());
+            let table_lookup = fluss_table
+                .new_lookup()
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            table_lookup
+                .create_lookuper()
+                .map_err(|e| FlussError::from_core_error(&e))
+        })?;
+
+        Ok(Self {
+            inner: Arc::new(Mutex::new(lookuper)),
+            table_info: Arc::new(table_info),
+        })
+    }
+}
+
+/// Lookuper for performing prefix key lookups on a Fluss table.
+///
+/// Returns all rows whose primary key starts with the given prefix.
+/// Create once via `table.new_lookup().lookup_by(columns).create_lookuper()`
+/// and reuse for multiple lookups.
+#[pyclass]
+pub struct PrefixLookuper {
+    inner: Arc<Mutex<fcore::client::PrefixKeyLookuper>>,
+    table_info: Arc<fcore::metadata::TableInfo>,
+    lookup_column_indices: Vec<usize>,
+}
+
+#[pymethods]
+impl PrefixLookuper {
+    /// Lookup all rows matching a prefix key.
+    ///
+    /// Args:
+    ///     prefix: A dict, list, or tuple containing only the prefix key values
+    ///         (the columns specified in lookup_by()).
+    ///         For dict: keys are prefix column names.
+    ///         For list/tuple: values in prefix column order.
+    ///
+    /// Returns:
+    ///     A list of dicts, each containing the full row data. Empty list if no matches.
+    pub fn lookup<'py>(
+        &self,
+        py: Python<'py>,
+        prefix: &Bound<'_, PyAny>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let generic_row =
+            python_to_dense_generic_row(prefix, &self.table_info, &self.lookup_column_indices)?;
+        let inner = self.inner.clone();
+        let table_info = self.table_info.clone();
+
+        future_into_py(py, async move {
+            let result = {
+                let mut lookuper = inner.lock().await;
+                lookuper
+                    .lookup(&generic_row)
+                    .await
+                    .map_err(|e| FlussError::from_core_error(&e))?
+            };
+
+            let rows = result
+                .get_rows()
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let py_rows: Vec<Py<PyAny>> = rows
+                    .iter()
+                    .map(|row| internal_row_to_dict(py, row, &table_info))
+                    .collect::<PyResult<_>>()?;
+                Ok(py_rows)
+            })
+        })
+    }
+
+    fn __repr__(&self) -> String {
+        "PrefixLookuper()".to_string()
+    }
+}
+
+impl PrefixLookuper {
+    pub fn new(
+        connection: &Arc<fcore::client::FlussConnection>,
+        metadata: Arc<fcore::client::Metadata>,
+        table_info: fcore::metadata::TableInfo,
+        lookup_column_names: Vec<String>,
+    ) -> PyResult<Self> {
+        let row_type = table_info.row_type();
+        let lookup_column_indices: Vec<usize> = lookup_column_names
+            .iter()
+            .map(|name| {
+                row_type.get_field_index(name).ok_or_else(|| {
+                    FlussError::new_err(format!("Unknown column name '{name}' for prefix lookup"))
+                })
+            })
+            .collect::<PyResult<_>>()?;
+
+        let lookuper = TOKIO_RUNTIME.block_on(async {
+            let fluss_table =
+                fcore::client::FlussTable::new(connection, metadata, table_info.clone());
+            let table_lookup = fluss_table
+                .new_lookup()
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            table_lookup
+                .lookup_by(lookup_column_names)
+                .create_lookuper()
+                .map_err(|e| FlussError::from_core_error(&e))
+        })?;
+
+        Ok(Self {
+            inner: Arc::new(Mutex::new(lookuper)),
+            table_info: Arc::new(table_info),
+            lookup_column_indices,
+        })
+    }
+}
diff --git a/fluss-rust/bindings/python/src/metadata.rs b/fluss-rust/bindings/python/src/metadata.rs
new file mode 100644
index 0000000000..7b6129a489
--- /dev/null
+++ b/fluss-rust/bindings/python/src/metadata.rs
@@ -0,0 +1,767 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::*;
+use pyo3::types::PyDict;
+use std::collections::HashMap;
+
+/// Represents the type of change for a record in a log
+#[pyclass(eq, eq_int)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ChangeType {
+    /// Append-only operation
+    AppendOnly = 0,
+    /// Insert operation
+    Insert = 1,
+    /// Update operation containing the previous content of the updated row
+    UpdateBefore = 2,
+    /// Update operation containing the new content of the updated row
+    UpdateAfter = 3,
+    /// Delete operation
+    Delete = 4,
+}
+
+#[pymethods]
+impl ChangeType {
+    /// Returns a short string representation of this ChangeType
+    pub fn short_string(&self) -> &'static str {
+        match self {
+            ChangeType::AppendOnly => "+A",
+            ChangeType::Insert => "+I",
+            ChangeType::UpdateBefore => "-U",
+            ChangeType::UpdateAfter => "+U",
+            ChangeType::Delete => "-D",
+        }
+    }
+
+    fn __str__(&self) -> &'static str {
+        self.short_string()
+    }
+
+    fn __repr__(&self) -> String {
+        format!("ChangeType.{self:?}")
+    }
+}
+
+impl ChangeType {
+    /// Convert from core ChangeType
+    pub fn from_core(change_type: fcore::record::ChangeType) -> Self {
+        match change_type {
+            fcore::record::ChangeType::AppendOnly => ChangeType::AppendOnly,
+            fcore::record::ChangeType::Insert => ChangeType::Insert,
+            fcore::record::ChangeType::UpdateBefore => ChangeType::UpdateBefore,
+            fcore::record::ChangeType::UpdateAfter => ChangeType::UpdateAfter,
+            fcore::record::ChangeType::Delete => ChangeType::Delete,
+        }
+    }
+}
+
+/// Represents a table path with database and table name
+#[pyclass]
+#[derive(Clone)]
+pub struct TablePath {
+    database_name: String,
+    table_name: String,
+}
+
+#[pymethods]
+impl TablePath {
+    /// Create a new TablePath
+    #[new]
+    pub fn new(database_name: String, table_name: String) -> Self {
+        Self {
+            database_name,
+            table_name,
+        }
+    }
+
+    /// Get the database name
+    #[getter]
+    pub fn database_name(&self) -> String {
+        self.database_name.clone()
+    }
+
+    /// Get the table name  
+    #[getter]
+    pub fn table_name(&self) -> String {
+        self.table_name.clone()
+    }
+
+    /// Get table path as string
+    pub fn table_path_str(&self) -> String {
+        format!("{}.{}", self.database_name, self.table_name)
+    }
+
+    pub fn __str__(&self) -> String {
+        self.table_path_str()
+    }
+
+    fn __repr__(&self) -> String {
+        format!("TablePath('{}', '{}')", self.database_name, self.table_name)
+    }
+
+    /// Hash implementation for Python
+    pub fn __hash__(&self) -> u64 {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        let mut hasher = DefaultHasher::new();
+        self.database_name.hash(&mut hasher);
+        self.table_name.hash(&mut hasher);
+        hasher.finish()
+    }
+
+    /// Equality implementation for Python
+    pub fn __eq__(&self, other: &TablePath) -> bool {
+        self.database_name == other.database_name && self.table_name == other.table_name
+    }
+}
+
+impl TablePath {
+    /// Convert to core TablePath
+    pub fn to_core(&self) -> fcore::metadata::TablePath {
+        fcore::metadata::TablePath::new(self.database_name.clone(), self.table_name.clone())
+    }
+
+    pub fn from_core(core_path: fcore::metadata::TablePath) -> Self {
+        Self {
+            database_name: core_path.database().to_string(),
+            table_name: core_path.table().to_string(),
+        }
+    }
+}
+
+/// Schema wrapper for Fluss table schema
+#[pyclass]
+pub struct Schema {
+    __schema: fcore::metadata::Schema,
+}
+
+#[pymethods]
+impl Schema {
+    /// Create a new Schema from PyArrow schema with optional primary keys
+    #[new]
+    #[pyo3(signature = (schema, primary_keys=None))]
+    pub fn new(
+        schema: Py<PyAny>, // PyArrow schema
+        primary_keys: Option<Vec<String>>,
+    ) -> PyResult<Self> {
+        let arrow_schema = crate::utils::Utils::pyarrow_to_arrow_schema(&schema)?;
+
+        let mut builder = fcore::metadata::Schema::builder();
+
+        for field in arrow_schema.fields() {
+            let fluss_data_type = crate::utils::Utils::arrow_field_to_fluss_type(field)?;
+            builder = builder.column(field.name(), fluss_data_type);
+
+            if let Some(comment) = field.metadata().get("comment") {
+                builder = builder.with_comment(comment);
+            }
+        }
+
+        if let Some(pk_columns) = primary_keys {
+            if !pk_columns.is_empty() {
+                builder = builder.primary_key(pk_columns);
+            }
+        }
+
+        let fluss_schema = builder
+            .build()
+            .map_err(|e| FlussError::new_err(format!("Failed to build schema: {e}")))?;
+
+        Ok(Self {
+            __schema: fluss_schema,
+        })
+    }
+
+    /// Get column names
+    fn get_column_names(&self) -> Vec<String> {
+        self.__schema
+            .columns()
+            .iter()
+            .map(|col| col.name().to_string())
+            .collect()
+    }
+
+    /// Get column types
+    fn get_column_types(&self) -> Vec<String> {
+        self.__schema
+            .columns()
+            .iter()
+            .map(|col| Utils::datatype_to_string(col.data_type()))
+            .collect()
+    }
+
+    /// Get columns as (name, type) pairs
+    fn get_columns(&self) -> Vec<(String, String)> {
+        self.__schema
+            .columns()
+            .iter()
+            .map(|col| {
+                (
+                    col.name().to_string(),
+                    Utils::datatype_to_string(col.data_type()),
+                )
+            })
+            .collect()
+    }
+
+    /// Get primary key column names, returns empty list if no primary key is defined
+    fn get_primary_keys(&self) -> Vec<String> {
+        self.__schema
+            .primary_key()
+            .map(|pk| pk.column_names().to_vec())
+            .unwrap_or_default()
+    }
+
+    fn __str__(&self) -> String {
+        format!("Schema: columns={:?}", self.get_columns())
+    }
+}
+
+impl Schema {
+    /// Convert to core Schema
+    pub fn to_core(&self) -> &fcore::metadata::Schema {
+        &self.__schema
+    }
+}
+
+/// Table distribution configuration
+#[pyclass]
+pub struct TableDistribution {
+    inner: fcore::metadata::TableDistribution,
+}
+
+#[pymethods]
+impl TableDistribution {
+    /// Get bucket keys
+    fn bucket_keys(&self) -> Vec<String> {
+        self.inner.bucket_keys().to_vec()
+    }
+
+    /// Get bucket count
+    fn bucket_count(&self) -> Option<i32> {
+        self.inner.bucket_count()
+    }
+}
+
+/// Table descriptor containing schema and metadata
+#[pyclass]
+#[derive(Clone)]
+pub struct TableDescriptor {
+    __tbl_desc: fcore::metadata::TableDescriptor,
+}
+
+#[pymethods]
+impl TableDescriptor {
+    /// Create a new TableDescriptor
+    #[new]
+    #[pyo3(signature = (schema, **kwargs))]
+    pub fn new(
+        schema: &Schema, // fluss schema
+        kwargs: Option<&Bound<'_, PyDict>>,
+    ) -> PyResult<Self> {
+        let mut partition_keys: Vec<String> = Vec::new();
+        let mut bucket_count = None;
+        let mut bucket_keys = Vec::new();
+        let mut properties: HashMap<String, String> = HashMap::new();
+        let mut custom_properties: HashMap<String, String> = HashMap::new();
+        let mut comment: Option<String> = None;
+        let mut log_format = None;
+        let mut kv_format = None;
+
+        if let Some(kwargs) = kwargs {
+            if let Ok(Some(pkeys)) = kwargs.get_item("partition_keys") {
+                partition_keys = pkeys.extract()?;
+            }
+            if let Ok(Some(bcount)) = kwargs.get_item("bucket_count") {
+                bucket_count = Some(bcount.extract()?);
+            }
+            if let Ok(Some(bkeys)) = kwargs.get_item("bucket_keys") {
+                bucket_keys = bkeys.extract()?;
+            }
+            if let Ok(Some(props)) = kwargs.get_item("properties") {
+                properties = props.extract()?;
+            }
+            if let Ok(Some(cprops)) = kwargs.get_item("custom_properties") {
+                custom_properties = cprops.extract()?;
+            }
+            if let Ok(Some(comm)) = kwargs.get_item("comment") {
+                comment = Some(comm.extract()?);
+            }
+            if let Ok(Some(lformat)) = kwargs.get_item("log_format") {
+                let format_str: String = lformat.extract()?;
+                log_format = Some(
+                    fcore::metadata::LogFormat::parse(&format_str)
+                        .map_err(|e| FlussError::new_err(e.to_string()))?,
+                );
+            }
+            if let Ok(Some(kformat)) = kwargs.get_item("kv_format") {
+                let format_str: String = kformat.extract()?;
+                kv_format = Some(
+                    fcore::metadata::KvFormat::parse(&format_str)
+                        .map_err(|e| FlussError::new_err(e.to_string()))?,
+                );
+            }
+        }
+
+        let fluss_schema = schema.to_core().clone();
+
+        let mut builder = fcore::metadata::TableDescriptor::builder()
+            .schema(fluss_schema)
+            .properties(properties)
+            .custom_properties(custom_properties)
+            .partitioned_by(partition_keys)
+            .distributed_by(bucket_count, bucket_keys);
+
+        if let Some(comment) = comment {
+            builder = builder.comment(&comment);
+        }
+        if let Some(log_format) = log_format {
+            builder = builder.log_format(log_format);
+        }
+        if let Some(kv_format) = kv_format {
+            builder = builder.kv_format(kv_format);
+        }
+
+        let core_descriptor = builder
+            .build()
+            .map_err(|e| FlussError::new_err(format!("Failed to build TableDescriptor: {e}")))?;
+
+        Ok(Self {
+            __tbl_desc: core_descriptor,
+        })
+    }
+
+    /// Get the schema of this table descriptor
+    pub fn get_schema(&self) -> PyResult<Schema> {
+        Ok(Schema {
+            __schema: self.__tbl_desc.schema().clone(),
+        })
+    }
+}
+
+impl TableDescriptor {
+    /// Convert to core TableDescriptor
+    pub fn to_core(&self) -> &fcore::metadata::TableDescriptor {
+        &self.__tbl_desc
+    }
+}
+
+/// Information about a Fluss table
+#[pyclass]
+#[derive(Clone)]
+pub struct TableInfo {
+    __table_info: fcore::metadata::TableInfo,
+}
+
+#[pymethods]
+impl TableInfo {
+    /// Get the table ID
+    #[getter]
+    pub fn table_id(&self) -> i64 {
+        self.__table_info.get_table_id()
+    }
+
+    /// Get the schema ID
+    #[getter]
+    pub fn schema_id(&self) -> i32 {
+        self.__table_info.get_schema_id()
+    }
+
+    /// Get the table path
+    #[getter]
+    pub fn table_path(&self) -> TablePath {
+        TablePath::from_core(self.__table_info.get_table_path().clone())
+    }
+
+    /// Get the created time
+    #[getter]
+    pub fn created_time(&self) -> i64 {
+        self.__table_info.get_created_time()
+    }
+
+    /// Get the modified time
+    #[getter]
+    pub fn modified_time(&self) -> i64 {
+        self.__table_info.get_modified_time()
+    }
+
+    /// Get the primary keys
+    pub fn get_primary_keys(&self) -> Vec<String> {
+        self.__table_info.get_primary_keys().clone()
+    }
+
+    /// Get the bucket keys
+    pub fn get_bucket_keys(&self) -> Vec<String> {
+        self.__table_info.get_bucket_keys().to_vec()
+    }
+
+    /// Get the partition keys
+    pub fn get_partition_keys(&self) -> Vec<String> {
+        self.__table_info.get_partition_keys().to_vec()
+    }
+
+    /// Get number of buckets
+    #[getter]
+    pub fn num_buckets(&self) -> i32 {
+        self.__table_info.get_num_buckets()
+    }
+
+    /// Check if table has primary key
+    pub fn has_primary_key(&self) -> bool {
+        self.__table_info.has_primary_key()
+    }
+
+    /// Check if table is partitioned
+    pub fn is_partitioned(&self) -> bool {
+        self.__table_info.is_partitioned()
+    }
+
+    /// Get properties
+    pub fn get_properties(&self) -> std::collections::HashMap<String, String> {
+        self.__table_info.get_properties().clone()
+    }
+
+    /// Get custom properties
+    pub fn get_custom_properties(&self) -> std::collections::HashMap<String, String> {
+        self.__table_info.get_custom_properties().clone()
+    }
+
+    /// Get comment
+    #[getter]
+    pub fn comment(&self) -> Option<String> {
+        self.__table_info.get_comment().map(|s| s.to_string())
+    }
+
+    /// Get the Schema
+    pub fn get_schema(&self) -> Schema {
+        Schema {
+            __schema: self.__table_info.get_schema().clone(),
+        }
+    }
+
+    /// Get column names
+    pub fn get_column_names(&self) -> Vec<String> {
+        self.__table_info
+            .get_schema()
+            .columns()
+            .iter()
+            .map(|col| col.name().to_string())
+            .collect()
+    }
+
+    /// Get column count
+    pub fn get_column_count(&self) -> usize {
+        self.__table_info.get_schema().columns().len()
+    }
+}
+
+impl TableInfo {
+    /// Create from core TableInfo (internal use)
+    pub fn from_core(info: fcore::metadata::TableInfo) -> Self {
+        Self { __table_info: info }
+    }
+}
+
+/// Represents a lake snapshot with snapshot ID and table bucket offsets
+#[pyclass]
+#[derive(Clone)]
+pub struct LakeSnapshot {
+    snapshot_id: i64,
+    table_buckets_offset: HashMap<fcore::metadata::TableBucket, i64>,
+}
+
+/// Represents a table bucket with table ID, partition ID, and bucket ID
+#[pyclass]
+#[derive(Eq, Hash, PartialEq, Clone)]
+pub struct TableBucket {
+    table_id: i64,
+    partition_id: Option<i64>,
+    bucket: i32,
+}
+
+#[pymethods]
+impl TableBucket {
+    /// Create a new TableBucket
+    #[new]
+    pub fn new(table_id: i64, bucket: i32) -> Self {
+        Self {
+            table_id,
+            partition_id: None,
+            bucket,
+        }
+    }
+
+    /// Create a new TableBucket with partition
+    #[staticmethod]
+    pub fn with_partition(table_id: i64, partition_id: i64, bucket: i32) -> Self {
+        Self {
+            table_id,
+            partition_id: Some(partition_id),
+            bucket,
+        }
+    }
+
+    /// Get table ID
+    #[getter]
+    pub fn table_id(&self) -> i64 {
+        self.table_id
+    }
+
+    /// Get bucket ID
+    #[getter]
+    pub fn bucket_id(&self) -> i32 {
+        self.bucket
+    }
+
+    /// Get partition ID
+    #[getter]
+    pub fn partition_id(&self) -> Option<i64> {
+        self.partition_id
+    }
+
+    /// String representation
+    pub fn __str__(&self) -> String {
+        if let Some(partition_id) = self.partition_id {
+            format!(
+                "TableBucket(table_id={}, partition_id={}, bucket={})",
+                self.table_id, partition_id, self.bucket
+            )
+        } else {
+            format!(
+                "TableBucket(table_id={}, bucket={})",
+                self.table_id, self.bucket
+            )
+        }
+    }
+
+    /// String representation
+    pub fn __repr__(&self) -> String {
+        self.__str__()
+    }
+
+    /// Hash implementation for Python
+    pub fn __hash__(&self) -> u64 {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        let mut hasher = DefaultHasher::new();
+        self.table_id.hash(&mut hasher);
+        self.partition_id.hash(&mut hasher);
+        self.bucket.hash(&mut hasher);
+        hasher.finish()
+    }
+
+    /// Equality implementation for Python
+    pub fn __eq__(&self, other: &TableBucket) -> bool {
+        self.table_id == other.table_id
+            && self.partition_id == other.partition_id
+            && self.bucket == other.bucket
+    }
+}
+
+impl TableBucket {
+    /// Create from core TableBucket (internal use)
+    pub fn from_core(bucket: fcore::metadata::TableBucket) -> Self {
+        Self {
+            table_id: bucket.table_id(),
+            partition_id: bucket.partition_id(),
+            bucket: bucket.bucket_id(),
+        }
+    }
+
+    /// Convert to core TableBucket (internal use)
+    pub fn to_core(&self) -> fcore::metadata::TableBucket {
+        fcore::metadata::TableBucket::new_with_partition(
+            self.table_id,
+            self.partition_id,
+            self.bucket,
+        )
+    }
+}
+
+#[pymethods]
+impl LakeSnapshot {
+    /// Create a new LakeSnapshot
+    #[new]
+    pub fn new(snapshot_id: i64) -> Self {
+        Self {
+            snapshot_id,
+            table_buckets_offset: HashMap::new(),
+        }
+    }
+
+    /// Get snapshot ID
+    #[getter]
+    pub fn snapshot_id(&self) -> i64 {
+        self.snapshot_id
+    }
+
+    /// Get table bucket offsets as a Python dictionary with TableBucket keys
+    #[getter]
+    pub fn table_buckets_offset(&self, py: Python) -> PyResult<Py<PyAny>> {
+        let dict = PyDict::new(py);
+        for (bucket, offset) in &self.table_buckets_offset {
+            let py_bucket = TableBucket::from_core(bucket.clone());
+            dict.set_item(Py::new(py, py_bucket)?, *offset)?;
+        }
+        Ok(dict.into())
+    }
+
+    /// Get offset for a specific table bucket
+    pub fn get_bucket_offset(&self, bucket: &TableBucket) -> Option<i64> {
+        let core_bucket = bucket.to_core();
+        self.table_buckets_offset.get(&core_bucket).copied()
+    }
+
+    /// Get all table buckets
+    pub fn get_table_buckets(&self, py: Python) -> PyResult<Vec<Py<PyAny>>> {
+        let mut buckets = Vec::new();
+        for bucket in self.table_buckets_offset.keys() {
+            let py_bucket = TableBucket::from_core(bucket.clone());
+            buckets.push(Py::new(py, py_bucket)?.into());
+        }
+        Ok(buckets)
+    }
+
+    /// String representation
+    pub fn __str__(&self) -> String {
+        format!(
+            "LakeSnapshot(snapshot_id={}, buckets_count={})",
+            self.snapshot_id,
+            self.table_buckets_offset.len()
+        )
+    }
+
+    /// String representation
+    pub fn __repr__(&self) -> String {
+        self.__str__()
+    }
+}
+
+impl LakeSnapshot {
+    /// Create from core LakeSnapshot (internal use)
+    pub fn from_core(snapshot: fcore::metadata::LakeSnapshot) -> Self {
+        Self {
+            snapshot_id: snapshot.snapshot_id,
+            table_buckets_offset: snapshot.table_buckets_offset,
+        }
+    }
+}
+
+/// Descriptor for a Fluss database (comment and custom properties)
+#[pyclass]
+#[derive(Clone)]
+pub struct DatabaseDescriptor {
+    __descriptor: fcore::metadata::DatabaseDescriptor,
+}
+
+#[pymethods]
+impl DatabaseDescriptor {
+    /// Create a new DatabaseDescriptor
+    #[new]
+    #[pyo3(signature = (comment=None, custom_properties=None))]
+    pub fn new(
+        comment: Option<String>,
+        custom_properties: Option<HashMap<String, String>>,
+    ) -> PyResult<Self> {
+        let mut builder = fcore::metadata::DatabaseDescriptor::builder();
+        if let Some(c) = comment {
+            builder = builder.comment(&c);
+        }
+        if let Some(props) = custom_properties {
+            builder = builder.custom_properties(props);
+        }
+        let __descriptor = builder.build();
+        Ok(Self { __descriptor })
+    }
+
+    /// Get comment if set
+    #[getter]
+    pub fn comment(&self) -> Option<String> {
+        self.__descriptor.comment().map(|s| s.to_string())
+    }
+
+    /// Get custom properties
+    pub fn get_custom_properties(&self) -> HashMap<String, String> {
+        self.__descriptor.custom_properties().clone()
+    }
+
+    fn __repr__(&self) -> String {
+        format!(
+            "DatabaseDescriptor(comment={:?}, custom_properties={:?})",
+            self.comment(),
+            self.get_custom_properties()
+        )
+    }
+}
+
+impl DatabaseDescriptor {
+    pub fn to_core(&self) -> &fcore::metadata::DatabaseDescriptor {
+        &self.__descriptor
+    }
+}
+
+/// Information about a Fluss database
+#[pyclass]
+pub struct DatabaseInfo {
+    __info: fcore::metadata::DatabaseInfo,
+}
+
+#[pymethods]
+impl DatabaseInfo {
+    /// Get the database name
+    #[getter]
+    pub fn database_name(&self) -> String {
+        self.__info.database_name().to_string()
+    }
+
+    /// Get the database descriptor
+    pub fn get_database_descriptor(&self) -> DatabaseDescriptor {
+        DatabaseDescriptor {
+            __descriptor: self.__info.database_descriptor().clone(),
+        }
+    }
+
+    /// Get created time
+    #[getter]
+    pub fn created_time(&self) -> i64 {
+        self.__info.created_time()
+    }
+
+    /// Get modified time
+    #[getter]
+    pub fn modified_time(&self) -> i64 {
+        self.__info.modified_time()
+    }
+
+    fn __repr__(&self) -> String {
+        format!(
+            "DatabaseInfo(database_name='{}', created_time={}, modified_time={})",
+            self.database_name(),
+            self.created_time(),
+            self.modified_time()
+        )
+    }
+}
+
+impl DatabaseInfo {
+    pub fn from_core(info: fcore::metadata::DatabaseInfo) -> Self {
+        Self { __info: info }
+    }
+}
diff --git a/fluss-rust/bindings/python/src/table.rs b/fluss-rust/bindings/python/src/table.rs
new file mode 100644
index 0000000000..b30baeb5ca
--- /dev/null
+++ b/fluss-rust/bindings/python/src/table.rs
@@ -0,0 +1,2599 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::TOKIO_RUNTIME;
+use crate::*;
+use arrow::array::RecordBatch as ArrowRecordBatch;
+use arrow::record_batch::RecordBatchReader as _;
+use arrow_pyarrow::{FromPyArrow, ToPyArrow};
+use arrow_schema::SchemaRef;
+use fluss::record::to_arrow_schema;
+use indexmap::IndexMap;
+use pyo3::IntoPyObjectExt;
+use pyo3::exceptions::{PyIndexError, PyRuntimeError, PyTypeError};
+use pyo3::sync::PyOnceLock;
+use pyo3::types::{
+    IntoPyDict, PyBool, PyByteArray, PyBytes, PyDate, PyDateAccess, PyDateTime, PyDelta,
+    PyDeltaAccess, PyDict, PyList, PySequence, PySlice, PyString, PyTime, PyTimeAccess, PyTuple,
+    PyType, PyTzInfo,
+};
+use pyo3_async_runtimes::tokio::future_into_py;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+
+// Time conversion constants
+const MILLIS_PER_SECOND: i64 = 1_000;
+const MILLIS_PER_MINUTE: i64 = 60_000;
+const MILLIS_PER_HOUR: i64 = 3_600_000;
+const MICROS_PER_MILLI: i64 = 1_000;
+const MICROS_PER_SECOND: i64 = 1_000_000;
+const MICROS_PER_DAY: i64 = 86_400_000_000;
+const NANOS_PER_MILLI: i64 = 1_000_000;
+const NANOS_PER_MICRO: i64 = 1_000;
+const DEFAULT_POLL_INTERVAL_MS: i64 = 1000;
+
+/// Represents a single scan record with metadata.
+///
+/// Matches Rust/Java: offset, timestamp, change_type, row.
+/// The bucket is the key in ScanRecords, not on the individual record.
+#[pyclass]
+pub struct ScanRecord {
+    #[pyo3(get)]
+    offset: i64,
+    #[pyo3(get)]
+    timestamp: i64,
+    #[pyo3(get)]
+    change_type: ChangeType,
+    /// Store row as a Python dict directly
+    row_dict: Py<PyDict>,
+}
+
+#[pymethods]
+impl ScanRecord {
+    /// Get the row data as a dictionary
+    #[getter]
+    pub fn row(&self, py: Python) -> Py<PyDict> {
+        self.row_dict.clone_ref(py)
+    }
+
+    fn __str__(&self) -> String {
+        format!(
+            "ScanRecord(offset={}, timestamp={}, change_type={})",
+            self.offset,
+            self.timestamp,
+            self.change_type.short_string()
+        )
+    }
+
+    fn __repr__(&self) -> String {
+        self.__str__()
+    }
+}
+
+impl ScanRecord {
+    /// Create a ScanRecord from core types
+    pub fn from_core(
+        py: Python,
+        record: &fcore::record::ScanRecord,
+        row_type: &fcore::metadata::RowType,
+    ) -> PyResult<Self> {
+        let fields = row_type.fields();
+        let row = record.row();
+        let dict = PyDict::new(py);
+
+        for (pos, field) in fields.iter().enumerate() {
+            let value = datum_to_python_value(py, row, pos, field.data_type())?;
+            dict.set_item(field.name(), value)?;
+        }
+
+        Ok(ScanRecord {
+            offset: record.offset(),
+            timestamp: record.timestamp(),
+            change_type: ChangeType::from_core(*record.change_type()),
+            row_dict: dict.unbind(),
+        })
+    }
+}
+
+/// Represents a batch of records with metadata
+#[pyclass]
+pub struct RecordBatch {
+    batch: Arc<ArrowRecordBatch>,
+    #[pyo3(get)]
+    bucket: TableBucket,
+    #[pyo3(get)]
+    base_offset: i64,
+    #[pyo3(get)]
+    last_offset: i64,
+}
+
+#[pymethods]
+impl RecordBatch {
+    /// Get the Arrow RecordBatch as PyArrow RecordBatch
+    #[getter]
+    pub fn batch(&self, py: Python) -> PyResult<Py<PyAny>> {
+        let pyarrow_batch = self
+            .batch
+            .as_ref()
+            .to_pyarrow(py)
+            .map_err(|e| FlussError::new_err(format!("Failed to convert batch: {e}")))?;
+        Ok(pyarrow_batch.unbind())
+    }
+
+    fn __str__(&self) -> String {
+        format!(
+            "RecordBatch(bucket={}, base_offset={}, last_offset={}, rows={})",
+            self.bucket.__str__(),
+            self.base_offset,
+            self.last_offset,
+            self.batch.num_rows()
+        )
+    }
+
+    fn __repr__(&self) -> String {
+        self.__str__()
+    }
+}
+
+impl RecordBatch {
+    /// Create a RecordBatch from core ScanBatch
+    pub fn from_scan_batch(scan_batch: fcore::record::ScanBatch) -> Self {
+        RecordBatch {
+            bucket: TableBucket::from_core(scan_batch.bucket().clone()),
+            base_offset: scan_batch.base_offset(),
+            last_offset: scan_batch.last_offset(),
+            batch: Arc::new(scan_batch.into_batch()),
+        }
+    }
+}
+
+/// A collection of scan records grouped by bucket.
+///
+/// Returned by `LogScanner.poll()`. Records are grouped by `TableBucket`.
+#[pyclass]
+pub struct ScanRecords {
+    records_by_bucket: IndexMap<TableBucket, Vec<Py<ScanRecord>>>,
+    total_count: usize,
+}
+
+#[pymethods]
+impl ScanRecords {
+    /// List of distinct buckets that have records in this result.
+    pub fn buckets(&self) -> Vec<TableBucket> {
+        self.records_by_bucket.keys().cloned().collect()
+    }
+
+    /// Get records for a specific bucket.
+    ///
+    /// Returns an empty list if the bucket is not present (matches Rust/Java behavior).
+    pub fn records(&self, py: Python, bucket: &TableBucket) -> Vec<Py<ScanRecord>> {
+        self.records_by_bucket
+            .get(bucket)
+            .map(|recs| recs.iter().map(|r| r.clone_ref(py)).collect())
+            .unwrap_or_default()
+    }
+
+    /// Total number of records across all buckets.
+    pub fn count(&self) -> usize {
+        self.total_count
+    }
+
+    /// Whether the result set is empty.
+    pub fn is_empty(&self) -> bool {
+        self.total_count == 0
+    }
+
+    fn __len__(&self) -> usize {
+        self.total_count
+    }
+
+    /// Type-dispatched indexing:
+    ///   records[0]       → ScanRecord (flat index)
+    ///   records[-1]      → ScanRecord (negative index)
+    ///   records[1:3]     → list[ScanRecord] (slice)
+    ///   records[bucket]  → list[ScanRecord] (by bucket)
+    fn __getitem__(&self, py: Python, key: &Bound<'_, PyAny>) -> PyResult<Py<PyAny>> {
+        // Try integer index first
+        if let Ok(mut idx) = key.extract::<isize>() {
+            let len = self.total_count as isize;
+            if idx < 0 {
+                idx += len;
+            }
+            if idx < 0 || idx >= len {
+                return Err(PyIndexError::new_err(format!(
+                    "index {idx} out of range for ScanRecords of size {len}"
+                )));
+            }
+            let idx = idx as usize;
+            let mut offset = 0;
+            for recs in self.records_by_bucket.values() {
+                if idx < offset + recs.len() {
+                    return Ok(recs[idx - offset].clone_ref(py).into_any());
+                }
+                offset += recs.len();
+            }
+            return Err(PyRuntimeError::new_err(
+                "internal error: total_count out of sync with records",
+            ));
+        }
+        // Try slice
+        if let Ok(slice) = key.downcast::<PySlice>() {
+            let indices = slice.indices(self.total_count as isize)?;
+            let mut result: Vec<Py<ScanRecord>> = Vec::new();
+            let mut i = indices.start;
+            while (indices.step > 0 && i < indices.stop) || (indices.step < 0 && i > indices.stop) {
+                let idx = i as usize;
+                let mut offset = 0;
+                for recs in self.records_by_bucket.values() {
+                    if idx < offset + recs.len() {
+                        result.push(recs[idx - offset].clone_ref(py));
+                        break;
+                    }
+                    offset += recs.len();
+                }
+                i += indices.step;
+            }
+            return Ok(result.into_pyobject(py).unwrap().into_any().unbind());
+        }
+        // Try TableBucket
+        if let Ok(bucket) = key.extract::<TableBucket>() {
+            let recs = self.records(py, &bucket);
+            return Ok(recs.into_pyobject(py).unwrap().into_any().unbind());
+        }
+        Err(PyTypeError::new_err(
+            "index must be int, slice, or TableBucket",
+        ))
+    }
+
+    /// Support `bucket in records`.
+    fn __contains__(&self, bucket: &TableBucket) -> bool {
+        self.records_by_bucket.contains_key(bucket)
+    }
+
+    /// Mapping protocol: alias for `buckets()`.
+    pub fn keys(&self) -> Vec<TableBucket> {
+        self.buckets()
+    }
+
+    /// Mapping protocol: lazy iterator over record lists, one per bucket.
+    pub fn values(slf: Bound<'_, Self>) -> ScanRecordsBucketIter {
+        let this = slf.borrow();
+        let bucket_keys: Vec<TableBucket> = this.records_by_bucket.keys().cloned().collect();
+        drop(this);
+        ScanRecordsBucketIter {
+            owner: slf.unbind(),
+            bucket_keys,
+            bucket_idx: 0,
+            with_keys: false,
+        }
+    }
+
+    /// Mapping protocol: lazy iterator over `(TableBucket, list[ScanRecord])` pairs.
+    pub fn items(slf: Bound<'_, Self>) -> ScanRecordsBucketIter {
+        let this = slf.borrow();
+        let bucket_keys: Vec<TableBucket> = this.records_by_bucket.keys().cloned().collect();
+        drop(this);
+        ScanRecordsBucketIter {
+            owner: slf.unbind(),
+            bucket_keys,
+            bucket_idx: 0,
+            with_keys: true,
+        }
+    }
+
+    fn __str__(&self) -> String {
+        format!(
+            "ScanRecords(records={}, buckets={})",
+            self.total_count,
+            self.records_by_bucket.len()
+        )
+    }
+
+    fn __repr__(&self) -> String {
+        self.__str__()
+    }
+
+    /// Flat iterator over all records across all buckets (matches Java/Rust).
+    fn __iter__(slf: Bound<'_, Self>) -> ScanRecordsIter {
+        let this = slf.borrow();
+        let bucket_keys: Vec<TableBucket> = this.records_by_bucket.keys().cloned().collect();
+        drop(this);
+        ScanRecordsIter {
+            owner: slf.unbind(),
+            bucket_keys,
+            bucket_idx: 0,
+            rec_idx: 0,
+        }
+    }
+}
+
+#[pyclass]
+struct ScanRecordsIter {
+    owner: Py<ScanRecords>,
+    bucket_keys: Vec<TableBucket>,
+    bucket_idx: usize,
+    rec_idx: usize,
+}
+
+#[pymethods]
+impl ScanRecordsIter {
+    fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
+        slf
+    }
+
+    fn __next__(&mut self, py: Python) -> Option<Py<ScanRecord>> {
+        let owner = self.owner.borrow(py);
+        loop {
+            if self.bucket_idx >= self.bucket_keys.len() {
+                return None;
+            }
+            let bucket = &self.bucket_keys[self.bucket_idx];
+            if let Some(recs) = owner.records_by_bucket.get(bucket) {
+                if self.rec_idx < recs.len() {
+                    let rec = recs[self.rec_idx].clone_ref(py);
+                    self.rec_idx += 1;
+                    return Some(rec);
+                }
+            }
+            self.bucket_idx += 1;
+            self.rec_idx = 0;
+        }
+    }
+}
+
+/// Lazy iterator for `ScanRecords.items()` and `ScanRecords.values()`.
+///
+/// Yields one bucket at a time: `(TableBucket, list[ScanRecord])` for items,
+/// or `list[ScanRecord]` for values. Only materializes records for the
+/// current bucket on each `__next__` call.
+#[pyclass]
+pub struct ScanRecordsBucketIter {
+    owner: Py<ScanRecords>,
+    bucket_keys: Vec<TableBucket>,
+    bucket_idx: usize,
+    with_keys: bool,
+}
+
+#[pymethods]
+impl ScanRecordsBucketIter {
+    fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
+        slf
+    }
+
+    fn __next__(&mut self, py: Python) -> Option<Py<PyAny>> {
+        if self.bucket_idx >= self.bucket_keys.len() {
+            return None;
+        }
+        let bucket = &self.bucket_keys[self.bucket_idx];
+        let owner = self.owner.borrow(py);
+        let recs = owner
+            .records_by_bucket
+            .get(bucket)
+            .map(|recs| recs.iter().map(|r| r.clone_ref(py)).collect::<Vec<_>>())
+            .unwrap_or_default();
+        let bucket = bucket.clone();
+        self.bucket_idx += 1;
+
+        if self.with_keys {
+            Some(
+                (bucket, recs)
+                    .into_pyobject(py)
+                    .unwrap()
+                    .into_any()
+                    .unbind(),
+            )
+        } else {
+            Some(recs.into_pyobject(py).unwrap().into_any().unbind())
+        }
+    }
+}
+
+/// Represents a Fluss table for data operations
+#[pyclass]
+pub struct FlussTable {
+    connection: Arc<fcore::client::FlussConnection>,
+    metadata: Arc<fcore::client::Metadata>,
+    table_info: fcore::metadata::TableInfo,
+    table_path: fcore::metadata::TablePath,
+    has_primary_key: bool,
+}
+
+/// Builder for creating log scanners with flexible configuration.
+///
+/// Use this builder to configure projection, and in the future, filters
+/// before creating a log scanner.
+#[pyclass]
+pub struct TableScan {
+    connection: Arc<fcore::client::FlussConnection>,
+    metadata: Arc<fcore::client::Metadata>,
+    table_info: fcore::metadata::TableInfo,
+    projection: Option<ProjectionType>,
+}
+
+/// Scanner type for internal use
+enum ScannerType {
+    Record,
+    Batch,
+}
+
+#[pymethods]
+impl TableScan {
+    /// Project to specific columns by their indices.
+    ///
+    /// Args:
+    ///     indices: List of column indices (0-based) to include in the scan.
+    ///
+    /// Returns:
+    ///     Self for method chaining.
+    pub fn project(mut slf: PyRefMut<'_, Self>, indices: Vec<usize>) -> PyRefMut<'_, Self> {
+        slf.projection = Some(ProjectionType::Indices(indices));
+        slf
+    }
+
+    /// Project to specific columns by their names.
+    ///
+    /// Args:
+    ///     names: List of column names to include in the scan.
+    ///
+    /// Returns:
+    ///     Self for method chaining.
+    pub fn project_by_name(mut slf: PyRefMut<'_, Self>, names: Vec<String>) -> PyRefMut<'_, Self> {
+        slf.projection = Some(ProjectionType::Names(names));
+        slf
+    }
+
+    /// Create a record-based log scanner.
+    ///
+    /// Use this scanner with `poll()` to get individual records with metadata
+    /// (offset, timestamp, change_type).
+    ///
+    /// Returns:
+    ///     LogScanner for record-by-record scanning with `poll()`
+    pub fn create_log_scanner<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        self.create_scanner_internal(py, ScannerType::Record)
+    }
+
+    /// Create a batch-based log scanner.
+    ///
+    /// Use this scanner with `poll_arrow()` to get Arrow Tables, or with
+    /// `poll_record_batch()` to get individual batches with metadata.
+    ///
+    /// Returns:
+    ///     LogScanner for batch-based scanning with `poll_arrow()` or `poll_record_batch()`
+    pub fn create_record_batch_log_scanner<'py>(
+        &self,
+        py: Python<'py>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        self.create_scanner_internal(py, ScannerType::Batch)
+    }
+
+    fn __repr__(&self) -> String {
+        format!(
+            "TableScan(table={}.{})",
+            self.table_info.table_path.database(),
+            self.table_info.table_path.table()
+        )
+    }
+}
+
+impl TableScan {
+    fn create_scanner_internal<'py>(
+        &self,
+        py: Python<'py>,
+        scanner_type: ScannerType,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let conn = self.connection.clone();
+        let metadata = self.metadata.clone();
+        let table_info = self.table_info.clone();
+        let projection = self.projection.clone();
+
+        future_into_py(py, async move {
+            let fluss_table = fcore::client::FlussTable::new(&conn, metadata, table_info.clone());
+
+            let projection_indices = resolve_projection_indices(&projection, &table_info)?;
+            let table_scan = apply_projection(fluss_table.new_scan(), projection)?;
+
+            let admin = conn
+                .get_admin()
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            let (projected_schema, projected_row_type) =
+                calculate_projected_types(&table_info, projection_indices)?;
+
+            let scanner_kind = match scanner_type {
+                ScannerType::Record => {
+                    let s = table_scan
+                        .create_log_scanner()
+                        .map_err(|e| FlussError::from_core_error(&e))?;
+                    ScannerKind::Record(s)
+                }
+                ScannerType::Batch => {
+                    let s = table_scan
+                        .create_record_batch_log_scanner()
+                        .map_err(|e| FlussError::from_core_error(&e))?;
+                    ScannerKind::Batch(s)
+                }
+            };
+
+            let py_scanner = LogScanner::new(
+                scanner_kind,
+                admin,
+                table_info,
+                projected_schema,
+                Arc::new(projected_row_type),
+            );
+
+            Python::attach(|py| Py::new(py, py_scanner))
+        })
+    }
+}
+
+/// Internal enum to represent different projection types
+#[derive(Clone)]
+enum ProjectionType {
+    Indices(Vec<usize>),
+    Names(Vec<String>),
+}
+
+/// Resolve projection to column indices
+fn resolve_projection_indices(
+    projection: &Option<ProjectionType>,
+    table_info: &fcore::metadata::TableInfo,
+) -> PyResult<Option<Vec<usize>>> {
+    match projection {
+        Some(ProjectionType::Indices(indices)) => Ok(Some(indices.clone())),
+        Some(ProjectionType::Names(names)) => {
+            let schema = table_info.get_schema();
+            let columns = schema.columns();
+            let mut indices = Vec::with_capacity(names.len());
+            for name in names {
+                let idx = columns
+                    .iter()
+                    .position(|c| c.name() == name)
+                    .ok_or_else(|| FlussError::new_err(format!("Column '{name}' not found")))?;
+                indices.push(idx);
+            }
+            Ok(Some(indices))
+        }
+        None => Ok(None),
+    }
+}
+
+/// Apply projection to table scan
+fn apply_projection(
+    table_scan: fcore::client::TableScan,
+    projection: Option<ProjectionType>,
+) -> PyResult<fcore::client::TableScan> {
+    match projection {
+        Some(ProjectionType::Indices(indices)) => table_scan
+            .project(&indices)
+            .map_err(|e| FlussError::from_core_error(&e)),
+        Some(ProjectionType::Names(names)) => {
+            let column_name_refs: Vec<&str> = names.iter().map(|s| s.as_str()).collect();
+            table_scan
+                .project_by_name(&column_name_refs)
+                .map_err(|e| FlussError::from_core_error(&e))
+        }
+        None => Ok(table_scan),
+    }
+}
+
+/// Calculate projected schema and row type from projection indices
+fn calculate_projected_types(
+    table_info: &fcore::metadata::TableInfo,
+    projection_indices: Option<Vec<usize>>,
+) -> PyResult<(SchemaRef, fcore::metadata::RowType)> {
+    let full_schema =
+        to_arrow_schema(table_info.get_row_type()).map_err(|e| FlussError::from_core_error(&e))?;
+    let full_row_type = table_info.get_row_type();
+
+    match projection_indices {
+        Some(indices) => {
+            let arrow_fields: Vec<_> = indices
+                .iter()
+                .map(|&i| full_schema.field(i).clone())
+                .collect();
+            let row_fields: Vec<_> = indices
+                .iter()
+                .map(|&i| full_row_type.fields()[i].clone())
+                .collect();
+            Ok((
+                Arc::new(arrow_schema::Schema::new(arrow_fields)),
+                fcore::metadata::RowType::new(row_fields),
+            ))
+        }
+        None => Ok((full_schema, full_row_type.clone())),
+    }
+}
+
+#[pymethods]
+impl FlussTable {
+    /// Create a new table scan builder for configuring and creating log scanners.
+    ///
+    /// Use this method to create scanners with the builder pattern:
+    /// Returns:
+    ///     TableScan builder for configuring the scanner.
+    pub fn new_scan(&self) -> TableScan {
+        TableScan {
+            connection: self.connection.clone(),
+            metadata: self.metadata.clone(),
+            table_info: self.table_info.clone(),
+            projection: None,
+        }
+    }
+
+    /// Create a new TableAppend builder for the table.
+    ///
+    /// Returns:
+    ///     TableAppend builder. Call `create_writer()` to get an AppendWriter.
+    fn new_append(&self) -> PyResult<TableAppend> {
+        let _guard = TOKIO_RUNTIME.enter();
+        let fluss_table = fcore::client::FlussTable::new(
+            &self.connection,
+            self.metadata.clone(),
+            self.table_info.clone(),
+        );
+
+        let table_append = fluss_table
+            .new_append()
+            .map_err(|e| FlussError::from_core_error(&e))?;
+
+        Ok(TableAppend {
+            inner: table_append,
+            table_info: self.table_info.clone(),
+        })
+    }
+
+    /// Get table information
+    pub fn get_table_info(&self) -> TableInfo {
+        TableInfo::from_core(self.table_info.clone())
+    }
+
+    /// Get table path
+    pub fn get_table_path(&self) -> TablePath {
+        TablePath::from_core(self.table_path.clone())
+    }
+
+    /// Check if table has primary key
+    pub fn has_primary_key(&self) -> bool {
+        self.has_primary_key
+    }
+
+    /// Create a new TableLookup builder for primary key lookups.
+    ///
+    /// This is only available for tables with a primary key.
+    ///
+    /// Returns:
+    ///     TableLookup builder. Call `create_lookuper()` to get a Lookuper.
+    pub fn new_lookup(&self) -> PyResult<TableLookup> {
+        if !self.has_primary_key {
+            return Err(FlussError::new_err(
+                "Lookup is only supported for primary key tables",
+            ));
+        }
+
+        Ok(TableLookup {
+            connection: self.connection.clone(),
+            metadata: self.metadata.clone(),
+            table_info: self.table_info.clone(),
+        })
+    }
+
+    /// Create a new TableUpsert builder for the table.
+    ///
+    /// This is only available for tables with a primary key.
+    ///
+    /// Returns:
+    ///     TableUpsert builder. Call `create_writer()` to get an UpsertWriter,
+    ///     or use `partial_update_by_name()` / `partial_update_by_index()` first.
+    pub fn new_upsert(&self) -> PyResult<TableUpsert> {
+        if !self.has_primary_key {
+            return Err(FlussError::new_err(
+                "Upsert is only supported for primary key tables",
+            ));
+        }
+
+        let _guard = TOKIO_RUNTIME.enter();
+        let fluss_table = fcore::client::FlussTable::new(
+            &self.connection,
+            self.metadata.clone(),
+            self.table_info.clone(),
+        );
+
+        let table_upsert = fluss_table
+            .new_upsert()
+            .map_err(|e| FlussError::from_core_error(&e))?;
+
+        Ok(TableUpsert {
+            inner: table_upsert,
+            table_info: self.table_info.clone(),
+            target_columns: None,
+        })
+    }
+
+    fn __repr__(&self) -> String {
+        format!(
+            "FlussTable(path={}.{})",
+            self.table_path.database(),
+            self.table_path.table()
+        )
+    }
+}
+
+impl FlussTable {
+    /// Create a FlussTable
+    pub fn new_table(
+        connection: Arc<fcore::client::FlussConnection>,
+        metadata: Arc<fcore::client::Metadata>,
+        table_info: fcore::metadata::TableInfo,
+        table_path: fcore::metadata::TablePath,
+        has_primary_key: bool,
+    ) -> Self {
+        Self {
+            connection,
+            metadata,
+            table_info,
+            table_path,
+            has_primary_key,
+        }
+    }
+}
+
+/// Builder for creating an AppendWriter.
+///
+/// Obtain via `FlussTable.new_append()`, then call `create_writer()`.
+#[pyclass]
+pub struct TableAppend {
+    inner: fcore::client::TableAppend,
+    table_info: fcore::metadata::TableInfo,
+}
+
+#[pymethods]
+impl TableAppend {
+    /// Create an AppendWriter from this builder.
+    pub fn create_writer(&self) -> PyResult<AppendWriter> {
+        let rust_writer = self
+            .inner
+            .create_writer()
+            .map_err(|e| FlussError::from_core_error(&e))?;
+        Ok(AppendWriter::from_core(
+            rust_writer,
+            self.table_info.clone(),
+        ))
+    }
+
+    fn __repr__(&self) -> String {
+        "TableAppend()".to_string()
+    }
+}
+
+/// Builder for creating an UpsertWriter, with optional partial update configuration.
+///
+/// Obtain via `FlussTable.new_upsert()`, then optionally call
+/// `partial_update_by_name()` or `partial_update_by_index()`,
+/// then call `create_writer()`.
+#[pyclass]
+pub struct TableUpsert {
+    inner: fcore::client::TableUpsert,
+    table_info: fcore::metadata::TableInfo,
+    /// Column indices for partial updates, tracked for Python's dict→GenericRow conversion.
+    target_columns: Option<Vec<usize>>,
+}
+
+#[pymethods]
+impl TableUpsert {
+    /// Configure partial update by column names.
+    ///
+    /// Only the specified columns will be updated on upsert.
+    ///
+    /// Args:
+    ///     columns: List of column names to update.
+    ///
+    /// Returns:
+    ///     A new TableUpsert configured for partial update.
+    pub fn partial_update_by_name(&self, columns: Vec<String>) -> PyResult<TableUpsert> {
+        let col_refs: Vec<&str> = columns.iter().map(|s| s.as_str()).collect();
+        // Core validates and resolves names → indices internally
+        let updated = self
+            .inner
+            .partial_update_with_column_names(&col_refs)
+            .map_err(|e| FlussError::from_core_error(&e))?;
+        // Resolve indices for Python's row conversion layer (core validated names above)
+        let row_type = self.table_info.row_type();
+        let indices: Vec<usize> = columns
+            .iter()
+            .map(|name| {
+                row_type.get_field_index(name).ok_or_else(|| {
+                    FlussError::new_err(format!("Unknown column name '{name}' for partial update"))
+                })
+            })
+            .collect::<PyResult<Vec<usize>>>()?;
+        Ok(TableUpsert {
+            inner: updated,
+            table_info: self.table_info.clone(),
+            target_columns: Some(indices),
+        })
+    }
+
+    /// Configure partial update by column indices.
+    ///
+    /// Only the specified columns will be updated on upsert.
+    ///
+    /// Args:
+    ///     column_indices: List of column indices (0-based) to update.
+    ///
+    /// Returns:
+    ///     A new TableUpsert configured for partial update.
+    pub fn partial_update_by_index(&self, column_indices: Vec<usize>) -> PyResult<TableUpsert> {
+        let target = column_indices.clone();
+        // Core validates indices internally
+        let updated = self
+            .inner
+            .partial_update(Some(column_indices))
+            .map_err(|e| FlussError::from_core_error(&e))?;
+        Ok(TableUpsert {
+            inner: updated,
+            table_info: self.table_info.clone(),
+            target_columns: Some(target),
+        })
+    }
+
+    /// Create an UpsertWriter from this builder.
+    pub fn create_writer(&self) -> PyResult<crate::UpsertWriter> {
+        crate::UpsertWriter::new(
+            &self.inner,
+            self.table_info.clone(),
+            self.target_columns.clone(),
+        )
+    }
+
+    fn __repr__(&self) -> String {
+        "TableUpsert()".to_string()
+    }
+}
+
+/// Builder for creating a Lookuper.
+///
+/// Obtain via `FlussTable.new_lookup()`, then call `create_lookuper()`.
+#[pyclass]
+pub struct TableLookup {
+    connection: Arc<fcore::client::FlussConnection>,
+    metadata: Arc<fcore::client::Metadata>,
+    table_info: fcore::metadata::TableInfo,
+}
+
+#[pymethods]
+impl TableLookup {
+    /// Create a Lookuper from this builder.
+    pub fn create_lookuper(&self) -> PyResult<crate::Lookuper> {
+        crate::Lookuper::new(
+            &self.connection,
+            self.metadata.clone(),
+            self.table_info.clone(),
+        )
+    }
+
+    /// Switch to prefix-scan mode for the given lookup columns.
+    ///
+    /// The columns must be the table's partition keys (if any) plus the
+    /// bucket keys, in that order.
+    ///
+    /// Args:
+    ///     column_names: List of column names forming the prefix key.
+    ///
+    /// Returns:
+    ///     TablePrefixLookup builder. Call `create_lookuper()` to get a PrefixLookuper.
+    pub fn lookup_by(&self, column_names: Vec<String>) -> TablePrefixLookup {
+        TablePrefixLookup {
+            connection: self.connection.clone(),
+            metadata: self.metadata.clone(),
+            table_info: self.table_info.clone(),
+            lookup_column_names: column_names,
+        }
+    }
+
+    fn __repr__(&self) -> String {
+        "TableLookup()".to_string()
+    }
+}
+
+/// Builder for creating a PrefixLookuper.
+///
+/// Obtain via `TableLookup.lookup_by(columns)`, then call `create_lookuper()`.
+#[pyclass]
+pub struct TablePrefixLookup {
+    connection: Arc<fcore::client::FlussConnection>,
+    metadata: Arc<fcore::client::Metadata>,
+    table_info: fcore::metadata::TableInfo,
+    lookup_column_names: Vec<String>,
+}
+
+#[pymethods]
+impl TablePrefixLookup {
+    /// Create a PrefixLookuper from this builder.
+    pub fn create_lookuper(&self) -> PyResult<crate::PrefixLookuper> {
+        crate::PrefixLookuper::new(
+            &self.connection,
+            self.metadata.clone(),
+            self.table_info.clone(),
+            self.lookup_column_names.clone(),
+        )
+    }
+
+    fn __repr__(&self) -> String {
+        "TablePrefixLookup()".to_string()
+    }
+}
+
+/// Writer for appending data to a Fluss table
+#[pyclass]
+pub struct AppendWriter {
+    inner: Arc<fcore::client::AppendWriter>,
+    table_info: fcore::metadata::TableInfo,
+}
+
+#[pymethods]
+impl AppendWriter {
+    /// Write Arrow table data (fire-and-forget, use flush() to ensure delivery)
+    pub fn write_arrow(&self, py: Python, table: Py<PyAny>) -> PyResult<()> {
+        // Convert Arrow Table to batches and write each batch
+        let batches = table.call_method0(py, "to_batches")?;
+        let batch_list: Vec<Py<PyAny>> = batches.extract(py)?;
+
+        for batch in batch_list {
+            // Drop the handle — fire-and-forget for bulk writes
+            drop(self.write_arrow_batch(py, batch)?);
+        }
+        Ok(())
+    }
+
+    /// Write Arrow batch data.
+    ///
+    /// Returns:
+    ///     WriteResultHandle that can be ignored (fire-and-forget) or
+    ///     awaited via `handle.wait()` for server acknowledgment.
+    pub fn write_arrow_batch(&self, py: Python, batch: Py<PyAny>) -> PyResult<WriteResultHandle> {
+        // This shares the underlying Arrow buffers without copying data
+        let batch_bound = batch.bind(py);
+        let rust_batch: ArrowRecordBatch = FromPyArrow::from_pyarrow_bound(batch_bound)
+            .map_err(|e| FlussError::new_err(format!("Failed to convert RecordBatch: {e}")))?;
+
+        let result_future = self
+            .inner
+            .append_arrow_batch(rust_batch)
+            .map_err(|e| FlussError::from_core_error(&e))?;
+        Ok(WriteResultHandle::new(result_future))
+    }
+
+    /// Append a single row to the table.
+    ///
+    /// Returns:
+    ///     WriteResultHandle that can be ignored (fire-and-forget) or
+    ///     awaited via `handle.wait()` for server acknowledgment.
+    pub fn append(&self, row: &Bound<'_, PyAny>) -> PyResult<WriteResultHandle> {
+        let generic_row = python_to_generic_row(row, &self.table_info)?;
+
+        let result_future = self
+            .inner
+            .append(&generic_row)
+            .map_err(|e| FlussError::from_core_error(&e))?;
+        Ok(WriteResultHandle::new(result_future))
+    }
+
+    /// Write Pandas DataFrame data
+    pub fn write_pandas(&self, py: Python, df: Py<PyAny>) -> PyResult<()> {
+        // Get the expected Arrow schema from the Fluss table
+        let row_type = self.table_info.get_row_type();
+        let expected_schema = fcore::record::to_arrow_schema(row_type)
+            .map_err(|e| FlussError::from_core_error(&e))?;
+
+        // Convert Arrow schema to PyArrow schema
+        let py_schema = expected_schema
+            .as_ref()
+            .to_pyarrow(py)
+            .map_err(|e| FlussError::new_err(format!("Failed to convert schema: {e}")))?;
+
+        // Import pyarrow module
+        let pyarrow = py.import("pyarrow")?;
+
+        // Get the Table class from pyarrow module
+        let table_class = pyarrow.getattr("Table")?;
+
+        // Call Table.from_pandas(df, schema=expected_schema) to ensure proper type casting
+        let pa_table = table_class.call_method(
+            "from_pandas",
+            (df,),
+            Some(&[("schema", py_schema)].into_py_dict(py)?),
+        )?;
+
+        // Then call write_arrow with the converted table
+        self.write_arrow(py, pa_table.into())
+    }
+
+    /// Flush any pending data
+    pub fn flush<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let inner = self.inner.clone();
+        future_into_py(py, async move {
+            inner
+                .flush()
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))
+        })
+    }
+
+    // Enter the async runtime context (for 'async with' statement)
+    fn __aenter__<'py>(slf: PyRef<'py, Self>, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let py_slf = slf.into_pyobject(py)?.unbind();
+        future_into_py(py, async move { Ok(py_slf) })
+    }
+
+    // Exit the async runtime context (for 'async with' statement)
+    /// On exit, the writer is automatically flushed.
+    #[pyo3(signature = (exc_type=None, _exc_value=None, _traceback=None))]
+    fn __aexit__<'py>(
+        &self,
+        py: Python<'py>,
+        exc_type: Option<Bound<'py, PyAny>>,
+        _exc_value: Option<Bound<'py, PyAny>>,
+        _traceback: Option<Bound<'py, PyAny>>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let inner = self.inner.clone();
+        let is_exc_none = exc_type.as_ref().is_none_or(|e| e.is_none());
+        future_into_py(py, async move {
+            let res = inner.flush().await;
+            if let Err(e) = res {
+                if is_exc_none {
+                    return Err(FlussError::from_core_error(&e));
+                }
+            }
+            Ok(false)
+        })
+    }
+
+    fn __repr__(&self) -> String {
+        "AppendWriter()".to_string()
+    }
+}
+
+impl AppendWriter {
+    /// Create a AppendWriter from a core append writer
+    pub fn from_core(
+        append: fcore::client::AppendWriter,
+        table_info: fcore::metadata::TableInfo,
+    ) -> Self {
+        Self {
+            inner: Arc::new(append),
+            table_info,
+        }
+    }
+}
+
+/// Represents different input shapes for a row
+#[derive(FromPyObject)]
+enum RowInput<'py> {
+    Dict(Bound<'py, PyDict>),
+    Tuple(Bound<'py, PyTuple>),
+    List(Bound<'py, PyList>),
+}
+
+/// Convert Python row (dict/list/tuple) to GenericRow requiring all schema columns.
+pub fn python_to_generic_row(
+    row: &Bound<PyAny>,
+    table_info: &fcore::metadata::TableInfo,
+) -> PyResult<fcore::row::GenericRow<'static>> {
+    let all_indices: Vec<usize> = (0..table_info.row_type().fields().len()).collect();
+    python_to_sparse_generic_row(row, table_info, &all_indices)
+}
+
+/// Process a Python sequence (list or tuple) into datums at the target column positions.
+fn process_sequence(
+    seq: &Bound<PySequence>,
+    target_indices: &[usize],
+    fields: &[fcore::metadata::DataField],
+    datums: &mut [fcore::row::Datum<'static>],
+    sparse: bool,
+) -> PyResult<()> {
+    if seq.len()? != target_indices.len() {
+        return Err(FlussError::new_err(format!(
+            "Expected {} elements, got {}",
+            target_indices.len(),
+            seq.len()?
+        )));
+    }
+    for (i, &col_idx) in target_indices.iter().enumerate() {
+        let field = &fields[col_idx];
+        let value = seq.get_item(i)?;
+        let dest = if sparse { col_idx } else { i };
+        datums[dest] = python_value_to_datum(&value, field.data_type())
+            .map_err(|e| FlussError::new_err(format!("Field '{}': {}", field.name(), e)))?;
+    }
+    Ok(())
+}
+
+/// Build a full-width GenericRow filling only the specified column
+/// indices from user input; all other columns are set to Null.
+pub fn python_to_sparse_generic_row(
+    row: &Bound<PyAny>,
+    table_info: &fcore::metadata::TableInfo,
+    target_indices: &[usize],
+) -> PyResult<fcore::row::GenericRow<'static>> {
+    python_to_generic_row_inner(row, table_info, target_indices, true)
+}
+
+/// Build a dense GenericRow with exactly `target_indices.len()` fields,
+/// containing only the target column values in order.
+pub fn python_to_dense_generic_row(
+    row: &Bound<PyAny>,
+    table_info: &fcore::metadata::TableInfo,
+    target_indices: &[usize],
+) -> PyResult<fcore::row::GenericRow<'static>> {
+    python_to_generic_row_inner(row, table_info, target_indices, false)
+}
+
+/// Build a GenericRow from user input. When `sparse` is true, the row is full width and padded with nulls
+fn python_to_generic_row_inner(
+    row: &Bound<PyAny>,
+    table_info: &fcore::metadata::TableInfo,
+    target_indices: &[usize],
+    sparse: bool,
+) -> PyResult<fcore::row::GenericRow<'static>> {
+    let row_type = table_info.row_type();
+    let fields = row_type.fields();
+    let target_names: Vec<&str> = target_indices.iter().map(|&i| fields[i].name()).collect();
+
+    let num_fields = if sparse {
+        fields.len()
+    } else {
+        target_indices.len()
+    };
+    let mut datums: Vec<fcore::row::Datum<'static>> = vec![fcore::row::Datum::Null; num_fields];
+
+    let row_input: RowInput = row.extract().map_err(|_| {
+        let type_name = row
+            .get_type()
+            .name()
+            .map(|n| n.to_string())
+            .unwrap_or_else(|_| "unknown".to_string());
+        FlussError::new_err(format!(
+            "Row must be a dict, list, or tuple; got {type_name}"
+        ))
+    })?;
+
+    match row_input {
+        RowInput::Dict(dict) => {
+            for (k, _) in dict.iter() {
+                let key_str = k.extract::<&str>().map_err(|_| {
+                    let key_type = k
+                        .get_type()
+                        .name()
+                        .map(|n| n.to_string())
+                        .unwrap_or_else(|_| "unknown".to_string());
+                    FlussError::new_err(format!("Dict keys must be strings; got {key_type}"))
+                })?;
+                if !target_names.contains(&key_str) {
+                    return Err(FlussError::new_err(format!(
+                        "Unknown field '{}'. Expected: {}",
+                        key_str,
+                        target_names.join(", ")
+                    )));
+                }
+            }
+            for (i, &col_idx) in target_indices.iter().enumerate() {
+                let name = target_names[i];
+                let field = &fields[col_idx];
+                let value = dict
+                    .get_item(name)?
+                    .ok_or_else(|| FlussError::new_err(format!("Missing field: {name}")))?;
+                let dest = if sparse { col_idx } else { i };
+                datums[dest] = python_value_to_datum(&value, field.data_type())
+                    .map_err(|e| FlussError::new_err(format!("Field '{name}': {e}")))?;
+            }
+        }
+
+        RowInput::List(list) => {
+            process_sequence(
+                list.as_sequence(),
+                target_indices,
+                fields,
+                &mut datums,
+                sparse,
+            )?;
+        }
+
+        RowInput::Tuple(tuple) => {
+            process_sequence(
+                tuple.as_sequence(),
+                target_indices,
+                fields,
+                &mut datums,
+                sparse,
+            )?;
+        }
+    }
+
+    Ok(fcore::row::GenericRow { values: datums })
+}
+
+/// Convert Python value to Datum based on data type
+fn python_value_to_datum(
+    value: &Bound<PyAny>,
+    data_type: &fcore::metadata::DataType,
+) -> PyResult<fcore::row::Datum<'static>> {
+    use fcore::row::{Datum, F32, F64};
+
+    if value.is_none() {
+        return Ok(Datum::Null);
+    }
+
+    match data_type {
+        fcore::metadata::DataType::Boolean(_) => {
+            let v: bool = value.extract()?;
+            Ok(Datum::Bool(v))
+        }
+        fcore::metadata::DataType::TinyInt(_) => {
+            // Strict type checking: reject bool for int columns
+            if value.is_instance_of::<PyBool>() {
+                return Err(FlussError::new_err(
+                    "Expected int for TinyInt column, got bool. Use 0 or 1 explicitly.".to_string(),
+                ));
+            }
+            let v: i8 = value.extract()?;
+            Ok(Datum::Int8(v))
+        }
+        fcore::metadata::DataType::SmallInt(_) => {
+            if value.is_instance_of::<PyBool>() {
+                return Err(FlussError::new_err(
+                    "Expected int for SmallInt column, got bool. Use 0 or 1 explicitly."
+                        .to_string(),
+                ));
+            }
+            let v: i16 = value.extract()?;
+            Ok(Datum::Int16(v))
+        }
+        fcore::metadata::DataType::Int(_) => {
+            if value.is_instance_of::<PyBool>() {
+                return Err(FlussError::new_err(
+                    "Expected int for Int column, got bool. Use 0 or 1 explicitly.".to_string(),
+                ));
+            }
+            let v: i32 = value.extract()?;
+            Ok(Datum::Int32(v))
+        }
+        fcore::metadata::DataType::BigInt(_) => {
+            if value.is_instance_of::<PyBool>() {
+                return Err(FlussError::new_err(
+                    "Expected int for BigInt column, got bool. Use 0 or 1 explicitly.".to_string(),
+                ));
+            }
+            let v: i64 = value.extract()?;
+            Ok(Datum::Int64(v))
+        }
+        fcore::metadata::DataType::Float(_) => {
+            let v: f32 = value.extract()?;
+            Ok(Datum::Float32(F32::from(v)))
+        }
+        fcore::metadata::DataType::Double(_) => {
+            let v: f64 = value.extract()?;
+            Ok(Datum::Float64(F64::from(v)))
+        }
+        fcore::metadata::DataType::String(_) | fcore::metadata::DataType::Char(_) => {
+            let v: String = value.extract()?;
+            Ok(v.into())
+        }
+        fcore::metadata::DataType::Bytes(_) | fcore::metadata::DataType::Binary(_) => {
+            // Efficient extraction: downcast to specific type and use bulk copy.
+            // PyBytes::as_bytes() and PyByteArray::to_vec() are O(n) bulk copies of the underlying data.
+            if let Ok(bytes) = value.downcast::<PyBytes>() {
+                Ok(bytes.as_bytes().to_vec().into())
+            } else if let Ok(bytearray) = value.downcast::<PyByteArray>() {
+                Ok(bytearray.to_vec().into())
+            } else {
+                Err(FlussError::new_err(format!(
+                    "Expected bytes or bytearray, got {}",
+                    value.get_type().name()?
+                )))
+            }
+        }
+        fcore::metadata::DataType::Decimal(decimal_type) => {
+            python_decimal_to_datum(value, decimal_type.precision(), decimal_type.scale())
+        }
+        fcore::metadata::DataType::Date(_) => python_date_to_datum(value),
+        fcore::metadata::DataType::Time(_) => python_time_to_datum(value),
+        fcore::metadata::DataType::Timestamp(_) => python_datetime_to_timestamp_ntz(value),
+        fcore::metadata::DataType::TimestampLTz(_) => python_datetime_to_timestamp_ltz(value),
+        fcore::metadata::DataType::Array(array_type) => {
+            let element_type = array_type.get_element_type();
+            if value.is_instance_of::<PyString>() {
+                return Err(FlussError::new_err(format!(
+                    "Expected sequence for Array column, got {}",
+                    get_type_name(value)
+                )));
+            }
+            let seq = value.downcast::<PySequence>().map_err(|_| {
+                FlussError::new_err(format!(
+                    "Expected sequence for Array column, got {}",
+                    get_type_name(value)
+                ))
+            })?;
+
+            let len = seq.len()?;
+            let mut writer = fcore::row::binary_array::FlussArrayWriter::new(len, element_type);
+
+            for i in 0..len {
+                let item = seq.get_item(i)?;
+                if item.is_none() {
+                    writer.set_null_at(i);
+                } else {
+                    let val_datum = python_value_to_datum(&item, element_type)?;
+                    match val_datum {
+                        Datum::Null => writer.set_null_at(i),
+                        Datum::Bool(v) => writer.write_boolean(i, v),
+                        Datum::Int8(v) => writer.write_byte(i, v),
+                        Datum::Int16(v) => writer.write_short(i, v),
+                        Datum::Int32(v) => writer.write_int(i, v),
+                        Datum::Int64(v) => writer.write_long(i, v),
+                        Datum::Float32(v) => writer.write_float(i, v.into_inner()),
+                        Datum::Float64(v) => writer.write_double(i, v.into_inner()),
+                        Datum::String(v) => writer.write_string(i, &v),
+                        Datum::Blob(v) => writer.write_binary_bytes(i, v.as_ref()),
+                        Datum::Decimal(v) => {
+                            if let fcore::metadata::DataType::Decimal(dt) = element_type {
+                                writer.write_decimal(i, &v, dt.precision());
+                            }
+                        }
+                        Datum::Date(v) => writer.write_date(i, v),
+                        Datum::Time(v) => writer.write_time(i, v),
+                        Datum::TimestampNtz(v) => {
+                            if let fcore::metadata::DataType::Timestamp(dt) = element_type {
+                                writer.write_timestamp_ntz(i, &v, dt.precision());
+                            }
+                        }
+                        Datum::TimestampLtz(v) => {
+                            if let fcore::metadata::DataType::TimestampLTz(dt) = element_type {
+                                writer.write_timestamp_ltz(i, &v, dt.precision());
+                            }
+                        }
+                        Datum::Array(v) => writer.write_array(i, &v),
+                        Datum::Map(v) => writer.write_map(i, &v),
+                        Datum::Row(_) => {
+                            return Err(FlussError::new_err(
+                                "Row datum is not supported as an array element",
+                            ));
+                        }
+                    }
+                }
+            }
+
+            let array = writer
+                .complete()
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            Ok(Datum::Array(array))
+        }
+        _ => Err(FlussError::new_err(format!(
+            "Unsupported data type for row-level operations: {data_type}"
+        ))),
+    }
+}
+
+/// Convert Rust Datum to Python value based on data type.
+/// This is the reverse of python_value_to_datum.
+pub fn datum_to_python_value(
+    py: Python,
+    row: &dyn fcore::row::InternalRow,
+    pos: usize,
+    data_type: &fcore::metadata::DataType,
+) -> PyResult<Py<PyAny>> {
+    use fcore::metadata::DataType;
+
+    // Check for null first
+    if row
+        .is_null_at(pos)
+        .map_err(|e| FlussError::from_core_error(&e))?
+    {
+        return Ok(py.None());
+    }
+
+    match data_type {
+        DataType::Boolean(_) => Ok(row
+            .get_boolean(pos)
+            .map_err(|e| FlussError::from_core_error(&e))?
+            .into_pyobject(py)?
+            .to_owned()
+            .into_any()
+            .unbind()),
+        DataType::TinyInt(_) => Ok(row
+            .get_byte(pos)
+            .map_err(|e| FlussError::from_core_error(&e))?
+            .into_pyobject(py)?
+            .to_owned()
+            .into_any()
+            .unbind()),
+        DataType::SmallInt(_) => Ok(row
+            .get_short(pos)
+            .map_err(|e| FlussError::from_core_error(&e))?
+            .into_pyobject(py)?
+            .to_owned()
+            .into_any()
+            .unbind()),
+        DataType::Int(_) => Ok(row
+            .get_int(pos)
+            .map_err(|e| FlussError::from_core_error(&e))?
+            .into_pyobject(py)?
+            .to_owned()
+            .into_any()
+            .unbind()),
+        DataType::BigInt(_) => Ok(row
+            .get_long(pos)
+            .map_err(|e| FlussError::from_core_error(&e))?
+            .into_pyobject(py)?
+            .to_owned()
+            .into_any()
+            .unbind()),
+        DataType::Float(_) => Ok(row
+            .get_float(pos)
+            .map_err(|e| FlussError::from_core_error(&e))?
+            .into_pyobject(py)?
+            .to_owned()
+            .into_any()
+            .unbind()),
+        DataType::Double(_) => Ok(row
+            .get_double(pos)
+            .map_err(|e| FlussError::from_core_error(&e))?
+            .into_pyobject(py)?
+            .to_owned()
+            .into_any()
+            .unbind()),
+        DataType::String(_) => {
+            let s = row
+                .get_string(pos)
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            Ok(s.into_pyobject(py)?.into_any().unbind())
+        }
+        DataType::Char(char_type) => {
+            let s = row
+                .get_char(pos, char_type.length() as usize)
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            Ok(s.into_pyobject(py)?.into_any().unbind())
+        }
+        DataType::Bytes(_) => {
+            let b = row
+                .get_bytes(pos)
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            Ok(PyBytes::new(py, b).into_any().unbind())
+        }
+        DataType::Binary(binary_type) => {
+            let b = row
+                .get_binary(pos, binary_type.length())
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            Ok(PyBytes::new(py, b).into_any().unbind())
+        }
+        DataType::Decimal(decimal_type) => {
+            let decimal = row
+                .get_decimal(
+                    pos,
+                    decimal_type.precision() as usize,
+                    decimal_type.scale() as usize,
+                )
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            rust_decimal_to_python(py, &decimal)
+        }
+        DataType::Date(_) => {
+            let date = row
+                .get_date(pos)
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            rust_date_to_python(py, date)
+        }
+        DataType::Time(_) => {
+            let time = row
+                .get_time(pos)
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            rust_time_to_python(py, time)
+        }
+        DataType::Timestamp(ts_type) => {
+            let ts = row
+                .get_timestamp_ntz(pos, ts_type.precision())
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            rust_timestamp_ntz_to_python(py, ts)
+        }
+        DataType::TimestampLTz(ts_type) => {
+            let ts = row
+                .get_timestamp_ltz(pos, ts_type.precision())
+                .map_err(|e| FlussError::from_core_error(&e))?;
+            rust_timestamp_ltz_to_python(py, ts)
+        }
+        DataType::Array(array_type) => {
+            let array_data = row
+                .get_array(pos)
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            let element_type = array_type.get_element_type();
+            let py_list = pyo3::types::PyList::empty(py);
+
+            for i in 0..array_data.size() {
+                let py_val = datum_to_python_value(py, &array_data, i, element_type)?;
+                py_list.append(py_val)?;
+            }
+            Ok(py_list.into_any().unbind())
+        }
+        _ => Err(FlussError::new_err(format!(
+            "Unsupported data type for conversion to Python: {data_type}"
+        ))),
+    }
+}
+
+/// Convert Rust Decimal to Python decimal.Decimal
+fn rust_decimal_to_python(py: Python, decimal: &fcore::row::Decimal) -> PyResult<Py<PyAny>> {
+    let decimal_ty = get_decimal_type(py)?;
+    let decimal_str = decimal.to_string();
+    let py_decimal = decimal_ty.call1((decimal_str,))?;
+    Ok(py_decimal.into_any().unbind())
+}
+
+/// Convert Rust Date (days since epoch) to Python datetime.date
+fn rust_date_to_python(py: Python, date: fcore::row::Date) -> PyResult<Py<PyAny>> {
+    let days_since_epoch = date.get_inner();
+    let epoch = jiff::civil::date(1970, 1, 1);
+    let civil_date = epoch + jiff::Span::new().days(days_since_epoch as i64);
+
+    let py_date = PyDate::new(
+        py,
+        civil_date.year() as i32,
+        civil_date.month() as u8,
+        civil_date.day() as u8,
+    )?;
+    Ok(py_date.into_any().unbind())
+}
+
+/// Convert Rust Time (millis since midnight) to Python datetime.time
+fn rust_time_to_python(py: Python, time: fcore::row::Time) -> PyResult<Py<PyAny>> {
+    let millis = time.get_inner() as i64;
+    let hours = millis / MILLIS_PER_HOUR;
+    let minutes = (millis % MILLIS_PER_HOUR) / MILLIS_PER_MINUTE;
+    let seconds = (millis % MILLIS_PER_MINUTE) / MILLIS_PER_SECOND;
+    let microseconds = (millis % MILLIS_PER_SECOND) * MICROS_PER_MILLI;
+
+    let py_time = PyTime::new(
+        py,
+        hours as u8,
+        minutes as u8,
+        seconds as u8,
+        microseconds as u32,
+        None,
+    )?;
+    Ok(py_time.into_any().unbind())
+}
+
+/// Convert Rust TimestampNtz to Python naive datetime
+fn rust_timestamp_ntz_to_python(py: Python, ts: fcore::row::TimestampNtz) -> PyResult<Py<PyAny>> {
+    let millis = ts.get_millisecond();
+    let nanos = ts.get_nano_of_millisecond();
+    let total_micros = millis * MICROS_PER_MILLI + (nanos as i64 / NANOS_PER_MICRO);
+
+    // Convert to civil datetime via jiff
+    let timestamp = jiff::Timestamp::from_microsecond(total_micros)
+        .map_err(|e| FlussError::new_err(format!("Invalid timestamp: {e}")))?;
+    let civil_dt = timestamp.to_zoned(jiff::tz::TimeZone::UTC).datetime();
+
+    let py_dt = PyDateTime::new(
+        py,
+        civil_dt.year() as i32,
+        civil_dt.month() as u8,
+        civil_dt.day() as u8,
+        civil_dt.hour() as u8,
+        civil_dt.minute() as u8,
+        civil_dt.second() as u8,
+        (civil_dt.subsec_nanosecond() / 1000) as u32, // microseconds
+        None,
+    )?;
+    Ok(py_dt.into_any().unbind())
+}
+
+/// Convert Rust TimestampLtz to Python timezone-aware datetime (UTC)
+fn rust_timestamp_ltz_to_python(py: Python, ts: fcore::row::TimestampLtz) -> PyResult<Py<PyAny>> {
+    let millis = ts.get_epoch_millisecond();
+    let nanos = ts.get_nano_of_millisecond();
+    let total_micros = millis * MICROS_PER_MILLI + (nanos as i64 / NANOS_PER_MICRO);
+
+    // Convert to civil datetime via jiff
+    let timestamp = jiff::Timestamp::from_microsecond(total_micros)
+        .map_err(|e| FlussError::new_err(format!("Invalid timestamp: {e}")))?;
+    let civil_dt = timestamp.to_zoned(jiff::tz::TimeZone::UTC).datetime();
+
+    let utc = get_utc_timezone(py)?;
+    let py_dt = PyDateTime::new(
+        py,
+        civil_dt.year() as i32,
+        civil_dt.month() as u8,
+        civil_dt.day() as u8,
+        civil_dt.hour() as u8,
+        civil_dt.minute() as u8,
+        civil_dt.second() as u8,
+        (civil_dt.subsec_nanosecond() / 1000) as u32, // microseconds
+        Some(&utc),
+    )?;
+    Ok(py_dt.into_any().unbind())
+}
+
+/// Convert an InternalRow to a Python dictionary
+pub fn internal_row_to_dict(
+    py: Python,
+    row: &dyn fcore::row::InternalRow,
+    table_info: &fcore::metadata::TableInfo,
+) -> PyResult<Py<PyAny>> {
+    let row_type = table_info.row_type();
+    let fields = row_type.fields();
+    let dict = PyDict::new(py);
+
+    for (pos, field) in fields.iter().enumerate() {
+        let value = datum_to_python_value(py, row, pos, field.data_type())?;
+        dict.set_item(field.name(), value)?;
+    }
+
+    Ok(dict.into_any().unbind())
+}
+
+/// Cached decimal.Decimal type
+/// Uses PyOnceLock for thread-safety and subinterpreter compatibility.
+static DECIMAL_TYPE: PyOnceLock<Py<PyType>> = PyOnceLock::new();
+
+/// Cached UTC timezone
+static UTC_TIMEZONE: PyOnceLock<Py<PyAny>> = PyOnceLock::new();
+
+/// Cached UTC epoch type
+static UTC_EPOCH: PyOnceLock<Py<PyAny>> = PyOnceLock::new();
+
+/// Get the cached decimal.Decimal type, importing it once per interpreter.
+fn get_decimal_type(py: Python) -> PyResult<Bound<PyType>> {
+    let ty = DECIMAL_TYPE.get_or_try_init(py, || -> PyResult<_> {
+        let decimal_mod = py.import("decimal")?;
+        let decimal_ty = decimal_mod.getattr("Decimal")?.downcast_into::<PyType>()?;
+        Ok(decimal_ty.unbind())
+    })?;
+    Ok(ty.bind(py).clone())
+}
+
+/// Get the cached UTC timezone (datetime.timezone.utc), creating it once per interpreter.
+fn get_utc_timezone(py: Python) -> PyResult<Bound<PyTzInfo>> {
+    let tz = UTC_TIMEZONE.get_or_try_init(py, || -> PyResult<_> {
+        let datetime_mod = py.import("datetime")?;
+        let timezone = datetime_mod.getattr("timezone")?;
+        let utc = timezone.getattr("utc")?;
+        Ok(utc.unbind())
+    })?;
+    // Downcast to PyTzInfo for use with PyDateTime::new()
+    Ok(tz.bind(py).clone().downcast_into::<PyTzInfo>()?)
+}
+
+/// Get the cached UTC epoch datetime, creating it once per interpreter.
+fn get_utc_epoch(py: Python) -> PyResult<Bound<PyAny>> {
+    let epoch = UTC_EPOCH.get_or_try_init(py, || -> PyResult<_> {
+        let datetime_mod = py.import("datetime")?;
+        let timezone = datetime_mod.getattr("timezone")?;
+        let utc = timezone.getattr("utc")?;
+        let epoch = datetime_mod
+            .getattr("datetime")?
+            .call1((1970, 1, 1, 0, 0, 0, 0, &utc))?;
+        Ok(epoch.unbind())
+    })?;
+    Ok(epoch.bind(py).clone())
+}
+
+/// Validate that value is a decimal.Decimal instance.
+fn ensure_is_decimal(value: &Bound<PyAny>) -> PyResult<()> {
+    let decimal_ty = get_decimal_type(value.py())?;
+    if !value.is_instance(&decimal_ty.into_any())? {
+        return Err(FlussError::new_err(format!(
+            "Expected decimal.Decimal, got {}",
+            get_type_name(value)
+        )));
+    }
+    Ok(())
+}
+
+/// Convert Python decimal.Decimal to Datum::Decimal.
+/// Only accepts decimal.Decimal
+fn python_decimal_to_datum(
+    value: &Bound<PyAny>,
+    precision: u32,
+    scale: u32,
+) -> PyResult<fcore::row::Datum<'static>> {
+    use std::str::FromStr;
+
+    ensure_is_decimal(value)?;
+
+    let decimal_str: String = value.str()?.extract()?;
+    let bd = bigdecimal::BigDecimal::from_str(&decimal_str).map_err(|e| {
+        FlussError::new_err(format!("Failed to parse decimal '{decimal_str}': {e}"))
+    })?;
+
+    let decimal = fcore::row::Decimal::from_big_decimal(bd, precision, scale).map_err(|e| {
+        FlussError::new_err(format!(
+            "Failed to convert decimal '{decimal_str}' to DECIMAL({precision}, {scale}): {e}"
+        ))
+    })?;
+
+    Ok(fcore::row::Datum::Decimal(decimal))
+}
+
+/// Convert Python datetime.date to Datum::Date.
+fn python_date_to_datum(value: &Bound<PyAny>) -> PyResult<fcore::row::Datum<'static>> {
+    // Reject datetime.datetime (subclass of date) - use timestamp columns for those
+    if value.downcast::<PyDateTime>().is_ok() {
+        return Err(FlussError::new_err(
+            "Expected datetime.date, got datetime.datetime. Use a TIMESTAMP column for datetime values.",
+        ));
+    }
+
+    let date = value.downcast::<PyDate>().map_err(|_| {
+        FlussError::new_err(format!(
+            "Expected datetime.date, got {}",
+            get_type_name(value)
+        ))
+    })?;
+
+    let year = date.get_year();
+    let month = date.get_month();
+    let day = date.get_day();
+
+    // Calculate days since Unix epoch (1970-01-01)
+    let civil_date = jiff::civil::date(year as i16, month as i8, day as i8);
+    let epoch = jiff::civil::date(1970, 1, 1);
+    let days_since_epoch = (civil_date - epoch).get_days();
+
+    Ok(fcore::row::Datum::Date(fcore::row::Date::new(
+        days_since_epoch,
+    )))
+}
+
+/// Convert Python datetime.time to Datum::Time.
+/// Uses PyO3's native PyTime type for efficient access.
+///
+/// Note: Fluss TIME is always stored as milliseconds since midnight (i32) regardless
+/// of the schema's precision setting. This matches the Java Fluss wire protocol.
+/// Sub-millisecond precision (microseconds not divisible by 1000) will raise an error
+/// to prevent silent data loss and ensure fail-fast behavior.
+fn python_time_to_datum(value: &Bound<PyAny>) -> PyResult<fcore::row::Datum<'static>> {
+    let time = value.downcast::<PyTime>().map_err(|_| {
+        FlussError::new_err(format!(
+            "Expected datetime.time, got {}",
+            get_type_name(value)
+        ))
+    })?;
+
+    let hour = time.get_hour() as i32;
+    let minute = time.get_minute() as i32;
+    let second = time.get_second() as i32;
+    let microsecond = time.get_microsecond() as i32;
+
+    // Strict validation: reject sub-millisecond precision
+    if microsecond % MICROS_PER_MILLI as i32 != 0 {
+        return Err(FlussError::new_err(format!(
+            "TIME values with sub-millisecond precision are not supported. \
+             Got time with {microsecond} microseconds (not divisible by 1000). \
+             Fluss stores TIME as milliseconds since midnight. \
+             Please round to milliseconds before insertion."
+        )));
+    }
+
+    // Convert to milliseconds since midnight
+    let millis = hour * MILLIS_PER_HOUR as i32
+        + minute * MILLIS_PER_MINUTE as i32
+        + second * MILLIS_PER_SECOND as i32
+        + microsecond / MICROS_PER_MILLI as i32;
+
+    Ok(fcore::row::Datum::Time(fcore::row::Time::new(millis)))
+}
+
+/// Convert Python datetime-like object to Datum::TimestampNtz.
+/// Supports: datetime.datetime (naive preferred), pd.Timestamp, np.datetime64
+fn python_datetime_to_timestamp_ntz(value: &Bound<PyAny>) -> PyResult<fcore::row::Datum<'static>> {
+    let (epoch_millis, nano_of_milli) = extract_datetime_components_ntz(value)?;
+
+    let ts = fcore::row::TimestampNtz::from_millis_nanos(epoch_millis, nano_of_milli)
+        .map_err(|e| FlussError::new_err(format!("Failed to create TimestampNtz: {e}")))?;
+
+    Ok(fcore::row::Datum::TimestampNtz(ts))
+}
+
+/// Convert Python datetime-like object to Datum::TimestampLtz.
+/// For naive datetimes, assumes UTC. For aware datetimes, converts to UTC.
+/// Supports: datetime.datetime, pd.Timestamp, np.datetime64
+fn python_datetime_to_timestamp_ltz(value: &Bound<PyAny>) -> PyResult<fcore::row::Datum<'static>> {
+    let (epoch_millis, nano_of_milli) = extract_datetime_components_ltz(value)?;
+
+    let ts = fcore::row::TimestampLtz::from_millis_nanos(epoch_millis, nano_of_milli)
+        .map_err(|e| FlussError::new_err(format!("Failed to create TimestampLtz: {e}")))?;
+
+    Ok(fcore::row::Datum::TimestampLtz(ts))
+}
+
+/// Extract epoch milliseconds for TimestampNtz (wall-clock time, no timezone conversion).
+/// Uses integer arithmetic to avoid float precision issues.
+/// For clarity, tz-aware datetimes are rejected - use TimestampLtz for those.
+fn extract_datetime_components_ntz(value: &Bound<PyAny>) -> PyResult<(i64, i32)> {
+    // Try PyDateTime first
+    if let Ok(dt) = value.downcast::<PyDateTime>() {
+        // Reject tz-aware datetime for NTZ - it's ambiguous what the user wants
+        let tzinfo = dt.getattr("tzinfo")?;
+        if !tzinfo.is_none() {
+            return Err(FlussError::new_err(
+                "TIMESTAMP (without timezone) requires a naive datetime. \
+                 Got timezone-aware datetime. Either remove tzinfo or use TIMESTAMP_LTZ column.",
+            ));
+        }
+        return datetime_to_epoch_millis_as_utc(dt);
+    }
+
+    // Check for pandas Timestamp by verifying module name
+    if is_pandas_timestamp(value) {
+        // For NTZ, reject tz-aware pandas Timestamps for consistency with datetime behavior
+        if let Ok(tz) = value.getattr("tz") {
+            if !tz.is_none() {
+                return Err(FlussError::new_err(
+                    "TIMESTAMP (without timezone) requires a naive pd.Timestamp. \
+                     Got timezone-aware Timestamp. Either use tz_localize(None) or use TIMESTAMP_LTZ column.",
+                ));
+            }
+        }
+        // Naive pandas Timestamp: .value is nanoseconds since epoch (wall-clock as UTC)
+        let nanos: i64 = value.getattr("value")?.extract()?;
+        return Ok(nanos_to_millis_and_submillis(nanos));
+    }
+
+    // Try to_pydatetime() for objects that support it
+    if let Ok(py_dt) = value.call_method0("to_pydatetime") {
+        if let Ok(dt) = py_dt.downcast::<PyDateTime>() {
+            let tzinfo = dt.getattr("tzinfo")?;
+            if !tzinfo.is_none() {
+                return Err(FlussError::new_err(
+                    "TIMESTAMP (without timezone) requires a naive datetime. \
+                     Got timezone-aware value. Use TIMESTAMP_LTZ column instead.",
+                ));
+            }
+            return datetime_to_epoch_millis_as_utc(dt);
+        }
+    }
+
+    Err(FlussError::new_err(format!(
+        "Expected naive datetime.datetime or pd.Timestamp, got {}",
+        get_type_name(value)
+    )))
+}
+
+/// Extract epoch milliseconds for TimestampLtz (instant in time, UTC-based).
+/// For naive datetimes, assumes UTC. For aware datetimes, converts to UTC.
+fn extract_datetime_components_ltz(value: &Bound<PyAny>) -> PyResult<(i64, i32)> {
+    // Try PyDateTime first
+    if let Ok(dt) = value.downcast::<PyDateTime>() {
+        // Check if timezone-aware
+        let tzinfo = dt.getattr("tzinfo")?;
+        if tzinfo.is_none() {
+            // Naive datetime: assume UTC (treat components as UTC time)
+            return datetime_to_epoch_millis_as_utc(dt);
+        } else {
+            // Aware datetime: use timedelta from epoch to get correct UTC instant
+            return datetime_to_epoch_millis_utc_aware(dt);
+        }
+    }
+
+    // Check for pandas Timestamp
+    if is_pandas_timestamp(value) {
+        // pandas Timestamp.value is always nanoseconds since UTC epoch
+        let nanos: i64 = value.getattr("value")?.extract()?;
+        return Ok(nanos_to_millis_and_submillis(nanos));
+    }
+
+    // Try to_pydatetime()
+    if let Ok(py_dt) = value.call_method0("to_pydatetime") {
+        if let Ok(dt) = py_dt.downcast::<PyDateTime>() {
+            let tzinfo = dt.getattr("tzinfo")?;
+            if tzinfo.is_none() {
+                return datetime_to_epoch_millis_as_utc(dt);
+            } else {
+                return datetime_to_epoch_millis_utc_aware(dt);
+            }
+        }
+    }
+
+    Err(FlussError::new_err(format!(
+        "Expected datetime.datetime or pd.Timestamp, got {}",
+        get_type_name(value)
+    )))
+}
+
+/// Convert datetime components to epoch milliseconds treating them as UTC
+fn datetime_to_epoch_millis_as_utc(dt: &Bound<'_, PyDateTime>) -> PyResult<(i64, i32)> {
+    let year = dt.get_year();
+    let month = dt.get_month();
+    let day = dt.get_day();
+    let hour = dt.get_hour();
+    let minute = dt.get_minute();
+    let second = dt.get_second();
+    let microsecond = dt.get_microsecond();
+
+    // Create jiff civil datetime and convert to UTC timestamp
+    // Safe casts: hour (0-23), minute (0-59), second (0-59) all fit in i8
+    let civil_dt = jiff::civil::date(year as i16, month as i8, day as i8).at(
+        hour as i8,
+        minute as i8,
+        second as i8,
+        microsecond as i32 * 1000,
+    );
+
+    let timestamp = jiff::tz::Offset::UTC
+        .to_timestamp(civil_dt)
+        .map_err(|e| FlussError::new_err(format!("Invalid datetime: {e}")))?;
+
+    let millis = timestamp.as_millisecond();
+    let nano_of_milli = (timestamp.subsec_nanosecond() % NANOS_PER_MILLI as i32) as i32;
+
+    Ok((millis, nano_of_milli))
+}
+
+/// Convert timezone-aware datetime to epoch milliseconds using Python's timedelta.
+/// This correctly handles timezone conversions by computing (dt - UTC_EPOCH).
+/// The UTC epoch is cached for performance.
+fn datetime_to_epoch_millis_utc_aware(dt: &Bound<'_, PyDateTime>) -> PyResult<(i64, i32)> {
+    let py = dt.py();
+    let epoch = get_utc_epoch(py)?;
+
+    // Compute delta = dt - epoch (this handles timezone conversion correctly)
+    let delta = dt.call_method1("__sub__", (epoch,))?;
+    let delta = delta.downcast::<PyDelta>()?;
+
+    // Extract components using integer arithmetic
+    let days = delta.get_days() as i64;
+    let seconds = delta.get_seconds() as i64;
+    let microseconds = delta.get_microseconds() as i64;
+
+    // Total milliseconds (note: days can be negative for dates before epoch)
+    let total_micros = days * MICROS_PER_DAY + seconds * MICROS_PER_SECOND + microseconds;
+    let millis = total_micros / MICROS_PER_MILLI;
+    let nano_of_milli = ((total_micros % MICROS_PER_MILLI) * MICROS_PER_MILLI) as i32;
+
+    // Handle negative microseconds remainder
+    let (millis, nano_of_milli) = if nano_of_milli < 0 {
+        (millis - 1, nano_of_milli + NANOS_PER_MILLI as i32)
+    } else {
+        (millis, nano_of_milli)
+    };
+
+    Ok((millis, nano_of_milli))
+}
+
+/// Convert nanoseconds to (milliseconds, nano_of_millisecond)
+fn nanos_to_millis_and_submillis(nanos: i64) -> (i64, i32) {
+    let millis = nanos / NANOS_PER_MILLI;
+    let nano_of_milli = (nanos % NANOS_PER_MILLI) as i32;
+
+    // Handle negative nanoseconds correctly (Euclidean remainder)
+    if nano_of_milli < 0 {
+        (millis - 1, nano_of_milli + NANOS_PER_MILLI as i32)
+    } else {
+        (millis, nano_of_milli)
+    }
+}
+
+/// Check if value is a pandas Timestamp by examining its type.
+fn is_pandas_timestamp(value: &Bound<PyAny>) -> bool {
+    // Check module and class name to avoid importing pandas
+    if let Ok(cls) = value.get_type().getattr("__module__") {
+        if let Ok(module) = cls.extract::<&str>() {
+            if module.starts_with("pandas") {
+                if let Ok(name) = value.get_type().getattr("__name__") {
+                    if let Ok(name_str) = name.extract::<&str>() {
+                        return name_str == "Timestamp";
+                    }
+                }
+            }
+        }
+    }
+    false
+}
+
+/// Get type name
+fn get_type_name(value: &Bound<PyAny>) -> String {
+    value
+        .get_type()
+        .name()
+        .map(|s| s.to_string())
+        .unwrap_or_else(|_| "unknown".to_string())
+}
+
+/// Thin Python iterator over [`fcore::client::SyncRecordBatchLogReader`].
+/// Used internally as the backing iterator for
+/// ``pa.RecordBatchReader.from_batches()``; not registered on the module.
+#[pyclass]
+struct PyRecordBatchLogReader {
+    sync_reader: fcore::client::SyncRecordBatchLogReader,
+}
+
+#[pymethods]
+impl PyRecordBatchLogReader {
+    fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
+        slf
+    }
+
+    fn __next__(&mut self, py: Python) -> PyResult<Option<Py<PyAny>>> {
+        let result = py.detach(|| self.sync_reader.next().transpose());
+
+        match result {
+            Ok(Some(batch)) => {
+                let py_batch = batch
+                    .to_pyarrow(py)
+                    .map_err(|e| FlussError::new_err(format!("Failed to convert batch: {e}")))?;
+                Ok(Some(py_batch.unbind()))
+            }
+            Ok(None) => Ok(None),
+            Err(arrow_err) => Err(FlussError::new_err(format!(
+                "Error reading batch: {arrow_err}"
+            ))),
+        }
+    }
+}
+
+/// Wraps the two scanner variants so we never have an impossible state
+/// (both None or both Some).
+enum ScannerKind {
+    Record(fcore::client::LogScanner),
+    Batch(fcore::client::RecordBatchLogScanner),
+}
+
+impl ScannerKind {
+    fn as_record(&self) -> PyResult<&fcore::client::LogScanner> {
+        match self {
+            Self::Record(s) => Ok(s),
+            Self::Batch(_) => Err(FlussError::new_err(
+                "poll() requires a record-based scanner. Use new_scan().create_log_scanner().",
+            )),
+        }
+    }
+
+    fn as_batch(&self) -> PyResult<&fcore::client::RecordBatchLogScanner> {
+        match self {
+            Self::Batch(s) => Ok(s),
+            Self::Record(_) => Err(FlussError::new_err(
+                "This method requires a batch-based scanner. Use new_scan().create_record_batch_log_scanner().",
+            )),
+        }
+    }
+}
+
+/// Dispatch a method call to whichever scanner variant is active.
+/// Both `LogScanner` and `RecordBatchLogScanner` share the same subscribe interface.
+macro_rules! with_scanner {
+    ($scanner:expr, $method:ident($($arg:expr),*)) => {
+        match $scanner.as_ref() {
+            ScannerKind::Record(s) => s.$method($($arg),*).await,
+            ScannerKind::Batch(s) => s.$method($($arg),*).await,
+        }
+    };
+}
+
+/// Scanner for reading log data from a Fluss table.
+///
+/// This scanner supports two modes:
+/// - Record-based scanning via `poll()` - returns individual records with metadata
+/// - Batch-based scanning via `poll_arrow()` / `poll_record_batch()` - returns Arrow batches
+#[pyclass]
+pub struct LogScanner {
+    kind: Arc<ScannerKind>,
+    admin: Arc<fcore::client::FlussAdmin>,
+    table_info: fcore::metadata::TableInfo,
+    /// The projected Arrow schema to use for empty table creation
+    projected_schema: SchemaRef,
+    /// The projected row type to use for record-based scanning
+    projected_row_type: Arc<fcore::metadata::RowType>,
+}
+
+#[pymethods]
+impl LogScanner {
+    /// Subscribe to a single bucket at a specific offset (non-partitioned tables).
+    ///
+    /// Args:
+    ///     bucket_id: The bucket ID to subscribe to
+    ///     start_offset: The offset to start reading from (use EARLIEST_OFFSET for beginning)
+    fn subscribe(&self, py: Python, bucket_id: i32, start_offset: i64) -> PyResult<()> {
+        py.detach(|| {
+            TOKIO_RUNTIME.block_on(async {
+                with_scanner!(&self.kind, subscribe(bucket_id, start_offset))
+                    .map_err(|e| FlussError::from_core_error(&e))
+            })
+        })
+    }
+
+    /// Subscribe to multiple buckets at specified offsets (non-partitioned tables).
+    ///
+    /// Args:
+    ///     bucket_offsets: A dict mapping bucket_id -> start_offset
+    fn subscribe_buckets(&self, py: Python, bucket_offsets: HashMap<i32, i64>) -> PyResult<()> {
+        py.detach(|| {
+            TOKIO_RUNTIME.block_on(async {
+                with_scanner!(&self.kind, subscribe_buckets(&bucket_offsets))
+                    .map_err(|e| FlussError::from_core_error(&e))
+            })
+        })
+    }
+
+    /// Subscribe to a bucket within a specific partition (partitioned tables only).
+    ///
+    /// Args:
+    ///     partition_id: The partition ID (from PartitionInfo.partition_id)
+    ///     bucket_id: The bucket ID within the partition
+    ///     start_offset: The offset to start reading from (use EARLIEST_OFFSET for beginning)
+    fn subscribe_partition(
+        &self,
+        py: Python,
+        partition_id: i64,
+        bucket_id: i32,
+        start_offset: i64,
+    ) -> PyResult<()> {
+        py.detach(|| {
+            TOKIO_RUNTIME.block_on(async {
+                with_scanner!(
+                    &self.kind,
+                    subscribe_partition(partition_id, bucket_id, start_offset)
+                )
+                .map_err(|e| FlussError::from_core_error(&e))
+            })
+        })
+    }
+
+    /// Subscribe to multiple partition+bucket combinations at once (partitioned tables only).
+    ///
+    /// Args:
+    ///     partition_bucket_offsets: A dict mapping (partition_id, bucket_id) tuples to start_offsets
+    fn subscribe_partition_buckets(
+        &self,
+        py: Python,
+        partition_bucket_offsets: HashMap<(i64, i32), i64>,
+    ) -> PyResult<()> {
+        py.detach(|| {
+            TOKIO_RUNTIME.block_on(async {
+                with_scanner!(
+                    &self.kind,
+                    subscribe_partition_buckets(&partition_bucket_offsets)
+                )
+                .map_err(|e| FlussError::from_core_error(&e))
+            })
+        })
+    }
+
+    /// Unsubscribe from a specific bucket (non-partitioned tables only).
+    ///
+    /// Args:
+    ///     bucket_id: The bucket ID to unsubscribe from
+    fn unsubscribe(&self, py: Python, bucket_id: i32) -> PyResult<()> {
+        py.detach(|| {
+            TOKIO_RUNTIME.block_on(async {
+                with_scanner!(&self.kind, unsubscribe(bucket_id))
+                    .map_err(|e| FlussError::from_core_error(&e))
+            })
+        })
+    }
+
+    /// Unsubscribe from a specific partition bucket (partitioned tables only).
+    ///
+    /// Args:
+    ///     partition_id: The partition ID to unsubscribe from
+    ///     bucket_id: The bucket ID within the partition
+    fn unsubscribe_partition(&self, py: Python, partition_id: i64, bucket_id: i32) -> PyResult<()> {
+        py.detach(|| {
+            TOKIO_RUNTIME.block_on(async {
+                with_scanner!(&self.kind, unsubscribe_partition(partition_id, bucket_id))
+                    .map_err(|e| FlussError::from_core_error(&e))
+            })
+        })
+    }
+
+    /// Poll for individual records with metadata.
+    ///
+    /// Args:
+    ///     timeout_ms: Timeout in milliseconds to wait for records
+    ///
+    /// Returns:
+    ///     ScanRecords grouped by bucket. Supports flat iteration
+    ///     (`for rec in records`) and per-bucket access (`records.buckets()`,
+    ///     `records.records(bucket)`, `records[bucket]`).
+    ///
+    /// Note:
+    ///     - Requires a record-based scanner (created with new_scan().create_log_scanner())
+    ///     - Returns an empty ScanRecords if no records are available
+    ///     - When timeout expires, returns an empty ScanRecords (NOT an error)
+    fn poll<'py>(&self, py: Python<'py>, timeout_ms: i64) -> PyResult<Bound<'py, PyAny>> {
+        if timeout_ms < 0 {
+            return Err(FlussError::new_err(format!(
+                "timeout_ms must be non-negative, got: {timeout_ms}"
+            )));
+        }
+
+        let timeout = Duration::from_millis(timeout_ms as u64);
+        let scanner = Arc::clone(&self.kind);
+        let projected_row_type = self.projected_row_type.clone();
+
+        future_into_py(py, async move {
+            let scan_records = scanner
+                .as_record()?
+                .poll(timeout)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                let mut records_by_bucket = IndexMap::new();
+                let mut total_count = 0usize;
+
+                for (bucket, records) in scan_records.into_records_by_buckets() {
+                    let py_bucket = TableBucket::from_core(bucket);
+                    let mut py_records = Vec::with_capacity(records.len());
+                    for record in &records {
+                        let scan_record = ScanRecord::from_core(py, record, &projected_row_type)?;
+                        py_records.push(Py::new(py, scan_record)?);
+                        total_count += 1;
+                    }
+                    records_by_bucket.insert(py_bucket, py_records);
+                }
+
+                Ok(ScanRecords {
+                    records_by_bucket,
+                    total_count,
+                })
+            })
+        })
+    }
+
+    /// Poll for batches with metadata.
+    ///
+    /// Args:
+    ///     timeout_ms: Timeout in milliseconds to wait for batches
+    ///
+    /// Returns:
+    ///     List of RecordBatch objects, each containing the Arrow batch along with
+    ///     bucket, base_offset, and last_offset metadata.
+    ///
+    /// Note:
+    ///     - Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner())
+    ///     - Returns an empty list if no batches are available
+    ///     - When timeout expires, returns an empty list (NOT an error)
+    fn poll_record_batch<'py>(
+        &self,
+        py: Python<'py>,
+        timeout_ms: i64,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        if timeout_ms < 0 {
+            return Err(FlussError::new_err(format!(
+                "timeout_ms must be non-negative, got: {timeout_ms}"
+            )));
+        }
+
+        let timeout = Duration::from_millis(timeout_ms as u64);
+        let scanner = Arc::clone(&self.kind);
+
+        future_into_py(py, async move {
+            let scan_batches = scanner
+                .as_batch()?
+                .poll(timeout)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            Python::attach(|py| {
+                scan_batches
+                    .into_iter()
+                    .map(|sb| Py::new(py, RecordBatch::from_scan_batch(sb)))
+                    .collect::<PyResult<Vec<_>>>()
+            })
+        })
+    }
+
+    /// Poll for new records as an Arrow Table.
+    ///
+    /// Args:
+    ///     timeout_ms: Timeout in milliseconds to wait for records
+    ///
+    /// Returns:
+    ///     PyArrow Table containing the polled records (batches merged)
+    ///
+    /// Note:
+    ///     - Requires a batch-based scanner (created with new_scan().create_record_batch_log_scanner())
+    ///     - Returns an empty table (with correct schema) if no records are available
+    ///     - When timeout expires, returns an empty table (NOT an error)
+    fn poll_arrow<'py>(&self, py: Python<'py>, timeout_ms: i64) -> PyResult<Bound<'py, PyAny>> {
+        if timeout_ms < 0 {
+            return Err(FlussError::new_err(format!(
+                "timeout_ms must be non-negative, got: {timeout_ms}"
+            )));
+        }
+
+        let timeout = Duration::from_millis(timeout_ms as u64);
+        let scanner = Arc::clone(&self.kind);
+        let projected_schema = self.projected_schema.clone();
+
+        future_into_py(py, async move {
+            let scan_batches = scanner
+                .as_batch()?
+                .poll(timeout)
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            let arrow_batches = scan_batches
+                .into_iter()
+                .map(|sb| Arc::new(sb.into_batch()))
+                .collect();
+
+            Python::attach(|py| Self::batches_to_arrow_table(py, arrow_batches, &projected_schema))
+        })
+    }
+
+    /// Create a lazy Arrow RecordBatchReader that reads until latest offsets.
+    ///
+    /// This is a **blocking / synchronous** API: construction queries the
+    /// server for latest offsets (via ``block_on``), and each
+    /// ``RecordBatchReader.__next__()`` call blocks the calling thread until
+    /// the next batch is available. It is suitable for Arrow interop
+    /// (feeding into DuckDB, Polars, etc.) but should not be used
+    /// from ``asyncio`` coroutines -- see issue #545 for a planned
+    /// asyncio-native streaming alternative.
+    /// TODO(#545): Add asyncio-native streaming counterpart.
+    ///
+    /// Returns a PyArrow RecordBatchReader that lazily polls batches one at a
+    /// time. This is more memory-efficient than ``to_arrow()`` which loads all
+    /// data into a single table.
+    ///
+    /// **Concurrency:** While this reader is alive, ``subscribe*`` and
+    /// ``unsubscribe*`` calls on the scanner are rejected with an error.
+    /// You should also avoid calling ``poll_arrow`` / ``poll_record_batch``
+    /// on the same scanner — these are not blocked by the guard, but they
+    /// share the underlying fetch buffer with the reader and would
+    /// interleave batches between both consumers. Drop the reader before
+    /// resuming any of these operations.
+    ///
+    /// You must call subscribe(), subscribe_buckets(), subscribe_partition(),
+    /// or subscribe_partition_buckets() first.
+    ///
+    /// Returns:
+    ///     ``pyarrow.RecordBatchReader`` yielding ``RecordBatch`` objects
+    fn to_arrow_batch_reader(&self, py: Python) -> PyResult<Py<PyAny>> {
+        let scanner = self.kind.as_batch()?;
+
+        let sync_reader = py
+            .detach(|| {
+                TOKIO_RUNTIME.block_on(async {
+                    let reader = fcore::client::RecordBatchLogReader::new_until_latest(
+                        scanner.new_shared_handle(),
+                        &self.admin,
+                    )
+                    .await?;
+                    Ok::<_, fcore::error::Error>(
+                        reader.to_record_batch_reader(TOKIO_RUNTIME.handle().clone()),
+                    )
+                })
+            })
+            .map_err(|e| FlussError::from_core_error(&e))?;
+
+        let py_schema = sync_reader
+            .schema()
+            .to_pyarrow(py)
+            .map_err(|e| FlussError::new_err(format!("Failed to convert schema: {e}")))?;
+
+        let py_iter = Py::new(py, PyRecordBatchLogReader { sync_reader })?;
+
+        let pyarrow = py.import("pyarrow")?;
+        let batch_reader = pyarrow
+            .getattr("RecordBatchReader")?
+            .call_method1("from_batches", (py_schema, py_iter))?;
+
+        Ok(batch_reader.into())
+    }
+
+    /// Convert all data to Arrow Table.
+    ///
+    /// Reads from currently subscribed buckets until reaching their latest offsets.
+    /// Works for both partitioned and non-partitioned tables.
+    ///
+    /// Materializes batches in Rust (``RecordBatchLogReader::collect_all_batches``)
+    /// then builds one PyArrow table, avoiding per-batch Python iteration.
+    ///
+    /// You must call subscribe(), subscribe_buckets(), subscribe_partition(), or subscribe_partition_buckets() first.
+    ///
+    /// Returns:
+    ///     PyArrow Table containing all data from subscribed buckets
+    fn to_arrow<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let kind = Arc::clone(&self.kind);
+        let admin = Arc::clone(&self.admin);
+        let projected_schema = self.projected_schema.clone();
+
+        future_into_py(py, async move {
+            let scanner = kind.as_batch()?;
+
+            let mut reader = fcore::client::RecordBatchLogReader::new_until_latest(
+                scanner.new_shared_handle(),
+                &admin,
+            )
+            .await
+            .map_err(|e| FlussError::from_core_error(&e))?;
+
+            let scan_batches = reader
+                .collect_all_batches()
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            let batches: Vec<Arc<ArrowRecordBatch>> = scan_batches
+                .into_iter()
+                .map(|sb| Arc::new(sb.into_batch()))
+                .collect();
+
+            Python::attach(|py| Self::batches_to_arrow_table(py, batches, &projected_schema))
+        })
+    }
+
+    /// Convert all data to Pandas DataFrame.
+    ///
+    /// Reads from currently subscribed buckets until reaching their latest offsets.
+    /// Works for both partitioned and non-partitioned tables.
+    ///
+    /// You must call subscribe(), subscribe_buckets(), subscribe_partition(), or subscribe_partition_buckets() first.
+    ///
+    /// Returns:
+    ///     Pandas DataFrame containing all data from subscribed buckets
+    fn to_pandas<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let kind = Arc::clone(&self.kind);
+        let admin = Arc::clone(&self.admin);
+        let projected_schema = self.projected_schema.clone();
+
+        future_into_py(py, async move {
+            let scanner = kind.as_batch()?;
+
+            let mut reader = fcore::client::RecordBatchLogReader::new_until_latest(
+                scanner.new_shared_handle(),
+                &admin,
+            )
+            .await
+            .map_err(|e| FlussError::from_core_error(&e))?;
+
+            let scan_batches = reader
+                .collect_all_batches()
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))?;
+
+            let batches: Vec<Arc<ArrowRecordBatch>> = scan_batches
+                .into_iter()
+                .map(|sb| Arc::new(sb.into_batch()))
+                .collect();
+
+            Python::attach(|py| {
+                let arrow_table = Self::batches_to_arrow_table(py, batches, &projected_schema)?;
+                arrow_table.call_method0(py, "to_pandas")
+            })
+        })
+    }
+
+    fn __aiter__<'py>(slf: PyRef<'py, Self>) -> PyResult<Bound<'py, PyAny>> {
+        let py = slf.py();
+
+        // Single lock for the generic async generator
+        static ASYNC_GEN_FN: PyOnceLock<Py<PyAny>> = PyOnceLock::new();
+
+        let gen_fn = ASYNC_GEN_FN.get_or_init(py, || {
+            let code = pyo3::ffi::c_str!(
+                r#"
+async def _async_scan_generic(scanner, method_name, timeout_ms):
+    poll_method = getattr(scanner, method_name)
+    while True:
+        for item in await poll_method(timeout_ms):
+            yield item
+"#
+            );
+            let globals = pyo3::types::PyDict::new(py);
+            py.run(code, Some(&globals), None).unwrap();
+            globals
+                .get_item("_async_scan_generic")
+                .unwrap()
+                .unwrap()
+                .unbind()
+        });
+
+        let method_name = match slf.kind.as_ref() {
+            ScannerKind::Record(_) => "poll",
+            ScannerKind::Batch(_) => "poll_record_batch",
+        };
+
+        gen_fn.bind(py).call1((
+            slf.into_bound_py_any(py)?,
+            method_name,
+            DEFAULT_POLL_INTERVAL_MS,
+        ))
+    }
+
+    fn __repr__(&self) -> String {
+        format!("LogScanner(table={})", self.table_info.table_path)
+    }
+}
+
+impl LogScanner {
+    fn new(
+        scanner: ScannerKind,
+        admin: Arc<fcore::client::FlussAdmin>,
+        table_info: fcore::metadata::TableInfo,
+        projected_schema: SchemaRef,
+        projected_row_type: Arc<fcore::metadata::RowType>,
+    ) -> Self {
+        Self {
+            kind: Arc::new(scanner),
+            admin,
+            table_info,
+            projected_schema,
+            projected_row_type,
+        }
+    }
+
+    /// Convert Arrow record batches to a PyArrow Table (or empty table if no batches).
+    fn batches_to_arrow_table(
+        py: Python<'_>,
+        batches: Vec<Arc<ArrowRecordBatch>>,
+        projected_schema: &SchemaRef,
+    ) -> PyResult<Py<PyAny>> {
+        if batches.is_empty() {
+            let py_schema = projected_schema
+                .as_ref()
+                .to_pyarrow(py)
+                .map_err(|e| FlussError::new_err(format!("Failed to convert schema: {e}")))?;
+            let pyarrow = py.import("pyarrow")?;
+            let empty_table = pyarrow
+                .getattr("Table")?
+                .call_method1("from_batches", (vec![] as Vec<Py<PyAny>>, py_schema))?;
+            Ok(empty_table.into())
+        } else {
+            Utils::combine_batches_to_table(py, batches)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_nanos_to_millis_and_submillis() {
+        // Simple positive case
+        assert_eq!(nanos_to_millis_and_submillis(1_500_000), (1, 500_000));
+
+        // Exact millisecond boundary
+        assert_eq!(nanos_to_millis_and_submillis(2_000_000), (2, 0));
+
+        // Zero
+        assert_eq!(nanos_to_millis_and_submillis(0), (0, 0));
+
+        // Large value
+        assert_eq!(
+            nanos_to_millis_and_submillis(86_400_000_000_000), // 1 day in nanos
+            (86_400_000, 0)
+        );
+
+        // Negative: -1.5 milliseconds should be (-2 millis, +500_000 nanos)
+        // Because -1_500_000 nanos = -2ms + 500_000ns
+        assert_eq!(nanos_to_millis_and_submillis(-1_500_000), (-2, 500_000));
+
+        // Negative exact boundary
+        assert_eq!(nanos_to_millis_and_submillis(-2_000_000), (-2, 0));
+
+        // Small negative
+        assert_eq!(nanos_to_millis_and_submillis(-1), (-1, 999_999));
+
+        // Negative with sub-millisecond part
+        assert_eq!(nanos_to_millis_and_submillis(-500_000), (-1, 500_000));
+    }
+}
diff --git a/fluss-rust/bindings/python/src/upsert.rs b/fluss-rust/bindings/python/src/upsert.rs
new file mode 100644
index 0000000000..45244225f6
--- /dev/null
+++ b/fluss-rust/bindings/python/src/upsert.rs
@@ -0,0 +1,161 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::table::{python_to_generic_row, python_to_sparse_generic_row};
+use crate::*;
+use pyo3_async_runtimes::tokio::future_into_py;
+use std::sync::Arc;
+
+/// Writer for upserting and deleting data in a Fluss primary key table.
+///
+/// Each upsert/delete operation synchronously queues the write. Call `flush()`
+/// to ensure all queued writes are delivered to the server.
+///
+/// # Example:
+///     writer = table.new_upsert().create_writer()
+///
+///     # Fire-and-forget — ignore the returned handle
+///     writer.upsert(row1)
+///     writer.upsert(row2)
+///     await writer.flush()
+///
+///     # Per-record ack — call wait() on the handle
+///     handle = writer.upsert(critical_row)
+///     await handle.wait()
+#[pyclass]
+pub struct UpsertWriter {
+    writer: Arc<fcore::client::UpsertWriter>,
+    table_info: fcore::metadata::TableInfo,
+    /// Column indices for partial updates (None = full row)
+    target_columns: Option<Vec<usize>>,
+}
+
+#[pymethods]
+impl UpsertWriter {
+    /// Upsert a row into the table.
+    ///
+    /// If a row with the same primary key exists, it will be updated.
+    /// Otherwise, a new row will be inserted.
+    ///
+    /// The write is queued synchronously. Call `flush()` to ensure delivery.
+    ///
+    /// Args:
+    ///     row: A dict, list, or tuple containing the row data.
+    ///          For dict: keys are column names, values are column values.
+    ///          For list/tuple: values must be in schema order.
+    pub fn upsert(&self, row: &Bound<'_, PyAny>) -> PyResult<WriteResultHandle> {
+        let generic_row = if let Some(target_cols) = &self.target_columns {
+            python_to_sparse_generic_row(row, &self.table_info, target_cols)?
+        } else {
+            python_to_generic_row(row, &self.table_info)?
+        };
+
+        let result_future = self
+            .writer
+            .upsert(&generic_row)
+            .map_err(|e| FlussError::from_core_error(&e))?;
+        Ok(WriteResultHandle::new(result_future))
+    }
+
+    /// Delete a row from the table by primary key.
+    ///
+    /// The delete is queued synchronously. Call `flush()` to ensure delivery.
+    ///
+    /// Args:
+    ///     pk: A dict, list, or tuple containing only the primary key values.
+    ///         For dict: keys are PK column names.
+    ///         For list/tuple: values in PK column order.
+    pub fn delete(&self, pk: &Bound<'_, PyAny>) -> PyResult<WriteResultHandle> {
+        let pk_indices = self.table_info.get_schema().primary_key_indexes();
+        let generic_row = python_to_sparse_generic_row(pk, &self.table_info, &pk_indices)?;
+
+        let result_future = self
+            .writer
+            .delete(&generic_row)
+            .map_err(|e| FlussError::from_core_error(&e))?;
+        Ok(WriteResultHandle::new(result_future))
+    }
+
+    /// Flush all pending upsert/delete operations to the server.
+    ///
+    /// This method sends all buffered operations and waits until they are
+    /// acknowledged according to the writer's ack configuration.
+    ///
+    /// Returns:
+    ///     None on success
+    pub fn flush<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let writer = self.writer.clone();
+
+        future_into_py(py, async move {
+            writer
+                .flush()
+                .await
+                .map_err(|e| FlussError::from_core_error(&e))
+        })
+    }
+
+    // Enter the async runtime context (for 'async with' statement)
+    fn __aenter__<'py>(slf: PyRef<'py, Self>, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let py_slf = slf.into_pyobject(py)?.unbind();
+        future_into_py(py, async move { Ok(py_slf) })
+    }
+
+    // Exit the async runtime context (for 'async with' statement)
+    /// On exit, the writer is automatically flushed.
+    #[pyo3(signature = (exc_type=None, _exc_value=None, _traceback=None))]
+    fn __aexit__<'py>(
+        &self,
+        py: Python<'py>,
+        exc_type: Option<Bound<'py, PyAny>>,
+        _exc_value: Option<Bound<'py, PyAny>>,
+        _traceback: Option<Bound<'py, PyAny>>,
+    ) -> PyResult<Bound<'py, PyAny>> {
+        let writer = self.writer.clone();
+        let is_exc_none = exc_type.as_ref().is_none_or(|e| e.is_none());
+        future_into_py(py, async move {
+            let res = writer.flush().await;
+            if let Err(e) = res {
+                if is_exc_none {
+                    return Err(FlussError::from_core_error(&e));
+                }
+            }
+            Ok(false)
+        })
+    }
+
+    fn __repr__(&self) -> String {
+        "UpsertWriter()".to_string()
+    }
+}
+
+impl UpsertWriter {
+    /// Create an UpsertWriter by eagerly creating the core writer from a TableUpsert.
+    pub fn new(
+        table_upsert: &fcore::client::TableUpsert,
+        table_info: fcore::metadata::TableInfo,
+        target_columns: Option<Vec<usize>>,
+    ) -> PyResult<Self> {
+        let writer = table_upsert
+            .create_writer()
+            .map_err(|e| FlussError::from_core_error(&e))?;
+        Ok(Self {
+            writer: Arc::new(writer),
+            table_info,
+            target_columns,
+        })
+    }
+}
diff --git a/fluss-rust/bindings/python/src/utils.rs b/fluss-rust/bindings/python/src/utils.rs
new file mode 100644
index 0000000000..e07713976e
--- /dev/null
+++ b/fluss-rust/bindings/python/src/utils.rs
@@ -0,0 +1,246 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::*;
+use arrow_pyarrow::{FromPyArrow, ToPyArrow};
+use arrow_schema::SchemaRef;
+use std::sync::Arc;
+
+/// Utilities for schema conversion between PyArrow, Arrow, and Fluss
+pub struct Utils;
+
+impl Utils {
+    /// Convert PyArrow schema to Rust Arrow schema
+    pub fn pyarrow_to_arrow_schema(py_schema: &Py<PyAny>) -> PyResult<SchemaRef> {
+        Python::attach(|py| {
+            let schema_bound = py_schema.bind(py);
+            let schema: arrow_schema::Schema = FromPyArrow::from_pyarrow_bound(schema_bound)
+                .map_err(|e| {
+                    FlussError::new_err(format!("Failed to convert PyArrow schema: {e}"))
+                })?;
+            Ok(Arc::new(schema))
+        })
+    }
+
+    /// Convert an Arrow Field to a Fluss DataType, preserving nullability.
+    pub fn arrow_field_to_fluss_type(
+        field: &arrow::datatypes::Field,
+    ) -> PyResult<fcore::metadata::DataType> {
+        use arrow::datatypes::DataType as ArrowDataType;
+        use fcore::metadata::DataTypes;
+
+        let fluss_type = match field.data_type() {
+            ArrowDataType::Boolean => DataTypes::boolean(),
+            ArrowDataType::Int8 => DataTypes::tinyint(),
+            ArrowDataType::Int16 => DataTypes::smallint(),
+            ArrowDataType::Int32 => DataTypes::int(),
+            ArrowDataType::Int64 => DataTypes::bigint(),
+            ArrowDataType::UInt8 => DataTypes::tinyint(),
+            ArrowDataType::UInt16 => DataTypes::smallint(),
+            ArrowDataType::UInt32 => DataTypes::int(),
+            ArrowDataType::UInt64 => DataTypes::bigint(),
+            ArrowDataType::Float32 => DataTypes::float(),
+            ArrowDataType::Float64 => DataTypes::double(),
+            ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => DataTypes::string(),
+            ArrowDataType::Binary | ArrowDataType::LargeBinary => DataTypes::bytes(),
+            ArrowDataType::FixedSizeBinary(n) => DataTypes::binary(*n as usize),
+            ArrowDataType::Date32 => DataTypes::date(),
+            ArrowDataType::Date64 => DataTypes::date(),
+            ArrowDataType::Time32(unit) => match unit {
+                arrow_schema::TimeUnit::Second => DataTypes::time_with_precision(0),
+                arrow_schema::TimeUnit::Millisecond => DataTypes::time_with_precision(3),
+                _ => {
+                    return Err(FlussError::new_err(format!(
+                        "Unsupported Time32 unit: {unit:?}"
+                    )));
+                }
+            },
+            ArrowDataType::Time64(unit) => match unit {
+                arrow_schema::TimeUnit::Microsecond => DataTypes::time_with_precision(6),
+                arrow_schema::TimeUnit::Nanosecond => DataTypes::time_with_precision(9),
+                _ => {
+                    return Err(FlussError::new_err(format!(
+                        "Unsupported Time64 unit: {unit:?}"
+                    )));
+                }
+            },
+            ArrowDataType::Timestamp(unit, tz) => {
+                let precision = match unit {
+                    arrow_schema::TimeUnit::Second => 0,
+                    arrow_schema::TimeUnit::Millisecond => 3,
+                    arrow_schema::TimeUnit::Microsecond => 6,
+                    arrow_schema::TimeUnit::Nanosecond => 9,
+                };
+                // Arrow Timestamp with timezone -> Fluss TimestampLtz
+                // Arrow Timestamp without timezone -> Fluss Timestamp (NTZ)
+                if tz.is_some() {
+                    DataTypes::timestamp_ltz_with_precision(precision)
+                } else {
+                    DataTypes::timestamp_with_precision(precision)
+                }
+            }
+            ArrowDataType::Decimal128(precision, scale) => {
+                DataTypes::decimal(*precision as u32, *scale as u32)
+            }
+            ArrowDataType::List(element_field) => {
+                let element_type = Utils::arrow_field_to_fluss_type(element_field)?;
+                DataTypes::array(element_type)
+            }
+            other => {
+                return Err(FlussError::new_err(format!(
+                    "Unsupported Arrow data type: {other:?}"
+                )));
+            }
+        };
+
+        if field.is_nullable() {
+            Ok(fluss_type)
+        } else {
+            Ok(fluss_type.as_non_nullable())
+        }
+    }
+
+    /// Convert Fluss DataType to string representation, appending " NOT NULL"
+    /// for non-nullable types (matches Java's `withNullability` and Rust core's
+    /// `Display` impl).
+    pub fn datatype_to_string(data_type: &fcore::metadata::DataType) -> String {
+        let type_str = match data_type {
+            fcore::metadata::DataType::Boolean(_) => "boolean".to_string(),
+            fcore::metadata::DataType::TinyInt(_) => "tinyint".to_string(),
+            fcore::metadata::DataType::SmallInt(_) => "smallint".to_string(),
+            fcore::metadata::DataType::Int(_) => "int".to_string(),
+            fcore::metadata::DataType::BigInt(_) => "bigint".to_string(),
+            fcore::metadata::DataType::Float(_) => "float".to_string(),
+            fcore::metadata::DataType::Double(_) => "double".to_string(),
+            fcore::metadata::DataType::String(_) => "string".to_string(),
+            fcore::metadata::DataType::Bytes(_) => "bytes".to_string(),
+            fcore::metadata::DataType::Date(_) => "date".to_string(),
+            fcore::metadata::DataType::Time(t) => {
+                if t.precision() == 0 {
+                    "time".to_string()
+                } else {
+                    format!("time({})", t.precision())
+                }
+            }
+            fcore::metadata::DataType::Timestamp(t) => {
+                if t.precision() == 6 {
+                    "timestamp".to_string()
+                } else {
+                    format!("timestamp({})", t.precision())
+                }
+            }
+            fcore::metadata::DataType::TimestampLTz(t) => {
+                if t.precision() == 6 {
+                    "timestamp_ltz".to_string()
+                } else {
+                    format!("timestamp_ltz({})", t.precision())
+                }
+            }
+            fcore::metadata::DataType::Char(c) => format!("char({})", c.length()),
+            fcore::metadata::DataType::Decimal(d) => {
+                format!("decimal({},{})", d.precision(), d.scale())
+            }
+            fcore::metadata::DataType::Binary(b) => format!("binary({})", b.length()),
+            fcore::metadata::DataType::Array(arr) => format!(
+                "array<{}>",
+                Utils::datatype_to_string(arr.get_element_type())
+            ),
+            fcore::metadata::DataType::Map(map) => format!(
+                "map<{},{}>",
+                Utils::datatype_to_string(map.key_type()),
+                Utils::datatype_to_string(map.value_type())
+            ),
+            fcore::metadata::DataType::Row(row) => {
+                let fields: Vec<String> = row
+                    .fields()
+                    .iter()
+                    .map(|field| {
+                        format!(
+                            "{}: {}",
+                            field.name(),
+                            Utils::datatype_to_string(field.data_type())
+                        )
+                    })
+                    .collect();
+                format!("row<{}>", fields.join(", "))
+            }
+        };
+
+        if data_type.is_nullable() {
+            type_str
+        } else {
+            format!("{type_str} NOT NULL")
+        }
+    }
+
+    /// Parse log format string to LogFormat enum
+    pub fn parse_log_format(format_str: &str) -> PyResult<fcore::metadata::LogFormat> {
+        fcore::metadata::LogFormat::parse(format_str)
+            .map_err(|e| FlussError::new_err(format!("Invalid log format '{format_str}': {e}")))
+    }
+
+    /// Parse kv format string to KvFormat enum
+    pub fn parse_kv_format(format_str: &str) -> PyResult<fcore::metadata::KvFormat> {
+        fcore::metadata::KvFormat::parse(format_str)
+            .map_err(|e| FlussError::new_err(format!("Invalid kv format '{format_str}': {e}")))
+    }
+
+    /// Convert Vec<ScanRecord> to Arrow RecordBatch
+    pub fn convert_scan_records_to_arrow(
+        _scan_records: Vec<fcore::record::ScanRecord>,
+    ) -> Vec<Arc<arrow::record_batch::RecordBatch>> {
+        let mut result = Vec::new();
+        for record in _scan_records {
+            let columnar_row = record.row();
+            let row_id = columnar_row.get_row_id();
+            if row_id == 0 {
+                let record_batch = columnar_row.get_record_batch();
+                result.push(Arc::new(record_batch.clone()));
+            }
+        }
+        result
+    }
+
+    /// Combine multiple Arrow batches into a single Table
+    pub fn combine_batches_to_table(
+        py: Python,
+        batches: Vec<Arc<arrow::record_batch::RecordBatch>>,
+    ) -> PyResult<Py<PyAny>> {
+        let py_batches: Result<Vec<Py<PyAny>>, _> = batches
+            .iter()
+            .map(|batch| {
+                // Just dereference the Arc - no need to recreate the batch
+                batch
+                    .as_ref()
+                    .to_pyarrow(py)
+                    .map(|x| x.into())
+                    .map_err(|e| FlussError::new_err(format!("Failed to convert to PyObject: {e}")))
+            })
+            .collect();
+
+        let py_batches = py_batches?;
+
+        let pyarrow = py.import("pyarrow")?;
+
+        // Use pyarrow.Table.from_batches to combine batches
+        let table = pyarrow
+            .getattr("Table")?
+            .call_method1("from_batches", (py_batches,))?;
+
+        Ok(table.into())
+    }
+}
diff --git a/fluss-rust/bindings/python/src/write_handle.rs b/fluss-rust/bindings/python/src/write_handle.rs
new file mode 100644
index 0000000000..83cbeccadc
--- /dev/null
+++ b/fluss-rust/bindings/python/src/write_handle.rs
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::*;
+use pyo3_async_runtimes::tokio::future_into_py;
+use std::sync::Mutex;
+
+/// Handle for a pending write operation.
+///
+/// Returned by `upsert()`, `delete()`, `append()`, etc.
+/// Can be safely ignored for fire-and-forget semantics,
+/// or awaited via `wait()` for per-record acknowledgment.
+///
+/// # Example:
+///     # Fire-and-forget — just ignore the handle
+///     writer.upsert(row1)
+///     writer.upsert(row2)
+///     await writer.flush()
+///
+///     # Per-record ack — call wait()
+///     handle = writer.upsert(critical_row)
+///     await handle.wait()
+#[pyclass]
+pub struct WriteResultHandle {
+    inner: Mutex<Option<fcore::client::WriteResultFuture>>,
+}
+
+impl WriteResultHandle {
+    pub fn new(future: fcore::client::WriteResultFuture) -> Self {
+        Self {
+            inner: Mutex::new(Some(future)),
+        }
+    }
+}
+
+#[pymethods]
+impl WriteResultHandle {
+    /// Wait for server acknowledgment of this specific write.
+    ///
+    /// Returns:
+    ///     None on success, raises FlussError on failure.
+    pub fn wait<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {
+        let future = self
+            .inner
+            .lock()
+            .map_err(|e| FlussError::new_err(format!("Lock poisoned: {e}")))?
+            .take()
+            .ok_or_else(|| FlussError::new_err("WriteResultHandle already consumed"))?;
+
+        future_into_py(py, async move {
+            future.await.map_err(|e| FlussError::from_core_error(&e))?;
+            Ok(())
+        })
+    }
+
+    fn __repr__(&self) -> String {
+        let consumed = self.inner.lock().map(|g| g.is_none()).unwrap_or(false);
+        if consumed {
+            "WriteResultHandle(consumed)".to_string()
+        } else {
+            "WriteResultHandle(pending)".to_string()
+        }
+    }
+}
diff --git a/fluss-rust/bindings/python/test/conftest.py b/fluss-rust/bindings/python/test/conftest.py
new file mode 100644
index 0000000000..8b2bc732b9
--- /dev/null
+++ b/fluss-rust/bindings/python/test/conftest.py
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import asyncio
+import json
+import os
+import subprocess
+import tempfile
+import time
+from pathlib import Path
+
+import pytest
+import pytest_asyncio
+from filelock import FileLock
+
+import fluss
+
+CLUSTER_NAME = "shared-test"
+
+
+def _find_cli_binary():
+    env_bin = os.environ.get("FLUSS_TEST_CLUSTER_BIN")
+    if env_bin:
+        if os.path.isfile(env_bin):
+            return env_bin
+        raise FileNotFoundError(f"FLUSS_TEST_CLUSTER_BIN={env_bin!r} does not exist")
+    result = subprocess.run(
+        ["cargo", "locate-project", "--workspace", "--message-format", "plain"],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode == 0:
+        root = Path(result.stdout.strip()).parent
+        for profile in ("debug", "release"):
+            bin_path = root / "target" / profile / "fluss-test-cluster"
+            if bin_path.is_file():
+                return str(bin_path)
+    raise FileNotFoundError(
+        "fluss-test-cluster not found. Run: cargo build -p fluss-test-cluster"
+    )
+
+
+def _start_cluster():
+    lock = Path(tempfile.gettempdir()) / f"fluss-{CLUSTER_NAME}.lock"
+    with FileLock(lock):
+        cli = _find_cli_binary()
+        result = subprocess.run(
+            [cli, "start", "--sasl", "--name", CLUSTER_NAME],
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"fluss-test-cluster start failed:\n{result.stderr}\n{result.stdout}"
+            )
+        prefix = "CLUSTER_JSON: "
+        for line in result.stdout.strip().split("\n"):
+            if line.startswith(prefix):
+                info = json.loads(line[len(prefix) :])
+                return info["bootstrap_servers"], info.get("sasl_bootstrap_servers")
+        raise RuntimeError(
+            f"No CLUSTER_JSON token in output:\n{result.stdout}\n{result.stderr}"
+        )
+
+
+def _stop_cluster():
+    try:
+        cli = _find_cli_binary()
+    except FileNotFoundError:
+        return
+    subprocess.run([cli, "stop", "--name", CLUSTER_NAME], capture_output=True)
+
+
+async def _connect(bootstrap_servers):
+    config = fluss.Config({"bootstrap.servers": bootstrap_servers})
+    start = time.time()
+    last_err = None
+    while time.time() - start < 60:
+        try:
+            conn = await fluss.FlussConnection.create(config)
+            admin = conn.get_admin()
+            nodes = await admin.get_server_nodes()
+            if any(n.server_type == "TabletServer" for n in nodes):
+                return conn
+            await conn.close()
+            last_err = RuntimeError("No TabletServer available yet")
+        except Exception as e:
+            last_err = e
+        await asyncio.sleep(1)
+    raise RuntimeError(f"Could not connect after 60s: {last_err}")
+
+
+def pytest_unconfigure(config):
+    if os.environ.get("FLUSS_BOOTSTRAP_SERVERS"):
+        return
+    if hasattr(config, "workerinput"):
+        return
+    if os.environ.get("FLUSS_SKIP_CLUSTER_TEARDOWN"):
+        return
+    _stop_cluster()
+
+
+@pytest.fixture(scope="session")
+def fluss_cluster():
+    env = os.environ.get("FLUSS_BOOTSTRAP_SERVERS")
+    if env:
+        sasl_env = os.environ.get("FLUSS_SASL_BOOTSTRAP_SERVERS", env)
+        yield (env, sasl_env)
+        return
+
+    plaintext_addr, sasl_addr = _start_cluster()
+    yield (plaintext_addr, sasl_addr or plaintext_addr)
+
+
+@pytest_asyncio.fixture(scope="session")
+async def connection(fluss_cluster):
+    plaintext_addr, _sasl_addr = fluss_cluster
+    conn = await _connect(plaintext_addr)
+    yield conn
+    conn.close()
+
+
+@pytest.fixture(scope="session")
+def sasl_bootstrap_servers(fluss_cluster):
+    _plaintext_addr, sasl_addr = fluss_cluster
+    return sasl_addr
+
+
+@pytest.fixture(scope="session")
+def plaintext_bootstrap_servers(fluss_cluster):
+    plaintext_addr, _sasl_addr = fluss_cluster
+    return plaintext_addr
+
+
+@pytest_asyncio.fixture(scope="session")
+async def admin(connection):
+    return connection.get_admin()
+
+
+@pytest_asyncio.fixture
+async def wait_for_table_ready(admin):
+    """
+    Fixture that returns a helper function to wait for a table or partition to be ready.
+    """
+    async def _wait(table_path, timeout=15, partition_name=None):
+        start_time = time.monotonic()
+        while time.monotonic() - start_time < timeout:
+            try:
+                if partition_name:
+                    await admin.list_partition_offsets(
+                        table_path, partition_name, [0], fluss.OffsetSpec.earliest()
+                    )
+                else:
+                    await admin.list_offsets(table_path, [0], fluss.OffsetSpec.earliest())
+                return
+            except (fluss.FlussError, Exception) as e:
+                # Catch "No leader found" or other errors that indicate the table/partition is still initializing
+                err_msg = str(e)
+                if any(msg in err_msg for msg in ["No leader found", "Table not ready", "Metadata not ready", "not leader or follower"]):
+                    await asyncio.sleep(1)
+                    continue
+                raise
+        raise TimeoutError(
+            f"Table/Partition {table_path} ({partition_name or 'standard'}) "
+            f"did not become ready within {timeout}s"
+        )
+
+    return _wait
diff --git a/fluss-rust/bindings/python/test/test_admin.py b/fluss-rust/bindings/python/test/test_admin.py
new file mode 100644
index 0000000000..646248d8d4
--- /dev/null
+++ b/fluss-rust/bindings/python/test/test_admin.py
@@ -0,0 +1,319 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Integration tests for FlussAdmin operations.
+
+Mirrors the Rust integration tests in crates/fluss/tests/integration/admin.rs.
+"""
+
+import pyarrow as pa
+import pytest
+
+import fluss
+
+
+async def test_create_database(admin):
+    """Test database create, exists, get_info, and drop lifecycle."""
+    db_name = "py_test_create_database"
+
+    # Cleanup in case of prior failed run
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+
+    assert not await admin.database_exists(db_name)
+
+    db_descriptor = fluss.DatabaseDescriptor(
+        comment="test_db",
+        custom_properties={"k1": "v1", "k2": "v2"},
+    )
+    await admin.create_database(db_name, db_descriptor, ignore_if_exists=False)
+
+    assert await admin.database_exists(db_name)
+
+    db_info = await admin.get_database_info(db_name)
+    assert db_info.database_name == db_name
+
+    descriptor = db_info.get_database_descriptor()
+    assert descriptor.comment == "test_db"
+    assert descriptor.get_custom_properties() == {"k1": "v1", "k2": "v2"}
+
+    await admin.drop_database(db_name, ignore_if_not_exists=False, cascade=True)
+
+    assert not await admin.database_exists(db_name)
+
+
+async def test_create_table(admin):
+    """Test table create, exists, get_info, list, and drop lifecycle."""
+    db_name = "py_test_create_table_db"
+
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+
+    assert not await admin.database_exists(db_name)
+    await admin.create_database(
+        db_name,
+        fluss.DatabaseDescriptor(comment="Database for test_create_table"),
+        ignore_if_exists=False,
+    )
+
+    table_name = "test_user_table"
+    table_path = fluss.TablePath(db_name, table_name)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("id", pa.int32()),
+                pa.field("name", pa.string()),
+                pa.field("age", pa.int32()),
+                pa.field("email", pa.string()),
+            ]
+        ),
+        primary_keys=["id"],
+    )
+    assert schema.get_primary_keys() == ["id"]
+
+    table_descriptor = fluss.TableDescriptor(
+        schema,
+        bucket_count=3,
+        bucket_keys=["id"],
+        comment="Test table for user data (id, name, age, email)",
+        log_format="arrow",
+        kv_format="indexed",
+        properties={"table.replication.factor": "1"},
+    )
+
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    assert await admin.table_exists(table_path)
+
+    tables = await admin.list_tables(db_name)
+    assert len(tables) == 1
+    assert table_name in tables
+
+    table_info = await admin.get_table_info(table_path)
+
+    assert table_info.comment == "Test table for user data (id, name, age, email)"
+    assert table_info.get_primary_keys() == ["id"]
+    assert table_info.num_buckets == 3
+    assert table_info.get_bucket_keys() == ["id"]
+    assert table_info.get_column_names() == ["id", "name", "age", "email"]
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+    assert not await admin.table_exists(table_path)
+
+    await admin.drop_database(db_name, ignore_if_not_exists=False, cascade=True)
+    assert not await admin.database_exists(db_name)
+
+
+async def test_partition_apis(admin):
+    """Test partition create, list, and drop lifecycle."""
+    db_name = "py_test_partition_apis_db"
+
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+    await admin.create_database(
+        db_name,
+        fluss.DatabaseDescriptor(comment="Database for test_partition_apis"),
+        ignore_if_exists=True,
+    )
+
+    table_path = fluss.TablePath(db_name, "partitioned_table")
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("id", pa.int32()),
+                pa.field("name", pa.string()),
+                pa.field("dt", pa.string()),
+                pa.field("region", pa.string()),
+            ]
+        ),
+        primary_keys=["id", "dt", "region"],
+    )
+
+    table_descriptor = fluss.TableDescriptor(
+        schema,
+        partition_keys=["dt", "region"],
+        bucket_count=3,
+        bucket_keys=["id"],
+        log_format="arrow",
+        kv_format="compacted",
+        properties={"table.replication.factor": "1"},
+    )
+
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=True)
+
+    # Initially no partitions
+    partitions = await admin.list_partition_infos(table_path)
+    assert len(partitions) == 0
+
+    # Create a partition
+    await admin.create_partition(
+        table_path,
+        {"dt": "2024-01-15", "region": "EMEA"},
+        ignore_if_exists=False,
+    )
+
+    partitions = await admin.list_partition_infos(table_path)
+    assert len(partitions) == 1
+    assert partitions[0].partition_name == "2024-01-15$EMEA"
+
+    # Drop the partition
+    await admin.drop_partition(
+        table_path,
+        {"dt": "2024-01-15", "region": "EMEA"},
+        ignore_if_not_exists=False,
+    )
+
+    partitions = await admin.list_partition_infos(table_path)
+    assert len(partitions) == 0
+
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+
+
+async def test_fluss_error_response(admin):
+    """Test that API errors are raised as FlussError with correct error codes."""
+    table_path = fluss.TablePath("fluss", "py_not_exist")
+
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await admin.get_table_info(table_path)
+
+    assert exc_info.value.error_code == fluss.ErrorCode.TABLE_NOT_EXIST
+
+
+async def test_error_database_not_exist(admin):
+    """Test error handling for non-existent database operations."""
+    # get_database_info
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await admin.get_database_info("py_no_such_db")
+    assert exc_info.value.error_code == fluss.ErrorCode.DATABASE_NOT_EXIST
+
+    # drop_database without ignore flag
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await admin.drop_database("py_no_such_db", ignore_if_not_exists=False)
+    assert exc_info.value.error_code == fluss.ErrorCode.DATABASE_NOT_EXIST
+
+    # list_tables for non-existent database
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await admin.list_tables("py_no_such_db")
+    assert exc_info.value.error_code == fluss.ErrorCode.DATABASE_NOT_EXIST
+
+
+async def test_error_database_already_exist(admin):
+    """Test error when creating a database that already exists."""
+    db_name = "py_test_error_db_already_exist"
+
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+    await admin.create_database(db_name, ignore_if_exists=False)
+
+    # Create same database again without ignore flag
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await admin.create_database(db_name, ignore_if_exists=False)
+    assert exc_info.value.error_code == fluss.ErrorCode.DATABASE_ALREADY_EXIST
+
+    # With ignore flag should succeed
+    await admin.create_database(db_name, ignore_if_exists=True)
+
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+
+
+async def test_error_table_already_exist(admin):
+    """Test error when creating a table that already exists."""
+    db_name = "py_test_error_tbl_already_exist_db"
+
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+    await admin.create_database(db_name, ignore_if_exists=True)
+
+    table_path = fluss.TablePath(db_name, "my_table")
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(
+        schema,
+        bucket_count=1,
+        properties={"table.replication.factor": "1"},
+    )
+
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    # Create same table again without ignore flag
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+    assert exc_info.value.error_code == fluss.ErrorCode.TABLE_ALREADY_EXIST
+
+    # With ignore flag should succeed
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=True)
+
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+
+
+async def test_error_table_not_exist(admin):
+    """Test error handling for non-existent table operations."""
+    table_path = fluss.TablePath("fluss", "py_no_such_table")
+
+    # drop without ignore flag
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await admin.drop_table(table_path, ignore_if_not_exists=False)
+    assert exc_info.value.error_code == fluss.ErrorCode.TABLE_NOT_EXIST
+
+    # drop with ignore flag should succeed
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+
+async def test_get_server_nodes(admin):
+    """Test get_server_nodes returns coordinator and tablet servers."""
+    nodes = await admin.get_server_nodes()
+
+    assert len(nodes) > 0, "Expected at least one server node"
+
+    server_types = [n.server_type for n in nodes]
+    assert "CoordinatorServer" in server_types, "Expected a coordinator server"
+    assert "TabletServer" in server_types, "Expected at least one tablet server"
+
+    for node in nodes:
+        assert node.host, "Server node host should not be empty"
+        assert node.port > 0, "Server node port should be > 0"
+        assert node.uid, "Server node uid should not be empty"
+        assert repr(node).startswith("ServerNode(")
+
+
+async def test_error_table_not_partitioned(admin):
+    """Test error when calling partition operations on non-partitioned table."""
+    db_name = "py_test_error_not_partitioned_db"
+
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+    await admin.create_database(db_name, ignore_if_exists=True)
+
+    table_path = fluss.TablePath(db_name, "non_partitioned_table")
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(
+        schema,
+        bucket_count=1,
+        properties={"table.replication.factor": "1"},
+    )
+
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await admin.list_partition_infos(table_path)
+    assert (
+        exc_info.value.error_code == fluss.ErrorCode.TABLE_NOT_PARTITIONED_EXCEPTION
+    )
+
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
diff --git a/fluss-rust/bindings/python/test/test_context_manager.py b/fluss-rust/bindings/python/test/test_context_manager.py
new file mode 100644
index 0000000000..5dcb5a4c31
--- /dev/null
+++ b/fluss-rust/bindings/python/test/test_context_manager.py
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import pyarrow as pa
+import time
+import fluss
+
+async def _poll_records(scanner, expected_count, timeout_s=10):
+    """Poll a record-based scanner until expected_count records are collected."""
+    collected = []
+    deadline = time.monotonic() + timeout_s
+    while len(collected) < expected_count and time.monotonic() < deadline:
+        records = await scanner.poll(5000)
+        collected.extend(records)
+    return collected
+
+@pytest.mark.asyncio
+async def test_connection_context_manager(plaintext_bootstrap_servers):
+    config = fluss.Config({"bootstrap.servers": plaintext_bootstrap_servers})
+    async with await fluss.FlussConnection.create(config) as conn:
+        admin = conn.get_admin()
+        nodes = await admin.get_server_nodes()
+        assert len(nodes) > 0
+
+
+@pytest.mark.asyncio
+async def test_append_writer_success_flush(connection, admin):
+    table_path = fluss.TablePath("fluss", "test_append_ctx_success")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+    
+    schema = fluss.Schema(pa.schema([pa.field("a", pa.int32())]))
+    await admin.create_table(table_path, fluss.TableDescriptor(schema))
+    
+    table = await connection.get_table(table_path)
+    
+    async with table.new_append().create_writer() as writer:
+        writer.append({"a": 1})
+        writer.append({"a": 2})
+        # No explicit flush here
+        
+    # After context exit, data should be flushed
+    scanner = await table.new_scan().create_log_scanner()
+    scanner.subscribe(0, fluss.EARLIEST_OFFSET)
+    records = await _poll_records(scanner, expected_count=2)
+    assert len(records) == 2
+    assert sorted([r.row["a"] for r in records]) == [1, 2]
+
+@pytest.mark.asyncio
+async def test_connection_drain_on_close(plaintext_bootstrap_servers, admin):
+    table_path = fluss.TablePath("fluss", "test_conn_drain")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+    schema = fluss.Schema(pa.schema([pa.field("a", pa.int32())]))
+    await admin.create_table(table_path, fluss.TableDescriptor(schema))
+
+    config = fluss.Config({"bootstrap.servers": plaintext_bootstrap_servers})
+    async with await fluss.FlussConnection.create(config) as conn:
+        table = await conn.get_table(table_path)
+        writer = table.new_append().create_writer()
+        writer.append({"a": 123})
+        # No explicit flush, no writer context exit. 
+        # Rely on connection.__aexit__ -> close() to drain.
+    
+    # Re-connect with a new connection to verify data arrived
+    async with await fluss.FlussConnection.create(config) as conn2:
+        table2 = await conn2.get_table(table_path)
+        scanner = await table2.new_scan().create_log_scanner()
+        scanner.subscribe(0, fluss.EARLIEST_OFFSET)
+        records = await _poll_records(scanner, expected_count=1)
+        assert len(records) == 1
+        assert records[0].row["a"] == 123
+
+@pytest.mark.asyncio
+async def test_upsert_writer_context_manager(connection, admin):
+    table_path = fluss.TablePath("fluss", "test_upsert_ctx")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+    
+    schema = fluss.Schema(pa.schema([pa.field("id", pa.int32()), pa.field("v", pa.string())]), primary_keys=["id"])
+    await admin.create_table(table_path, fluss.TableDescriptor(schema))
+    
+    table = await connection.get_table(table_path)
+    
+    # Success path: verify it flushes
+    async with table.new_upsert().create_writer() as writer:
+        writer.upsert({"id": 1, "v": "a"})
+        
+    lookuper = table.new_lookup().create_lookuper()
+    res = await lookuper.lookup({"id": 1})
+    assert res is not None
+    assert res["v"] == "a"
+    
+@pytest.mark.asyncio
+async def test_connection_context_manager_exception(plaintext_bootstrap_servers):
+    config = fluss.Config({"bootstrap.servers": plaintext_bootstrap_servers})
+    class TestException(Exception): pass
+    
+    try:
+        async with await fluss.FlussConnection.create(config) as conn:
+            raise TestException("connection error")
+    except TestException:
+        pass
+    # If we reach here without hanging, the connection __aexit__ gracefully handled the error
\ No newline at end of file
diff --git a/fluss-rust/bindings/python/test/test_kv_table.py b/fluss-rust/bindings/python/test/test_kv_table.py
new file mode 100644
index 0000000000..f3cddf8c3d
--- /dev/null
+++ b/fluss-rust/bindings/python/test/test_kv_table.py
@@ -0,0 +1,720 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Integration tests for KV (primary key) table operations.
+
+Mirrors the Rust integration tests in crates/fluss/tests/integration/kv_table.rs.
+"""
+
+import math
+from datetime import date, datetime, timezone
+from datetime import time as dt_time
+from decimal import Decimal
+
+import pyarrow as pa
+import pytest
+
+import fluss
+
+
+async def _upsert_and_wait(writer, row):
+    handle = writer.upsert(row)
+    await handle.wait()
+
+
+def _assert_float_specials(values):
+    assert math.isnan(values[0])
+    assert math.isinf(values[1]) and values[1] > 0
+    assert math.isinf(values[2]) and values[2] < 0
+
+
+async def test_upsert_delete_and_lookup(connection, admin):
+    """Test upsert, lookup, update, delete, and non-existent key lookup."""
+    table_path = fluss.TablePath("fluss", "py_test_upsert_and_lookup")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("id", pa.int32()),
+                pa.field("name", pa.string()),
+                pa.field("age", pa.int64()),
+            ]
+        ),
+        primary_keys=["id"],
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    upsert_writer = table.new_upsert().create_writer()
+
+    test_data = [(1, "Verso", 32), (2, "Noco", 25), (3, "Esquie", 35)]
+
+    # Upsert rows (fire-and-forget, then flush)
+    for id_, name, age in test_data:
+        upsert_writer.upsert({"id": id_, "name": name, "age": age})
+    await upsert_writer.flush()
+
+    # Lookup and verify
+    lookuper = table.new_lookup().create_lookuper()
+
+    for id_, expected_name, expected_age in test_data:
+        result = await lookuper.lookup({"id": id_})
+        assert result is not None, f"Row with id={id_} should exist"
+        assert result["id"] == id_
+        assert result["name"] == expected_name
+        assert result["age"] == expected_age
+
+    # Update record with id=1 (await acknowledgment)
+    handle = upsert_writer.upsert({"id": 1, "name": "Verso", "age": 33})
+    await handle.wait()
+
+    result = await lookuper.lookup({"id": 1})
+    assert result is not None
+    assert result["age"] == 33
+    assert result["name"] == "Verso"
+
+    # Delete record with id=1 (await acknowledgment)
+    handle = upsert_writer.delete({"id": 1})
+    await handle.wait()
+
+    result = await lookuper.lookup({"id": 1})
+    assert result is None, "Record 1 should not exist after delete"
+
+    # Verify other records still exist
+    for id_ in [2, 3]:
+        result = await lookuper.lookup({"id": id_})
+        assert result is not None, f"Record {id_} should still exist"
+
+    # Lookup non-existent key
+    result = await lookuper.lookup({"id": 999})
+    assert result is None, "Non-existent key should return None"
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_composite_primary_keys(connection, admin):
+    """Test upsert/lookup with composite PKs, including prefix lookup."""
+    table_path = fluss.TablePath("fluss", "py_test_composite_pk")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    # PK columns intentionally interleaved with non-PK column to verify
+    # that lookup correctly handles non-contiguous primary key indices.
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("region", pa.string()),
+                pa.field("score", pa.int64()),
+                pa.field("user_id", pa.int32()),
+                pa.field("event_id", pa.int64()),
+            ]
+        ),
+        primary_keys=["region", "user_id", "event_id"],
+    )
+    table_descriptor = fluss.TableDescriptor(
+        schema, bucket_count=3, bucket_keys=["region", "user_id"]
+    )
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    upsert_writer = table.new_upsert().create_writer()
+
+    test_data = [
+        ("US", 1, 1, 100),
+        ("US", 1, 2, 200),
+        ("US", 2, 1, 300),
+        ("EU", 1, 1, 150),
+        ("EU", 2, 1, 250),
+    ]
+
+    for region, user_id, event_id, score in test_data:
+        upsert_writer.upsert(
+            {
+                "region": region,
+                "user_id": user_id,
+                "event_id": event_id,
+                "score": score,
+            }
+        )
+    await upsert_writer.flush()
+
+    lookuper = table.new_lookup().create_lookuper()
+
+    # Lookup (US, 1, 1) -> score 100
+    result = await lookuper.lookup({"region": "US", "user_id": 1, "event_id": 1})
+    assert result is not None
+    assert result["score"] == 100
+
+    # Lookup (EU, 2, 1) -> score 250
+    result = await lookuper.lookup({"region": "EU", "user_id": 2, "event_id": 1})
+    assert result is not None
+    assert result["score"] == 250
+
+    # Update (US, 1, 1) score (await acknowledgment)
+    handle = upsert_writer.upsert(
+        {"region": "US", "user_id": 1, "event_id": 1, "score": 500}
+    )
+    await handle.wait()
+
+    result = await lookuper.lookup({"region": "US", "user_id": 1, "event_id": 1})
+    assert result is not None
+    assert result["score"] == 500
+
+    prefix_lookuper = table.new_lookup().lookup_by(["region", "user_id"]).create_lookuper()
+
+    # Prefix (US, 1) should match 2 rows (event_id 1 and 2)
+    rows = await prefix_lookuper.lookup({"region": "US", "user_id": 1})
+    assert len(rows) == 2
+    event_ids = sorted(row["event_id"] for row in rows)
+    assert event_ids == [1, 2]
+
+    # Also validate list/tuple prefix input
+    rows = await prefix_lookuper.lookup(["US", 1])
+    assert len(rows) == 2
+    rows = await prefix_lookuper.lookup(("EU", 2))
+    assert len(rows) == 1
+    assert rows[0]["event_id"] == 1
+
+    # Validate empty-result case: valid prefix shape but no matching rows.
+    rows = await prefix_lookuper.lookup({"region": "APAC", "user_id": 999})
+    assert rows == []
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_partial_update(connection, admin):
+    """Test partial column update via partial_update_by_name."""
+    table_path = fluss.TablePath("fluss", "py_test_partial_update")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("id", pa.int32()),
+                pa.field("name", pa.string()),
+                pa.field("age", pa.int64()),
+                pa.field("score", pa.int64()),
+            ]
+        ),
+        primary_keys=["id"],
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+
+    # Insert initial record
+    upsert_writer = table.new_upsert().create_writer()
+    handle = upsert_writer.upsert(
+        {"id": 1, "name": "Verso", "age": 32, "score": 6942}
+    )
+    await handle.wait()
+
+    lookuper = table.new_lookup().create_lookuper()
+    result = await lookuper.lookup({"id": 1})
+    assert result is not None
+    assert result["id"] == 1
+    assert result["name"] == "Verso"
+    assert result["age"] == 32
+    assert result["score"] == 6942
+
+    # Partial update: only update score column
+    partial_writer = (
+        table.new_upsert().partial_update_by_name(["id", "score"]).create_writer()
+    )
+    handle = partial_writer.upsert({"id": 1, "score": 420})
+    await handle.wait()
+
+    result = await lookuper.lookup({"id": 1})
+    assert result is not None
+    assert result["id"] == 1
+    assert result["name"] == "Verso", "name should remain unchanged"
+    assert result["age"] == 32, "age should remain unchanged"
+    assert result["score"] == 420, "score should be updated to 420"
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_partial_update_by_index(connection, admin):
+    """Test partial column update via partial_update_by_index."""
+    table_path = fluss.TablePath("fluss", "py_test_partial_update_by_index")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("id", pa.int32()),
+                pa.field("name", pa.string()),
+                pa.field("age", pa.int64()),
+                pa.field("score", pa.int64()),
+            ]
+        ),
+        primary_keys=["id"],
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+
+    upsert_writer = table.new_upsert().create_writer()
+    handle = upsert_writer.upsert(
+        {"id": 1, "name": "Verso", "age": 32, "score": 6942}
+    )
+    await handle.wait()
+
+    # Partial update by indices: columns 0=id (PK), 1=name
+    partial_writer = (
+        table.new_upsert().partial_update_by_index([0, 1]).create_writer()
+    )
+    handle = partial_writer.upsert([1, "Verso Renamed"])
+    await handle.wait()
+
+    lookuper = table.new_lookup().create_lookuper()
+    result = await lookuper.lookup({"id": 1})
+    assert result is not None
+    assert result["name"] == "Verso Renamed", "name should be updated"
+    assert result["score"] == 6942, "score should remain unchanged"
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_partitioned_table_upsert_and_lookup(connection, admin):
+    """Test upsert/lookup/delete on a partitioned KV table."""
+    table_path = fluss.TablePath("fluss", "py_test_partitioned_kv_table")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("region", pa.string()),
+                pa.field("user_id", pa.int32()),
+                pa.field("name", pa.string()),
+                pa.field("score", pa.int64()),
+            ]
+        ),
+        primary_keys=["region", "user_id"],
+    )
+    table_descriptor = fluss.TableDescriptor(
+        schema,
+        partition_keys=["region"],
+    )
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    # Create partitions
+    for region in ["US", "EU", "APAC"]:
+        await admin.create_partition(
+            table_path, {"region": region}, ignore_if_exists=True
+        )
+
+    table = await connection.get_table(table_path)
+    upsert_writer = table.new_upsert().create_writer()
+
+    test_data = [
+        ("US", 1, "Gustave", 100),
+        ("US", 2, "Lune", 200),
+        ("EU", 1, "Sciel", 150),
+        ("EU", 2, "Maelle", 250),
+        ("APAC", 1, "Noco", 300),
+    ]
+
+    for region, user_id, name, score in test_data:
+        upsert_writer.upsert(
+            {"region": region, "user_id": user_id, "name": name, "score": score}
+        )
+    await upsert_writer.flush()
+
+    lookuper = table.new_lookup().create_lookuper()
+
+    # Verify all rows across partitions
+    for region, user_id, expected_name, expected_score in test_data:
+        result = await lookuper.lookup({"region": region, "user_id": user_id})
+        assert result is not None, f"Row ({region}, {user_id}) should exist"
+        assert result["region"] == region
+        assert result["user_id"] == user_id
+        assert result["name"] == expected_name
+        assert result["score"] == expected_score
+
+    # Update within a partition (await acknowledgment)
+    handle = upsert_writer.upsert(
+        {"region": "US", "user_id": 1, "name": "Gustave Updated", "score": 999}
+    )
+    await handle.wait()
+
+    result = await lookuper.lookup({"region": "US", "user_id": 1})
+    assert result is not None
+    assert result["name"] == "Gustave Updated"
+    assert result["score"] == 999
+
+    # Lookup in non-existent partition should return None
+    result = await lookuper.lookup({"region": "UNKNOWN_REGION", "user_id": 1})
+    assert result is None, "Lookup in non-existent partition should return None"
+
+    # Delete within a partition (await acknowledgment)
+    handle = upsert_writer.delete({"region": "EU", "user_id": 1})
+    await handle.wait()
+
+    result = await lookuper.lookup({"region": "EU", "user_id": 1})
+    assert result is None, "Deleted record should not exist"
+
+    # Verify sibling record still exists
+    result = await lookuper.lookup({"region": "EU", "user_id": 2})
+    assert result is not None
+    assert result["name"] == "Maelle"
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_upsert_and_lookup_with_array(connection, admin):
+    """Test upsert and lookup with flat, nested, and null-pattern arrays in KV tables."""
+    table_path = fluss.TablePath("fluss", "py_test_kv_arrays")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("id", pa.int32()),
+                pa.field("tags", pa.list_(pa.string())),
+                pa.field("scores", pa.list_(pa.int32())),
+                pa.field("matrix", pa.list_(pa.list_(pa.int32()))),
+            ]
+        ),
+        primary_keys=["id"],
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    upsert_writer = table.new_upsert().create_writer()
+
+    await _upsert_and_wait(
+        upsert_writer,
+        {
+            "id": 1,
+            "tags": ["hello", "world"],
+            "scores": [10, 20, 30],
+            "matrix": [[1, 2], [3, 4]],
+        },
+    )
+    await _upsert_and_wait(
+        upsert_writer,
+        {"id": 2, "tags": [None], "scores": [], "matrix": None},
+    )
+    await _upsert_and_wait(
+        upsert_writer,
+        {"id": 3, "tags": None, "scores": [42], "matrix": [[], [5], [6, 7, 8]]},
+    )
+    await _upsert_and_wait(
+        upsert_writer,
+        {"id": 4, "tags": None, "scores": None, "matrix": [[1, None], None, []]},
+    )
+
+    lookuper = table.new_lookup().create_lookuper()
+
+    result1 = await lookuper.lookup({"id": 1})
+    assert result1 is not None
+    assert result1["tags"] == ["hello", "world"]
+    assert result1["scores"] == [10, 20, 30]
+    assert result1["matrix"] == [[1, 2], [3, 4]]
+
+    result2 = await lookuper.lookup({"id": 2})
+    assert result2 is not None
+    assert result2["tags"] == [None]
+    assert result2["scores"] == []
+    assert result2["matrix"] is None
+
+    result3 = await lookuper.lookup({"id": 3})
+    assert result3 is not None
+    assert result3["tags"] is None
+    assert result3["scores"] == [42]
+    assert result3["matrix"] == [[], [5], [6, 7, 8]]
+
+    result4 = await lookuper.lookup({"id": 4})
+    assert result4 is not None
+    assert result4["tags"] is None
+    assert result4["scores"] is None
+    assert result4["matrix"] == [[1, None], None, []]
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_upsert_and_lookup_with_array_rich_types(connection, admin):
+    """Test upsert/lookup for arrays with rich element types and encoding edge cases."""
+    table_path = fluss.TablePath("fluss", "py_test_kv_arrays_rich_types")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("id", pa.int32()),
+                pa.field("arr_bytes", pa.list_(pa.binary())),
+                pa.field("arr_date", pa.list_(pa.date32())),
+                pa.field("arr_time", pa.list_(pa.time32("ms"))),
+                pa.field("arr_ts_ntz", pa.list_(pa.timestamp("us"))),
+                pa.field("arr_ts_ltz", pa.list_(pa.timestamp("us", tz="UTC"))),
+                pa.field("arr_decimal", pa.list_(pa.decimal128(10, 2))),
+                pa.field("arr_long_str", pa.list_(pa.string())),
+                pa.field("arr_big_decimal", pa.list_(pa.decimal128(22, 5))),
+                pa.field("arr_ts_nano", pa.list_(pa.timestamp("ns"))),
+                pa.field("arr_float", pa.list_(pa.float32())),
+                pa.field("arr_double", pa.list_(pa.float64())),
+                # TODO(fluss-python#524): support PyArrow FixedSizeBinary in schema
+                # conversion. Then switch to pa.binary(4).
+                pa.field("arr_binary", pa.list_(pa.binary())),
+            ]
+        ),
+        primary_keys=["id"],
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    upsert_writer = table.new_upsert().create_writer()
+
+    await _upsert_and_wait(
+        upsert_writer,
+        {
+            "id": 1,
+            "arr_bytes": [b"\x10\x20\x30", None],
+            "arr_date": [date(2026, 1, 23), None],
+            "arr_time": [dt_time(10, 13, 47, 123000), None],
+            "arr_ts_ntz": [datetime(2026, 1, 23, 10, 13, 47, 123000)],
+            "arr_ts_ltz": [
+                datetime(2026, 1, 23, 10, 13, 47, 123000, tzinfo=timezone.utc)
+            ],
+            "arr_decimal": [Decimal("123.45"), None],
+            "arr_long_str": [
+                "abcdefgh",
+                "this is a much longer string that definitely exceeds inline",
+            ],
+            "arr_big_decimal": [
+                Decimal("12345678901234567.12345"),
+                Decimal("-99999999999999999.99999"),
+            ],
+            "arr_ts_nano": [datetime(2026, 1, 23, 10, 13, 47, 123456)],
+            "arr_float": [float("nan"), float("inf"), float("-inf")],
+            "arr_double": [float("nan"), float("inf"), float("-inf")],
+            "arr_binary": [b"\xde\xad\xbe\xef", b"\x00\x01\x02\x03"],
+        },
+    )
+
+    lookuper = table.new_lookup().create_lookuper()
+    result = await lookuper.lookup({"id": 1})
+    assert result is not None
+
+    assert result["arr_bytes"] == [b"\x10\x20\x30", None]
+    assert result["arr_date"] == [date(2026, 1, 23), None]
+    assert result["arr_time"] == [dt_time(10, 13, 47, 123000), None]
+    assert result["arr_ts_ntz"] == [datetime(2026, 1, 23, 10, 13, 47, 123000)]
+    assert result["arr_ts_ltz"] == [
+        datetime(2026, 1, 23, 10, 13, 47, 123000, tzinfo=timezone.utc)
+    ]
+    assert result["arr_decimal"] == [Decimal("123.45"), None]
+    assert result["arr_long_str"] == [
+        "abcdefgh",
+        "this is a much longer string that definitely exceeds inline",
+    ]
+    assert result["arr_big_decimal"] == [
+        Decimal("12345678901234567.12345"),
+        Decimal("-99999999999999999.99999"),
+    ]
+    assert result["arr_ts_nano"] == [datetime(2026, 1, 23, 10, 13, 47, 123456)]
+    _assert_float_specials(result["arr_float"])
+    _assert_float_specials(result["arr_double"])
+    assert result["arr_binary"] == [b"\xde\xad\xbe\xef", b"\x00\x01\x02\x03"]
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_all_supported_datatypes(connection, admin):
+    """Test upsert/lookup for all supported data types, including nulls."""
+    table_path = fluss.TablePath("fluss", "py_test_kv_all_datatypes")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("pk_int", pa.int32()),
+                pa.field("col_boolean", pa.bool_()),
+                pa.field("col_tinyint", pa.int8()),
+                pa.field("col_smallint", pa.int16()),
+                pa.field("col_int", pa.int32()),
+                pa.field("col_bigint", pa.int64()),
+                pa.field("col_float", pa.float32()),
+                pa.field("col_double", pa.float64()),
+                pa.field("col_string", pa.string()),
+                pa.field("col_decimal", pa.decimal128(10, 2)),
+                pa.field("col_date", pa.date32()),
+                pa.field("col_time", pa.time32("ms")),
+                pa.field("col_timestamp_ntz", pa.timestamp("us")),
+                pa.field("col_timestamp_ltz", pa.timestamp("us", tz="UTC")),
+                pa.field("col_bytes", pa.binary()),
+                pa.field("col_array", pa.list_(pa.string())),
+                pa.field("col_binary", pa.binary(16)),
+            ]
+        ),
+        primary_keys=["pk_int"],
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    upsert_writer = table.new_upsert().create_writer()
+
+    # Test data for all types
+    row_data = {
+        "pk_int": 1,
+        "col_boolean": True,
+        "col_tinyint": 127,
+        "col_smallint": 32767,
+        "col_int": 2147483647,
+        "col_bigint": 9223372036854775807,
+        "col_float": 3.14,
+        "col_double": 2.718281828459045,
+        "col_string": "world of fluss python client",
+        "col_decimal": Decimal("123.45"),
+        "col_date": date(2026, 1, 23),
+        "col_time": dt_time(10, 13, 47, 123000),  # millisecond precision
+        "col_timestamp_ntz": datetime(2026, 1, 23, 10, 13, 47, 123000),
+        "col_timestamp_ltz": datetime(2026, 1, 23, 10, 13, 47, 123000),
+        "col_bytes": b"binary data",
+        "col_array": ["fluss", "python"],
+        "col_binary": b"binary_data_0123",
+    }
+
+    await _upsert_and_wait(upsert_writer, row_data)
+
+    lookuper = table.new_lookup().create_lookuper()
+    result = await lookuper.lookup({"pk_int": 1})
+    assert result is not None, "Row should exist"
+
+    assert result["pk_int"] == 1
+    assert result["col_boolean"] is True
+    assert result["col_tinyint"] == 127
+    assert result["col_smallint"] == 32767
+    assert result["col_int"] == 2147483647
+    assert result["col_bigint"] == 9223372036854775807
+    assert math.isclose(result["col_float"], 3.14, rel_tol=1e-6)
+    assert math.isclose(result["col_double"], 2.718281828459045, rel_tol=1e-15)
+    assert result["col_string"] == "world of fluss python client"
+    assert result["col_decimal"] == Decimal("123.45")
+    assert result["col_date"] == date(2026, 1, 23)
+    assert result["col_time"] == dt_time(10, 13, 47, 123000)
+    assert result["col_timestamp_ntz"] == datetime(2026, 1, 23, 10, 13, 47, 123000)
+    assert result["col_timestamp_ltz"] == datetime(
+        2026, 1, 23, 10, 13, 47, 123000, tzinfo=timezone.utc
+    )
+    assert result["col_bytes"] == b"binary data"
+    assert result["col_array"] == ["fluss", "python"]
+    assert result["col_binary"] == b"binary_data_0123"
+
+    # Test with null values for all nullable columns
+    null_row = {"pk_int": 2}
+    for col in row_data:
+        if col != "pk_int":
+            null_row[col] = None
+    await _upsert_and_wait(upsert_writer, null_row)
+
+    result = await lookuper.lookup({"pk_int": 2})
+    assert result is not None, "Row with nulls should exist"
+    assert result["pk_int"] == 2
+    for col in row_data:
+        if col != "pk_int":
+            assert result[col] is None, f"{col} should be null"
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_prefix_lookup_validation_errors(connection, admin):
+    """Test that prefix lookup raises errors for invalid column configurations."""
+    table_path = fluss.TablePath("fluss", "py_test_prefix_lookup_validation")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("a", pa.int32()),
+                pa.field("b", pa.string()),
+                pa.field("c", pa.int64()),
+            ]
+        ),
+        primary_keys=["a", "b", "c"],
+    )
+    table_descriptor = fluss.TableDescriptor(
+        schema, bucket_count=3, bucket_keys=["a", "b"]
+    )
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+
+    # lookup_by with columns equal to full PK should error
+    with pytest.raises(fluss.FlussError, match="prefix lookup"):
+        table.new_lookup().lookup_by(["a", "b", "c"]).create_lookuper()
+
+    # lookup_by with wrong column names should error
+    with pytest.raises(fluss.FlussError, match="bucket keys"):
+        table.new_lookup().lookup_by(["a", "c"]).create_lookuper()
+
+    # lookup_by with unknown column should error
+    with pytest.raises(fluss.FlussError, match="Unknown column name"):
+        table.new_lookup().lookup_by(["a", "missing_col"]).create_lookuper()
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+    # Partitioned table: lookup columns must include partition keys first,
+    # followed by bucket keys.
+    partitioned_table_path = fluss.TablePath("fluss", "py_test_prefix_lookup_validation_pt")
+    await admin.drop_table(partitioned_table_path, ignore_if_not_exists=True)
+
+    partitioned_schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("region", pa.string()),
+                pa.field("user_id", pa.int32()),
+                pa.field("event_id", pa.int64()),
+            ]
+        ),
+        primary_keys=["region", "user_id", "event_id"],
+    )
+    partitioned_table_descriptor = fluss.TableDescriptor(
+        partitioned_schema,
+        partition_keys=["region"],
+        bucket_count=3,
+        bucket_keys=["user_id"],
+    )
+    await admin.create_table(
+        partitioned_table_path, partitioned_table_descriptor, ignore_if_exists=False
+    )
+
+    partitioned_table = await connection.get_table(partitioned_table_path)
+
+    # Missing partition key in lookup columns.
+    with pytest.raises(fluss.FlussError, match="partition fields"):
+        partitioned_table.new_lookup().lookup_by(["user_id"]).create_lookuper()
+
+    # A non-existent partition returns empty list.
+    partitioned_prefix_lookuper = (
+        partitioned_table.new_lookup().lookup_by(["region", "user_id"]).create_lookuper()
+    )
+    rows = await partitioned_prefix_lookuper.lookup({"region": "UNKNOWN_REGION", "user_id": 1})
+    assert rows == []
+
+    # After partition keys, remaining columns must equal bucket keys.
+    with pytest.raises(fluss.FlussError, match="bucket keys"):
+        partitioned_table.new_lookup().lookup_by(["region", "event_id"]).create_lookuper()
+
+    await admin.drop_table(partitioned_table_path, ignore_if_not_exists=False)
diff --git a/fluss-rust/bindings/python/test/test_log_table.py b/fluss-rust/bindings/python/test/test_log_table.py
new file mode 100644
index 0000000000..50b9078bcb
--- /dev/null
+++ b/fluss-rust/bindings/python/test/test_log_table.py
@@ -0,0 +1,1452 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Integration tests for log (append-only) table operations.
+
+Mirrors the Rust integration tests in crates/fluss/tests/integration/log_table.rs.
+"""
+
+import asyncio
+import time
+
+import pyarrow as pa
+import pytest
+
+import fluss
+
+
+async def test_append_and_scan(connection, admin):
+    """Test appending record batches and scanning with a record-based scanner."""
+    table_path = fluss.TablePath("fluss", "py_test_append_and_scan")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("c1", pa.int32()), pa.field("c2", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(
+        schema, bucket_count=3, bucket_keys=["c1"]
+    )
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    append_writer = table.new_append().create_writer()
+
+    batch1 = pa.RecordBatch.from_arrays(
+        [pa.array([1, 2, 3], type=pa.int32()), pa.array(["a1", "a2", "a3"])],
+        schema=pa.schema([pa.field("c1", pa.int32()), pa.field("c2", pa.string())]),
+    )
+    append_writer.write_arrow_batch(batch1)
+
+    batch2 = pa.RecordBatch.from_arrays(
+        [pa.array([4, 5, 6], type=pa.int32()), pa.array(["a4", "a5", "a6"])],
+        schema=pa.schema([pa.field("c1", pa.int32()), pa.field("c2", pa.string())]),
+    )
+    append_writer.write_arrow_batch(batch2)
+
+    await append_writer.flush()
+
+    # Scan with record-based scanner
+    scanner = await table.new_scan().create_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    records = await _poll_records(scanner, expected_count=6)
+
+    assert len(records) == 6, f"Expected 6 records, got {len(records)}"
+
+    records.sort(key=lambda r: r.row["c1"])
+
+    expected_c1 = [1, 2, 3, 4, 5, 6]
+    expected_c2 = ["a1", "a2", "a3", "a4", "a5", "a6"]
+    for i, record in enumerate(records):
+        assert record.row["c1"] == expected_c1[i], f"c1 mismatch at row {i}"
+        assert record.row["c2"] == expected_c2[i], f"c2 mismatch at row {i}"
+
+    # Test unsubscribe
+    scanner.unsubscribe(bucket_id=0)
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_append_dict_rows(connection, admin):
+    """Test appending rows as dicts and scanning."""
+    table_path = fluss.TablePath("fluss", "py_test_append_dict_rows")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    append_writer = table.new_append().create_writer()
+
+    # Append using dicts
+    append_writer.append({"id": 1, "name": "Alice"})
+    append_writer.append({"id": 2, "name": "Bob"})
+    # Append using lists
+    append_writer.append([3, "Charlie"])
+    await append_writer.flush()
+
+    scanner = await table.new_scan().create_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    records = await _poll_records(scanner, expected_count=3)
+    assert len(records) == 3
+
+    rows = sorted([r.row for r in records], key=lambda r: r["id"])
+    assert rows[0] == {"id": 1, "name": "Alice"}
+    assert rows[1] == {"id": 2, "name": "Bob"}
+    assert rows[2] == {"id": 3, "name": "Charlie"}
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_list_offsets(connection, admin, wait_for_table_ready):
+    """Test listing earliest, latest, and timestamp-based offsets."""
+    table_path = fluss.TablePath("fluss", "py_test_list_offsets")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    await wait_for_table_ready(table_path)
+
+    # Earliest offset should be 0 for empty table
+    earliest = await admin.list_offsets(
+        table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.earliest()
+    )
+    assert earliest[0] == 0
+
+    # Latest offset should be 0 for empty table
+    latest = await admin.list_offsets(
+        table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.latest()
+    )
+    assert latest[0] == 0
+
+    before_append_ms = int(time.time() * 1000)
+
+    # Append some records
+    table = await connection.get_table(table_path)
+    append_writer = table.new_append().create_writer()
+    batch = pa.RecordBatch.from_arrays(
+        [
+            pa.array([1, 2, 3], type=pa.int32()),
+            pa.array(["alice", "bob", "charlie"]),
+        ],
+        schema=pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())]),
+    )
+    append_writer.write_arrow_batch(batch)
+    await append_writer.flush()
+
+    await asyncio.sleep(1)
+
+    after_append_ms = int(time.time() * 1000)
+
+    # Latest offset should be 3 after appending 3 records
+    latest_after = await admin.list_offsets(
+        table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.latest()
+    )
+    assert latest_after[0] == 3
+
+    # Earliest offset should still be 0
+    earliest_after = await admin.list_offsets(
+        table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.earliest()
+    )
+    assert earliest_after[0] == 0
+
+    # Timestamp before append should resolve to offset 0
+    ts_before = await admin.list_offsets(
+        table_path,
+        bucket_ids=[0],
+        offset_spec=fluss.OffsetSpec.timestamp(before_append_ms),
+    )
+    assert ts_before[0] == 0
+
+    # Intentional sleep to avoid race condition FlussError(code=38) The timestamp is invalid
+    await asyncio.sleep(1)
+
+    # Timestamp after append should resolve to offset 3
+    ts_after = await admin.list_offsets(
+        table_path,
+        bucket_ids=[0],
+        offset_spec=fluss.OffsetSpec.timestamp(after_append_ms),
+    )
+    assert ts_after[0] == 3
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_project(connection, admin):
+    """Test column projection by name and by index."""
+    table_path = fluss.TablePath("fluss", "py_test_project")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("col_a", pa.int32()),
+                pa.field("col_b", pa.string()),
+                pa.field("col_c", pa.int32()),
+            ]
+        )
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    append_writer = table.new_append().create_writer()
+
+    batch = pa.RecordBatch.from_arrays(
+        [
+            pa.array([1, 2, 3], type=pa.int32()),
+            pa.array(["x", "y", "z"]),
+            pa.array([10, 20, 30], type=pa.int32()),
+        ],
+        schema=pa.schema(
+            [
+                pa.field("col_a", pa.int32()),
+                pa.field("col_b", pa.string()),
+                pa.field("col_c", pa.int32()),
+            ]
+        ),
+    )
+    append_writer.write_arrow_batch(batch)
+    await append_writer.flush()
+
+    # Test project_by_name: select col_b and col_c only
+    scan = table.new_scan().project_by_name(["col_b", "col_c"])
+    scanner = await scan.create_log_scanner()
+    scanner.subscribe_buckets({0: 0})
+
+    records = await _poll_records(scanner, expected_count=3)
+    assert len(records) == 3
+
+    records.sort(key=lambda r: r.row["col_c"])
+    expected_col_b = ["x", "y", "z"]
+    expected_col_c = [10, 20, 30]
+    for i, record in enumerate(records):
+        assert record.row["col_b"] == expected_col_b[i]
+        assert record.row["col_c"] == expected_col_c[i]
+        # col_a should not be present in projected results
+        assert "col_a" not in record.row
+
+    # Test project by indices [1, 0] -> (col_b, col_a)
+    scanner2 = await table.new_scan().project([1, 0]).create_log_scanner()
+    scanner2.subscribe_buckets({0: 0})
+
+    records2 = await _poll_records(scanner2, expected_count=3)
+    assert len(records2) == 3
+
+    records2.sort(key=lambda r: r.row["col_a"])
+    for i, record in enumerate(records2):
+        assert record.row["col_b"] == expected_col_b[i]
+        assert record.row["col_a"] == [1, 2, 3][i]
+        assert "col_c" not in record.row
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_poll_batches(connection, admin, wait_for_table_ready):
+    """Test batch-based scanning with poll_arrow and poll_record_batch."""
+    table_path = fluss.TablePath("fluss", "py_test_poll_batches")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    await wait_for_table_ready(table_path)
+
+    table = await connection.get_table(table_path)
+    scanner = await table.new_scan().create_record_batch_log_scanner()
+    scanner.subscribe(bucket_id=0, start_offset=0)
+
+    # Empty table should return empty result
+    result = await scanner.poll_arrow(500)
+    assert result.num_rows == 0
+
+    writer = table.new_append().create_writer()
+    pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [pa.array([1, 2], type=pa.int32()), pa.array(["a", "b"])],
+            schema=pa_schema,
+        )
+    )
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [pa.array([3, 4], type=pa.int32()), pa.array(["c", "d"])],
+            schema=pa_schema,
+        )
+    )
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [pa.array([5, 6], type=pa.int32()), pa.array(["e", "f"])],
+            schema=pa_schema,
+        )
+    )
+    await writer.flush()
+
+    # Poll until we get all 6 records
+    all_ids = await _poll_arrow_ids(scanner, expected_count=6)
+    assert all_ids == [1, 2, 3, 4, 5, 6]
+
+    # Append more and verify offset continuation (no duplicates)
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [pa.array([7, 8], type=pa.int32()), pa.array(["g", "h"])],
+            schema=pa_schema,
+        )
+    )
+    await writer.flush()
+
+    new_ids = await _poll_arrow_ids(scanner, expected_count=2)
+    assert new_ids == [7, 8]
+
+    # Subscribe from mid-offset should truncate (skip earlier records)
+    trunc_scanner = await table.new_scan().create_record_batch_log_scanner()
+    trunc_scanner.subscribe(bucket_id=0, start_offset=3)
+
+    trunc_ids = await _poll_arrow_ids(trunc_scanner, expected_count=5)
+    assert trunc_ids == [4, 5, 6, 7, 8]
+
+    # Projection with batch scanner
+    proj_scanner = (
+        await table.new_scan()
+        .project_by_name(["id"])
+        .create_record_batch_log_scanner()
+    )
+    proj_scanner.subscribe(bucket_id=0, start_offset=0)
+    batches = await proj_scanner.poll_record_batch(10000)
+    assert len(batches) > 0
+    assert batches[0].batch.num_columns == 1
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_to_arrow_and_to_pandas(connection, admin):
+    """Test to_arrow() and to_pandas() convenience methods."""
+    table_path = fluss.TablePath("fluss", "py_test_to_arrow_pandas")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+
+    pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [pa.array([1, 2, 3], type=pa.int32()), pa.array(["a", "b", "c"])],
+            schema=pa_schema,
+        )
+    )
+    await writer.flush()
+
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+
+    # to_arrow()
+    scanner = await table.new_scan().create_record_batch_log_scanner()
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+    arrow_table = await scanner.to_arrow()
+    assert arrow_table.num_rows == 3
+    assert arrow_table.schema.names == ["id", "name"]
+
+    # to_pandas()
+    scanner2 = await table.new_scan().create_record_batch_log_scanner()
+    scanner2.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+    df = await scanner2.to_pandas()
+    assert len(df) == 3
+    assert list(df.columns) == ["id", "name"]
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_to_arrow_batch_reader(connection, admin):
+    """Test to_arrow_batch_reader() returns a lazy PyArrow RecordBatchReader."""
+    table_path = fluss.TablePath("fluss", "py_test_to_arrow_batch_reader")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+
+    pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [pa.array([10, 20, 30], type=pa.int32()), pa.array(["x", "y", "z"])],
+            schema=pa_schema,
+        )
+    )
+    await writer.flush()
+
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+
+    scanner = await table.new_scan().create_record_batch_log_scanner()
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    # to_arrow_batch_reader() is a blocking/sync API; run in a thread to
+    # avoid starving the asyncio event loop (see docstring warning).
+    def _read_all():
+        reader = scanner.to_arrow_batch_reader()
+        assert isinstance(reader, pa.RecordBatchReader)
+        assert reader.schema == pa_schema
+
+        batches = list(reader)
+        total_rows = sum(b.num_rows for b in batches)
+        assert total_rows == 3
+
+        result_table = pa.Table.from_batches(batches, schema=pa_schema)
+        assert result_table.column("id").to_pylist() == [10, 20, 30]
+        assert result_table.column("name").to_pylist() == ["x", "y", "z"]
+
+    await asyncio.to_thread(_read_all)
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_to_arrow_batch_reader_drop_and_guard(connection, admin):
+    """Test reader-active guard and Drop cleanup on mid-iteration drop."""
+    table_path = fluss.TablePath("fluss", "py_test_batch_reader_drop_guard")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+
+    pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    # Write multiple separate flushes so the server stores multiple log
+    # batches per bucket. This makes it likely that the reader's first poll
+    # only drains a subset, leaving real work for the Drop cleanup loop.
+    num_flushes = 10
+    rows_per_flush = 200
+    total_rows = num_flushes * rows_per_flush
+    for f in range(num_flushes):
+        start = f * rows_per_flush
+        writer.write_arrow_batch(
+            pa.RecordBatch.from_arrays(
+                [
+                    pa.array(
+                        list(range(start, start + rows_per_flush)), type=pa.int32()
+                    ),
+                    pa.array(
+                        [f"row_{i}" for i in range(start, start + rows_per_flush)]
+                    ),
+                ],
+                schema=pa_schema,
+            )
+        )
+        await writer.flush()
+
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+
+    scanner = await table.new_scan().create_record_batch_log_scanner()
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    # to_arrow_batch_reader() is a blocking/sync API; run all blocking
+    # interactions in a thread to avoid starving the asyncio event loop.
+    def _test_guard_and_drop():
+        # --- Guard blocks subscribe / unsubscribe while reader is active ---
+        reader = scanner.to_arrow_batch_reader()
+        with pytest.raises(fluss.FlussError, match="RecordBatchLogReader is active"):
+            scanner.subscribe_buckets({0: fluss.EARLIEST_OFFSET})
+        with pytest.raises(fluss.FlussError, match="RecordBatchLogReader is active"):
+            scanner.unsubscribe(0)
+
+        # --- Drop mid-iteration: read one batch, then discard ---
+        first_batch = next(reader)
+        assert first_batch.num_rows > 0
+        del reader
+
+        # --- Drop unsubscribed leftover buckets: creating a reader without
+        #     re-subscribing must fail with "No buckets subscribed" ---
+        with pytest.raises(fluss.FlussError, match="No buckets subscribed"):
+            scanner.to_arrow_batch_reader()
+
+        # --- Guard cleared after drop: scanner is reusable from a fresh subscribe ---
+        scanner.subscribe_buckets(
+            {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}
+        )
+        reader2 = scanner.to_arrow_batch_reader()
+        batches = list(reader2)
+        assert sum(b.num_rows for b in batches) == total_rows
+
+    await asyncio.to_thread(_test_guard_and_drop)
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_partitioned_table_append_scan(connection, admin, wait_for_table_ready):
+    """Test append and scan on a partitioned log table."""
+    table_path = fluss.TablePath("fluss", "py_test_partitioned_log_append")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("id", pa.int32()),
+                pa.field("region", pa.string()),
+                pa.field("value", pa.int64()),
+            ]
+        )
+    )
+    table_descriptor = fluss.TableDescriptor(
+        schema,
+        partition_keys=["region"],
+    )
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    # Create partitions
+    for region in ["US", "EU"]:
+        await admin.create_partition(
+            table_path, {"region": region}, ignore_if_exists=True
+        )
+        await wait_for_table_ready(table_path, partition_name=region)
+    table = await connection.get_table(table_path)
+    append_writer = table.new_append().create_writer()
+
+    # Append rows
+    test_data = [
+        (1, "US", 100),
+        (2, "US", 200),
+        (3, "EU", 300),
+        (4, "EU", 400),
+    ]
+    for id_, region, value in test_data:
+        append_writer.append({"id": id_, "region": region, "value": value})
+    await append_writer.flush()
+
+    # Append arrow batches per partition
+    pa_schema = pa.schema(
+        [
+            pa.field("id", pa.int32()),
+            pa.field("region", pa.string()),
+            pa.field("value", pa.int64()),
+        ]
+    )
+    us_batch = pa.RecordBatch.from_arrays(
+        [
+            pa.array([5, 6], type=pa.int32()),
+            pa.array(["US", "US"]),
+            pa.array([500, 600], type=pa.int64()),
+        ],
+        schema=pa_schema,
+    )
+    append_writer.write_arrow_batch(us_batch)
+
+    eu_batch = pa.RecordBatch.from_arrays(
+        [
+            pa.array([7, 8], type=pa.int32()),
+            pa.array(["EU", "EU"]),
+            pa.array([700, 800], type=pa.int64()),
+        ],
+        schema=pa_schema,
+    )
+    append_writer.write_arrow_batch(eu_batch)
+    await append_writer.flush()
+
+    # Verify partition offsets
+    us_offsets = await admin.list_partition_offsets(
+        table_path,
+        partition_name="US",
+        bucket_ids=[0],
+        offset_spec=fluss.OffsetSpec.latest(),
+    )
+    assert us_offsets[0] == 4, "US partition should have 4 records"
+
+    eu_offsets = await admin.list_partition_offsets(
+        table_path,
+        partition_name="EU",
+        bucket_ids=[0],
+        offset_spec=fluss.OffsetSpec.latest(),
+    )
+    assert eu_offsets[0] == 4, "EU partition should have 4 records"
+
+    # Scan all partitions
+    scanner = await table.new_scan().create_log_scanner()
+    partition_infos = await admin.list_partition_infos(table_path)
+    for p in partition_infos:
+        scanner.subscribe_partition(
+            partition_id=p.partition_id, bucket_id=0, start_offset=0
+        )
+
+    expected = [
+        (1, "US", 100),
+        (2, "US", 200),
+        (3, "EU", 300),
+        (4, "EU", 400),
+        (5, "US", 500),
+        (6, "US", 600),
+        (7, "EU", 700),
+        (8, "EU", 800),
+    ]
+
+    # Poll and verify per-bucket grouping
+    all_records = []
+    deadline = time.monotonic() + 10
+    while len(all_records) < 8 and time.monotonic() < deadline:
+        scan_records = await scanner.poll(5000)
+        for bucket, bucket_records in scan_records.items():
+            assert bucket.partition_id is not None, "Partitioned table should have partition_id"
+            # All records in a bucket should belong to the same partition
+            regions = {r.row["region"] for r in bucket_records}
+            assert len(regions) == 1, f"Bucket has mixed regions: {regions}"
+            all_records.extend(bucket_records)
+
+    assert len(all_records) == 8
+
+    collected = sorted(
+        [(r.row["id"], r.row["region"], r.row["value"]) for r in all_records],
+        key=lambda x: x[0],
+    )
+    assert collected == expected
+
+    # Test unsubscribe_partition: unsubscribe from EU, only US data should remain
+    unsub_scanner = await table.new_scan().create_log_scanner()
+    eu_partition_id = next(
+        p.partition_id for p in partition_infos if p.partition_name == "EU"
+    )
+    for p in partition_infos:
+        unsub_scanner.subscribe_partition(p.partition_id, 0, 0)
+    unsub_scanner.unsubscribe_partition(eu_partition_id, 0)
+
+    remaining = await _poll_records(unsub_scanner, expected_count=4, timeout_s=5)
+    assert len(remaining) == 4
+    assert all(r.row["region"] == "US" for r in remaining)
+
+    # Test subscribe_partition_buckets (batch subscribe)
+    batch_scanner = await table.new_scan().create_log_scanner()
+    partition_bucket_offsets = {
+        (p.partition_id, 0): fluss.EARLIEST_OFFSET for p in partition_infos
+    }
+    batch_scanner.subscribe_partition_buckets(partition_bucket_offsets)
+
+    batch_records = await _poll_records(batch_scanner, expected_count=8)
+    assert len(batch_records) == 8
+    batch_collected = sorted(
+        [(r.row["id"], r.row["region"], r.row["value"]) for r in batch_records],
+        key=lambda x: x[0],
+    )
+    assert batch_collected == expected
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_write_arrow(connection, admin):
+    """Test writing a full PyArrow Table via write_arrow()."""
+    table_path = fluss.TablePath("fluss", "py_test_write_arrow")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+
+    pa_schema = pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    arrow_table = pa.table(
+        {
+            "id": pa.array([1, 2, 3, 4, 5], type=pa.int32()),
+            "name": pa.array(["alice", "bob", "charlie", "dave", "eve"]),
+        },
+        schema=pa_schema,
+    )
+    writer.write_arrow(arrow_table)
+    await writer.flush()
+
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner = await table.new_scan().create_record_batch_log_scanner()
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    result = await scanner.to_arrow()
+    assert result.num_rows == 5
+
+    ids = sorted(result.column("id").to_pylist())
+    names = [
+        n
+        for _, n in sorted(
+            zip(result.column("id").to_pylist(), result.column("name").to_pylist())
+        )
+    ]
+    assert ids == [1, 2, 3, 4, 5]
+    assert names == ["alice", "bob", "charlie", "dave", "eve"]
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_write_pandas(connection, admin):
+    """Test writing a Pandas DataFrame via write_pandas()."""
+    import pandas as pd
+
+    table_path = fluss.TablePath("fluss", "py_test_write_pandas")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("name", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+
+    df = pd.DataFrame({"id": [10, 20, 30], "name": ["x", "y", "z"]})
+    writer.write_pandas(df)
+    await writer.flush()
+
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner = await table.new_scan().create_record_batch_log_scanner()
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    result = await scanner.to_pandas()
+    assert len(result) == 3
+
+    result_sorted = result.sort_values("id").reset_index(drop=True)
+    assert result_sorted["id"].tolist() == [10, 20, 30]
+    assert result_sorted["name"].tolist() == ["x", "y", "z"]
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_partitioned_table_to_arrow(connection, admin, wait_for_table_ready):
+    """Test to_arrow() on partitioned tables."""
+    table_path = fluss.TablePath("fluss", "py_test_partitioned_to_arrow")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema(
+            [
+                pa.field("id", pa.int32()),
+                pa.field("region", pa.string()),
+                pa.field("value", pa.int64()),
+            ]
+        )
+    )
+    table_descriptor = fluss.TableDescriptor(schema, partition_keys=["region"])
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    for region in ["US", "EU"]:
+        await admin.create_partition(
+            table_path, {"region": region}, ignore_if_exists=True
+        )
+        await wait_for_table_ready(table_path, partition_name=region)
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+    writer.append({"id": 1, "region": "US", "value": 100})
+    writer.append({"id": 2, "region": "EU", "value": 200})
+    await writer.flush()
+
+    scanner = await table.new_scan().create_record_batch_log_scanner()
+    partition_infos = await admin.list_partition_infos(table_path)
+    for p in partition_infos:
+        scanner.subscribe_partition(p.partition_id, 0, fluss.EARLIEST_OFFSET)
+
+    arrow_table = await scanner.to_arrow()
+    assert arrow_table.num_rows == 2
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_scan_records_indexing_and_slicing(connection, admin):
+    """Test ScanRecords indexing, slicing (incl. negative steps), and iteration consistency."""
+    table_path = fluss.TablePath("fluss", "py_test_scan_records_indexing")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())])
+    )
+    await admin.create_table(table_path, fluss.TableDescriptor(schema))
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [pa.array(list(range(1, 9)), type=pa.int32()),
+             pa.array([f"v{i}" for i in range(1, 9)])],
+            schema=pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]),
+        )
+    )
+    await writer.flush()
+
+    scanner = await table.new_scan().create_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    # Poll until we get a non-empty ScanRecords (need ≥2 records for slice tests)
+    sr = None
+    deadline = time.monotonic() + 10
+    while time.monotonic() < deadline:
+        sr = await scanner.poll(5000)
+        if len(sr) >= 2:
+            break
+    assert sr is not None and len(sr) >= 2, "Expected at least 2 records"
+    n = len(sr)
+    offsets = [sr[i].offset for i in range(n)]
+
+    # Iteration and indexing must produce the same order
+    assert [r.offset for r in sr] == offsets
+
+    # Negative indexing
+    assert sr[-1].offset == offsets[-1]
+    assert sr[-n].offset == offsets[0]
+
+    # Verify slices match the same operation on the offsets reference list
+    test_slices = [
+        slice(1, n - 1),          # forward subrange
+        slice(None, None, -1),    # [::-1] full reverse
+        slice(n - 2, 0, -1),      # reverse with bounds
+        slice(n - 1, 0, -2),      # reverse with step
+        slice(None, None, 2),     # [::2]
+        slice(1, None, 3),        # [1::3]
+        slice(2, 2),              # empty
+    ]
+    for s in test_slices:
+        result = [r.offset for r in sr[s]]
+        assert result == offsets[s], f"slice {s}: got {result}, expected {offsets[s]}"
+
+    # Bucket-based indexing
+    for bucket in sr.buckets():
+        assert len(sr[bucket]) > 0
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_async_iterator(connection, admin):
+    """Test the Python asynchronous iterator loop (`async for`) on LogScanner."""
+    table_path = fluss.TablePath("fluss", "py_test_async_iterator")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())])
+    )
+    await admin.create_table(table_path, fluss.TableDescriptor(schema))
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+    
+    # Write 5 records
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [pa.array(list(range(1, 6)), type=pa.int32()),
+             pa.array([f"async{i}" for i in range(1, 6)])],
+            schema=pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())]),
+        )
+    )
+    await writer.flush()
+
+    scanner = await table.new_scan().create_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    collected = []
+    
+    # Here is the magical Issue #424 async iterator logic at work:
+    async def consume_scanner():
+        async for record in scanner:
+            collected.append(record)
+            if len(collected) == 5:
+                break
+                
+    await consume_scanner()
+    
+    assert len(collected) == 5, f"Expected 5 records, got {len(collected)}"
+    
+    collected.sort(key=lambda r: r.row["id"])
+    for i, record in enumerate(collected):
+        assert record.row["id"] == i + 1
+        assert record.row["val"] == f"async{i + 1}"
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_async_iterator_break_no_leak(connection, admin):
+    """Verify that breaking out of `async for` does not leak resources.
+
+    After breaking, the scanner must still be usable for synchronous
+    `poll()` calls.  If the old implementation's tokio::spawn'd task
+    were still alive, it would hold the Mutex and cause `poll()` to
+    deadlock or error.
+    """
+    table_path = fluss.TablePath("fluss", "py_test_async_break_leak")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())])
+    )
+    await admin.create_table(table_path, fluss.TableDescriptor(schema))
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [
+                pa.array(list(range(1, 11)), type=pa.int32()),
+                pa.array([f"v{i}" for i in range(1, 11)]),
+            ],
+            schema=pa.schema(
+                [pa.field("id", pa.int32()), pa.field("val", pa.string())]
+            ),
+        )
+    )
+    await writer.flush()
+
+    scanner = await table.new_scan().create_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner.subscribe_buckets(
+        {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}
+    )
+
+    # Phase 1: async for with early break (collect only 3 of 10)
+    collected_async = []
+
+    async def consume_and_break():
+        async for record in scanner:
+            collected_async.append(record)
+            if len(collected_async) >= 3:
+                break
+
+    await consume_and_break()
+    assert len(collected_async) == 3, (
+        f"Expected 3 records from async for, got {len(collected_async)}"
+    )
+
+    # Phase 2: sync poll() must still work — proves no leaked task / lock.
+    # With small data and few buckets, _async_poll may have fetched all
+    # records in one batch. After break, the un-yielded records from that
+    # batch are lost. So sync poll may return 0 records — the key assertion
+    # is that poll() completes without deadlock (returns within timeout).
+    remaining = await scanner.poll(2000)
+    assert remaining is not None, "poll() should return (not deadlock)"
+
+    # If we got records, verify no duplicates
+    async_ids = {r.row["id"] for r in collected_async}
+    sync_ids = {r.row["id"] for r in remaining}
+    assert async_ids.isdisjoint(sync_ids), (
+        f"Duplicate IDs between async and sync: {async_ids & sync_ids}"
+    )
+
+    # All IDs must be from the original 1-10 range
+    all_ids = async_ids | sync_ids
+    assert all_ids.issubset(set(range(1, 11))), (
+        f"Unexpected IDs: {all_ids - set(range(1, 11))}"
+    )
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_async_iterator_multiple_batches(connection, admin):
+    """Verify async iteration works across multiple network poll cycles.
+
+    _async_poll does a single bounded poll per call.  Writing 20 records
+    to multiple buckets ensures the Python generator must loop through
+    several _async_poll calls to collect them all.
+    """
+    table_path = fluss.TablePath("fluss", "py_test_async_multi_batch")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(
+        schema, bucket_count=3, bucket_keys=["id"]
+    )
+    await admin.create_table(
+        table_path, table_descriptor, ignore_if_exists=False
+    )
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+
+    num_records = 20
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [
+                pa.array(list(range(1, num_records + 1)), type=pa.int32()),
+                pa.array([f"multi{i}" for i in range(1, num_records + 1)]),
+            ],
+            schema=pa.schema(
+                [pa.field("id", pa.int32()), pa.field("val", pa.string())]
+            ),
+        )
+    )
+    await writer.flush()
+
+    scanner = await table.new_scan().create_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner.subscribe_buckets(
+        {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}
+    )
+
+    collected = []
+
+    async def consume_all():
+        async for record in scanner:
+            collected.append(record)
+            if len(collected) >= num_records:
+                break
+
+    await consume_all()
+    assert len(collected) == num_records, (
+        f"Expected {num_records} records, got {len(collected)}"
+    )
+
+    # Verify all IDs are present (order may vary due to bucketing)
+    ids = sorted(r.row["id"] for r in collected)
+    assert ids == list(range(1, num_records + 1))
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_batch_async_iterator(connection, admin):
+    """Test the Python asynchronous iterator loop (`async for`) on a batch LogScanner.
+
+    With our __aiter__ dispatch, a batch-based scanner should yield RecordBatch
+    objects (not ScanRecord). Each yielded item has .batch (PyArrow RecordBatch),
+    .bucket, .base_offset, .last_offset.
+    """
+    table_path = fluss.TablePath("fluss", "py_test_batch_async_iter")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())])
+    )
+    await admin.create_table(table_path, fluss.TableDescriptor(schema))
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [
+                pa.array(list(range(1, 7)), type=pa.int32()),
+                pa.array([f"bv{i}" for i in range(1, 7)]),
+            ],
+            schema=pa.schema(
+                [pa.field("id", pa.int32()), pa.field("val", pa.string())]
+            ),
+        )
+    )
+    await writer.flush()
+
+    batch_scanner = await table.new_scan().create_record_batch_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    batch_scanner.subscribe_buckets(
+        {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}
+    )
+
+    collected_batches = []
+    total_rows = 0
+
+    async def consume_batches():
+        nonlocal total_rows
+        async for rb in batch_scanner:
+            collected_batches.append(rb)
+            total_rows += rb.batch.num_rows
+            if total_rows >= 6:
+                break
+
+    await consume_batches()
+
+    assert total_rows >= 6, f"Expected >=6 total rows, got {total_rows}"
+    assert len(collected_batches) > 0
+
+    # Verify each yielded item is a RecordBatch with expected attributes
+    for rb in collected_batches:
+        assert hasattr(rb, "batch"), "RecordBatch should have .batch"
+        assert hasattr(rb, "bucket"), "RecordBatch should have .bucket"
+        assert hasattr(rb, "base_offset"), "RecordBatch should have .base_offset"
+        assert hasattr(rb, "last_offset"), "RecordBatch should have .last_offset"
+        # .batch should be a PyArrow RecordBatch
+        arrow_batch = rb.batch
+        assert isinstance(arrow_batch, pa.RecordBatch), (
+            f"Expected PyArrow RecordBatch, got {type(arrow_batch).__name__}"
+        )
+        assert arrow_batch.num_columns == 2
+        assert set(arrow_batch.schema.names) == {"id", "val"}
+
+    # Verify all 6 IDs are present
+    all_ids = []
+    for rb in collected_batches:
+        all_ids.extend(rb.batch.column("id").to_pylist())
+    assert sorted(all_ids[:6]) == [1, 2, 3, 4, 5, 6]
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_batch_async_iterator_break_no_leak(connection, admin):
+    """Verify that breaking out of batch `async for` does not leak resources.
+
+    After breaking, the scanner must still be usable for synchronous
+    poll_record_batch() calls, proving no leaked task or lock.
+    """
+    table_path = fluss.TablePath("fluss", "py_test_batch_async_break")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())])
+    )
+    await admin.create_table(table_path, fluss.TableDescriptor(schema))
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [
+                pa.array(list(range(1, 11)), type=pa.int32()),
+                pa.array([f"bl{i}" for i in range(1, 11)]),
+            ],
+            schema=pa.schema(
+                [pa.field("id", pa.int32()), pa.field("val", pa.string())]
+            ),
+        )
+    )
+    await writer.flush()
+
+    batch_scanner = await table.new_scan().create_record_batch_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    batch_scanner.subscribe_buckets(
+        {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}
+    )
+
+    # Phase 1: async for with early break (collect just 1 batch)
+    first_batch = None
+
+    async def consume_and_break():
+        nonlocal first_batch
+        async for rb in batch_scanner:
+            first_batch = rb
+            break
+
+    await consume_and_break()
+    assert first_batch is not None, "Should have received at least 1 batch"
+    assert first_batch.batch.num_rows > 0
+
+    # Phase 2: sync poll_record_batch() must still work — proves no leak
+    remaining = await batch_scanner.poll_record_batch(2000)
+    assert remaining is not None, "poll_record_batch() should return (not deadlock)"
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_batch_async_iterator_multiple_batches(connection, admin):
+    """Verify batch async iteration works across multiple network poll cycles.
+
+    Writing 20 records to 3 buckets ensures the generator must loop through
+    several _async_poll_batches calls to collect them all.
+    """
+    table_path = fluss.TablePath("fluss", "py_test_batch_async_multi")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    schema = fluss.Schema(
+        pa.schema([pa.field("id", pa.int32()), pa.field("val", pa.string())])
+    )
+    table_descriptor = fluss.TableDescriptor(
+        schema, bucket_count=3, bucket_keys=["id"]
+    )
+    await admin.create_table(
+        table_path, table_descriptor, ignore_if_exists=False
+    )
+
+    table = await connection.get_table(table_path)
+    writer = table.new_append().create_writer()
+
+    num_records = 20
+    writer.write_arrow_batch(
+        pa.RecordBatch.from_arrays(
+            [
+                pa.array(list(range(1, num_records + 1)), type=pa.int32()),
+                pa.array([f"bm{i}" for i in range(1, num_records + 1)]),
+            ],
+            schema=pa.schema(
+                [pa.field("id", pa.int32()), pa.field("val", pa.string())]
+            ),
+        )
+    )
+    await writer.flush()
+
+    batch_scanner = await table.new_scan().create_record_batch_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    batch_scanner.subscribe_buckets(
+        {i: fluss.EARLIEST_OFFSET for i in range(num_buckets)}
+    )
+
+    all_ids = []
+
+    async def consume_all():
+        async for rb in batch_scanner:
+            all_ids.extend(rb.batch.column("id").to_pylist())
+            if len(all_ids) >= num_records:
+                break
+
+    await consume_all()
+    assert len(all_ids) >= num_records, (
+        f"Expected >={num_records} IDs, got {len(all_ids)}"
+    )
+    assert sorted(all_ids[:num_records]) == list(range(1, num_records + 1))
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+async def _poll_records(scanner, expected_count, timeout_s=10):
+    """Poll a record-based scanner until expected_count records are collected."""
+    collected = []
+    deadline = time.monotonic() + timeout_s
+    while len(collected) < expected_count and time.monotonic() < deadline:
+        records = await scanner.poll(5000)
+        collected.extend(records)
+    return collected
+
+
+async def _poll_arrow_ids(scanner, expected_count, timeout_s=10):
+    """Poll a batch scanner and extract 'id' column values."""
+    all_ids = []
+    deadline = time.monotonic() + timeout_s
+    while len(all_ids) < expected_count and time.monotonic() < deadline:
+        arrow_table = await scanner.poll_arrow(5000)
+        if arrow_table.num_rows > 0:
+            all_ids.extend(arrow_table.column("id").to_pylist())
+    return all_ids
+
+
+async def test_append_and_scan_with_array(connection, admin):
+    """Test appending and scanning with array columns."""
+    table_path = fluss.TablePath("fluss", "py_test_append_and_scan_with_array")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    pa_schema = pa.schema(
+        [
+            pa.field("id", pa.int32()),
+            pa.field("tags", pa.list_(pa.string())),
+            pa.field("scores", pa.list_(pa.int32())),
+        ]
+    )
+    schema = fluss.Schema(pa_schema)
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    append_writer = table.new_append().create_writer()
+
+    # Batch 1: Testing standard lists
+    batch1 = pa.RecordBatch.from_arrays(
+        [
+            pa.array([1, 2], type=pa.int32()),
+            pa.array([["a", "b"], ["c"]], type=pa.list_(pa.string())),
+            pa.array([[10, 20], [30]], type=pa.list_(pa.int32())),
+        ],
+        schema=pa_schema,
+    )
+    append_writer.write_arrow_batch(batch1)
+
+    # Batch 2: Testing null values inside arrays and null arrays
+    batch2 = pa.RecordBatch.from_arrays(
+        [
+            pa.array([3, 4, 5, 6], type=pa.int32()),
+            pa.array([["d", None], None, [], [None]], type=pa.list_(pa.string())),
+            pa.array([[40, 50], [60], None, []], type=pa.list_(pa.int32())),
+        ],
+        schema=pa_schema,
+    )
+    append_writer.write_arrow_batch(batch2)
+    await append_writer.flush()
+
+    # Verify via LogScanner (record-by-record)
+    scanner = await table.new_scan().create_log_scanner()
+    scanner.subscribe_buckets({0: fluss.EARLIEST_OFFSET})
+    records = await _poll_records(scanner, expected_count=6)
+
+    assert len(records) == 6
+    records.sort(key=lambda r: r.row["id"])
+
+    # Verify Batch 1
+    assert records[0].row["tags"] == ["a", "b"]
+    assert records[0].row["scores"] == [10, 20]
+    assert records[1].row["tags"] == ["c"]
+    assert records[1].row["scores"] == [30]
+
+    # Verify Batch 2
+    assert records[2].row["tags"] == ["d", None]
+    assert records[2].row["scores"] == [40, 50]
+    assert records[3].row["tags"] is None
+    assert records[3].row["scores"] == [60]
+    assert records[4].row["tags"] == []
+    assert records[4].row["scores"] is None
+    assert records[5].row["tags"] == [None]
+    assert records[5].row["scores"] == []
+
+    # Verify via to_arrow (batch-based)
+    scanner2 = await table.new_scan().create_record_batch_log_scanner()
+    scanner2.subscribe_buckets({0: fluss.EARLIEST_OFFSET})
+    result_table = await scanner2.to_arrow()
+
+    assert result_table.num_rows == 6
+    assert result_table.column("tags").to_pylist() == [
+        ["a", "b"],
+        ["c"],
+        ["d", None],
+        None,
+        [],
+        [None],
+    ]
+    assert result_table.column("scores").to_pylist() == [
+        [10, 20],
+        [30],
+        [40, 50],
+        [60],
+        None,
+        [],
+    ]
+
+
+
+
+async def test_append_rows_with_array(connection, admin):
+    """Test appending rows with array data as Python lists and scanning."""
+    table_path = fluss.TablePath("fluss", "py_test_append_rows_with_array")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    pa_schema = pa.schema(
+        [
+            pa.field("id", pa.int32()),
+            pa.field("tags", pa.list_(pa.string())),
+            pa.field("scores", pa.list_(pa.int32())),
+        ]
+    )
+    schema = fluss.Schema(pa_schema)
+    table_descriptor = fluss.TableDescriptor(schema)
+    await admin.create_table(table_path, table_descriptor, ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    append_writer = table.new_append().create_writer()
+
+    # Append rows using dicts with lists
+    append_writer.append({"id": 1, "tags": ["a", "b"], "scores": [10, 20]})
+    append_writer.append({"id": 2, "tags": ["c"], "scores": [30]})
+    # Append row using list with nested list (null handling)
+    append_writer.append([3, None, [40, None, 60]])
+    
+    await append_writer.flush()
+
+    scanner = await table.new_scan().create_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    records = await _poll_records(scanner, expected_count=3)
+    assert len(records) == 3
+
+    rows = sorted([r.row for r in records], key=lambda r: r["id"])
+    assert rows[0] == {"id": 1, "tags": ["a", "b"], "scores": [10, 20]}
+    assert rows[1] == {"id": 2, "tags": ["c"], "scores": [30]}
+    # Note: records[2].row["tags"] will be None, records[2].row["scores"] will be [40, None, 60]
+    assert rows[2]["id"] == 3
+    assert rows[2]["tags"] is None
+    assert rows[2]["scores"] == [40, None, 60]
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_append_rows_with_nested_array(connection, admin):
+    """Test appending rows with nested array data (ARRAY<ARRAY<INT>>) and scanning."""
+    table_path = fluss.TablePath("fluss", "py_test_append_rows_with_nested_array")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    pa_schema = pa.schema([
+        pa.field("id", pa.int32()),
+        pa.field("matrix", pa.list_(pa.list_(pa.int32()))),
+    ])
+    schema = fluss.Schema(pa_schema)
+    await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    append_writer = table.new_append().create_writer()
+
+    # Append nested lists
+    append_writer.append({"id": 1, "matrix": [[1, 2], [3, 4]]})
+    append_writer.append({"id": 2, "matrix": [[], [5], [6, 7, 8]]})
+    append_writer.append({"id": 3, "matrix": None})
+    append_writer.append({"id": 4, "matrix": [[1, None], None, []]})
+    append_writer.append({"id": 5, "matrix": [[None, None]]})
+    
+    await append_writer.flush()
+
+    scanner = await table.new_scan().create_log_scanner()
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+    records = await _poll_records(scanner, expected_count=5)
+    assert len(records) == 5
+
+    rows = sorted([r.row for r in records], key=lambda r: r["id"])
+    assert rows[0] == {"id": 1, "matrix": [[1, 2], [3, 4]]}
+    assert rows[1] == {"id": 2, "matrix": [[], [5], [6, 7, 8]]}
+    assert rows[2] == {"id": 3, "matrix": None}
+    assert rows[3] == {"id": 4, "matrix": [[1, None], None, []]}
+    assert rows[4] == {"id": 5, "matrix": [[None, None]]}
+
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
+
+
+async def test_append_rows_with_invalid_array(connection, admin):
+    """Test that appending invalid data to an array column raises an error."""
+    table_path = fluss.TablePath("fluss", "py_test_append_rows_with_invalid_array")
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+
+    pa_schema = pa.schema([
+        pa.field("id", pa.int32()),
+        pa.field("tags", pa.list_(pa.string())),
+    ])
+    schema = fluss.Schema(pa_schema)
+    await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=False)
+
+    table = await connection.get_table(table_path)
+    append_writer = table.new_append().create_writer()
+
+    # Appending a string instead of a list should raise an error
+    with pytest.raises(Exception, match="Expected sequence for Array column"):
+        append_writer.append({"id": 4, "tags": "not_a_list"})
+    
+    await admin.drop_table(table_path, ignore_if_not_exists=False)
diff --git a/fluss-rust/bindings/python/test/test_sasl_auth.py b/fluss-rust/bindings/python/test/test_sasl_auth.py
new file mode 100644
index 0000000000..6889f1ab67
--- /dev/null
+++ b/fluss-rust/bindings/python/test/test_sasl_auth.py
@@ -0,0 +1,108 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Integration tests for SASL/PLAIN authentication.
+
+Mirrors the Rust integration tests in crates/fluss/tests/integration/sasl_auth.rs.
+"""
+
+import pytest
+
+import fluss
+
+
+async def test_sasl_connect_with_valid_credentials(sasl_bootstrap_servers):
+    """Verify that a client with correct SASL credentials can connect and perform operations."""
+    config = fluss.Config({
+        "bootstrap.servers": sasl_bootstrap_servers,
+        "security.protocol": "sasl",
+        "security.sasl.mechanism": "PLAIN",
+        "security.sasl.username": "admin",
+        "security.sasl.password": "admin-secret",
+    })
+    conn = await fluss.FlussConnection.create(config)
+    admin = conn.get_admin()
+
+    db_name = "py_sasl_test_valid_db"
+    db_descriptor = fluss.DatabaseDescriptor(comment="created via SASL auth")
+    await admin.create_database(db_name, db_descriptor, ignore_if_exists=True)
+
+    assert await admin.database_exists(db_name)
+
+    # Cleanup
+    await admin.drop_database(db_name, ignore_if_not_exists=True, cascade=True)
+    await conn.close()
+
+
+async def test_sasl_connect_with_second_user(sasl_bootstrap_servers):
+    """Verify that a second user can also authenticate successfully."""
+    config = fluss.Config({
+        "bootstrap.servers": sasl_bootstrap_servers,
+        "security.protocol": "sasl",
+        "security.sasl.mechanism": "PLAIN",
+        "security.sasl.username": "alice",
+        "security.sasl.password": "alice-secret",
+    })
+    conn = await fluss.FlussConnection.create(config)
+    admin = conn.get_admin()
+
+    # Basic operation to confirm functional connection
+    assert not await admin.database_exists("some_nonexistent_db_alice")
+    await conn.close()
+
+
+async def test_sasl_connect_with_wrong_password(sasl_bootstrap_servers):
+    """Verify that wrong credentials are rejected with AUTHENTICATE_EXCEPTION."""
+    config = fluss.Config({
+        "bootstrap.servers": sasl_bootstrap_servers,
+        "security.protocol": "sasl",
+        "security.sasl.mechanism": "PLAIN",
+        "security.sasl.username": "admin",
+        "security.sasl.password": "wrong-password",
+    })
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await fluss.FlussConnection.create(config)
+
+    assert exc_info.value.error_code == fluss.ErrorCode.AUTHENTICATE_EXCEPTION
+
+
+async def test_sasl_connect_with_unknown_user(sasl_bootstrap_servers):
+    """Verify that a nonexistent user is rejected with AUTHENTICATE_EXCEPTION."""
+    config = fluss.Config({
+        "bootstrap.servers": sasl_bootstrap_servers,
+        "security.protocol": "sasl",
+        "security.sasl.mechanism": "PLAIN",
+        "security.sasl.username": "nonexistent_user",
+        "security.sasl.password": "some-password",
+    })
+    with pytest.raises(fluss.FlussError) as exc_info:
+        await fluss.FlussConnection.create(config)
+
+    assert exc_info.value.error_code == fluss.ErrorCode.AUTHENTICATE_EXCEPTION
+
+
+async def test_sasl_client_to_plaintext_server(plaintext_bootstrap_servers):
+    """Verify that a SASL-configured client fails when connecting to a plaintext server."""
+    config = fluss.Config({
+        "bootstrap.servers": plaintext_bootstrap_servers,
+        "security.protocol": "sasl",
+        "security.sasl.mechanism": "PLAIN",
+        "security.sasl.username": "admin",
+        "security.sasl.password": "admin-secret",
+    })
+    with pytest.raises(fluss.FlussError):
+        await fluss.FlussConnection.create(config)
diff --git a/fluss-rust/bindings/python/test/test_schema.py b/fluss-rust/bindings/python/test/test_schema.py
new file mode 100644
index 0000000000..dfd9cf5619
--- /dev/null
+++ b/fluss-rust/bindings/python/test/test_schema.py
@@ -0,0 +1,97 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Unit tests for Schema (no cluster required)."""
+
+import pyarrow as pa
+
+import fluss
+
+
+def test_get_primary_keys():
+    fields = pa.schema([
+        pa.field("id", pa.int32()),
+        pa.field("name", pa.string()),
+    ])
+
+    schema_with_pk = fluss.Schema(fields, primary_keys=["id"])
+    assert schema_with_pk.get_primary_keys() == ["id"]
+
+    schema_without_pk = fluss.Schema(fields)
+    assert schema_without_pk.get_primary_keys() == []
+
+
+def test_schema_with_array():
+    # Test that a schema can be constructed from a pyarrow schema containing a list
+    fields = pa.schema(
+        [
+            pa.field("id", pa.int32()),
+            pa.field("tags", pa.list_(pa.string())),
+        ]
+    )
+    schema = fluss.Schema(fields)
+    assert schema.get_column_names() == ["id", "tags"]
+    assert schema.get_column_types() == ["int", "array<string>"]
+
+
+def test_nullable_fields():
+    fields = pa.schema(
+        [
+            pa.field("id", pa.int32(), nullable=False),
+            pa.field("name", pa.string()),
+        ]
+    )
+    schema = fluss.Schema(fields)
+    assert schema.get_column_types() == ["int NOT NULL", "string"]
+    assert schema.get_columns() == [("id", "int NOT NULL"), ("name", "string")]
+
+
+def test_pk_forces_non_nullable():
+    fields = pa.schema(
+        [
+            pa.field("id", pa.int32()),
+            pa.field("name", pa.string()),
+        ]
+    )
+    schema = fluss.Schema(fields, primary_keys=["id"])
+    types = schema.get_column_types()
+    assert types[0] == "int NOT NULL"
+    assert types[1] == "string"
+
+
+def test_nested_list_nullability():
+    fields = pa.schema(
+        [
+            pa.field(
+                "tags",
+                pa.list_(pa.field("item", pa.string(), nullable=False)),
+            ),
+            pa.field("ids", pa.list_(pa.int32()), nullable=False),
+            pa.field(
+                "strict_ids",
+                pa.list_(pa.field("item", pa.int32(), nullable=False)),
+                nullable=False,
+            ),
+        ]
+    )
+    schema = fluss.Schema(fields)
+    types = schema.get_column_types()
+    assert types[0] == "array<string NOT NULL>"
+    assert types[1] == "array<int> NOT NULL"
+    assert types[2] == "array<int NOT NULL> NOT NULL"
+
+
diff --git a/fluss-rust/copyright.txt b/fluss-rust/copyright.txt
new file mode 100644
index 0000000000..d5519133ed
--- /dev/null
+++ b/fluss-rust/copyright.txt
@@ -0,0 +1,17 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
\ No newline at end of file
diff --git a/fluss-rust/crates/examples/Cargo.toml b/fluss-rust/crates/examples/Cargo.toml
new file mode 100644
index 0000000000..45f029ee8c
--- /dev/null
+++ b/fluss-rust/crates/examples/Cargo.toml
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+edition = { workspace = true }
+license = { workspace = true }
+name = "fluss-examples"
+rust-version = { workspace = true }
+version = { workspace = true }
+
+
+[dependencies]
+fluss = { workspace = true, features = ["storage-all"] }
+tokio = { workspace = true }
+clap = { workspace = true }
+
+[target.'cfg(not(target_env = "msvc"))'.dependencies]
+tikv-jemallocator = "0.6"
+
+[[example]]
+name = "example-table"
+path = "src/example_table.rs"
+
+[[example]]
+name = "example-upsert-lookup"
+path = "src/example_kv_table.rs"
+
+[[example]]
+name = "example-partitioned-upsert-lookup"
+path = "src/example_partitioned_kv_table.rs"
+
+[[example]]
+name = "example-prefix-lookup"
+path = "src/example_prefix_lookup.rs"
+
+[[example]]
+name = "example-partitioned-prefix-lookup"
+path = "src/example_partitioned_prefix_lookup.rs"
diff --git a/fluss-rust/crates/examples/DEPENDENCIES.rust.tsv b/fluss-rust/crates/examples/DEPENDENCIES.rust.tsv
new file mode 100644
index 0000000000..5af4754d0c
--- /dev/null
+++ b/fluss-rust/crates/examples/DEPENDENCIES.rust.tsv
@@ -0,0 +1,300 @@
+crate	Apache-2.0	Apache-2.0 WITH LLVM-exception	BSD-2-Clause	BSD-3-Clause	BSL-1.0	CC0-1.0	CDLA-Permissive-2.0	ISC	LGPL-2.1-or-later	MIT	Unicode-3.0	Unlicense	Zlib
+ahash@0.8.12	X									X			
+aho-corasick@1.1.4										X		X	
+android_system_properties@0.1.5	X									X			
+anstream@1.0.0	X									X			
+anstyle@1.0.14	X									X			
+anstyle-parse@1.0.0	X									X			
+anstyle-query@1.1.5	X									X			
+anstyle-wincon@3.0.11	X									X			
+anyhow@1.0.102	X									X			
+arrow@57.3.0	X												
+arrow-arith@57.3.0	X												
+arrow-array@57.3.0	X												
+arrow-buffer@57.3.0	X												
+arrow-cast@57.3.0	X												
+arrow-csv@57.3.0	X												
+arrow-data@57.3.0	X												
+arrow-ipc@57.3.0	X												
+arrow-json@57.3.0	X												
+arrow-ord@57.3.0	X												
+arrow-row@57.3.0	X												
+arrow-schema@57.3.0	X												
+arrow-select@57.3.0	X												
+arrow-string@57.3.0	X												
+async-trait@0.1.89	X									X			
+atoi@2.0.0										X			
+atomic-waker@1.1.2	X									X			
+autocfg@1.5.0	X									X			
+backon@1.6.0	X												
+base64@0.22.1	X									X			
+bigdecimal@0.4.10	X									X			
+bitflags@2.11.0	X									X			
+bitvec@1.0.1										X			
+block-buffer@0.10.4	X									X			
+bumpalo@3.20.2	X									X			
+byteorder@1.5.0										X		X	
+bytes@1.11.1										X			
+cc@1.2.57	X									X			
+cfg-if@1.0.4	X									X			
+chrono@0.4.44	X									X			
+clap@4.6.0	X									X			
+clap_builder@4.6.0	X									X			
+clap_derive@4.6.0	X									X			
+clap_lex@1.1.0	X									X			
+colorchoice@1.0.5	X									X			
+const-oid@0.9.6	X									X			
+const-random@0.1.18	X									X			
+const-random-macro@0.1.16	X									X			
+core-foundation-sys@0.8.7	X									X			
+cpufeatures@0.2.17	X									X			
+crc32c@0.6.8	X									X			
+crossbeam-utils@0.8.21	X									X			
+crunchy@0.2.4										X			
+crypto-common@0.1.7	X									X			
+csv@1.4.0										X		X	
+csv-core@0.1.13										X		X	
+dashmap@6.1.0										X			
+delegate@0.13.5	X									X			
+digest@0.10.7	X									X			
+displaydoc@0.2.5	X									X			
+either@1.15.0	X									X			
+equivalent@1.0.2	X									X			
+errno@0.3.14	X									X			
+fastrand@2.3.0	X									X			
+find-msvc-tools@0.1.9	X									X			
+fixedbitset@0.5.7	X									X			
+flatbuffers@25.12.19	X												
+fluss-examples@0.1.0	X												
+fluss-rs@0.1.0	X												
+fnv@1.0.7	X									X			
+foldhash@0.1.5													X
+form_urlencoded@1.2.2	X									X			
+funty@2.0.0										X			
+futures@0.3.32	X									X			
+futures-channel@0.3.32	X									X			
+futures-core@0.3.32	X									X			
+futures-executor@0.3.32	X									X			
+futures-io@0.3.32	X									X			
+futures-macro@0.3.32	X									X			
+futures-sink@0.3.32	X									X			
+futures-task@0.3.32	X									X			
+futures-util@0.3.32	X									X			
+generic-array@0.14.7										X			
+getrandom@0.2.17	X									X			
+getrandom@0.3.4	X									X			
+getrandom@0.4.2	X									X			
+gloo-timers@0.3.0	X									X			
+h2@0.4.13										X			
+half@2.7.1	X									X			
+hashbrown@0.14.5	X									X			
+hashbrown@0.15.5	X									X			
+hashbrown@0.16.1	X									X			
+heck@0.5.0	X									X			
+hex@0.4.3	X									X			
+hmac@0.12.1	X									X			
+home@0.5.12	X									X			
+http@1.4.0	X									X			
+http-body@1.0.1										X			
+http-body-util@0.1.3										X			
+httparse@1.10.1	X									X			
+httpdate@1.0.3	X									X			
+hyper@1.8.1										X			
+hyper-rustls@0.27.7	X							X		X			
+hyper-util@0.1.20										X			
+iana-time-zone@0.1.65	X									X			
+iana-time-zone-haiku@0.1.2	X									X			
+icu_collections@2.1.1											X		
+icu_locale_core@2.1.1											X		
+icu_normalizer@2.1.1											X		
+icu_normalizer_data@2.1.1											X		
+icu_properties@2.1.2											X		
+icu_properties_data@2.1.2											X		
+icu_provider@2.1.1											X		
+idna@1.1.0	X									X			
+idna_adapter@1.2.1	X									X			
+indexmap@2.13.0	X									X			
+ipnet@2.12.0	X									X			
+iri-string@0.7.11	X									X			
+is_terminal_polyfill@1.70.2	X									X			
+itertools@0.14.0	X									X			
+itoa@1.0.18	X									X			
+jiff@0.2.23										X		X	
+jiff-tzdb@0.1.6										X		X	
+jiff-tzdb-platform@0.1.3										X		X	
+jobserver@0.1.34	X									X			
+js-sys@0.3.91	X									X			
+lexical-core@1.0.6	X									X			
+lexical-parse-float@1.0.6	X									X			
+lexical-parse-integer@1.0.6	X									X			
+lexical-util@1.0.7	X									X			
+lexical-write-float@1.0.6	X									X			
+lexical-write-integer@1.0.6	X									X			
+libc@0.2.183	X									X			
+libm@0.2.16										X			
+linked-hash-map@0.5.6	X									X			
+linux-raw-sys@0.12.1	X	X								X			
+litemap@0.8.1											X		
+lock_api@0.4.14	X									X			
+log@0.4.29	X									X			
+lz4_flex@0.12.1										X			
+md-5@0.10.6	X									X			
+memchr@2.8.0										X		X	
+mio@1.1.1										X			
+multimap@0.10.1	X									X			
+num-bigint@0.4.6	X									X			
+num-complex@0.4.6	X									X			
+num-integer@0.1.46	X									X			
+num-traits@0.2.19	X									X			
+once_cell@1.21.4	X									X			
+once_cell_polyfill@1.70.2	X									X			
+opendal@0.55.0	X												
+ordered-float@5.1.0										X			
+parking_lot@0.12.5	X									X			
+parking_lot_core@0.9.12	X									X			
+parse-display@0.10.0	X									X			
+parse-display-derive@0.10.0	X									X			
+percent-encoding@2.3.2	X									X			
+petgraph@0.8.3	X									X			
+pin-project-lite@0.2.17	X									X			
+pin-utils@0.1.0	X									X			
+pkg-config@0.3.32	X									X			
+portable-atomic@1.13.1	X									X			
+portable-atomic-util@0.2.6	X									X			
+potential_utf@0.1.4											X		
+ppv-lite86@0.2.21	X									X			
+prettyplease@0.2.37	X									X			
+proc-macro2@1.0.106	X									X			
+prost@0.14.3	X												
+prost-build@0.14.3	X												
+prost-derive@0.14.3	X												
+prost-types@0.14.3	X												
+quick-xml@0.37.5										X			
+quick-xml@0.38.4										X			
+quote@1.0.45	X									X			
+r-efi@5.3.0	X								X	X			
+r-efi@6.0.0	X								X	X			
+radium@0.7.0										X			
+rand@0.8.5	X									X			
+rand@0.9.2	X									X			
+rand_chacha@0.3.1	X									X			
+rand_chacha@0.9.0	X									X			
+rand_core@0.6.4	X									X			
+rand_core@0.9.5	X									X			
+redox_syscall@0.5.18										X			
+regex@1.12.3	X									X			
+regex-automata@0.4.14	X									X			
+regex-syntax@0.8.10	X									X			
+reqsign@0.16.5	X												
+reqwest@0.12.28	X									X			
+ring@0.17.14	X							X					
+rustc_version@0.4.1	X									X			
+rustix@1.1.4	X	X								X			
+rustls@0.23.37	X							X		X			
+rustls-pki-types@1.14.0	X									X			
+rustls-webpki@0.103.10								X					
+rustversion@1.0.22	X									X			
+ryu@1.0.23	X				X								
+scopeguard@1.2.0	X									X			
+semver@1.0.27	X									X			
+serde@1.0.228	X									X			
+serde_core@1.0.228	X									X			
+serde_derive@1.0.228	X									X			
+serde_json@1.0.149	X									X			
+serde_urlencoded@0.7.1	X									X			
+sha1@0.10.6	X									X			
+sha2@0.10.9	X									X			
+shlex@1.3.0	X									X			
+signal-hook-registry@1.4.8	X									X			
+simdutf8@0.1.5	X									X			
+slab@0.4.12										X			
+smallvec@1.15.1	X									X			
+snafu@0.8.9	X									X			
+snafu-derive@0.8.9	X									X			
+socket2@0.6.3	X									X			
+stable_deref_trait@1.2.1	X									X			
+strsim@0.11.1										X			
+structmeta@0.3.0	X									X			
+structmeta-derive@0.3.0	X									X			
+strum@0.26.3										X			
+strum_macros@0.26.4										X			
+subtle@2.6.1				X									
+syn@2.0.117	X									X			
+sync_wrapper@1.0.2	X												
+synstructure@0.13.2										X			
+tap@1.0.1										X			
+tempfile@3.27.0	X									X			
+thiserror@1.0.69	X									X			
+thiserror-impl@1.0.69	X									X			
+tikv-jemalloc-sys@0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7	X									X			
+tikv-jemallocator@0.6.1	X									X			
+tiny-keccak@2.0.2						X							
+tinystr@0.8.2											X		
+tokio@1.50.0										X			
+tokio-macros@2.6.1										X			
+tokio-rustls@0.26.4	X									X			
+tokio-util@0.7.18										X			
+tower@0.5.3										X			
+tower-http@0.6.8										X			
+tower-layer@0.3.3										X			
+tower-service@0.3.3										X			
+tracing@0.1.44										X			
+tracing-attributes@0.1.31										X			
+tracing-core@0.1.36										X			
+try-lock@0.2.5										X			
+twox-hash@2.1.2										X			
+typenum@1.19.0	X									X			
+unicode-ident@1.0.24	X									X	X		
+untrusted@0.9.0								X					
+url@2.5.8	X									X			
+utf8_iter@1.0.4	X									X			
+utf8parse@0.2.2	X									X			
+uuid@1.22.0	X									X			
+value-bag@1.12.0	X									X			
+version_check@0.9.5	X									X			
+want@0.3.1										X			
+wasi@0.11.1+wasi-snapshot-preview1	X	X								X			
+wasip2@1.0.2+wasi-0.2.9	X	X								X			
+wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06	X	X								X			
+wasm-bindgen@0.2.114	X									X			
+wasm-bindgen-futures@0.4.64	X									X			
+wasm-bindgen-macro@0.2.114	X									X			
+wasm-bindgen-macro-support@0.2.114	X									X			
+wasm-bindgen-shared@0.2.114	X									X			
+wasm-streams@0.4.2	X									X			
+web-sys@0.3.91	X									X			
+webpki-roots@1.0.6							X						
+windows-core@0.62.2	X									X			
+windows-implement@0.60.2	X									X			
+windows-interface@0.59.3	X									X			
+windows-link@0.2.1	X									X			
+windows-result@0.4.1	X									X			
+windows-strings@0.5.1	X									X			
+windows-sys@0.52.0	X									X			
+windows-sys@0.61.2	X									X			
+windows-targets@0.52.6	X									X			
+windows_aarch64_gnullvm@0.52.6	X									X			
+windows_aarch64_msvc@0.52.6	X									X			
+windows_i686_gnu@0.52.6	X									X			
+windows_i686_gnullvm@0.52.6	X									X			
+windows_i686_msvc@0.52.6	X									X			
+windows_x86_64_gnu@0.52.6	X									X			
+windows_x86_64_gnullvm@0.52.6	X									X			
+windows_x86_64_msvc@0.52.6	X									X			
+wit-bindgen@0.51.0	X	X								X			
+writeable@0.6.2											X		
+wyz@0.5.1										X			
+yoke@0.8.1											X		
+yoke-derive@0.8.1											X		
+zerocopy@0.8.47	X		X							X			
+zerocopy-derive@0.8.47	X		X							X			
+zerofrom@0.1.6											X		
+zerofrom-derive@0.1.6											X		
+zeroize@1.8.2	X									X			
+zerotrie@0.2.3											X		
+zerovec@0.11.5											X		
+zerovec-derive@0.11.2											X		
+zmij@1.0.21										X			
+zstd@0.13.3										X			
+zstd-safe@7.2.4	X									X			
+zstd-sys@2.0.16+zstd.1.5.7	X									X			
diff --git a/fluss-rust/crates/examples/src/example_kv_table.rs b/fluss-rust/crates/examples/src/example_kv_table.rs
new file mode 100644
index 0000000000..ad12ed79cf
--- /dev/null
+++ b/fluss-rust/crates/examples/src/example_kv_table.rs
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use clap::Parser;
+use fluss::client::FlussConnection;
+use fluss::config::Config;
+use fluss::error::Result;
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+use fluss::row::{GenericRow, InternalRow};
+
+#[tokio::main]
+#[allow(dead_code)]
+pub async fn main() -> Result<()> {
+    let mut config = Config::parse();
+    config.bootstrap_servers = "127.0.0.1:9123".to_string();
+
+    let conn = FlussConnection::new(config).await?;
+
+    let table_descriptor = TableDescriptor::builder()
+        .schema(
+            Schema::builder()
+                .column("id", DataTypes::int())
+                .column("name", DataTypes::string())
+                .column("age", DataTypes::bigint())
+                .primary_key(vec!["id"])
+                .build()?,
+        )
+        .build()?;
+
+    let table_path = TablePath::new("fluss", "rust_upsert_lookup_example");
+
+    let admin = conn.get_admin()?;
+    admin
+        .create_table(&table_path, &table_descriptor, true)
+        .await?;
+    println!(
+        "Created KV Table:\n {}\n",
+        admin.get_table_info(&table_path).await?
+    );
+
+    let table = conn.get_table(&table_path).await?;
+    let table_upsert = table.new_upsert()?;
+    let upsert_writer = table_upsert.create_writer()?;
+
+    println!("\n=== Upserting ===");
+    for (id, name, age) in [(1, "Verso", 32i64), (2, "Noco", 25), (3, "Esquie", 35)] {
+        let mut row = GenericRow::new(3);
+        row.set_field(0, id);
+        row.set_field(1, name);
+        row.set_field(2, age);
+        upsert_writer.upsert(&row)?;
+        println!("Upserted: {row:?}");
+    }
+    upsert_writer.flush().await?;
+
+    println!("\n=== Looking up ===");
+    let mut lookuper = table.new_lookup()?.create_lookuper()?;
+
+    for id in 1..=3 {
+        let result = lookuper.lookup(&make_key(id)).await?;
+        let row = result.get_single_row()?.unwrap();
+        println!(
+            "Found id={id}: name={}, age={}",
+            row.get_string(1)?,
+            row.get_long(2)?
+        );
+    }
+
+    println!("\n=== Updating ===");
+    let mut row = GenericRow::new(3);
+    row.set_field(0, 1);
+    row.set_field(1, "Verso");
+    row.set_field(2, 33i64);
+    upsert_writer.upsert(&row)?.await?;
+    println!("Updated: {row:?}");
+
+    let result = lookuper.lookup(&make_key(1)).await?;
+    let row = result.get_single_row()?.unwrap();
+    println!(
+        "Verified update: name={}, age={}",
+        row.get_string(1)?,
+        row.get_long(2)?
+    );
+
+    println!("\n=== Deleting ===");
+    // For delete, only primary key field needs to be set; other fields can remain null
+    let mut row = GenericRow::new(3);
+    row.set_field(0, 2);
+    upsert_writer.delete(&row)?.await?;
+    println!("Deleted row with id=2");
+
+    let result = lookuper.lookup(&make_key(2)).await?;
+    if result.get_single_row()?.is_none() {
+        println!("Verified deletion");
+    }
+
+    Ok(())
+}
+
+fn make_key(id: i32) -> GenericRow<'static> {
+    let mut row = GenericRow::new(1);
+    row.set_field(0, id);
+    row
+}
diff --git a/fluss-rust/crates/examples/src/example_partitioned_kv_table.rs b/fluss-rust/crates/examples/src/example_partitioned_kv_table.rs
new file mode 100644
index 0000000000..944d8d4962
--- /dev/null
+++ b/fluss-rust/crates/examples/src/example_partitioned_kv_table.rs
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use clap::Parser;
+use fluss::client::{FlussAdmin, FlussConnection};
+use fluss::config::Config;
+use fluss::error::Result;
+use fluss::metadata::{DataTypes, PartitionSpec, Schema, TableDescriptor, TablePath};
+use fluss::row::{GenericRow, InternalRow};
+use std::collections::HashMap;
+
+#[tokio::main]
+#[allow(dead_code)]
+pub async fn main() -> Result<()> {
+    let mut config = Config::parse();
+    config.bootstrap_servers = "127.0.0.1:9123".to_string();
+
+    let conn = FlussConnection::new(config).await?;
+
+    let table_descriptor = TableDescriptor::builder()
+        .schema(
+            Schema::builder()
+                .column("id", DataTypes::int())
+                .column("region", DataTypes::string())
+                .column("zone", DataTypes::bigint())
+                .column("score", DataTypes::bigint())
+                .primary_key(vec!["id", "region", "zone"])
+                .build()?,
+        )
+        .partitioned_by(vec!["region", "zone"])
+        .build()?;
+
+    let table_path = TablePath::new("fluss", "partitioned_kv_example");
+
+    let admin = conn.get_admin()?;
+    admin
+        .create_table(&table_path, &table_descriptor, true)
+        .await?;
+    println!(
+        "Created KV Table:\n {}\n",
+        admin.get_table_info(&table_path).await?
+    );
+
+    create_partition(&table_path, &admin, "APAC", 1).await;
+    create_partition(&table_path, &admin, "EMEA", 2).await;
+    create_partition(&table_path, &admin, "US", 3).await;
+
+    let table = conn.get_table(&table_path).await?;
+    let table_upsert = table.new_upsert()?;
+    let upsert_writer = table_upsert.create_writer()?;
+
+    println!("\n=== Upserting ===");
+    for (id, region, zone, score) in [
+        (1001, "APAC", 1i64, 1234i64),
+        (1002, "EMEA", 2, 2234),
+        (1003, "US", 3, 3234),
+    ] {
+        let mut row = GenericRow::new(4);
+        row.set_field(0, id);
+        row.set_field(1, region);
+        row.set_field(2, zone);
+        row.set_field(3, score);
+        upsert_writer.upsert(&row)?;
+        println!("Upserted: {row:?}");
+    }
+    upsert_writer.flush().await?;
+
+    println!("\n=== Looking up ===");
+    let mut lookuper = table.new_lookup()?.create_lookuper()?;
+
+    for (id, region, zone) in [(1001, "APAC", 1i64), (1002, "EMEA", 2), (1003, "US", 3)] {
+        let result = lookuper
+            .lookup(&make_key(id, region, zone))
+            .await
+            .expect("lookup");
+        let row = result.get_single_row()?.unwrap();
+        println!(
+            "Found id={id}: region={}, zone={}, score={}",
+            row.get_string(1)?,
+            row.get_long(2)?,
+            row.get_long(3)?
+        );
+    }
+
+    println!("\n=== Updating ===");
+    let mut row = GenericRow::new(4);
+    row.set_field(0, 1001);
+    row.set_field(1, "APAC");
+    row.set_field(2, 1i64);
+    row.set_field(3, 4321i64);
+    upsert_writer.upsert(&row)?.await?;
+    println!("Updated: {row:?}");
+
+    let result = lookuper.lookup(&make_key(1001, "APAC", 1)).await?;
+    let row = result.get_single_row()?.unwrap();
+    println!(
+        "Verified update: region={}, zone={}",
+        row.get_string(1)?,
+        row.get_long(2)?
+    );
+
+    println!("\n=== Deleting ===");
+    let mut row = GenericRow::new(4);
+    row.set_field(0, 1002);
+    row.set_field(1, "EMEA");
+    row.set_field(2, 2i64);
+    upsert_writer.delete(&row)?.await?;
+    println!("Deleted: {row:?}");
+
+    let result = lookuper.lookup(&make_key(1002, "EMEA", 2)).await?;
+    if result.get_single_row()?.is_none() {
+        println!("Verified deletion");
+    }
+
+    Ok(())
+}
+
+async fn create_partition(table_path: &TablePath, admin: &FlussAdmin, region: &str, zone: i64) {
+    let mut partition_values = HashMap::new();
+    partition_values.insert("region".to_string(), region.to_string());
+    partition_values.insert("zone".to_string(), zone.to_string());
+    let partition_spec = PartitionSpec::new(partition_values);
+
+    admin
+        .create_partition(table_path, &partition_spec, true)
+        .await
+        .unwrap();
+}
+
+fn make_key(id: i32, region: &str, zone: i64) -> GenericRow<'static> {
+    let mut row = GenericRow::new(4);
+    row.set_field(0, id);
+    row.set_field(1, region.to_string());
+    row.set_field(2, zone);
+    row
+}
diff --git a/fluss-rust/crates/examples/src/example_partitioned_prefix_lookup.rs b/fluss-rust/crates/examples/src/example_partitioned_prefix_lookup.rs
new file mode 100644
index 0000000000..b212b0fd45
--- /dev/null
+++ b/fluss-rust/crates/examples/src/example_partitioned_prefix_lookup.rs
@@ -0,0 +1,144 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use clap::Parser;
+use fluss::client::{FlussAdmin, FlussConnection};
+use fluss::config::Config;
+use fluss::error::Result;
+use fluss::metadata::{DataTypes, PartitionSpec, Schema, TableDescriptor, TablePath};
+use fluss::row::{GenericRow, InternalRow};
+use std::collections::HashMap;
+
+#[tokio::main]
+#[allow(dead_code)]
+pub async fn main() -> Result<()> {
+    let mut config = Config::parse();
+    config.bootstrap_servers = "127.0.0.1:9123".to_string();
+
+    let conn = FlussConnection::new(config).await?;
+
+    // Partitioned schema: pk is (region, user_id, session_id, event_seq),
+    // `region` is the partition key, and the bucket key (user_id, session_id)
+    // is a prefix of the *non-partition* portion of the primary key — which is
+    // the condition for prefix lookup on a partitioned table. The lookup
+    // key must include the partition column(s) in addition to the bucket
+    // prefix, so we look up by (region, user_id, session_id).
+    let table_descriptor = TableDescriptor::builder()
+        .schema(
+            Schema::builder()
+                .column("region", DataTypes::string())
+                .column("user_id", DataTypes::int())
+                .column("session_id", DataTypes::string())
+                .column("event_seq", DataTypes::bigint())
+                .column("event_data", DataTypes::string())
+                .primary_key(vec!["region", "user_id", "session_id", "event_seq"])
+                .build()?,
+        )
+        .partitioned_by(vec!["region"])
+        .distributed_by(
+            Some(3),
+            vec!["user_id".to_string(), "session_id".to_string()],
+        )
+        .build()?;
+
+    let table_path = TablePath::new("fluss", "rust_partitioned_prefix_lookup_example");
+
+    let admin = conn.get_admin()?;
+    admin
+        .create_table(&table_path, &table_descriptor, true)
+        .await?;
+    println!(
+        "Created partitioned KV Table:\n {}\n",
+        admin.get_table_info(&table_path).await?
+    );
+
+    create_partition(&table_path, &admin, "US").await;
+    create_partition(&table_path, &admin, "EU").await;
+
+    let table = conn.get_table(&table_path).await?;
+    let table_upsert = table.new_upsert()?;
+    let upsert_writer = table_upsert.create_writer()?;
+
+    println!("\n=== Upserting session events ===");
+    for (region, user_id, session_id, event_seq, event_data) in [
+        ("US", 1, "sess-a", 1i64, "open"),
+        ("US", 1, "sess-a", 2, "click"),
+        ("US", 1, "sess-a", 3, "close"),
+        ("US", 2, "sess-b", 1, "open"),
+        ("EU", 1, "sess-a", 1, "open"),
+    ] {
+        let mut row = GenericRow::new(5);
+        row.set_field(0, region);
+        row.set_field(1, user_id);
+        row.set_field(2, session_id);
+        row.set_field(3, event_seq);
+        row.set_field(4, event_data);
+        upsert_writer.upsert(&row)?;
+        println!("Upserted: {row:?}");
+    }
+    upsert_writer.flush().await?;
+
+    println!("\n=== Prefix lookup by (region, user_id, session_id) ===");
+    let mut prefix_lookuper = table
+        .new_lookup()?
+        .lookup_by(vec![
+            "region".to_string(),
+            "user_id".to_string(),
+            "session_id".to_string(),
+        ])
+        .create_lookuper()?;
+
+    for (region, user_id, session_id) in [
+        ("US", 1, "sess-a"),
+        ("US", 2, "sess-b"),
+        ("EU", 1, "sess-a"),
+        ("EU", 1, "sess-missing"),
+    ] {
+        let result = prefix_lookuper
+            .lookup(&make_prefix(region, user_id, session_id))
+            .await?;
+        let rows = result.get_rows()?;
+        println!(
+            "region={region}, user_id={user_id}, session_id={session_id}: {} event(s)",
+            rows.len()
+        );
+        for row in &rows {
+            println!("  seq={}, data={}", row.get_long(3)?, row.get_string(4)?);
+        }
+    }
+
+    Ok(())
+}
+
+async fn create_partition(table_path: &TablePath, admin: &FlussAdmin, region: &str) {
+    let mut partition_values = HashMap::new();
+    partition_values.insert("region".to_string(), region.to_string());
+    let partition_spec = PartitionSpec::new(partition_values);
+
+    admin
+        .create_partition(table_path, &partition_spec, true)
+        .await
+        .unwrap();
+}
+
+fn make_prefix(region: &str, user_id: i32, session_id: &str) -> GenericRow<'static> {
+    let mut row = GenericRow::new(3);
+    row.set_field(0, region.to_string());
+    row.set_field(1, user_id);
+    row.set_field(2, session_id.to_string());
+    row
+}
diff --git a/fluss-rust/crates/examples/src/example_prefix_lookup.rs b/fluss-rust/crates/examples/src/example_prefix_lookup.rs
new file mode 100644
index 0000000000..12fc76dc13
--- /dev/null
+++ b/fluss-rust/crates/examples/src/example_prefix_lookup.rs
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use clap::Parser;
+use fluss::client::FlussConnection;
+use fluss::config::Config;
+use fluss::error::Result;
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+use fluss::row::{GenericRow, InternalRow};
+
+#[tokio::main]
+#[allow(dead_code)]
+pub async fn main() -> Result<()> {
+    let mut config = Config::parse();
+    config.bootstrap_servers = "127.0.0.1:9123".to_string();
+
+    let conn = FlussConnection::new(config).await?;
+
+    // Schema: primary key is (user_id, session_id, event_seq); the bucket key
+    // (user_id, session_id) is a strict prefix of the primary key, which is
+    // what enables prefix lookup.
+    let table_descriptor = TableDescriptor::builder()
+        .schema(
+            Schema::builder()
+                .column("user_id", DataTypes::int())
+                .column("session_id", DataTypes::string())
+                .column("event_seq", DataTypes::bigint())
+                .column("event_data", DataTypes::string())
+                .primary_key(vec!["user_id", "session_id", "event_seq"])
+                .build()?,
+        )
+        .distributed_by(
+            Some(3),
+            vec!["user_id".to_string(), "session_id".to_string()],
+        )
+        .build()?;
+
+    let table_path = TablePath::new("fluss", "rust_prefix_lookup_example");
+
+    let admin = conn.get_admin()?;
+    admin
+        .create_table(&table_path, &table_descriptor, true)
+        .await?;
+    println!(
+        "Created KV Table:\n {}\n",
+        admin.get_table_info(&table_path).await?
+    );
+
+    let table = conn.get_table(&table_path).await?;
+    let table_upsert = table.new_upsert()?;
+    let upsert_writer = table_upsert.create_writer()?;
+
+    println!("\n=== Upserting session events ===");
+    for (user_id, session_id, event_seq, event_data) in [
+        (1, "sess-a", 1i64, "open"),
+        (1, "sess-a", 2, "click"),
+        (1, "sess-a", 3, "close"),
+        (1, "sess-b", 1, "open"),
+        (2, "sess-c", 1, "open"),
+    ] {
+        let mut row = GenericRow::new(4);
+        row.set_field(0, user_id);
+        row.set_field(1, session_id);
+        row.set_field(2, event_seq);
+        row.set_field(3, event_data);
+        upsert_writer.upsert(&row)?;
+        println!("Upserted: {row:?}");
+    }
+    upsert_writer.flush().await?;
+
+    println!("\n=== Prefix lookup by (user_id, session_id) ===");
+    // `lookup_by` names the prefix columns. The resulting lookuper returns all
+    // rows whose primary key starts with the given prefix.
+    let mut prefix_lookuper = table
+        .new_lookup()?
+        .lookup_by(vec!["user_id".to_string(), "session_id".to_string()])
+        .create_lookuper()?;
+
+    for (user_id, session_id) in [
+        (1, "sess-a"),
+        (1, "sess-b"),
+        (2, "sess-c"),
+        (2, "sess-missing"),
+    ] {
+        let result = prefix_lookuper
+            .lookup(&make_prefix(user_id, session_id))
+            .await?;
+        let rows = result.get_rows()?;
+        println!(
+            "user_id={user_id}, session_id={session_id}: {} event(s)",
+            rows.len()
+        );
+        for row in &rows {
+            println!("  seq={}, data={}", row.get_long(2)?, row.get_string(3)?);
+        }
+    }
+
+    Ok(())
+}
+
+fn make_prefix(user_id: i32, session_id: &str) -> GenericRow<'static> {
+    let mut row = GenericRow::new(2);
+    row.set_field(0, user_id);
+    row.set_field(1, session_id.to_string());
+    row
+}
diff --git a/fluss-rust/crates/examples/src/example_table.rs b/fluss-rust/crates/examples/src/example_table.rs
new file mode 100644
index 0000000000..1f751f3c98
--- /dev/null
+++ b/fluss-rust/crates/examples/src/example_table.rs
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(not(target_env = "msvc"))]
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+mod example_kv_table;
+mod example_partitioned_kv_table;
+
+use clap::Parser;
+use fluss::client::FlussConnection;
+use fluss::config::Config;
+use fluss::error::Result;
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+use fluss::row::{GenericRow, InternalRow};
+use std::time::Duration;
+
+#[tokio::main]
+pub async fn main() -> Result<()> {
+    let mut config = Config::parse();
+    config.bootstrap_servers = "127.0.0.1:9123".to_string();
+
+    let conn = FlussConnection::new(config).await?;
+
+    let table_descriptor = TableDescriptor::builder()
+        .schema(
+            Schema::builder()
+                .column("c1", DataTypes::int())
+                .column("c2", DataTypes::string())
+                .column("c3", DataTypes::bigint())
+                .build()?,
+        )
+        .build()?;
+
+    let table_path = TablePath::new("fluss", "rust_test_long");
+
+    let admin = conn.get_admin()?;
+
+    admin
+        .create_table(&table_path, &table_descriptor, true)
+        .await?;
+
+    // 2: get the table
+    let table_info = admin.get_table_info(&table_path).await?;
+    print!("Get created table:\n {table_info}\n");
+
+    // write row
+    let mut row = GenericRow::new(3);
+    row.set_field(0, 22222);
+    row.set_field(1, "t2t");
+    row.set_field(2, 123_456_789_123i64);
+
+    let table = conn.get_table(&table_path).await?;
+    let append_writer = table.new_append()?.create_writer()?;
+    // Fire-and-forget: queue writes then flush
+    append_writer.append(&row)?;
+    let mut row = GenericRow::new(3);
+    row.set_field(0, 233333);
+    row.set_field(1, "tt44");
+    row.set_field(2, 987_654_321_987i64);
+    append_writer.append(&row)?;
+    append_writer.flush().await?;
+
+    // scan rows
+    let log_scanner = table.new_scan().create_log_scanner()?;
+    log_scanner.subscribe(0, 0).await?;
+
+    loop {
+        let scan_records = log_scanner.poll(Duration::from_secs(10)).await?;
+        println!("Start to poll records......");
+        for record in scan_records {
+            let row = record.row();
+            println!(
+                "{{{}, {}, {}}}@{}",
+                row.get_int(0)?,
+                row.get_string(1)?,
+                row.get_long(2)?,
+                record.offset()
+            );
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss-test-cluster/Cargo.toml b/fluss-rust/crates/fluss-test-cluster/Cargo.toml
new file mode 100644
index 0000000000..977df307b8
--- /dev/null
+++ b/fluss-rust/crates/fluss-test-cluster/Cargo.toml
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+name = "fluss-test-cluster"
+edition.workspace = true
+version.workspace = true
+license.workspace = true
+rust-version.workspace = true
+publish = false
+
+[[bin]]
+name = "fluss-test-cluster"
+path = "src/main.rs"
+
+[dependencies]
+fluss = { workspace = true }
+testcontainers = "0.27.2"
+tokio = { workspace = true }
+clap = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
diff --git a/fluss-rust/crates/fluss-test-cluster/build.rs b/fluss-rust/crates/fluss-test-cluster/build.rs
new file mode 100644
index 0000000000..0145196bc3
--- /dev/null
+++ b/fluss-rust/crates/fluss-test-cluster/build.rs
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+fn main() {
+    println!("cargo:rerun-if-changed=test-images.env");
+    for line in std::fs::read_to_string("test-images.env")
+        .expect("test-images.env not found")
+        .lines()
+    {
+        let line = line.trim();
+        if line.is_empty() || line.starts_with('#') {
+            continue;
+        }
+        if let Some((key, value)) = line.split_once('=') {
+            println!("cargo:rustc-env={}={}", key.trim(), value.trim());
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss-test-cluster/src/lib.rs b/fluss-rust/crates/fluss-test-cluster/src/lib.rs
new file mode 100644
index 0000000000..76199f7ed7
--- /dev/null
+++ b/fluss-rust/crates/fluss-test-cluster/src/lib.rs
@@ -0,0 +1,515 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use fluss::client::FlussConnection;
+use fluss::config::Config;
+use std::collections::HashMap;
+use std::mem::ManuallyDrop;
+use std::sync::Arc;
+use std::time::Duration;
+use testcontainers::core::ContainerPort;
+use testcontainers::runners::AsyncRunner;
+use testcontainers::{ContainerAsync, GenericImage, ImageExt};
+
+pub const FLUSS_IMAGE: &str = env!("FLUSS_IMAGE");
+pub const FLUSS_VERSION: &str = env!("FLUSS_VERSION");
+pub const ZOOKEEPER_IMAGE: &str = env!("ZOOKEEPER_IMAGE");
+pub const ZOOKEEPER_VERSION: &str = env!("ZOOKEEPER_VERSION");
+
+#[derive(serde::Serialize, serde::Deserialize, Debug)]
+pub struct ClusterInfo {
+    pub bootstrap_servers: String,
+    pub sasl_bootstrap_servers: Option<String>,
+}
+
+pub struct FlussTestingClusterBuilder {
+    number_of_tablet_servers: u16,
+    network: &'static str,
+    cluster_conf: HashMap<String, String>,
+    testing_name: String,
+    remote_data_dir: Option<std::path::PathBuf>,
+    sasl_enabled: bool,
+    sasl_users: Vec<(String, String)>,
+    coordinator_host_port: u16,
+    plain_client_port: Option<u16>,
+    image: String,
+    image_tag: String,
+}
+
+impl FlussTestingClusterBuilder {
+    pub fn new(testing_name: impl Into<String>) -> Self {
+        Self::new_with_cluster_conf(testing_name.into(), &HashMap::default())
+    }
+
+    pub fn with_remote_data_dir(mut self, dir: std::path::PathBuf) -> Self {
+        std::fs::create_dir_all(&dir).expect("Failed to create remote data directory");
+        self.remote_data_dir = Some(dir);
+        self
+    }
+
+    pub fn with_sasl(mut self, users: Vec<(String, String)>) -> Self {
+        self.sasl_enabled = true;
+        self.sasl_users = users;
+        self.plain_client_port = Some(self.coordinator_host_port + 100);
+        self
+    }
+
+    pub fn with_port(mut self, port: u16) -> Self {
+        self.coordinator_host_port = port;
+        // Re-derive SASL port if SASL was already enabled.
+        if self.sasl_enabled {
+            self.plain_client_port = Some(port + 100);
+        }
+        self
+    }
+
+    pub fn new_with_cluster_conf(
+        testing_name: impl Into<String>,
+        conf: &HashMap<String, String>,
+    ) -> Self {
+        let mut cluster_conf = conf.clone();
+        cluster_conf.insert(
+            "netty.server.num-network-threads".to_string(),
+            "1".to_string(),
+        );
+        cluster_conf.insert(
+            "netty.server.num-worker-threads".to_string(),
+            "3".to_string(),
+        );
+
+        FlussTestingClusterBuilder {
+            number_of_tablet_servers: 1,
+            cluster_conf,
+            network: "fluss-cluster-network",
+            testing_name: testing_name.into(),
+            remote_data_dir: None,
+            sasl_enabled: false,
+            sasl_users: Vec::new(),
+            coordinator_host_port: 9123,
+            plain_client_port: None,
+            // runtime env overrides the compile-time default (server-compat CI lane)
+            image: std::env::var("FLUSS_IMAGE").unwrap_or_else(|_| FLUSS_IMAGE.to_string()),
+            image_tag: std::env::var("FLUSS_VERSION").unwrap_or_else(|_| FLUSS_VERSION.to_string()),
+        }
+    }
+
+    fn tablet_server_container_name(&self, server_id: u16) -> String {
+        format!("tablet-server-{}-{}", self.testing_name, server_id)
+    }
+
+    fn coordinator_server_container_name(&self) -> String {
+        format!("coordinator-server-{}", self.testing_name)
+    }
+
+    fn zookeeper_container_name(&self) -> String {
+        format!("zookeeper-{}", self.testing_name)
+    }
+
+    fn container_names(&self) -> Vec<String> {
+        std::iter::once(self.zookeeper_container_name())
+            .chain(std::iter::once(self.coordinator_server_container_name()))
+            .chain(
+                (0..self.number_of_tablet_servers).map(|id| self.tablet_server_container_name(id)),
+            )
+            .collect()
+    }
+
+    fn inject_sasl_conf(&mut self) {
+        if self.sasl_enabled
+            && !self.sasl_users.is_empty()
+            && !self.cluster_conf.contains_key("security.protocol.map")
+        {
+            self.cluster_conf.insert(
+                "security.protocol.map".to_string(),
+                "CLIENT:sasl".to_string(),
+            );
+            self.cluster_conf.insert(
+                "security.sasl.enabled.mechanisms".to_string(),
+                "plain".to_string(),
+            );
+            let user_entries: Vec<String> = self
+                .sasl_users
+                .iter()
+                .map(|(u, p)| format!("user_{}=\"{}\"", u, p))
+                .collect();
+            let jaas_config = format!(
+                "org.apache.fluss.security.auth.sasl.plain.PlainLoginModule required {};",
+                user_entries.join(" ")
+            );
+            self.cluster_conf
+                .insert("security.sasl.plain.jaas.config".to_string(), jaas_config);
+        }
+    }
+
+    fn bootstrap_addresses(&self) -> (String, Option<String>) {
+        if let Some(plain_port) = self.plain_client_port {
+            (
+                format!("127.0.0.1:{}", plain_port),
+                Some(format!("127.0.0.1:{}", self.coordinator_host_port)),
+            )
+        } else {
+            (format!("127.0.0.1:{}", self.coordinator_host_port), None)
+        }
+    }
+
+    fn all_containers_exist(&self) -> bool {
+        self.container_names().iter().all(|name| {
+            std::process::Command::new("docker")
+                .args(["ps", "-q", "--filter", &format!("name=^{}$", name)])
+                .output()
+                .map(|o| !String::from_utf8_lossy(&o.stdout).trim().is_empty())
+                .unwrap_or(false)
+        })
+    }
+
+    async fn start_all_containers(&mut self) -> Vec<ContainerAsync<GenericImage>> {
+        for name in &self.container_names() {
+            let _ = std::process::Command::new("docker")
+                .args(["rm", "-f", name])
+                .output();
+        }
+        self.inject_sasl_conf();
+
+        let mut containers = Vec::new();
+        containers.push(self.start_zookeeper().await);
+        containers.push(self.start_coordinator_server().await);
+        for server_id in 0..self.number_of_tablet_servers {
+            containers.push(self.start_tablet_server(server_id).await);
+        }
+        containers
+    }
+
+    /// Containers stop when the returned struct is dropped.
+    pub async fn build(&mut self) -> FlussTestingCluster {
+        let container_names = self.container_names();
+        let containers = self.start_all_containers().await;
+
+        let mut iter = containers.into_iter();
+        let zookeeper = Arc::new(iter.next().unwrap());
+        let coordinator_server = Arc::new(iter.next().unwrap());
+        let mut tablet_servers = HashMap::new();
+        for server_id in 0..self.number_of_tablet_servers {
+            tablet_servers.insert(server_id, Arc::new(iter.next().unwrap()));
+        }
+
+        let (bootstrap_servers, sasl_bootstrap_servers) = self.bootstrap_addresses();
+
+        FlussTestingCluster {
+            zookeeper,
+            coordinator_server,
+            tablet_servers,
+            bootstrap_servers,
+            sasl_bootstrap_servers,
+            remote_data_dir: self.remote_data_dir.clone(),
+            sasl_users: self.sasl_users.clone(),
+            container_names,
+        }
+    }
+
+    /// Containers outlive the process. Clean up via `stop_cluster()`.
+    /// Idempotent: if the cluster is already running, returns its info.
+    pub async fn build_detached(&mut self) -> ClusterInfo {
+        if !self.all_containers_exist() {
+            let containers = self.start_all_containers().await;
+            let _ = ManuallyDrop::new(containers);
+        }
+
+        let (bootstrap_servers, sasl_bootstrap_servers) = self.bootstrap_addresses();
+        ClusterInfo {
+            bootstrap_servers,
+            sasl_bootstrap_servers,
+        }
+    }
+
+    async fn start_zookeeper(&self) -> ContainerAsync<GenericImage> {
+        GenericImage::new(ZOOKEEPER_IMAGE, ZOOKEEPER_VERSION)
+            .with_network(self.network)
+            .with_container_name(self.zookeeper_container_name())
+            .start()
+            .await
+            .unwrap()
+    }
+
+    async fn start_coordinator_server(&mut self) -> ContainerAsync<GenericImage> {
+        let port = self.coordinator_host_port;
+        let container_name = self.coordinator_server_container_name();
+        let mut coordinator_confs = HashMap::new();
+        coordinator_confs.insert(
+            "zookeeper.address",
+            format!("{}:2181", self.zookeeper_container_name()),
+        );
+
+        if let Some(plain_port) = self.plain_client_port {
+            coordinator_confs.insert(
+                "bind.listeners",
+                format!(
+                    "INTERNAL://{}:0, CLIENT://{}:{}, PLAIN_CLIENT://{}:{}",
+                    container_name, container_name, port, container_name, plain_port
+                ),
+            );
+            coordinator_confs.insert(
+                "advertised.listeners",
+                format!(
+                    "CLIENT://localhost:{}, PLAIN_CLIENT://localhost:{}",
+                    port, plain_port
+                ),
+            );
+        } else {
+            coordinator_confs.insert(
+                "bind.listeners",
+                format!(
+                    "INTERNAL://{}:0, CLIENT://{}:{}",
+                    container_name, container_name, port
+                ),
+            );
+            coordinator_confs.insert(
+                "advertised.listeners",
+                format!("CLIENT://localhost:{}", port),
+            );
+        }
+
+        coordinator_confs.insert("internal.listener.name", "INTERNAL".to_string());
+
+        let mut image = GenericImage::new(&self.image, &self.image_tag)
+            .with_container_name(self.coordinator_server_container_name())
+            .with_mapped_port(port, ContainerPort::Tcp(port))
+            .with_network(self.network)
+            .with_cmd(vec!["coordinatorServer"])
+            .with_env_var(
+                "FLUSS_PROPERTIES",
+                self.to_fluss_properties_with(coordinator_confs),
+            );
+
+        if let Some(plain_port) = self.plain_client_port {
+            image = image.with_mapped_port(plain_port, ContainerPort::Tcp(plain_port));
+        }
+
+        image.start().await.unwrap()
+    }
+
+    async fn start_tablet_server(&self, server_id: u16) -> ContainerAsync<GenericImage> {
+        let port = self.coordinator_host_port;
+        let container_name = self.tablet_server_container_name(server_id);
+        let mut tablet_server_confs = HashMap::new();
+        let expose_host_port = port + 1 + server_id;
+        let tablet_server_id = format!("{}", server_id);
+
+        if let Some(plain_port) = self.plain_client_port {
+            let bind_listeners = format!(
+                "INTERNAL://{}:0, CLIENT://{}:{}, PLAIN_CLIENT://{}:{}",
+                container_name, container_name, port, container_name, plain_port,
+            );
+            let plain_expose_host_port = plain_port + 1 + server_id;
+            let advertised_listeners = format!(
+                "CLIENT://localhost:{}, PLAIN_CLIENT://localhost:{}",
+                expose_host_port, plain_expose_host_port
+            );
+            tablet_server_confs.insert("bind.listeners", bind_listeners);
+            tablet_server_confs.insert("advertised.listeners", advertised_listeners);
+        } else {
+            let bind_listeners = format!(
+                "INTERNAL://{}:0, CLIENT://{}:{}",
+                container_name, container_name, port,
+            );
+            let advertised_listeners = format!("CLIENT://localhost:{}", expose_host_port);
+            tablet_server_confs.insert("bind.listeners", bind_listeners);
+            tablet_server_confs.insert("advertised.listeners", advertised_listeners);
+        }
+
+        tablet_server_confs.insert(
+            "zookeeper.address",
+            format!("{}:2181", self.zookeeper_container_name()),
+        );
+        tablet_server_confs.insert("internal.listener.name", "INTERNAL".to_string());
+        tablet_server_confs.insert("tablet-server.id", tablet_server_id);
+
+        if let Some(remote_data_dir) = &self.remote_data_dir {
+            tablet_server_confs.insert(
+                "remote.data.dir",
+                remote_data_dir.to_string_lossy().to_string(),
+            );
+        }
+        let mut image = GenericImage::new(&self.image, &self.image_tag)
+            .with_cmd(vec!["tabletServer"])
+            .with_mapped_port(expose_host_port, ContainerPort::Tcp(port))
+            .with_network(self.network)
+            .with_container_name(self.tablet_server_container_name(server_id))
+            .with_env_var(
+                "FLUSS_PROPERTIES",
+                self.to_fluss_properties_with(tablet_server_confs),
+            );
+
+        if let Some(plain_port) = self.plain_client_port {
+            let plain_expose_host_port = plain_port + 1 + server_id;
+            image = image.with_mapped_port(plain_expose_host_port, ContainerPort::Tcp(plain_port));
+        }
+
+        if let Some(ref remote_data_dir) = self.remote_data_dir {
+            use testcontainers::core::Mount;
+            std::fs::create_dir_all(remote_data_dir)
+                .expect("Failed to create remote data directory for mount");
+            let host_path = remote_data_dir.to_string_lossy().to_string();
+            let container_path = remote_data_dir.to_string_lossy().to_string();
+            image = image.with_mount(Mount::bind_mount(host_path, container_path));
+        }
+
+        image.start().await.unwrap()
+    }
+
+    fn to_fluss_properties_with(&self, extra_properties: HashMap<&str, String>) -> String {
+        let mut fluss_properties = Vec::new();
+        for (k, v) in self.cluster_conf.iter() {
+            fluss_properties.push(format!("{}: {}", k, v));
+        }
+        for (k, v) in extra_properties.iter() {
+            fluss_properties.push(format!("{}: {}", k, v));
+        }
+        fluss_properties.join("\n")
+    }
+}
+
+#[derive(Clone)]
+#[allow(dead_code)] // Fields held for RAII.
+pub struct FlussTestingCluster {
+    zookeeper: Arc<ContainerAsync<GenericImage>>,
+    coordinator_server: Arc<ContainerAsync<GenericImage>>,
+    tablet_servers: HashMap<u16, Arc<ContainerAsync<GenericImage>>>,
+    bootstrap_servers: String,
+    sasl_bootstrap_servers: Option<String>,
+    remote_data_dir: Option<std::path::PathBuf>,
+    sasl_users: Vec<(String, String)>,
+    container_names: Vec<String>,
+}
+
+impl FlussTestingCluster {
+    pub fn stop(&self) {
+        for name in &self.container_names {
+            let _ = std::process::Command::new("docker")
+                .args(["rm", "-f", name])
+                .output();
+        }
+        if let Some(ref dir) = self.remote_data_dir {
+            let _ = std::fs::remove_dir_all(dir);
+        }
+    }
+
+    pub fn sasl_users(&self) -> &[(String, String)] {
+        &self.sasl_users
+    }
+
+    pub fn plaintext_bootstrap_servers(&self) -> &str {
+        &self.bootstrap_servers
+    }
+
+    pub async fn get_fluss_connection(&self) -> FlussConnection {
+        let config = Config {
+            writer_acks: "all".to_string(),
+            bootstrap_servers: self.bootstrap_servers.clone(),
+            ..Default::default()
+        };
+
+        self.connect_with_retry(config).await
+    }
+
+    pub async fn get_fluss_connection_with_sasl(
+        &self,
+        username: &str,
+        password: &str,
+    ) -> FlussConnection {
+        let bootstrap = self
+            .sasl_bootstrap_servers
+            .clone()
+            .unwrap_or_else(|| self.bootstrap_servers.clone());
+        let config = Config {
+            writer_acks: "all".to_string(),
+            bootstrap_servers: bootstrap,
+            security_protocol: "sasl".to_string(),
+            security_sasl_mechanism: "PLAIN".to_string(),
+            security_sasl_username: username.to_string(),
+            security_sasl_password: password.to_string(),
+            ..Default::default()
+        };
+
+        self.connect_with_retry(config).await
+    }
+
+    pub async fn try_fluss_connection_with_sasl(
+        &self,
+        username: &str,
+        password: &str,
+    ) -> fluss::error::Result<FlussConnection> {
+        let bootstrap = self
+            .sasl_bootstrap_servers
+            .clone()
+            .unwrap_or_else(|| self.bootstrap_servers.clone());
+        let config = Config {
+            writer_acks: "all".to_string(),
+            bootstrap_servers: bootstrap,
+            security_protocol: "sasl".to_string(),
+            security_sasl_mechanism: "PLAIN".to_string(),
+            security_sasl_username: username.to_string(),
+            security_sasl_password: password.to_string(),
+            ..Default::default()
+        };
+
+        FlussConnection::new(config).await
+    }
+
+    async fn connect_with_retry(&self, config: Config) -> FlussConnection {
+        let max_retries = 60;
+        let retry_interval = Duration::from_secs(1);
+
+        for attempt in 1..=max_retries {
+            match FlussConnection::new(config.clone()).await {
+                Ok(connection) => {
+                    return connection;
+                }
+                Err(e) => {
+                    if attempt == max_retries {
+                        panic!(
+                            "Failed to connect to Fluss cluster after {} attempts: {}",
+                            max_retries, e
+                        );
+                    }
+                    tokio::time::sleep(retry_interval).await;
+                }
+            }
+        }
+        unreachable!()
+    }
+}
+
+pub fn stop_cluster(name: &str) {
+    let prefixes = [
+        format!("zookeeper-{}", name),
+        format!("coordinator-server-{}", name),
+        format!("tablet-server-{}-", name),
+    ];
+    for prefix in &prefixes {
+        if let Ok(output) = std::process::Command::new("docker")
+            .args(["ps", "-aq", "--filter", &format!("name={}", prefix)])
+            .output()
+        {
+            let ids = String::from_utf8_lossy(&output.stdout);
+            for id in ids.split_whitespace() {
+                let _ = std::process::Command::new("docker")
+                    .args(["rm", "-f", id])
+                    .output();
+            }
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss-test-cluster/src/main.rs b/fluss-rust/crates/fluss-test-cluster/src/main.rs
new file mode 100644
index 0000000000..fc3a19f60e
--- /dev/null
+++ b/fluss-rust/crates/fluss-test-cluster/src/main.rs
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use clap::{Parser, Subcommand};
+use fluss::ServerType;
+use fluss::config::Config;
+use fluss_test_cluster::FlussTestingClusterBuilder;
+use std::time::Duration;
+
+#[derive(Parser)]
+#[command(about = "Manage a Fluss test cluster via testcontainers")]
+struct Cli {
+    #[command(subcommand)]
+    command: Command,
+}
+
+#[derive(Subcommand)]
+enum Command {
+    /// Start a Fluss test cluster (idempotent). Prints cluster info as JSON to stdout.
+    Start {
+        #[arg(long, default_value = "shared-test")]
+        name: String,
+        #[arg(long)]
+        sasl: bool,
+        #[arg(long, default_value_t = 9123)]
+        port: u16,
+    },
+    /// Stop and remove all containers for a cluster.
+    Stop {
+        #[arg(long, default_value = "shared-test")]
+        name: String,
+    },
+}
+
+#[tokio::main]
+async fn main() {
+    let cli = Cli::parse();
+
+    match cli.command {
+        Command::Start { name, sasl, port } => {
+            eprintln!("Starting Fluss test cluster '{}'...", name);
+
+            let mut builder = FlussTestingClusterBuilder::new(&name).with_port(port);
+
+            if sasl {
+                builder = builder.with_sasl(vec![
+                    ("admin".to_string(), "admin-secret".to_string()),
+                    ("alice".to_string(), "alice-secret".to_string()),
+                ]);
+            }
+
+            let info = builder.build_detached().await;
+            let start = std::time::Instant::now();
+
+            // Check plaintext endpoint only — can't verify SASL without credentials.
+            eprintln!("Waiting for cluster to be ready...");
+            loop {
+                let config = Config {
+                    bootstrap_servers: info.bootstrap_servers.clone(),
+                    ..Default::default()
+                };
+                if let Ok(conn) = fluss::client::FlussConnection::new(config).await {
+                    if let Ok(admin) = conn.get_admin() {
+                        if let Ok(nodes) = admin.get_server_nodes().await {
+                            if nodes
+                                .iter()
+                                .any(|n| *n.server_type() == ServerType::TabletServer)
+                            {
+                                break;
+                            }
+                        }
+                    }
+                }
+                if start.elapsed() >= Duration::from_secs(60) {
+                    eprintln!("TIMEOUT: cluster did not become ready within 60s");
+                    std::process::exit(1);
+                }
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+            eprintln!("Cluster ready.");
+            println!("CLUSTER_JSON: {}", serde_json::to_string(&info).unwrap());
+        }
+        Command::Stop { name } => {
+            eprintln!("Stopping Fluss test cluster '{}'...", name);
+            fluss_test_cluster::stop_cluster(&name);
+            eprintln!("Cluster stopped.");
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss-test-cluster/test-images.env b/fluss-rust/crates/fluss-test-cluster/test-images.env
new file mode 100644
index 0000000000..5cd914172c
--- /dev/null
+++ b/fluss-rust/crates/fluss-test-cluster/test-images.env
@@ -0,0 +1,4 @@
+FLUSS_IMAGE=apache/fluss
+FLUSS_VERSION=0.9.1-incubating
+ZOOKEEPER_IMAGE=zookeeper
+ZOOKEEPER_VERSION=3.9.2
diff --git a/fluss-rust/crates/fluss/Cargo.toml b/fluss-rust/crates/fluss/Cargo.toml
new file mode 100644
index 0000000000..feac8309f1
--- /dev/null
+++ b/fluss-rust/crates/fluss/Cargo.toml
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[package]
+edition = { workspace = true }
+license.workspace = true
+rust-version = { workspace = true }
+version = { workspace = true }
+name = "fluss-rs"
+authors = { workspace = true }
+description = "The official rust client of Apache Fluss (Incubating)"
+homepage = "https://clients.fluss.apache.org/user-guide/rust/installation/"
+repository = { workspace = true }
+keywords = { workspace = true }
+categories = { workspace = true }
+documentation = "https://docs.rs/fluss-rs"
+
+[lib]
+name = "fluss"
+
+[features]
+default = ["storage-memory", "storage-fs"]
+storage-all = ["storage-memory", "storage-fs", "storage-s3", "storage-oss"]
+
+storage-memory = ["opendal/services-memory"]
+storage-fs = ["opendal/services-fs"]
+storage-s3 = ["opendal/services-s3"]
+storage-oss = ["opendal/services-oss"]
+integration_tests = []
+
+[dependencies]
+arrow = { workspace = true }
+arrow-schema = "57.0.0"
+bitvec = "1"
+byteorder = "1.5"
+futures = "0.3"
+clap = { workspace = true }
+crc32c = "0.6.8"
+linked-hash-map = "0.5.6"
+prost = "0.14"
+rand = "0.9.3"
+serde = { workspace = true, features = ["rc"] }
+serde_json = { workspace = true }
+thiserror = "1.0"
+log = { version = "0.4", features = ["kv_std"] }
+metrics = { workspace = true }
+tokio = { workspace = true }
+parking_lot = "0.12"
+bytes = "1.10.1"
+dashmap = "6.1.0"
+bigdecimal = { workspace = true, features = ["serde"] }
+ordered-float = { version = "5", features = ["serde"] }
+parse-display = "0.10"
+jiff = { workspace = true }
+opendal = "0.55.0"
+url = "2.5.7"
+uuid = { version = "1.10", features = ["v4"] }
+tempfile = "3.23.0"
+snafu = "0.8.3"
+scopeguard = "1.2.0"
+delegate = "0.13.5"
+strum = "0.26"
+strum_macros = "0.26"
+
+[target.'cfg(target_arch = "wasm32")'.dependencies]
+jiff = { workspace = true, features = ["js"] }
+
+[dev-dependencies]
+metrics-util = "0.20"
+fluss-test-cluster = { path = "../fluss-test-cluster" }
+
+[build-dependencies]
+prost-build = "0.14"
diff --git a/fluss-rust/crates/fluss/DEPENDENCIES.rust.tsv b/fluss-rust/crates/fluss/DEPENDENCIES.rust.tsv
new file mode 100644
index 0000000000..85a865852a
--- /dev/null
+++ b/fluss-rust/crates/fluss/DEPENDENCIES.rust.tsv
@@ -0,0 +1,297 @@
+crate	Apache-2.0	Apache-2.0 WITH LLVM-exception	BSD-2-Clause	BSD-3-Clause	BSL-1.0	CC0-1.0	CDLA-Permissive-2.0	ISC	LGPL-2.1-or-later	MIT	Unicode-3.0	Unlicense	Zlib
+ahash@0.8.12	X									X			
+aho-corasick@1.1.4										X		X	
+android_system_properties@0.1.5	X									X			
+anstream@1.0.0	X									X			
+anstyle@1.0.14	X									X			
+anstyle-parse@1.0.0	X									X			
+anstyle-query@1.1.5	X									X			
+anstyle-wincon@3.0.11	X									X			
+anyhow@1.0.102	X									X			
+arrow@57.3.0	X												
+arrow-arith@57.3.0	X												
+arrow-array@57.3.0	X												
+arrow-buffer@57.3.0	X												
+arrow-cast@57.3.0	X												
+arrow-csv@57.3.0	X												
+arrow-data@57.3.0	X												
+arrow-ipc@57.3.0	X												
+arrow-json@57.3.0	X												
+arrow-ord@57.3.0	X												
+arrow-row@57.3.0	X												
+arrow-schema@57.3.0	X												
+arrow-select@57.3.0	X												
+arrow-string@57.3.0	X												
+async-trait@0.1.89	X									X			
+atoi@2.0.0										X			
+atomic-waker@1.1.2	X									X			
+autocfg@1.5.0	X									X			
+backon@1.6.0	X												
+base64@0.22.1	X									X			
+bigdecimal@0.4.10	X									X			
+bitflags@2.11.0	X									X			
+bitvec@1.0.1										X			
+block-buffer@0.10.4	X									X			
+bumpalo@3.20.2	X									X			
+byteorder@1.5.0										X		X	
+bytes@1.11.1										X			
+cc@1.2.57	X									X			
+cfg-if@1.0.4	X									X			
+chrono@0.4.44	X									X			
+clap@4.6.0	X									X			
+clap_builder@4.6.0	X									X			
+clap_derive@4.6.0	X									X			
+clap_lex@1.1.0	X									X			
+colorchoice@1.0.5	X									X			
+const-oid@0.9.6	X									X			
+const-random@0.1.18	X									X			
+const-random-macro@0.1.16	X									X			
+core-foundation-sys@0.8.7	X									X			
+cpufeatures@0.2.17	X									X			
+crc32c@0.6.8	X									X			
+crossbeam-utils@0.8.21	X									X			
+crunchy@0.2.4										X			
+crypto-common@0.1.7	X									X			
+csv@1.4.0										X		X	
+csv-core@0.1.13										X		X	
+dashmap@6.1.0										X			
+delegate@0.13.5	X									X			
+digest@0.10.7	X									X			
+displaydoc@0.2.5	X									X			
+either@1.15.0	X									X			
+equivalent@1.0.2	X									X			
+errno@0.3.14	X									X			
+fastrand@2.3.0	X									X			
+find-msvc-tools@0.1.9	X									X			
+fixedbitset@0.5.7	X									X			
+flatbuffers@25.12.19	X												
+fluss-rs@0.1.0	X												
+fnv@1.0.7	X									X			
+foldhash@0.1.5													X
+form_urlencoded@1.2.2	X									X			
+funty@2.0.0										X			
+futures@0.3.32	X									X			
+futures-channel@0.3.32	X									X			
+futures-core@0.3.32	X									X			
+futures-executor@0.3.32	X									X			
+futures-io@0.3.32	X									X			
+futures-macro@0.3.32	X									X			
+futures-sink@0.3.32	X									X			
+futures-task@0.3.32	X									X			
+futures-util@0.3.32	X									X			
+generic-array@0.14.7										X			
+getrandom@0.2.17	X									X			
+getrandom@0.3.4	X									X			
+getrandom@0.4.2	X									X			
+gloo-timers@0.3.0	X									X			
+h2@0.4.13										X			
+half@2.7.1	X									X			
+hashbrown@0.14.5	X									X			
+hashbrown@0.15.5	X									X			
+hashbrown@0.16.1	X									X			
+heck@0.5.0	X									X			
+hex@0.4.3	X									X			
+hmac@0.12.1	X									X			
+home@0.5.12	X									X			
+http@1.4.0	X									X			
+http-body@1.0.1										X			
+http-body-util@0.1.3										X			
+httparse@1.10.1	X									X			
+httpdate@1.0.3	X									X			
+hyper@1.8.1										X			
+hyper-rustls@0.27.7	X							X		X			
+hyper-util@0.1.20										X			
+iana-time-zone@0.1.65	X									X			
+iana-time-zone-haiku@0.1.2	X									X			
+icu_collections@2.1.1											X		
+icu_locale_core@2.1.1											X		
+icu_normalizer@2.1.1											X		
+icu_normalizer_data@2.1.1											X		
+icu_properties@2.1.2											X		
+icu_properties_data@2.1.2											X		
+icu_provider@2.1.1											X		
+idna@1.1.0	X									X			
+idna_adapter@1.2.1	X									X			
+indexmap@2.13.0	X									X			
+ipnet@2.12.0	X									X			
+iri-string@0.7.11	X									X			
+is_terminal_polyfill@1.70.2	X									X			
+itertools@0.14.0	X									X			
+itoa@1.0.18	X									X			
+jiff@0.2.23										X		X	
+jiff-tzdb@0.1.6										X		X	
+jiff-tzdb-platform@0.1.3										X		X	
+jobserver@0.1.34	X									X			
+js-sys@0.3.91	X									X			
+lexical-core@1.0.6	X									X			
+lexical-parse-float@1.0.6	X									X			
+lexical-parse-integer@1.0.6	X									X			
+lexical-util@1.0.7	X									X			
+lexical-write-float@1.0.6	X									X			
+lexical-write-integer@1.0.6	X									X			
+libc@0.2.183	X									X			
+libm@0.2.16										X			
+linked-hash-map@0.5.6	X									X			
+linux-raw-sys@0.12.1	X	X								X			
+litemap@0.8.1											X		
+lock_api@0.4.14	X									X			
+log@0.4.29	X									X			
+lz4_flex@0.12.1										X			
+md-5@0.10.6	X									X			
+memchr@2.8.0										X		X	
+mio@1.1.1										X			
+multimap@0.10.1	X									X			
+num-bigint@0.4.6	X									X			
+num-complex@0.4.6	X									X			
+num-integer@0.1.46	X									X			
+num-traits@0.2.19	X									X			
+once_cell@1.21.4	X									X			
+once_cell_polyfill@1.70.2	X									X			
+opendal@0.55.0	X												
+ordered-float@5.1.0										X			
+parking_lot@0.12.5	X									X			
+parking_lot_core@0.9.12	X									X			
+parse-display@0.10.0	X									X			
+parse-display-derive@0.10.0	X									X			
+percent-encoding@2.3.2	X									X			
+petgraph@0.8.3	X									X			
+pin-project-lite@0.2.17	X									X			
+pin-utils@0.1.0	X									X			
+pkg-config@0.3.32	X									X			
+portable-atomic@1.13.1	X									X			
+portable-atomic-util@0.2.6	X									X			
+potential_utf@0.1.4											X		
+ppv-lite86@0.2.21	X									X			
+prettyplease@0.2.37	X									X			
+proc-macro2@1.0.106	X									X			
+prost@0.14.3	X												
+prost-build@0.14.3	X												
+prost-derive@0.14.3	X												
+prost-types@0.14.3	X												
+quick-xml@0.37.5										X			
+quick-xml@0.38.4										X			
+quote@1.0.45	X									X			
+r-efi@5.3.0	X								X	X			
+r-efi@6.0.0	X								X	X			
+radium@0.7.0										X			
+rand@0.8.5	X									X			
+rand@0.9.2	X									X			
+rand_chacha@0.3.1	X									X			
+rand_chacha@0.9.0	X									X			
+rand_core@0.6.4	X									X			
+rand_core@0.9.5	X									X			
+redox_syscall@0.5.18										X			
+regex@1.12.3	X									X			
+regex-automata@0.4.14	X									X			
+regex-syntax@0.8.10	X									X			
+reqsign@0.16.5	X												
+reqwest@0.12.28	X									X			
+ring@0.17.14	X							X					
+rustc_version@0.4.1	X									X			
+rustix@1.1.4	X	X								X			
+rustls@0.23.37	X							X		X			
+rustls-pki-types@1.14.0	X									X			
+rustls-webpki@0.103.10								X					
+rustversion@1.0.22	X									X			
+ryu@1.0.23	X				X								
+scopeguard@1.2.0	X									X			
+semver@1.0.27	X									X			
+serde@1.0.228	X									X			
+serde_core@1.0.228	X									X			
+serde_derive@1.0.228	X									X			
+serde_json@1.0.149	X									X			
+serde_urlencoded@0.7.1	X									X			
+sha1@0.10.6	X									X			
+sha2@0.10.9	X									X			
+shlex@1.3.0	X									X			
+signal-hook-registry@1.4.8	X									X			
+simdutf8@0.1.5	X									X			
+slab@0.4.12										X			
+smallvec@1.15.1	X									X			
+snafu@0.8.9	X									X			
+snafu-derive@0.8.9	X									X			
+socket2@0.6.3	X									X			
+stable_deref_trait@1.2.1	X									X			
+strsim@0.11.1										X			
+structmeta@0.3.0	X									X			
+structmeta-derive@0.3.0	X									X			
+strum@0.26.3										X			
+strum_macros@0.26.4										X			
+subtle@2.6.1				X									
+syn@2.0.117	X									X			
+sync_wrapper@1.0.2	X												
+synstructure@0.13.2										X			
+tap@1.0.1										X			
+tempfile@3.27.0	X									X			
+thiserror@1.0.69	X									X			
+thiserror-impl@1.0.69	X									X			
+tiny-keccak@2.0.2						X							
+tinystr@0.8.2											X		
+tokio@1.50.0										X			
+tokio-macros@2.6.1										X			
+tokio-rustls@0.26.4	X									X			
+tokio-util@0.7.18										X			
+tower@0.5.3										X			
+tower-http@0.6.8										X			
+tower-layer@0.3.3										X			
+tower-service@0.3.3										X			
+tracing@0.1.44										X			
+tracing-attributes@0.1.31										X			
+tracing-core@0.1.36										X			
+try-lock@0.2.5										X			
+twox-hash@2.1.2										X			
+typenum@1.19.0	X									X			
+unicode-ident@1.0.24	X									X	X		
+untrusted@0.9.0								X					
+url@2.5.8	X									X			
+utf8_iter@1.0.4	X									X			
+utf8parse@0.2.2	X									X			
+uuid@1.22.0	X									X			
+value-bag@1.12.0	X									X			
+version_check@0.9.5	X									X			
+want@0.3.1										X			
+wasi@0.11.1+wasi-snapshot-preview1	X	X								X			
+wasip2@1.0.2+wasi-0.2.9	X	X								X			
+wasip3@0.4.0+wasi-0.3.0-rc-2026-01-06	X	X								X			
+wasm-bindgen@0.2.114	X									X			
+wasm-bindgen-futures@0.4.64	X									X			
+wasm-bindgen-macro@0.2.114	X									X			
+wasm-bindgen-macro-support@0.2.114	X									X			
+wasm-bindgen-shared@0.2.114	X									X			
+wasm-streams@0.4.2	X									X			
+web-sys@0.3.91	X									X			
+webpki-roots@1.0.6							X						
+windows-core@0.62.2	X									X			
+windows-implement@0.60.2	X									X			
+windows-interface@0.59.3	X									X			
+windows-link@0.2.1	X									X			
+windows-result@0.4.1	X									X			
+windows-strings@0.5.1	X									X			
+windows-sys@0.52.0	X									X			
+windows-sys@0.61.2	X									X			
+windows-targets@0.52.6	X									X			
+windows_aarch64_gnullvm@0.52.6	X									X			
+windows_aarch64_msvc@0.52.6	X									X			
+windows_i686_gnu@0.52.6	X									X			
+windows_i686_gnullvm@0.52.6	X									X			
+windows_i686_msvc@0.52.6	X									X			
+windows_x86_64_gnu@0.52.6	X									X			
+windows_x86_64_gnullvm@0.52.6	X									X			
+windows_x86_64_msvc@0.52.6	X									X			
+wit-bindgen@0.51.0	X	X								X			
+writeable@0.6.2											X		
+wyz@0.5.1										X			
+yoke@0.8.1											X		
+yoke-derive@0.8.1											X		
+zerocopy@0.8.47	X		X							X			
+zerocopy-derive@0.8.47	X		X							X			
+zerofrom@0.1.6											X		
+zerofrom-derive@0.1.6											X		
+zeroize@1.8.2	X									X			
+zerotrie@0.2.3											X		
+zerovec@0.11.5											X		
+zerovec-derive@0.11.2											X		
+zmij@1.0.21										X			
+zstd@0.13.3										X			
+zstd-safe@7.2.4	X									X			
+zstd-sys@2.0.16+zstd.1.5.7	X									X			
diff --git a/fluss-rust/crates/fluss/README.md b/fluss-rust/crates/fluss/README.md
new file mode 100644
index 0000000000..76dc0ec293
--- /dev/null
+++ b/fluss-rust/crates/fluss/README.md
@@ -0,0 +1,105 @@
+# Apache Fluss (Incubating) Official Rust Client
+
+Official Rust client library for [Apache Fluss (Incubating)](https://fluss.apache.org/).
+
+[![crates.io](https://img.shields.io/crates/v/fluss-rs.svg)](https://crates.io/crates/fluss-rs)
+[![docs.rs](https://img.shields.io/docsrs/fluss-rs)](https://docs.rs/fluss-rs/)
+
+## Usage
+
+The following example shows both **primary key (KV) tables** and **log tables** in one flow: connect, create a KV table (upsert + lookup), then create a log table (append + scan).
+
+```rust
+use fluss::client::EARLIEST_OFFSET;
+use fluss::client::FlussConnection;
+use fluss::config::Config;
+use fluss::error::Result;
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+use fluss::row::{GenericRow, InternalRow};
+use std::time::Duration;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let mut config = Config::default();
+    config.bootstrap_servers = "127.0.0.1:9123".to_string();
+    let connection = FlussConnection::new(config).await?;
+    let admin = connection.get_admin()?;
+
+    // ---- Primary key (KV) table: upsert and lookup ----
+    let kv_path = TablePath::new("fluss", "users");
+    let mut kv_schema = Schema::builder()
+        .column("id", DataTypes::int())
+        .column("name", DataTypes::string())
+        .column("age", DataTypes::bigint())
+        .primary_key(vec!["id"]);
+    let kv_descriptor = TableDescriptor::builder()
+        .schema(kv_schema.build()?)
+        .build()?;
+    admin.create_table(&kv_path, &kv_descriptor, false).await?;
+
+    let kv_table = connection.get_table(&kv_path).await?;
+    let upsert_writer = kv_table.new_upsert()?.create_writer()?;
+    let mut row = GenericRow::new(3);
+    row.set_field(0, 1i32);
+    row.set_field(1, "Alice");
+    row.set_field(2, 30i64);
+    upsert_writer.upsert(&row)?;
+    upsert_writer.flush().await?;
+
+    let mut lookuper = kv_table.new_lookup()?.create_lookuper()?;
+    let mut key = GenericRow::new(1);
+    key.set_field(0, 1i32);
+    let result = lookuper.lookup(&key).await?;
+    if let Some(r) = result.get_single_row()? {
+        println!("KV lookup: id={}, name={}, age={}",
+                 r.get_int(0)?, r.get_string(1)?, r.get_long(2)?);
+    }
+
+    // ---- Log table: append and scan ----
+    let log_path = TablePath::new("fluss", "events");
+    let log_schema = Schema::builder()
+        .column("ts", DataTypes::bigint())
+        .column("message", DataTypes::string())
+        .build()?;
+    let log_descriptor = TableDescriptor::builder()
+        .schema(log_schema)
+        .build()?;
+    admin.create_table(&log_path, &log_descriptor, false).await?;
+
+    let log_table = connection.get_table(&log_path).await?;
+    let append_writer = log_table.new_append()?.create_writer()?;
+    let mut event = GenericRow::new(2);
+    event.set_field(0, 1700000000i64);
+    event.set_field(1, "hello");
+    append_writer.append(&event)?;
+    append_writer.flush().await?;
+
+    let scanner = log_table.new_scan().create_log_scanner()?;
+    scanner.subscribe(0, EARLIEST_OFFSET).await?;
+    let scan_records = scanner.poll(Duration::from_secs(1)).await?;
+    for record in scan_records {
+        let r = record.row();
+        println!("Log scan: ts={}, message={}", r.get_long(0)?, r.get_string(1)?);
+    }
+
+    Ok(())
+}
+```
+
+## Storage Support
+
+The Fluss client reads remote data by accessing Fluss’s **remote files** (e.g. log segments and snapshots) directly. The following **remote file systems** are supported; enable the matching feature(s) for your deployment:
+
+| Storage Backend | Feature Flag | Status | Description |
+|----------------|--------------|--------|-------------|
+| Local Filesystem | `storage-fs` | ✅ Stable | Local filesystem storage |
+| Amazon S3 | `storage-s3` | ✅ Stable | Amazon S3 storage |
+| Alibaba Cloud OSS | `storage-oss` | ✅ Stable | Alibaba Cloud Object Storage Service |
+
+You can enable all storage backends at once using the `storage-all` feature flag.
+
+Example usage in Cargo.toml:
+```toml
+[dependencies]
+fluss-rs = { version = "0.x.x", features = ["storage-s3", "storage-fs"] }
+```
diff --git a/fluss-rust/crates/fluss/build.rs b/fluss-rust/crates/fluss/build.rs
new file mode 100644
index 0000000000..65d58e3592
--- /dev/null
+++ b/fluss-rust/crates/fluss/build.rs
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::io::Result;
+use std::path::Path;
+
+fn main() -> Result<()> {
+    let mut config = prost_build::Config::new();
+    config.bytes([
+        ".fluss.PbProduceLogReqForBucket.records",
+        ".fluss.PbPutKvReqForBucket.records",
+        ".fluss.PbLookupReqForBucket.keys",
+        ".fluss.PbPrefixLookupReqForBucket.keys",
+    ]);
+    // Published crates vendor the proto under proto/ (scripts/vendor-proto.sh);
+    // monorepo builds read the canonical proto directly from fluss-rpc.
+    let (proto, include_dir) = if Path::new("proto/FlussApi.proto").exists() {
+        ("proto/FlussApi.proto", "proto")
+    } else {
+        (
+            "../../../fluss-rpc/src/main/proto/FlussApi.proto",
+            "../../../fluss-rpc/src/main/proto",
+        )
+    };
+    config.compile_protos(&[proto], &[include_dir])?;
+    Ok(())
+}
diff --git a/fluss-rust/crates/fluss/src/bucketing/mod.rs b/fluss-rust/crates/fluss/src/bucketing/mod.rs
new file mode 100644
index 0000000000..1b43d12a23
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/bucketing/mod.rs
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::DataLakeFormat;
+use crate::util::murmur_hash;
+
+pub trait BucketingFunction: Sync + Send {
+    fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result<i32>;
+}
+
+impl dyn BucketingFunction {
+    /// Provides the bucketing function for a given [DataLakeFormat]
+    ///
+    /// # Arguments
+    /// * `lake_format` - Data lake format or none
+    ///
+    /// # Returns
+    /// * BucketingFunction
+    pub fn of(lake_format: Option<&DataLakeFormat>) -> Box<dyn BucketingFunction> {
+        match lake_format {
+            None => Box::new(FlussBucketingFunction),
+            Some(DataLakeFormat::Paimon) => Box::new(PaimonBucketingFunction),
+            Some(DataLakeFormat::Lance) => Box::new(FlussBucketingFunction),
+            Some(DataLakeFormat::Iceberg) => Box::new(IcebergBucketingFunction),
+        }
+    }
+}
+
+struct FlussBucketingFunction;
+impl BucketingFunction for FlussBucketingFunction {
+    fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result<i32> {
+        if bucket_key.is_empty() {
+            return Err(IllegalArgument {
+                message: "bucket_key must not be empty!".to_string(),
+            });
+        }
+
+        if num_buckets <= 0 {
+            return Err(IllegalArgument {
+                message: "num_buckets must be positive!".to_string(),
+            });
+        }
+
+        let key_hash = murmur_hash::fluss_hash_bytes(bucket_key)?;
+
+        Ok(murmur_hash::fluss_hash_i32(key_hash) % num_buckets)
+    }
+}
+
+struct PaimonBucketingFunction;
+impl BucketingFunction for PaimonBucketingFunction {
+    fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result<i32> {
+        if bucket_key.is_empty() {
+            return Err(IllegalArgument {
+                message: "bucket_key must not be empty!".to_string(),
+            });
+        }
+
+        if num_buckets <= 0 {
+            return Err(IllegalArgument {
+                message: "num_buckets must be positive!".to_string(),
+            });
+        }
+
+        let key_hash = murmur_hash::fluss_hash_bytes(bucket_key)?;
+
+        Ok((key_hash % num_buckets).abs())
+    }
+}
+
+struct IcebergBucketingFunction;
+impl BucketingFunction for IcebergBucketingFunction {
+    fn bucketing(&self, bucket_key: &[u8], num_buckets: i32) -> Result<i32> {
+        if bucket_key.is_empty() {
+            return Err(IllegalArgument {
+                message: "bucket_key must not be empty!".to_string(),
+            });
+        }
+
+        if num_buckets <= 0 {
+            return Err(IllegalArgument {
+                message: "num_buckets must be positive!".to_string(),
+            });
+        };
+
+        Ok((murmur_hash::hash_bytes(bucket_key) as i32 & i32::MAX) % num_buckets)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_bucketing() {
+        let default_bucketing = <dyn BucketingFunction>::of(None);
+
+        let expected = 1;
+        let actual = default_bucketing.bucketing(&[00u8, 10u8], 7).unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 0;
+        let actual = default_bucketing
+            .bucketing(&[00u8, 10u8, 10u8, 10u8], 12)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 6;
+        let actual = default_bucketing
+            .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 6;
+        let actual = default_bucketing
+            .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+    }
+
+    #[test]
+    fn test_paimon_bucketing() {
+        let paimon_bucketing = <dyn BucketingFunction>::of(Some(&DataLakeFormat::Paimon));
+
+        let expected = 1;
+        let actual = paimon_bucketing.bucketing(&[00u8, 10u8], 7).unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 11;
+        let actual = paimon_bucketing
+            .bucketing(&[00u8, 10u8, 10u8, 10u8], 12)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 12;
+        let actual = paimon_bucketing
+            .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 0;
+        let actual = paimon_bucketing
+            .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+    }
+
+    #[test]
+    fn test_lance_bucketing() {
+        let lance_bucketing = <dyn BucketingFunction>::of(Some(&DataLakeFormat::Lance));
+
+        let expected = 1;
+        let actual = lance_bucketing.bucketing(&[00u8, 10u8], 7).unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 0;
+        let actual = lance_bucketing
+            .bucketing(&[00u8, 10u8, 10u8, 10u8], 12)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 6;
+        let actual = lance_bucketing
+            .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 6;
+        let actual = lance_bucketing
+            .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+    }
+
+    #[test]
+    fn test_iceberg_bucketing() {
+        let iceberg_bucketing = <dyn BucketingFunction>::of(Some(&DataLakeFormat::Iceberg));
+
+        let expected = 3;
+        let actual = iceberg_bucketing.bucketing(&[00u8, 10u8], 7).unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 4;
+        let actual = iceberg_bucketing
+            .bucketing(&[00u8, 10u8, 10u8, 10u8], 12)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 12;
+        let actual = iceberg_bucketing
+            .bucketing("2bb87d68-baf9-4e64-90f9-f80910419fa6".as_bytes(), 16)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+
+        let expected = 3;
+        let actual = iceberg_bucketing
+            .bucketing("The quick brown fox jumps over the lazy dog".as_bytes(), 8)
+            .unwrap();
+        assert_eq!(
+            expected, actual,
+            "Expecting bucket to be {expected} but got {actual}"
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/admin.rs b/fluss-rust/crates/fluss/src/client/admin.rs
new file mode 100644
index 0000000000..1eb2f80bb0
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/admin.rs
@@ -0,0 +1,486 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::metadata::Metadata;
+use crate::cluster::ServerNode;
+use crate::metadata::{
+    DatabaseDescriptor, DatabaseInfo, JsonSerde, LakeSnapshot, PartitionInfo, PartitionSpec,
+    PhysicalTablePath, Schema, SchemaInfo, TableBucket, TableDescriptor, TableInfo, TablePath,
+};
+use crate::rpc::message::{
+    CreateDatabaseRequest, CreatePartitionRequest, CreateTableRequest, DatabaseExistsRequest,
+    DropDatabaseRequest, DropPartitionRequest, DropTableRequest, GetDatabaseInfoRequest,
+    GetLatestLakeSnapshotRequest, GetTableRequest, GetTableSchemaRequestMsg, ListDatabasesRequest,
+    ListPartitionInfosRequest, ListTablesRequest, TableExistsRequest,
+};
+use crate::rpc::message::{ListOffsetsRequest, OffsetSpec};
+use crate::rpc::{RpcClient, ServerConnection};
+
+use crate::error::{Error, Result};
+use crate::proto::GetTableInfoResponse;
+use crate::{BucketId, PartitionId, TableId};
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+use tokio::task::JoinHandle;
+
+pub struct FlussAdmin {
+    metadata: Arc<Metadata>,
+    rpc_client: Arc<RpcClient>,
+}
+
+impl FlussAdmin {
+    pub fn new(connections: Arc<RpcClient>, metadata: Arc<Metadata>) -> Self {
+        FlussAdmin {
+            metadata,
+            rpc_client: connections,
+        }
+    }
+
+    async fn admin_gateway(&self) -> Result<ServerConnection> {
+        let cluster = self.metadata.get_cluster();
+        let coordinator =
+            cluster
+                .get_coordinator_server()
+                .ok_or_else(|| Error::UnexpectedError {
+                    message: "Coordinator server not found in cluster metadata".to_string(),
+                    source: None,
+                })?;
+        self.rpc_client.get_connection(coordinator).await
+    }
+
+    pub async fn create_database(
+        &self,
+        database_name: &str,
+        database_descriptor: Option<&DatabaseDescriptor>,
+        ignore_if_exists: bool,
+    ) -> Result<()> {
+        let _response = self
+            .admin_gateway()
+            .await?
+            .request(CreateDatabaseRequest::new(
+                database_name,
+                database_descriptor,
+                ignore_if_exists,
+            )?)
+            .await?;
+        Ok(())
+    }
+
+    pub async fn create_table(
+        &self,
+        table_path: &TablePath,
+        table_descriptor: &TableDescriptor,
+        ignore_if_exists: bool,
+    ) -> Result<()> {
+        let _response = self
+            .admin_gateway()
+            .await?
+            .request(CreateTableRequest::new(
+                table_path,
+                table_descriptor,
+                ignore_if_exists,
+            )?)
+            .await?;
+        Ok(())
+    }
+
+    pub async fn drop_table(
+        &self,
+        table_path: &TablePath,
+        ignore_if_not_exists: bool,
+    ) -> Result<()> {
+        let _response = self
+            .admin_gateway()
+            .await?
+            .request(DropTableRequest::new(table_path, ignore_if_not_exists))
+            .await?;
+        Ok(())
+    }
+
+    /// Fetch the schema for `table_path` at the given `schema_id`. Pass
+    /// `None` to request the latest.
+    pub async fn get_table_schema(
+        &self,
+        table_path: &TablePath,
+        schema_id: Option<i32>,
+    ) -> Result<SchemaInfo> {
+        let response = self
+            .admin_gateway()
+            .await?
+            .request(GetTableSchemaRequestMsg::new(table_path, schema_id))
+            .await?;
+
+        let schema_node: serde_json::Value = serde_json::from_slice(&response.schema_json)
+            .map_err(|e| Error::JsonSerdeError {
+                message: format!("Failed to parse schema_json: {e}"),
+            })?;
+        let schema = Schema::deserialize_json(&schema_node)?;
+        Ok(SchemaInfo::new(schema, response.schema_id))
+    }
+
+    pub async fn get_table_info(&self, table_path: &TablePath) -> Result<TableInfo> {
+        let response = self
+            .admin_gateway()
+            .await?
+            .request(GetTableRequest::new(table_path))
+            .await?;
+
+        // force update to avoid stale data in cache
+        self.metadata
+            .update_tables_metadata(&HashSet::from([table_path]), &HashSet::new(), vec![])
+            .await?;
+
+        let GetTableInfoResponse {
+            table_id,
+            schema_id,
+            table_json,
+            created_time,
+            modified_time,
+            remote_data_dir: _,
+        } = response;
+        let v: &[u8] = &table_json[..];
+        let table_descriptor =
+            TableDescriptor::deserialize_json(&serde_json::from_slice(v).unwrap())?;
+        Ok(TableInfo::of(
+            table_path.clone(),
+            table_id,
+            schema_id,
+            table_descriptor,
+            created_time,
+            modified_time,
+        ))
+    }
+
+    /// List all tables in the given database
+    pub async fn list_tables(&self, database_name: &str) -> Result<Vec<String>> {
+        let response = self
+            .admin_gateway()
+            .await?
+            .request(ListTablesRequest::new(database_name))
+            .await?;
+        Ok(response.table_name)
+    }
+
+    /// List all partitions in the given table.
+    pub async fn list_partition_infos(&self, table_path: &TablePath) -> Result<Vec<PartitionInfo>> {
+        self.list_partition_infos_with_spec(table_path, None).await
+    }
+
+    /// List partitions in the given table that match the partial partition spec.
+    pub async fn list_partition_infos_with_spec(
+        &self,
+        table_path: &TablePath,
+        partial_partition_spec: Option<&PartitionSpec>,
+    ) -> Result<Vec<PartitionInfo>> {
+        let response = self
+            .admin_gateway()
+            .await?
+            .request(ListPartitionInfosRequest::new(
+                table_path,
+                partial_partition_spec,
+            ))
+            .await?;
+        Ok(response.get_partitions_info())
+    }
+
+    /// Create a new partition for a partitioned table.
+    pub async fn create_partition(
+        &self,
+        table_path: &TablePath,
+        partition_spec: &PartitionSpec,
+        ignore_if_exists: bool,
+    ) -> Result<()> {
+        let _response = self
+            .admin_gateway()
+            .await?
+            .request(CreatePartitionRequest::new(
+                table_path,
+                partition_spec,
+                ignore_if_exists,
+            ))
+            .await?;
+        Ok(())
+    }
+
+    /// Drop a partition from a partitioned table.
+    pub async fn drop_partition(
+        &self,
+        table_path: &TablePath,
+        partition_spec: &PartitionSpec,
+        ignore_if_not_exists: bool,
+    ) -> Result<()> {
+        let _response = self
+            .admin_gateway()
+            .await?
+            .request(DropPartitionRequest::new(
+                table_path,
+                partition_spec,
+                ignore_if_not_exists,
+            ))
+            .await?;
+        Ok(())
+    }
+
+    /// Check if a table exists
+    pub async fn table_exists(&self, table_path: &TablePath) -> Result<bool> {
+        let response = self
+            .admin_gateway()
+            .await?
+            .request(TableExistsRequest::new(table_path))
+            .await?;
+        Ok(response.exists)
+    }
+
+    /// Drop a database
+    pub async fn drop_database(
+        &self,
+        database_name: &str,
+        ignore_if_not_exists: bool,
+        cascade: bool,
+    ) -> Result<()> {
+        let _response = self
+            .admin_gateway()
+            .await?
+            .request(DropDatabaseRequest::new(
+                database_name,
+                ignore_if_not_exists,
+                cascade,
+            ))
+            .await?;
+        Ok(())
+    }
+
+    /// List all databases
+    pub async fn list_databases(&self) -> Result<Vec<String>> {
+        let response = self
+            .admin_gateway()
+            .await?
+            .request(ListDatabasesRequest::new())
+            .await?;
+        Ok(response.database_name)
+    }
+
+    /// Check if a database exists
+    pub async fn database_exists(&self, database_name: &str) -> Result<bool> {
+        let response = self
+            .admin_gateway()
+            .await?
+            .request(DatabaseExistsRequest::new(database_name))
+            .await?;
+        Ok(response.exists)
+    }
+
+    /// Get database information
+    pub async fn get_database_info(&self, database_name: &str) -> Result<DatabaseInfo> {
+        let request = GetDatabaseInfoRequest::new(database_name);
+        let response = self.admin_gateway().await?.request(request).await?;
+
+        // Convert proto response to DatabaseInfo
+        let database_descriptor = DatabaseDescriptor::from_json_bytes(&response.database_json)?;
+
+        Ok(DatabaseInfo::new(
+            database_name.to_string(),
+            database_descriptor,
+            response.created_time,
+            response.modified_time,
+        ))
+    }
+
+    /// Get all alive server nodes in the cluster, including the coordinator
+    /// and all tablet servers. Refreshes cluster metadata before returning.
+    pub async fn get_server_nodes(&self) -> Result<Vec<ServerNode>> {
+        self.metadata.reinit_cluster().await?;
+        Ok(self.metadata.get_cluster().get_server_nodes())
+    }
+
+    /// Get the latest lake snapshot for a table
+    pub async fn get_latest_lake_snapshot(&self, table_path: &TablePath) -> Result<LakeSnapshot> {
+        let response = self
+            .admin_gateway()
+            .await?
+            .request(GetLatestLakeSnapshotRequest::new(table_path))
+            .await?;
+
+        // Convert proto response to LakeSnapshot
+        let mut table_buckets_offset = HashMap::new();
+        for bucket_snapshot in response.bucket_snapshots {
+            let table_bucket = TableBucket::new_with_partition(
+                response.table_id,
+                bucket_snapshot.partition_id,
+                bucket_snapshot.bucket_id,
+            );
+            if let Some(log_offset) = bucket_snapshot.log_offset {
+                table_buckets_offset.insert(table_bucket, log_offset);
+            }
+        }
+
+        Ok(LakeSnapshot::new(
+            response.snapshot_id,
+            table_buckets_offset,
+        ))
+    }
+
+    /// List offset for the specified buckets. This operation enables to find the beginning offset,
+    /// end offset as well as the offset matching a timestamp in buckets.
+    pub async fn list_offsets(
+        &self,
+        table_path: &TablePath,
+        buckets_id: &[BucketId],
+        offset_spec: OffsetSpec,
+    ) -> Result<HashMap<i32, i64>> {
+        self.do_list_offsets(table_path, None, buckets_id, offset_spec)
+            .await
+    }
+
+    /// List offset for the specified buckets in a partition. This operation enables to find
+    /// the beginning offset, end offset as well as the offset matching a timestamp in buckets.
+    pub async fn list_partition_offsets(
+        &self,
+        table_path: &TablePath,
+        partition_name: &str,
+        buckets_id: &[BucketId],
+        offset_spec: OffsetSpec,
+    ) -> Result<HashMap<i32, i64>> {
+        self.do_list_offsets(table_path, Some(partition_name), buckets_id, offset_spec)
+            .await
+    }
+
+    async fn do_list_offsets(
+        &self,
+        table_path: &TablePath,
+        partition_name: Option<&str>,
+        buckets_id: &[BucketId],
+        offset_spec: OffsetSpec,
+    ) -> Result<HashMap<i32, i64>> {
+        if buckets_id.is_empty() {
+            return Err(Error::IllegalArgument {
+                message: "Buckets are empty.".to_string(),
+            });
+        }
+
+        // force to update table metadata like java side
+        self.metadata.update_table_metadata(table_path).await?;
+
+        let cluster = self.metadata.get_cluster();
+        let table_id = cluster.get_table(table_path)?.table_id;
+
+        // Resolve partition_id from partition_name if provided
+        let partition_id = if let Some(name) = partition_name {
+            let physical_table_path = Arc::new(PhysicalTablePath::of_partitioned(
+                Arc::new(table_path.clone()),
+                Some(name.to_string()),
+            ));
+
+            // Update partition metadata like java side
+            self.metadata
+                .update_physical_table_metadata(std::slice::from_ref(&physical_table_path))
+                .await?;
+
+            let cluster = self.metadata.get_cluster();
+            Some(
+                cluster
+                    .get_partition_id(&physical_table_path)
+                    .ok_or_else(|| {
+                        Error::partition_not_exist(format!(
+                            "Partition '{name}' not found for table '{table_path}'"
+                        ))
+                    })?,
+            )
+        } else {
+            None
+        };
+
+        // Prepare requests
+        let requests_by_server =
+            self.prepare_list_offsets_requests(table_id, partition_id, buckets_id, offset_spec)?;
+
+        // Send Requests
+        let response_futures = self.send_list_offsets_request(requests_by_server).await?;
+
+        let mut results = HashMap::new();
+
+        for response_future in response_futures {
+            let offsets = response_future.await.map_err(|e| Error::UnexpectedError {
+                message: "Fail to get result for list offsets.".to_string(),
+                source: Some(Box::new(e)),
+            })?;
+            results.extend(offsets?);
+        }
+        Ok(results)
+    }
+
+    fn prepare_list_offsets_requests(
+        &self,
+        table_id: TableId,
+        partition_id: Option<PartitionId>,
+        buckets: &[BucketId],
+        offset_spec: OffsetSpec,
+    ) -> Result<HashMap<i32, ListOffsetsRequest>> {
+        let cluster = self.metadata.get_cluster();
+        let mut node_for_bucket_list: HashMap<i32, Vec<BucketId>> = HashMap::new();
+
+        for bucket_id in buckets {
+            let table_bucket = TableBucket::new_with_partition(table_id, partition_id, *bucket_id);
+            let leader = cluster.leader_for(&table_bucket).ok_or_else(|| {
+                // todo: consider retry?
+                Error::UnexpectedError {
+                    message: format!("No leader found for table bucket: {table_bucket}."),
+                    source: None,
+                }
+            })?;
+
+            node_for_bucket_list
+                .entry(leader.id())
+                .or_default()
+                .push(*bucket_id);
+        }
+
+        let mut list_offsets_requests = HashMap::new();
+        for (leader_id, bucket_ids) in node_for_bucket_list {
+            let request =
+                ListOffsetsRequest::new(table_id, partition_id, bucket_ids, offset_spec.clone());
+            list_offsets_requests.insert(leader_id, request);
+        }
+        Ok(list_offsets_requests)
+    }
+
+    async fn send_list_offsets_request(
+        &self,
+        request_map: HashMap<i32, ListOffsetsRequest>,
+    ) -> Result<Vec<JoinHandle<Result<HashMap<i32, i64>>>>> {
+        let mut tasks = Vec::new();
+
+        for (leader_id, request) in request_map {
+            let rpc_client = self.rpc_client.clone();
+            let metadata = self.metadata.clone();
+
+            let task = tokio::spawn(async move {
+                let cluster = metadata.get_cluster();
+                let tablet_server = cluster.get_tablet_server(leader_id).ok_or_else(|| {
+                    Error::leader_not_available(format!(
+                        "Tablet server {leader_id} is not found in metadata cache."
+                    ))
+                })?;
+                let connection = rpc_client.get_connection(tablet_server).await?;
+                let list_offsets_response = connection.request(request).await?;
+                list_offsets_response.offsets()
+            });
+            tasks.push(task);
+        }
+        Ok(tasks)
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/connection.rs b/fluss-rust/crates/fluss/src/client/connection.rs
new file mode 100644
index 0000000000..c31104c469
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/connection.rs
@@ -0,0 +1,187 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::WriterClient;
+use crate::client::admin::FlussAdmin;
+use crate::client::lookup::LookupClient;
+use crate::client::metadata::Metadata;
+use crate::client::table::FlussTable;
+use crate::config::Config;
+use crate::error::{Error, FlussError, Result};
+use crate::metadata::TablePath;
+use crate::rpc::RpcClient;
+use parking_lot::RwLock;
+use std::sync::Arc;
+use std::time::Duration;
+
+pub struct FlussConnection {
+    metadata: Arc<Metadata>,
+    network_connects: Arc<RpcClient>,
+    args: Config,
+    writer_client: RwLock<Option<Arc<WriterClient>>>,
+    admin_client: RwLock<Option<Arc<FlussAdmin>>>,
+    lookup_client: RwLock<Option<Arc<LookupClient>>>,
+}
+
+impl FlussConnection {
+    pub async fn new(arg: Config) -> Result<Self> {
+        arg.validate_security()
+            .map_err(|msg| Error::IllegalArgument { message: msg })?;
+        arg.validate_scanner()
+            .map_err(|msg| Error::IllegalArgument { message: msg })?;
+        arg.validate_writer()
+            .map_err(|msg| Error::IllegalArgument { message: msg })?;
+
+        let timeout = Duration::from_millis(arg.connect_timeout_ms);
+        // connect_timeout_ms: no lower-bound validation to match Java behavior.
+        // Java allows 0 — tracked in https://github.com/apache/fluss/issues/3068
+        let connections = if arg.is_sasl_enabled() {
+            Arc::new(
+                RpcClient::new()
+                    .with_sasl(
+                        arg.security_sasl_username.clone(),
+                        arg.security_sasl_password.clone(),
+                    )
+                    .with_timeout(timeout),
+            )
+        } else {
+            Arc::new(RpcClient::new().with_timeout(timeout))
+        };
+        let metadata = Metadata::new(arg.bootstrap_servers.as_str(), connections.clone()).await?;
+
+        Ok(FlussConnection {
+            metadata: Arc::new(metadata),
+            network_connects: connections.clone(),
+            args: arg.clone(),
+            writer_client: Default::default(),
+            admin_client: RwLock::new(None),
+            lookup_client: Default::default(),
+        })
+    }
+
+    /// Gracefully shut down the connection, draining any pending write batches.
+    ///
+    /// If a writer client has been created, this method will signal it to drain
+    /// its buffers and wait for the background sender task to complete, bounded
+    /// by the provided timeout.
+    pub async fn close(&self, timeout: Duration) -> Result<()> {
+        let writer_client = self.writer_client.write().take();
+        if let Some(client) = writer_client {
+            client.close(timeout).await?;
+        }
+        Ok(())
+    }
+
+    pub fn get_metadata(&self) -> Arc<Metadata> {
+        self.metadata.clone()
+    }
+
+    pub fn get_connections(&self) -> Arc<RpcClient> {
+        self.network_connects.clone()
+    }
+
+    pub fn config(&self) -> &Config {
+        &self.args
+    }
+
+    pub fn get_admin(&self) -> Result<Arc<FlussAdmin>> {
+        // 1. Fast path: return cached instance if already initialized.
+        if let Some(admin) = self.admin_client.read().as_ref() {
+            return Ok(admin.clone());
+        }
+
+        // 2. Slow path: acquire write lock.
+        let mut admin_guard = self.admin_client.write();
+
+        // 3. Double-check: another thread may have initialized while we waited.
+        if let Some(admin) = admin_guard.as_ref() {
+            return Ok(admin.clone());
+        }
+
+        // 4. Initialize and cache.
+        let admin = Arc::new(FlussAdmin::new(
+            self.network_connects.clone(),
+            self.metadata.clone(),
+        ));
+        *admin_guard = Some(admin.clone());
+        Ok(admin)
+    }
+
+    pub fn get_or_create_writer_client(&self) -> Result<Arc<WriterClient>> {
+        // 1. Fast path: Attempt to acquire a read lock to check if the client already exists.
+        if let Some(client) = self.writer_client.read().as_ref() {
+            return Ok(client.clone());
+        }
+
+        // 2. Slow path: Acquire the write lock.
+        let mut writer_guard = self.writer_client.write();
+
+        // 3. Double-check: Another thread might have initialized the client
+        // while this thread was waiting for the write lock.
+        if let Some(client) = writer_guard.as_ref() {
+            return Ok(client.clone());
+        }
+
+        // 4. Initialize the client since we are certain it doesn't exist yet.
+        let new_client = Arc::new(WriterClient::new(self.args.clone(), self.metadata.clone())?);
+
+        // 5. Store and return the newly created client.
+        *writer_guard = Some(new_client.clone());
+        Ok(new_client)
+    }
+
+    /// Gets or creates a lookup client for batched lookup operations.
+    pub fn get_or_create_lookup_client(&self) -> Result<Arc<LookupClient>> {
+        // 1. Fast path: Attempt to acquire a read lock to check if the client already exists.
+        if let Some(client) = self.lookup_client.read().as_ref() {
+            return Ok(client.clone());
+        }
+
+        // 2. Slow path: Acquire the write lock.
+        let mut lookup_guard = self.lookup_client.write();
+
+        // 3. Double-check: Another thread might have initialized the client
+        // while this thread was waiting for the write lock.
+        if let Some(client) = lookup_guard.as_ref() {
+            return Ok(client.clone());
+        }
+
+        // 4. Initialize the client since we are certain it doesn't exist yet.
+        let new_client = Arc::new(LookupClient::new(&self.args, self.metadata.clone()));
+
+        // 5. Store and return the newly created client.
+        *lookup_guard = Some(new_client.clone());
+        Ok(new_client)
+    }
+
+    pub async fn get_table(&self, table_path: &TablePath) -> Result<FlussTable<'_>> {
+        self.metadata.update_table_metadata(table_path).await?;
+        let table_info = self
+            .metadata
+            .get_cluster()
+            .get_table(table_path)
+            .map_err(|e| {
+                if e.api_error() == Some(FlussError::InvalidTableException) {
+                    Error::table_not_exist(format!("Table not found: {table_path}"))
+                } else {
+                    e
+                }
+            })?
+            .clone();
+        Ok(FlussTable::new(self, self.metadata.clone(), table_info))
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/credentials.rs b/fluss-rust/crates/fluss/src/client/credentials.rs
new file mode 100644
index 0000000000..a954e2a916
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/credentials.rs
@@ -0,0 +1,437 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::metadata::Metadata;
+use crate::error::{Error, Result};
+use crate::rpc::RpcClient;
+use crate::rpc::message::GetSecurityTokenRequest;
+use log::{debug, info, warn};
+use parking_lot::RwLock;
+use serde::Deserialize;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use tokio::sync::{oneshot, watch};
+use tokio::task::JoinHandle;
+
+/// Default renewal time ratio - refresh at 80% of token lifetime
+const DEFAULT_TOKEN_RENEWAL_RATIO: f64 = 0.8;
+/// Default retry backoff when token fetch fails
+const DEFAULT_RENEWAL_RETRY_BACKOFF: Duration = Duration::from_secs(30);
+/// Minimum delay between refreshes
+const MIN_RENEWAL_DELAY: Duration = Duration::from_secs(1);
+/// Maximum delay between refreshes (7 days) - prevents overflow and ensures periodic refresh
+const MAX_RENEWAL_DELAY: Duration = Duration::from_secs(7 * 24 * 60 * 60);
+/// Default refresh interval for tokens without expiration (never expires)
+const DEFAULT_NON_EXPIRING_REFRESH_INTERVAL: Duration = Duration::from_secs(7 * 24 * 60 * 60); // 7 day
+
+/// Type alias for credentials properties receiver
+/// - `None` = not yet fetched, should wait
+/// - `Some(HashMap)` = fetched (may be empty if no auth needed)
+pub type CredentialsReceiver = watch::Receiver<Option<HashMap<String, String>>>;
+
+#[derive(Debug, Deserialize)]
+struct Credentials {
+    access_key_id: String,
+    access_key_secret: String,
+    security_token: Option<String>,
+}
+
+/// Returns (opendal_key, needs_inversion)
+/// needs_inversion is true for path_style_access -> enable_virtual_host_style conversion
+fn convert_hadoop_key_to_opendal(hadoop_key: &str) -> Option<(String, bool)> {
+    match hadoop_key {
+        // S3 specific configurations
+        "fs.s3a.endpoint" => Some(("endpoint".to_string(), false)),
+        "fs.s3a.endpoint.region" => Some(("region".to_string(), false)),
+        "fs.s3a.path.style.access" => Some(("enable_virtual_host_style".to_string(), true)),
+        "fs.s3a.connection.ssl.enabled" => None,
+        // OSS specific configurations
+        "fs.oss.endpoint" => Some(("endpoint".to_string(), false)),
+        "fs.oss.region" => Some(("region".to_string(), false)),
+        _ => None,
+    }
+}
+
+/// Build remote filesystem props from credentials and additional info
+fn build_remote_fs_props(
+    credentials: &Credentials,
+    addition_infos: &HashMap<String, String>,
+) -> HashMap<String, String> {
+    let mut props = HashMap::new();
+
+    props.insert(
+        "access_key_id".to_string(),
+        credentials.access_key_id.clone(),
+    );
+
+    // S3 specific configurations
+    props.insert(
+        "secret_access_key".to_string(),
+        credentials.access_key_secret.clone(),
+    );
+
+    // OSS specific configurations, todo: consider refactor it
+    // to handle different conversion for different scheme in different method
+    props.insert(
+        "access_key_secret".to_string(),
+        credentials.access_key_secret.clone(),
+    );
+
+    if let Some(token) = &credentials.security_token {
+        props.insert("security_token".to_string(), token.clone());
+    }
+
+    for (key, value) in addition_infos {
+        if let Some((opendal_key, transform)) = convert_hadoop_key_to_opendal(key) {
+            let final_value = if transform {
+                // Invert boolean value (path_style_access -> enable_virtual_host_style)
+                if value == "true" {
+                    "false".to_string()
+                } else {
+                    "true".to_string()
+                }
+            } else {
+                value.clone()
+            };
+            props.insert(opendal_key, final_value);
+        }
+    }
+
+    props
+}
+
+/// Manager for security tokens that refreshes tokens in a background task.
+///
+/// This follows the pattern from Java's `DefaultSecurityTokenManager`, where
+/// a background thread periodically refreshes tokens based on their expiration time.
+///
+/// Uses `tokio::sync::watch` channel to broadcast token updates to consumers.
+/// Consumers can subscribe by calling `subscribe()` to get a receiver.
+///
+/// The channel value is `Option<HashMap>`:
+/// - `None` = not yet fetched, consumers should wait
+/// - `Some(HashMap)` = fetched (may be empty if no auth needed)
+///
+/// # Example
+/// ```ignore
+/// let manager = SecurityTokenManager::new(rpc_client, metadata);
+/// let credentials_rx = manager.subscribe();
+/// manager.start();
+///
+/// // Consumer can get latest credentials via:
+/// let props = credentials_rx.borrow().clone();
+/// ```
+pub struct SecurityTokenManager {
+    rpc_client: Arc<RpcClient>,
+    metadata: Arc<Metadata>,
+    token_renewal_ratio: f64,
+    renewal_retry_backoff: Duration,
+    /// Watch channel sender for broadcasting token updates
+    credentials_tx: watch::Sender<Option<HashMap<String, String>>>,
+    /// Watch channel receiver (kept to allow cloning for new subscribers)
+    credentials_rx: watch::Receiver<Option<HashMap<String, String>>>,
+    /// Handle to the background refresh task
+    task_handle: RwLock<Option<JoinHandle<()>>>,
+    /// Sender to signal shutdown
+    shutdown_tx: RwLock<Option<oneshot::Sender<()>>>,
+}
+
+impl SecurityTokenManager {
+    pub fn new(rpc_client: Arc<RpcClient>, metadata: Arc<Metadata>) -> Self {
+        let (credentials_tx, credentials_rx) = watch::channel(None);
+        Self {
+            rpc_client,
+            metadata,
+            token_renewal_ratio: DEFAULT_TOKEN_RENEWAL_RATIO,
+            renewal_retry_backoff: DEFAULT_RENEWAL_RETRY_BACKOFF,
+            credentials_tx,
+            credentials_rx,
+            task_handle: RwLock::new(None),
+            shutdown_tx: RwLock::new(None),
+        }
+    }
+
+    /// Subscribe to credential updates.
+    /// Returns a receiver that always contains the latest credentials.
+    /// Consumers can call `receiver.borrow()` to get the current value.
+    pub fn subscribe(&self) -> CredentialsReceiver {
+        self.credentials_rx.clone()
+    }
+
+    /// Start the background token refresh task.
+    /// This should be called once after creating the manager.
+    pub fn start(&self) {
+        if self.task_handle.read().is_some() {
+            warn!("SecurityTokenManager is already started");
+            return;
+        }
+
+        let (shutdown_tx, shutdown_rx) = oneshot::channel();
+        *self.shutdown_tx.write() = Some(shutdown_tx);
+
+        let rpc_client = Arc::clone(&self.rpc_client);
+        let metadata = Arc::clone(&self.metadata);
+        let token_renewal_ratio = self.token_renewal_ratio;
+        let renewal_retry_backoff = self.renewal_retry_backoff;
+        let credentials_tx = self.credentials_tx.clone();
+
+        let handle = tokio::spawn(async move {
+            Self::token_refresh_loop(
+                rpc_client,
+                metadata,
+                token_renewal_ratio,
+                renewal_retry_backoff,
+                credentials_tx,
+                shutdown_rx,
+            )
+            .await;
+        });
+
+        *self.task_handle.write() = Some(handle);
+        info!("SecurityTokenManager started");
+    }
+
+    /// Stop the background token refresh task.
+    pub fn stop(&self) {
+        if let Some(tx) = self.shutdown_tx.write().take() {
+            let _ = tx.send(());
+        }
+        // Take and drop the task handle so the task can finish gracefully
+        let _ = self.task_handle.write().take();
+        info!("SecurityTokenManager stopped");
+    }
+
+    /// Background task that periodically refreshes tokens.
+    async fn token_refresh_loop(
+        rpc_client: Arc<RpcClient>,
+        metadata: Arc<Metadata>,
+        token_renewal_ratio: f64,
+        renewal_retry_backoff: Duration,
+        credentials_tx: watch::Sender<Option<HashMap<String, String>>>,
+        mut shutdown_rx: oneshot::Receiver<()>,
+    ) {
+        info!("Starting token refresh loop");
+
+        loop {
+            // Fetch token and send to channel
+            let result = Self::fetch_token(&rpc_client, &metadata).await;
+
+            let next_delay = match result {
+                Ok((props, expiration_time)) => {
+                    // Send credentials via watch channel (Some indicates fetched)
+                    if let Err(e) = credentials_tx.send(Some(props)) {
+                        debug!("No active subscribers for credentials update: {e:?}");
+                    }
+
+                    // Calculate next renewal delay based on expiration time
+                    if let Some(exp_time) = expiration_time {
+                        Self::calculate_renewal_delay(exp_time, token_renewal_ratio)
+                    } else {
+                        // No expiration time - token never expires, use long refresh interval
+                        info!(
+                            "Token has no expiration time (never expires), next refresh in {DEFAULT_NON_EXPIRING_REFRESH_INTERVAL:?}"
+                        );
+                        DEFAULT_NON_EXPIRING_REFRESH_INTERVAL
+                    }
+                }
+                Err(e) => {
+                    warn!(
+                        "Failed to obtain security token: {e:?}, will retry in {renewal_retry_backoff:?}"
+                    );
+                    renewal_retry_backoff
+                }
+            };
+
+            debug!("Next token refresh in {next_delay:?}");
+
+            // Wait for either the delay to elapse or shutdown signal
+            tokio::select! {
+                _ = tokio::time::sleep(next_delay) => {
+                    // Continue to next iteration to refresh
+                }
+                _ = &mut shutdown_rx => {
+                     info!("Token refresh loop received shutdown signal");
+                    break;
+                }
+            }
+        }
+    }
+
+    /// Fetch token from server.
+    /// Returns the props and expiration time if available.
+    async fn fetch_token(
+        rpc_client: &Arc<RpcClient>,
+        metadata: &Arc<Metadata>,
+    ) -> Result<(HashMap<String, String>, Option<i64>)> {
+        let cluster = metadata.get_cluster();
+        let server_node =
+            cluster
+                .get_one_available_server()
+                .ok_or_else(|| Error::UnexpectedError {
+                    message: "No tablet server available for token refresh".to_string(),
+                    source: None,
+                })?;
+
+        let conn = rpc_client.get_connection(server_node).await?;
+        let request = GetSecurityTokenRequest::new();
+        let response = conn.request(request).await?;
+
+        // The token may be empty if remote filesystem doesn't require authentication
+        if response.token.is_empty() {
+            info!("Empty token received, remote filesystem may not require authentication");
+            return Ok((HashMap::new(), response.expiration_time));
+        }
+
+        let credentials: Credentials =
+            serde_json::from_slice(&response.token).map_err(|e| Error::JsonSerdeError {
+                message: format!("Error when parsing token from server: {e}"),
+            })?;
+
+        let mut addition_infos = HashMap::new();
+        for kv in &response.addition_info {
+            addition_infos.insert(kv.key.clone(), kv.value.clone());
+        }
+
+        let props = build_remote_fs_props(&credentials, &addition_infos);
+        debug!("Security token fetched successfully");
+
+        Ok((props, response.expiration_time))
+    }
+
+    /// Calculate the delay before next token renewal.
+    /// Uses the renewal ratio to refresh before actual expiration.
+    /// Caps the delay to MAX_RENEWAL_DELAY to prevent overflow and ensure periodic refresh.
+    fn calculate_renewal_delay(expiration_time: i64, renewal_ratio: f64) -> Duration {
+        let now = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_millis() as i64;
+
+        let time_until_expiry = expiration_time - now;
+        if time_until_expiry <= 0 {
+            // Token already expired, refresh immediately
+            return MIN_RENEWAL_DELAY;
+        }
+
+        // Cap time_until_expiry to prevent overflow when casting to f64 and back
+        let max_delay_ms = MAX_RENEWAL_DELAY.as_millis() as i64;
+        let capped_time = time_until_expiry.min(max_delay_ms);
+
+        let delay_ms = (capped_time as f64 * renewal_ratio) as u64;
+        let delay = Duration::from_millis(delay_ms);
+
+        debug!(
+            "Calculated renewal delay: {delay:?} (expiration: {expiration_time}, now: {now}, ratio: {renewal_ratio})"
+        );
+
+        delay.clamp(MIN_RENEWAL_DELAY, MAX_RENEWAL_DELAY)
+    }
+}
+
+impl Drop for SecurityTokenManager {
+    fn drop(&mut self) {
+        self.stop();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn convert_hadoop_key_to_opendal_maps_known_keys() {
+        // S3 keys
+        let (key, invert) = convert_hadoop_key_to_opendal("fs.s3a.endpoint").expect("key");
+        assert_eq!(key, "endpoint");
+        assert!(!invert);
+
+        let (key, invert) = convert_hadoop_key_to_opendal("fs.s3a.path.style.access").expect("key");
+        assert_eq!(key, "enable_virtual_host_style");
+        assert!(invert);
+
+        assert!(convert_hadoop_key_to_opendal("fs.s3a.connection.ssl.enabled").is_none());
+
+        // OSS keys
+        let (key, invert) = convert_hadoop_key_to_opendal("fs.oss.endpoint").expect("key");
+        assert_eq!(key, "endpoint");
+        assert!(!invert);
+
+        let (key, invert) = convert_hadoop_key_to_opendal("fs.oss.region").expect("key");
+        assert_eq!(key, "region");
+        assert!(!invert);
+
+        // Unknown key
+        assert!(convert_hadoop_key_to_opendal("unknown.key").is_none());
+    }
+
+    #[test]
+    fn calculate_renewal_delay_returns_correct_delay() {
+        let now = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_millis() as i64;
+
+        // Token expires in 1 hour
+        let expiration = now + 3600 * 1000;
+        let delay = SecurityTokenManager::calculate_renewal_delay(expiration, 0.8);
+
+        // Should be approximately 48 minutes (80% of 1 hour)
+        let expected_min = Duration::from_secs(2800); // ~46.7 minutes
+        let expected_max = Duration::from_secs(2900); // ~48.3 minutes
+        assert!(
+            delay >= expected_min && delay <= expected_max,
+            "Expected delay between {expected_min:?} and {expected_max:?}, got {delay:?}"
+        );
+    }
+
+    #[test]
+    fn calculate_renewal_delay_handles_expired_token() {
+        let now = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap()
+            .as_millis() as i64;
+
+        // Token already expired
+        let expiration = now - 1000;
+        let delay = SecurityTokenManager::calculate_renewal_delay(expiration, 0.8);
+
+        // Should return minimum delay
+        assert_eq!(delay, MIN_RENEWAL_DELAY);
+    }
+
+    #[test]
+    fn build_remote_fs_props_includes_all_fields() {
+        let credentials = Credentials {
+            access_key_id: "ak".to_string(),
+            access_key_secret: "sk".to_string(),
+            security_token: Some("token".to_string()),
+        };
+        let addition_infos =
+            HashMap::from([("fs.s3a.path.style.access".to_string(), "true".to_string())]);
+
+        let props = build_remote_fs_props(&credentials, &addition_infos);
+        assert_eq!(props.get("access_key_id"), Some(&"ak".to_string()));
+        assert_eq!(props.get("access_key_secret"), Some(&"sk".to_string()));
+        assert_eq!(props.get("access_key_secret"), Some(&"sk".to_string()));
+        assert_eq!(props.get("security_token"), Some(&"token".to_string()));
+        assert_eq!(
+            props.get("enable_virtual_host_style"),
+            Some(&"false".to_string())
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/lookup/lookup_client.rs b/fluss-rust/crates/fluss/src/client/lookup/lookup_client.rs
new file mode 100644
index 0000000000..4d507aa9bf
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/lookup/lookup_client.rs
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Lookup client that batches multiple lookups together for improved throughput.
+//!
+//! This client achieves parity with the Java client by:
+//! - Queuing lookup operations instead of sending them immediately
+//! - Batching multiple lookups to the same server/bucket
+//! - Running a background sender task to process batches
+
+use super::{LookupQueue, PrefixLookupQuery, PrimaryLookupQuery, QueuedLookup};
+use crate::client::lookup::lookup_sender::LookupSender;
+use crate::client::metadata::Metadata;
+use crate::config::Config;
+use crate::error::{Error, Result};
+use crate::metadata::{TableBucket, TablePath};
+use bytes::Bytes;
+use log::{debug, error};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::Duration;
+use tokio::sync::{mpsc, watch};
+use tokio::task::JoinHandle;
+
+/// A client that lookups values from the server with batching support.
+///
+/// The lookup client uses a queue and background sender to batch multiple
+/// lookup operations together, reducing network round trips and improving
+/// throughput.
+///
+/// # Example
+///
+/// ```ignore
+/// let lookup_client = LookupClient::new(config, metadata);
+/// let result = lookup_client.lookup(table_path, table_bucket, key_bytes).await?;
+/// ```
+pub struct LookupClient {
+    /// Channel to send lookup requests to the queue
+    lookup_tx: mpsc::Sender<QueuedLookup>,
+    /// Handle to the sender task
+    sender_handle: Option<JoinHandle<()>>,
+    /// Watch channel for internal shutdown handling
+    shutdown_tx: watch::Sender<bool>,
+    /// Whether the client is closed
+    closed: AtomicBool,
+}
+
+impl LookupClient {
+    /// Creates a new lookup client.
+    pub fn new(config: &Config, metadata: Arc<Metadata>) -> Self {
+        // Extract configuration values
+        let queue_size = config.lookup_queue_size;
+        let max_batch_size = config.lookup_max_batch_size;
+        let batch_timeout_ms = config.lookup_batch_timeout_ms;
+        let max_inflight = config.lookup_max_inflight_requests;
+        let max_retries = config.lookup_max_retries;
+
+        // Create queue and channels
+        let cluster_rx = metadata.subscribe_cluster_changes();
+        let (queue, lookup_tx, re_enqueue_tx) =
+            LookupQueue::new(queue_size, max_batch_size, batch_timeout_ms, cluster_rx);
+
+        // Create shutdown channel
+        let (shutdown_tx, shutdown_rx) = watch::channel(false);
+
+        // Create sender with shutdown receiver
+        let mut sender = LookupSender::new(
+            metadata,
+            queue,
+            re_enqueue_tx,
+            max_inflight,
+            max_retries,
+            shutdown_rx,
+        );
+
+        // Spawn sender task - sender handles shutdown internally
+        let sender_handle = tokio::spawn(async move {
+            sender.run().await;
+            debug!("Lookup sender completed");
+        });
+
+        Self {
+            lookup_tx,
+            sender_handle: Some(sender_handle),
+            shutdown_tx,
+            closed: AtomicBool::new(false),
+        }
+    }
+
+    /// Looks up a value by its primary key.
+    ///
+    /// This method queues the lookup operation and returns a future that will
+    /// complete when the server responds. Multiple lookups may be batched together
+    /// for improved throughput.
+    ///
+    /// # Arguments
+    /// * `table_path` - The table path
+    /// * `table_bucket` - The table bucket
+    /// * `key_bytes` - The encoded primary key bytes
+    ///
+    /// # Returns
+    /// * `Ok(Some(bytes))` - The value bytes if found
+    /// * `Ok(None)` - If the key was not found
+    /// * `Err(Error)` - If the lookup fails
+    pub async fn lookup(
+        &self,
+        table_path: TablePath,
+        table_bucket: TableBucket,
+        key_bytes: Bytes,
+    ) -> Result<Option<Vec<u8>>> {
+        if self.closed.load(Ordering::Acquire) {
+            return Err(Error::UnexpectedError {
+                message: "Lookup client is closed".to_string(),
+                source: None,
+            });
+        }
+
+        let (result_tx, result_rx) = tokio::sync::oneshot::channel();
+        let query = QueuedLookup::Primary(PrimaryLookupQuery::new(
+            table_path,
+            table_bucket,
+            key_bytes,
+            result_tx,
+        ));
+
+        self.enqueue(query).await?;
+
+        result_rx.await.map_err(|_| Error::UnexpectedError {
+            message: "Lookup result channel closed".to_string(),
+            source: None,
+        })?
+    }
+
+    /// Looks up all values matching a prefix key.
+    ///
+    /// The prefix key must be a prefix subset of the table's primary key
+    /// (specifically, the bucket keys). Returns every row whose primary key
+    /// starts with the supplied prefix. Queries are batched together with
+    /// other lookups going to the same server for improved throughput.
+    ///
+    /// # Arguments
+    /// * `table_path` - The table path
+    /// * `table_bucket` - The table bucket computed from the bucket key part of the prefix
+    /// * `key_bytes` - The encoded prefix key bytes
+    ///
+    /// # Returns
+    /// * `Ok(rows)` - Every row matching the prefix (possibly empty)
+    /// * `Err(Error)` - If the lookup fails
+    pub async fn prefix_lookup(
+        &self,
+        table_path: TablePath,
+        table_bucket: TableBucket,
+        key_bytes: Bytes,
+    ) -> Result<Vec<Vec<u8>>> {
+        if self.closed.load(Ordering::Acquire) {
+            return Err(Error::UnexpectedError {
+                message: "Lookup client is closed".to_string(),
+                source: None,
+            });
+        }
+
+        let (result_tx, result_rx) = tokio::sync::oneshot::channel();
+        let query = QueuedLookup::Prefix(PrefixLookupQuery::new(
+            table_path,
+            table_bucket,
+            key_bytes,
+            result_tx,
+        ));
+
+        self.enqueue(query).await?;
+
+        result_rx.await.map_err(|_| Error::UnexpectedError {
+            message: "Lookup result channel closed".to_string(),
+            source: None,
+        })?
+    }
+
+    async fn enqueue(&self, query: QueuedLookup) -> Result<()> {
+        self.lookup_tx.send(query).await.map_err(|e| {
+            let failed_query = e.0;
+            error!(
+                "Failed to queue lookup: channel closed. table_path: {}, table_bucket: {:?}, key_len: {}",
+                failed_query.table_path(),
+                failed_query.table_bucket(),
+                failed_query.key().len()
+            );
+            Error::UnexpectedError {
+                message: "Failed to queue lookup: channel closed".to_string(),
+                source: None,
+            }
+        })
+    }
+
+    /// Closes the lookup client gracefully.
+    pub async fn close(mut self, timeout: Duration) {
+        debug!("Closing lookup client");
+
+        // Mark as closed to reject new lookups
+        self.closed.store(true, Ordering::Release);
+
+        // Send shutdown signal via watch channel
+        let _ = self.shutdown_tx.send(true);
+
+        // Wait for sender to complete with timeout
+        if let Some(handle) = self.sender_handle.take() {
+            debug!("Waiting for sender task to complete...");
+            let abort_handle = handle.abort_handle();
+
+            match tokio::time::timeout(timeout, handle).await {
+                Ok(Ok(())) => {
+                    debug!("Lookup sender task completed gracefully.");
+                }
+                Ok(Err(join_error)) => {
+                    error!("Lookup sender task panicked: {:?}", join_error);
+                }
+                Err(_elapsed) => {
+                    error!("Lookup sender task did not complete within timeout. Forcing shutdown.");
+                    abort_handle.abort();
+                }
+            }
+        } else {
+            debug!("Lookup client was already closed or never initialized properly.");
+        }
+
+        debug!("Lookup client closed");
+    }
+}
+
+impl Drop for LookupClient {
+    fn drop(&mut self) {
+        // Abort the sender task on drop if it wasn't already consumed by close()
+        if let Some(handle) = self.sender_handle.take() {
+            handle.abort();
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/lookup/lookup_query.rs b/fluss-rust/crates/fluss/src/client/lookup/lookup_query.rs
new file mode 100644
index 0000000000..19830aefa5
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/lookup/lookup_query.rs
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::{Error, Result};
+use crate::metadata::{TableBucket, TablePath};
+use bytes::Bytes;
+use tokio::sync::oneshot;
+
+pub struct LookupQuery<T> {
+    table_path: TablePath,
+    table_bucket: TableBucket,
+    key: Bytes,
+    retries: i32,
+    result_tx: Option<oneshot::Sender<Result<T>>>,
+}
+
+impl<T> LookupQuery<T> {
+    pub fn new(
+        table_path: TablePath,
+        table_bucket: TableBucket,
+        key: Bytes,
+        result_tx: oneshot::Sender<Result<T>>,
+    ) -> Self {
+        Self {
+            table_path,
+            table_bucket,
+            key,
+            retries: 0,
+            result_tx: Some(result_tx),
+        }
+    }
+
+    pub fn table_path(&self) -> &TablePath {
+        &self.table_path
+    }
+
+    pub fn table_bucket(&self) -> &TableBucket {
+        &self.table_bucket
+    }
+
+    pub fn key(&self) -> &Bytes {
+        &self.key
+    }
+
+    pub fn retries(&self) -> i32 {
+        self.retries
+    }
+
+    pub fn increment_retries(&mut self) {
+        self.retries += 1;
+    }
+
+    pub fn is_done(&self) -> bool {
+        self.result_tx.is_none()
+    }
+
+    pub fn complete(&mut self, result: Result<T>) {
+        if let Some(tx) = self.result_tx.take() {
+            let _ = tx.send(result);
+        }
+    }
+
+    pub fn complete_with_error(&mut self, error: Error) {
+        self.complete(Err(error));
+    }
+}
+
+pub type PrimaryLookupQuery = LookupQuery<Option<Vec<u8>>>;
+pub type PrefixLookupQuery = LookupQuery<Vec<Vec<u8>>>;
+
+pub enum QueuedLookup {
+    Primary(PrimaryLookupQuery),
+    Prefix(PrefixLookupQuery),
+}
+
+impl QueuedLookup {
+    pub fn table_path(&self) -> &TablePath {
+        match self {
+            Self::Primary(q) => q.table_path(),
+            Self::Prefix(q) => q.table_path(),
+        }
+    }
+
+    pub fn table_bucket(&self) -> &TableBucket {
+        match self {
+            Self::Primary(q) => q.table_bucket(),
+            Self::Prefix(q) => q.table_bucket(),
+        }
+    }
+
+    pub fn key(&self) -> &Bytes {
+        match self {
+            Self::Primary(q) => q.key(),
+            Self::Prefix(q) => q.key(),
+        }
+    }
+
+    pub fn complete_with_error(&mut self, error: Error) {
+        match self {
+            Self::Primary(q) => q.complete_with_error(error),
+            Self::Prefix(q) => q.complete_with_error(error),
+        }
+    }
+}
+
+impl From<PrimaryLookupQuery> for QueuedLookup {
+    fn from(q: PrimaryLookupQuery) -> Self {
+        QueuedLookup::Primary(q)
+    }
+}
+
+impl From<PrefixLookupQuery> for QueuedLookup {
+    fn from(q: PrefixLookupQuery) -> Self {
+        QueuedLookup::Prefix(q)
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/lookup/lookup_queue.rs b/fluss-rust/crates/fluss/src/client/lookup/lookup_queue.rs
new file mode 100644
index 0000000000..295ec93d8c
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/lookup/lookup_queue.rs
@@ -0,0 +1,148 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Lookup queue for buffering pending lookup operations.
+//!
+//! This queue buffers lookup operations and provides batched draining
+//! to improve throughput by reducing network round trips.
+
+use super::QueuedLookup;
+use std::time::Duration;
+use tokio::sync::{mpsc, watch};
+
+/// A queue that buffers pending lookup operations and provides batched draining.
+///
+/// The queue supports two types of entries:
+/// - New lookups from client calls
+/// - Re-enqueued lookups from retry logic
+///
+/// Re-enqueued lookups are prioritized over new lookups to ensure fair processing.
+pub struct LookupQueue {
+    /// Channel for receiving lookup requests
+    lookup_rx: mpsc::Receiver<QueuedLookup>,
+    /// Channel for receiving re-enqueued lookups
+    re_enqueue_rx: mpsc::UnboundedReceiver<QueuedLookup>,
+    /// Maximum batch size for draining
+    max_batch_size: usize,
+    /// Timeout for batch collection
+    batch_timeout: Duration,
+    /// Wakes `drain()` early when the cluster changes.
+    cluster_rx: watch::Receiver<u64>,
+}
+
+impl LookupQueue {
+    pub fn new(
+        queue_size: usize,
+        max_batch_size: usize,
+        batch_timeout_ms: u64,
+        cluster_rx: watch::Receiver<u64>,
+    ) -> (
+        Self,
+        mpsc::Sender<QueuedLookup>,
+        mpsc::UnboundedSender<QueuedLookup>,
+    ) {
+        let (lookup_tx, lookup_rx) = mpsc::channel(queue_size);
+        let (re_enqueue_tx, re_enqueue_rx) = mpsc::unbounded_channel();
+
+        let queue = Self {
+            lookup_rx,
+            re_enqueue_rx,
+            max_batch_size,
+            batch_timeout: Duration::from_millis(batch_timeout_ms),
+            cluster_rx,
+        };
+
+        (queue, lookup_tx, re_enqueue_tx)
+    }
+
+    /// Drains a batch of lookup queries from the queue.
+    pub async fn drain(&mut self) -> Vec<QueuedLookup> {
+        let mut lookups = Vec::with_capacity(self.max_batch_size);
+        let deadline = tokio::time::Instant::now() + self.batch_timeout;
+
+        loop {
+            let remaining = deadline.saturating_duration_since(tokio::time::Instant::now());
+            if remaining.is_zero() {
+                break;
+            }
+
+            // Prioritize re-enqueued lookups.
+            while lookups.len() < self.max_batch_size {
+                match self.re_enqueue_rx.try_recv() {
+                    Ok(lookup) => lookups.push(lookup),
+                    Err(_) => break,
+                }
+            }
+            if lookups.len() >= self.max_batch_size {
+                break;
+            }
+
+            let sleep = tokio::time::sleep(remaining);
+            tokio::select! {
+                biased;
+                maybe = self.lookup_rx.recv() => {
+                    match maybe {
+                        Some(lookup) => {
+                            lookups.push(lookup);
+                            while lookups.len() < self.max_batch_size {
+                                match self.lookup_rx.try_recv() {
+                                    Ok(lookup) => lookups.push(lookup),
+                                    Err(_) => break,
+                                }
+                            }
+                        }
+                        None => break,
+                    }
+                }
+                _ = self.cluster_rx.changed() => {
+                    if !lookups.is_empty() {
+                        break;
+                    }
+                }
+                _ = sleep => break,
+            }
+
+            if lookups.len() >= self.max_batch_size {
+                break;
+            }
+        }
+
+        lookups
+    }
+
+    /// Drains all remaining lookups from the queue.
+    pub fn drain_all(&mut self) -> Vec<QueuedLookup> {
+        let mut lookups = Vec::new();
+
+        // Drain re-enqueued lookups
+        while let Ok(lookup) = self.re_enqueue_rx.try_recv() {
+            lookups.push(lookup);
+        }
+
+        // Drain main queue
+        while let Ok(lookup) = self.lookup_rx.try_recv() {
+            lookups.push(lookup);
+        }
+
+        lookups
+    }
+
+    /// Returns true if there are undrained lookups in the queue.
+    pub fn has_undrained(&self) -> bool {
+        !self.lookup_rx.is_empty() || !self.re_enqueue_rx.is_empty()
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/lookup/lookup_sender.rs b/fluss-rust/crates/fluss/src/client/lookup/lookup_sender.rs
new file mode 100644
index 0000000000..06014bfbb7
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/lookup/lookup_sender.rs
@@ -0,0 +1,711 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::{LookupQueue, QueuedLookup};
+use crate::client::lookup::lookup_query::LookupQuery;
+use crate::client::metadata::Metadata;
+use crate::error::{Error, FlussError, Result};
+use crate::metadata::{TableBucket, TablePath};
+use crate::proto::{LookupResponse, PrefixLookupResponse};
+use crate::rpc::ServerConnection;
+use crate::rpc::message::{LookupRequest, PrefixLookupRequest, ReadType, RequestBody, WriteType};
+use crate::{BucketId, PartitionId, TableId};
+use bytes::Bytes;
+use futures::stream::{FuturesUnordered, StreamExt};
+use log::{debug, error, warn};
+use std::collections::{HashMap, HashSet};
+use std::io::Cursor;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::Duration;
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, watch};
+
+type ServerId = i32;
+
+type BatchesByLeader<T> = HashMap<ServerId, HashMap<TableBucket, LookupBatch<T>>>;
+type PrimaryBatches = BatchesByLeader<Option<Vec<u8>>>;
+type PrefixBatches = BatchesByLeader<Vec<Vec<u8>>>;
+
+struct BucketResponse<V> {
+    partition_id: Option<PartitionId>,
+    bucket_id: BucketId,
+    error_code: Option<i32>,
+    error_message: Option<String>,
+    values: Vec<V>,
+}
+
+trait LookupProtocol {
+    type Request: RequestBody<ResponseBody = Self::Response> + Send + WriteType<Vec<u8>>;
+    type Response: ReadType<Cursor<Vec<u8>>> + Send;
+    type Value: Send;
+
+    const OP_NAME: &'static str;
+
+    fn build_request(
+        table_id: TableId,
+        keys_by_bucket: Vec<(BucketId, Option<PartitionId>, Vec<Bytes>)>,
+    ) -> Self::Request;
+
+    fn decode_buckets(
+        response: Self::Response,
+    ) -> impl Iterator<Item = BucketResponse<Self::Value>>;
+}
+
+struct Primary;
+impl LookupProtocol for Primary {
+    type Request = LookupRequest;
+    type Response = LookupResponse;
+    type Value = Option<Vec<u8>>;
+
+    const OP_NAME: &'static str = "Lookup";
+
+    fn build_request(
+        table_id: TableId,
+        keys_by_bucket: Vec<(BucketId, Option<PartitionId>, Vec<Bytes>)>,
+    ) -> Self::Request {
+        LookupRequest::new_batched(table_id, keys_by_bucket)
+    }
+
+    fn decode_buckets(
+        response: Self::Response,
+    ) -> impl Iterator<Item = BucketResponse<Self::Value>> {
+        response.buckets_resp.into_iter().map(|r| BucketResponse {
+            partition_id: r.partition_id,
+            bucket_id: r.bucket_id,
+            error_code: r.error_code,
+            error_message: r.error_message,
+            values: r.values.into_iter().map(|pb| pb.values).collect(),
+        })
+    }
+}
+
+struct Prefix;
+impl LookupProtocol for Prefix {
+    type Request = PrefixLookupRequest;
+    type Response = PrefixLookupResponse;
+    type Value = Vec<Vec<u8>>;
+
+    const OP_NAME: &'static str = "Prefix lookup";
+
+    fn build_request(
+        table_id: TableId,
+        keys_by_bucket: Vec<(BucketId, Option<PartitionId>, Vec<Bytes>)>,
+    ) -> Self::Request {
+        PrefixLookupRequest::new_batched(table_id, keys_by_bucket)
+    }
+
+    fn decode_buckets(
+        response: Self::Response,
+    ) -> impl Iterator<Item = BucketResponse<Self::Value>> {
+        response.buckets_resp.into_iter().map(|r| BucketResponse {
+            partition_id: r.partition_id,
+            bucket_id: r.bucket_id,
+            error_code: r.error_code,
+            error_message: r.error_message,
+            values: r.value_lists.into_iter().map(|pb| pb.values).collect(),
+        })
+    }
+}
+
+struct GroupByLeaderResult {
+    primary: PrimaryBatches,
+    prefix: PrefixBatches,
+    unknown_leader_tables: HashSet<TablePath>,
+    unknown_leader_partition_ids: HashSet<PartitionId>,
+}
+
+impl GroupByLeaderResult {
+    fn is_empty(&self) -> bool {
+        self.primary.is_empty() && self.prefix.is_empty()
+    }
+
+    /// Assumes no `(server, bucket)` overlap — safe because the second pass only
+    /// re-groups items unknown in the first.
+    fn merge_batches(&mut self, other: GroupByLeaderResult) {
+        for (server, inner) in other.primary {
+            self.primary.entry(server).or_default().extend(inner);
+        }
+        for (server, inner) in other.prefix {
+            self.prefix.entry(server).or_default().extend(inner);
+        }
+    }
+}
+
+struct GroupingResult {
+    groups: GroupByLeaderResult,
+    unknowns: Vec<QueuedLookup>,
+}
+
+pub struct LookupSender {
+    metadata: Arc<Metadata>,
+    queue: LookupQueue,
+    re_enqueue_tx: mpsc::UnboundedSender<QueuedLookup>,
+    inflight_semaphore: Arc<Semaphore>,
+    max_retries: i32,
+    running: AtomicBool,
+    force_close: AtomicBool,
+    shutdown_rx: watch::Receiver<bool>,
+}
+
+struct LookupBatch<T> {
+    table_bucket: TableBucket,
+    lookups: Vec<LookupQuery<T>>,
+    keys: Vec<Bytes>,
+}
+
+impl<T> LookupBatch<T> {
+    fn new(table_bucket: TableBucket) -> Self {
+        Self {
+            table_bucket,
+            lookups: Vec::new(),
+            keys: Vec::new(),
+        }
+    }
+
+    fn add_lookup(&mut self, lookup: LookupQuery<T>) {
+        self.keys.push(lookup.key().clone());
+        self.lookups.push(lookup);
+    }
+
+    fn complete(&mut self, values: Vec<T>) {
+        if values.len() != self.lookups.len() {
+            let err_msg = format!(
+                "The number of return values ({}) does not match the number of lookups ({})",
+                values.len(),
+                self.lookups.len()
+            );
+            for lookup in &mut self.lookups {
+                lookup.complete_with_error(Error::UnexpectedError {
+                    message: err_msg.clone(),
+                    source: None,
+                });
+            }
+            return;
+        }
+
+        for (lookup, value) in self.lookups.iter_mut().zip(values) {
+            lookup.complete(Ok(value));
+        }
+    }
+
+    fn complete_all_with_error(&mut self, error_msg: &str) {
+        for lookup in &mut self.lookups {
+            lookup.complete_with_error(Error::UnexpectedError {
+                message: error_msg.to_string(),
+                source: None,
+            });
+        }
+    }
+
+    fn keys_tuple(&mut self) -> (BucketId, Option<PartitionId>, Vec<Bytes>) {
+        (
+            self.table_bucket.bucket_id(),
+            self.table_bucket.partition_id(),
+            std::mem::take(&mut self.keys),
+        )
+    }
+}
+
+impl LookupSender {
+    pub fn new(
+        metadata: Arc<Metadata>,
+        queue: LookupQueue,
+        re_enqueue_tx: mpsc::UnboundedSender<QueuedLookup>,
+        max_inflight_requests: usize,
+        max_retries: i32,
+        shutdown_rx: watch::Receiver<bool>,
+    ) -> Self {
+        Self {
+            metadata,
+            queue,
+            re_enqueue_tx,
+            inflight_semaphore: Arc::new(Semaphore::new(max_inflight_requests)),
+            max_retries,
+            running: AtomicBool::new(true),
+            force_close: AtomicBool::new(false),
+            shutdown_rx,
+        }
+    }
+
+    pub async fn run(&mut self) {
+        debug!("Starting Fluss lookup sender");
+
+        let mut shutdown_rx = self.shutdown_rx.clone();
+
+        while self.running.load(Ordering::Acquire) {
+            if *shutdown_rx.borrow() {
+                debug!("Lookup sender received shutdown signal");
+                self.initiate_close();
+                break;
+            }
+
+            tokio::select! {
+                biased;
+                _ = shutdown_rx.changed() => {
+                    if *shutdown_rx.borrow() {
+                        debug!("Lookup sender received shutdown signal during select");
+                        self.initiate_close();
+                    }
+                }
+                result = self.run_once(false) => {
+                    if let Err(e) = result {
+                        error!("Error in lookup sender: {}", e);
+                    }
+                }
+            }
+        }
+
+        debug!("Beginning shutdown of lookup sender, sending remaining lookups");
+
+        // TODO: Check the in-flight request count in the accumulator.
+        if !self.force_close.load(Ordering::Acquire) && self.queue.has_undrained() {
+            if let Err(e) = self.run_once(true).await {
+                error!("Error during lookup sender shutdown: {}", e);
+            }
+        }
+
+        // TODO: If force close failed, add logic to abort incomplete lookup requests.
+        debug!("Lookup sender shutdown complete");
+    }
+
+    async fn run_once(&mut self, drain_all: bool) -> Result<()> {
+        let lookups = if drain_all {
+            self.queue.drain_all()
+        } else {
+            self.queue.drain().await
+        };
+
+        self.send_lookups(lookups).await
+    }
+
+    async fn send_lookups(&self, lookups: Vec<QueuedLookup>) -> Result<()> {
+        if lookups.is_empty() {
+            return Ok(());
+        }
+
+        let GroupingResult {
+            mut groups,
+            unknowns,
+        } = self.group_by_leader(lookups);
+
+        if !unknowns.is_empty() {
+            let table_paths_refs: HashSet<&TablePath> =
+                groups.unknown_leader_tables.iter().collect();
+            let partition_ids: Vec<PartitionId> = groups
+                .unknown_leader_partition_ids
+                .iter()
+                .copied()
+                .collect();
+            if let Err(e) = self
+                .metadata
+                .update_tables_metadata(&table_paths_refs, &HashSet::new(), partition_ids)
+                .await
+            {
+                warn!("Failed to update metadata for unknown leader tables: {}", e);
+            } else {
+                debug!(
+                    "Updated metadata due to unknown leader tables during lookup: {:?}",
+                    groups.unknown_leader_tables
+                );
+            }
+
+            // Re-group with fresh cluster state; dispatch what resolved, re-enqueue the rest.
+            let retry = self.group_by_leader(unknowns);
+            groups.merge_batches(retry.groups);
+            for item in retry.unknowns {
+                self.re_enqueue_lookup(item);
+            }
+
+            // Nothing to dispatch even after refresh — back off to avoid a tight RPC loop.
+            if groups.is_empty() {
+                let mut cluster_rx = self.metadata.subscribe_cluster_changes();
+                tokio::select! {
+                    _ = cluster_rx.changed() => {}
+                    _ = tokio::time::sleep(Duration::from_millis(100)) => {}
+                }
+                return Ok(());
+            }
+        }
+
+        let primary_fut = async {
+            let mut pending = FuturesUnordered::new();
+            for (server, batches) in groups.primary {
+                pending.push(self.send_request::<Primary>(server, batches));
+            }
+            while pending.next().await.is_some() {}
+        };
+        let prefix_fut = async {
+            let mut pending = FuturesUnordered::new();
+            for (server, batches) in groups.prefix {
+                pending.push(self.send_request::<Prefix>(server, batches));
+            }
+            while pending.next().await.is_some() {}
+        };
+        tokio::join!(primary_fut, prefix_fut);
+
+        Ok(())
+    }
+
+    fn group_by_leader(&self, lookups: Vec<QueuedLookup>) -> GroupingResult {
+        let cluster = self.metadata.get_cluster();
+        let mut primary: PrimaryBatches = HashMap::new();
+        let mut prefix: PrefixBatches = HashMap::new();
+        let mut unknown_leader_tables: HashSet<TablePath> = HashSet::new();
+        let mut unknown_leader_partition_ids: HashSet<PartitionId> = HashSet::new();
+        let mut unknowns: Vec<QueuedLookup> = Vec::new();
+
+        for query in lookups {
+            let table_bucket = query.table_bucket().clone();
+
+            let leader = match cluster.leader_for(&table_bucket) {
+                Some(leader) => leader.id(),
+                None => {
+                    warn!(
+                        "No leader found for table bucket {} during lookup",
+                        table_bucket
+                    );
+                    unknown_leader_tables.insert(query.table_path().clone());
+                    if let Some(partition_id) = table_bucket.partition_id() {
+                        unknown_leader_partition_ids.insert(partition_id);
+                    }
+                    unknowns.push(query);
+                    continue;
+                }
+            };
+
+            match query {
+                QueuedLookup::Primary(q) => {
+                    primary
+                        .entry(leader)
+                        .or_default()
+                        .entry(table_bucket.clone())
+                        .or_insert_with(|| LookupBatch::new(table_bucket))
+                        .add_lookup(q);
+                }
+                QueuedLookup::Prefix(q) => {
+                    prefix
+                        .entry(leader)
+                        .or_default()
+                        .entry(table_bucket.clone())
+                        .or_insert_with(|| LookupBatch::new(table_bucket))
+                        .add_lookup(q);
+                }
+            }
+        }
+
+        GroupingResult {
+            groups: GroupByLeaderResult {
+                primary,
+                prefix,
+                unknown_leader_tables,
+                unknown_leader_partition_ids,
+            },
+            unknowns,
+        }
+    }
+
+    async fn send_request<P: LookupProtocol>(
+        &self,
+        destination: ServerId,
+        batches_by_bucket: HashMap<TableBucket, LookupBatch<P::Value>>,
+    ) where
+        LookupQuery<P::Value>: Into<QueuedLookup>,
+    {
+        let mut batches_by_table = group_by_table(batches_by_bucket);
+        let connection = match self
+            .connect_or_fail(destination, &mut batches_by_table)
+            .await
+        {
+            Some(conn) => conn,
+            None => return,
+        };
+
+        let mut pending = FuturesUnordered::new();
+        for (table_id, mut batches) in batches_by_table {
+            let keys_by_bucket: Vec<_> = batches.iter_mut().map(|b| b.keys_tuple()).collect();
+            let request = P::build_request(table_id, keys_by_bucket);
+            pending.push(self.send_single_table_lookup::<P>(
+                table_id,
+                destination,
+                connection.clone(),
+                request,
+                batches,
+            ));
+        }
+        while pending.next().await.is_some() {}
+    }
+
+    async fn connect_or_fail<T>(
+        &self,
+        destination: ServerId,
+        batches_by_table: &mut HashMap<TableId, Vec<LookupBatch<T>>>,
+    ) -> Option<ServerConnection>
+    where
+        LookupQuery<T>: Into<QueuedLookup>,
+    {
+        let cluster = self.metadata.get_cluster();
+        let tablet_server = match cluster.get_tablet_server(destination) {
+            Some(server) => server.clone(),
+            None => {
+                let err_msg = format!("Server {} is not found in metadata cache", destination);
+                self.fail_all_batches(&err_msg, true, batches_by_table);
+                return None;
+            }
+        };
+
+        match self.metadata.get_connection(&tablet_server).await {
+            Ok(conn) => Some(conn),
+            Err(e) => {
+                let err_msg = format!("Failed to get connection to server {}: {}", destination, e);
+                self.fail_all_batches(&err_msg, true, batches_by_table);
+                None
+            }
+        }
+    }
+
+    fn fail_all_batches<T>(
+        &self,
+        err_msg: &str,
+        is_retriable: bool,
+        batches_by_table: &mut HashMap<TableId, Vec<LookupBatch<T>>>,
+    ) where
+        LookupQuery<T>: Into<QueuedLookup>,
+    {
+        for batches in batches_by_table.values_mut() {
+            for batch in batches.iter_mut() {
+                self.handle_batch_error(err_msg, is_retriable, batch);
+            }
+        }
+    }
+
+    async fn send_single_table_lookup<P: LookupProtocol>(
+        &self,
+        table_id: TableId,
+        destination: ServerId,
+        connection: ServerConnection,
+        request: P::Request,
+        mut batches: Vec<LookupBatch<P::Value>>,
+    ) where
+        LookupQuery<P::Value>: Into<QueuedLookup>,
+    {
+        let _permit = match self.acquire_inflight_permit(&mut batches).await {
+            Some(p) => p,
+            None => return,
+        };
+
+        match connection.request(request).await {
+            Ok(response) => {
+                self.handle_response::<P>(table_id, destination, response, &mut batches);
+            }
+            Err(e) => {
+                let err_msg = format!("{} request failed: {}", P::OP_NAME, e);
+                let is_retriable = e.is_retriable();
+                for batch in &mut batches {
+                    self.handle_batch_error(&err_msg, is_retriable, batch);
+                }
+            }
+        }
+    }
+
+    async fn acquire_inflight_permit<T>(
+        &self,
+        batches: &mut [LookupBatch<T>],
+    ) -> Option<OwnedSemaphorePermit> {
+        match self.inflight_semaphore.clone().acquire_owned().await {
+            Ok(p) => Some(p),
+            Err(_) => {
+                error!("Semaphore closed during lookup");
+                for batch in batches.iter_mut() {
+                    batch.complete_all_with_error("Lookup sender shutdown");
+                }
+                None
+            }
+        }
+    }
+
+    fn handle_response<P: LookupProtocol>(
+        &self,
+        table_id: TableId,
+        destination: ServerId,
+        response: P::Response,
+        batches: &mut [LookupBatch<P::Value>],
+    ) where
+        LookupQuery<P::Value>: Into<QueuedLookup>,
+    {
+        let bucket_to_index = build_bucket_index(batches);
+        let mut processed = vec![false; batches.len()];
+
+        for bucket_resp in P::decode_buckets(response) {
+            let table_bucket = TableBucket::new_with_partition(
+                table_id,
+                bucket_resp.partition_id,
+                bucket_resp.bucket_id,
+            );
+            let Some(&idx) = bucket_to_index.get(&table_bucket) else {
+                error!(
+                    "Received {} response for unknown bucket {} from server {}",
+                    P::OP_NAME,
+                    table_bucket,
+                    destination
+                );
+                continue;
+            };
+            processed[idx] = true;
+            let batch = &mut batches[idx];
+
+            if let Some(err) = extract_bucket_error(
+                bucket_resp.error_code,
+                bucket_resp.error_message,
+                &table_bucket,
+                P::OP_NAME,
+            ) {
+                self.handle_batch_error(&err.message, err.is_retriable, batch);
+                continue;
+            }
+
+            batch.complete(bucket_resp.values);
+        }
+
+        self.fail_unprocessed_batches(&processed, batches, destination, P::OP_NAME);
+    }
+
+    fn fail_unprocessed_batches<T>(
+        &self,
+        processed: &[bool],
+        batches: &mut [LookupBatch<T>],
+        destination: ServerId,
+        op_name: &'static str,
+    ) where
+        LookupQuery<T>: Into<QueuedLookup>,
+    {
+        for (idx, was_processed) in processed.iter().enumerate() {
+            if !was_processed {
+                let batch = &mut batches[idx];
+                let err_msg = format!(
+                    "Bucket {} {} response missing from server {}",
+                    batch.table_bucket.bucket_id(),
+                    op_name,
+                    destination
+                );
+                self.handle_batch_error(&err_msg, true, batch);
+            }
+        }
+    }
+
+    fn handle_batch_error<T>(&self, error_msg: &str, is_retriable: bool, batch: &mut LookupBatch<T>)
+    where
+        LookupQuery<T>: Into<QueuedLookup>,
+    {
+        let mut retried = 0usize;
+        let mut failed = 0usize;
+        let table_bucket = batch.table_bucket.clone();
+
+        for mut lookup in batch.lookups.drain(..) {
+            if is_retriable && lookup.retries() < self.max_retries && !lookup.is_done() {
+                lookup.increment_retries();
+                self.re_enqueue_lookup(lookup.into());
+                retried += 1;
+            } else {
+                lookup.complete_with_error(Error::UnexpectedError {
+                    message: error_msg.to_string(),
+                    source: None,
+                });
+                failed += 1;
+            }
+        }
+
+        if retried > 0 {
+            warn!(
+                "Lookup error for bucket {}, retrying {} lookups: {}",
+                table_bucket, retried, error_msg
+            );
+        }
+        if failed > 0 {
+            warn!(
+                "Lookup failed for bucket {} ({} lookups): {}",
+                table_bucket, failed, error_msg
+            );
+        }
+    }
+
+    fn re_enqueue_lookup(&self, lookup: QueuedLookup) {
+        if let Err(e) = self.re_enqueue_tx.send(lookup) {
+            error!("Failed to re-enqueue lookup: {}", e);
+            let mut failed_lookup = e.0;
+            failed_lookup.complete_with_error(Error::UnexpectedError {
+                message: "Failed to re-enqueue lookup: channel closed".to_string(),
+                source: None,
+            });
+        }
+    }
+
+    pub fn initiate_close(&mut self) {
+        self.running.store(false, Ordering::Release);
+    }
+
+    #[allow(dead_code)]
+    pub fn force_close(&mut self) {
+        self.force_close.store(true, Ordering::Release);
+        self.initiate_close();
+    }
+}
+
+fn group_by_table<T>(
+    batches_by_bucket: HashMap<TableBucket, LookupBatch<T>>,
+) -> HashMap<TableId, Vec<LookupBatch<T>>> {
+    let mut out: HashMap<TableId, Vec<LookupBatch<T>>> = HashMap::new();
+    for (table_bucket, batch) in batches_by_bucket {
+        out.entry(table_bucket.table_id()).or_default().push(batch);
+    }
+    out
+}
+
+fn build_bucket_index<T>(batches: &[LookupBatch<T>]) -> HashMap<TableBucket, usize> {
+    batches
+        .iter()
+        .enumerate()
+        .map(|(idx, batch)| (batch.table_bucket.clone(), idx))
+        .collect()
+}
+
+struct BucketError {
+    message: String,
+    is_retriable: bool,
+}
+
+fn extract_bucket_error(
+    error_code: Option<i32>,
+    error_message: Option<String>,
+    table_bucket: &TableBucket,
+    op: &str,
+) -> Option<BucketError> {
+    let code = error_code?;
+    let fluss_error = FlussError::for_code(code);
+    if fluss_error == FlussError::None {
+        return None;
+    }
+    Some(BucketError {
+        message: format!(
+            "{} error for bucket {}: code={}, message={}",
+            op,
+            table_bucket,
+            code,
+            error_message.unwrap_or_default()
+        ),
+        is_retriable: fluss_error.is_retriable(),
+    })
+}
diff --git a/fluss-rust/crates/fluss/src/client/lookup/mod.rs b/fluss-rust/crates/fluss/src/client/lookup/mod.rs
new file mode 100644
index 0000000000..ac2446a9e4
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/lookup/mod.rs
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Lookup client implementation with batching and queuing support.
+//!
+//! This module provides a high-throughput lookup client that batches multiple
+//! lookup operations together to reduce network round trips, achieving parity
+//! with the Java client implementation.
+//!
+//! # Example
+//!
+//! ```ignore
+//! let lookup_client = LookupClient::new(config, metadata);
+//! let future = lookup_client.lookup(table_path, table_bucket, key_bytes);
+//! let result = future.await?;
+//! ```
+
+mod lookup_client;
+mod lookup_query;
+mod lookup_queue;
+mod lookup_sender;
+
+pub use lookup_client::LookupClient;
+pub(crate) use lookup_query::{PrefixLookupQuery, PrimaryLookupQuery, QueuedLookup};
+pub(crate) use lookup_queue::LookupQueue;
diff --git a/fluss-rust/crates/fluss/src/client/metadata.rs b/fluss-rust/crates/fluss/src/client/metadata.rs
new file mode 100644
index 0000000000..1e3ee7fe1c
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/metadata.rs
@@ -0,0 +1,367 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::PartitionId;
+use crate::cluster::{Cluster, ServerNode, ServerType};
+use crate::error::{Error, FlussError, Result};
+use crate::metadata::{PhysicalTablePath, TableBucket, TablePath};
+use crate::proto::MetadataResponse;
+use crate::rpc::message::UpdateMetadataRequest;
+use crate::rpc::{RpcClient, ServerConnection};
+use log::info;
+use parking_lot::RwLock;
+use std::collections::HashSet;
+use std::net::{SocketAddr, ToSocketAddrs};
+use std::sync::Arc;
+use tokio::sync::watch;
+
+pub struct Metadata {
+    cluster: RwLock<Arc<Cluster>>,
+    connections: Arc<RpcClient>,
+    bootstrap: Arc<str>,
+    cluster_version_tx: watch::Sender<u64>,
+}
+
+impl Metadata {
+    pub async fn new(bootstrap: &str, connections: Arc<RpcClient>) -> Result<Self> {
+        let cluster = Self::init_cluster(bootstrap, connections.clone()).await?;
+        let (cluster_version_tx, _) = watch::channel(0);
+        Ok(Metadata {
+            cluster: RwLock::new(Arc::new(cluster)),
+            connections,
+            bootstrap: bootstrap.into(),
+            cluster_version_tx,
+        })
+    }
+
+    pub fn subscribe_cluster_changes(&self) -> watch::Receiver<u64> {
+        self.cluster_version_tx.subscribe()
+    }
+
+    fn notify_cluster_changed(&self) {
+        self.cluster_version_tx
+            .send_modify(|v| *v = v.wrapping_add(1));
+    }
+
+    fn parse_bootstrap(boot_strap: &str) -> Result<SocketAddr> {
+        // Resolve all socket addresses and deterministically choose one.
+        let addrs = boot_strap
+            .to_socket_addrs()
+            .map_err(|e| Error::IllegalArgument {
+                message: format!("Invalid bootstrap address '{boot_strap}': {e}"),
+            })?;
+
+        // Prefer IPv4 addresses; if none are available, fall back to the first IPv6.
+        let mut ipv6_candidate: Option<SocketAddr> = None;
+        for addr in addrs {
+            if addr.is_ipv4() {
+                return Ok(addr);
+            }
+            if ipv6_candidate.is_none() {
+                ipv6_candidate = Some(addr);
+            }
+        }
+
+        let addr = ipv6_candidate.ok_or_else(|| Error::IllegalArgument {
+            message: format!("Unable to resolve bootstrap address '{boot_strap}'"),
+        })?;
+        Ok(addr)
+    }
+
+    async fn init_cluster(boot_strap: &str, connections: Arc<RpcClient>) -> Result<Cluster> {
+        let socket_address = Self::parse_bootstrap(boot_strap)?;
+        let server_node = ServerNode::new(
+            -1,
+            socket_address.ip().to_string(),
+            socket_address.port() as u32,
+            ServerType::CoordinatorServer,
+        );
+        let con = connections.get_connection(&server_node).await?;
+
+        let response = con
+            .request(UpdateMetadataRequest::new(
+                &HashSet::default(),
+                &HashSet::new(),
+                vec![],
+            ))
+            .await?;
+        Cluster::from_metadata_response(response, None)
+    }
+
+    pub(crate) async fn reinit_cluster(&self) -> Result<()> {
+        let cluster = Self::init_cluster(&self.bootstrap, self.connections.clone()).await?;
+        *self.cluster.write() = cluster.into();
+        self.notify_cluster_changed();
+        Ok(())
+    }
+
+    pub fn invalidate_server(&self, server_id: &i32, table_ids: Vec<i64>) {
+        {
+            let mut cluster_guard = self.cluster.write();
+            let updated_cluster = cluster_guard.invalidate_server(server_id, table_ids);
+            *cluster_guard = Arc::new(updated_cluster);
+        }
+        self.notify_cluster_changed();
+    }
+
+    pub fn invalidate_physical_table_meta(
+        &self,
+        physical_tables_to_invalid: &HashSet<PhysicalTablePath>,
+    ) {
+        {
+            let mut cluster_guard = self.cluster.write();
+            let updated_cluster =
+                cluster_guard.invalidate_physical_table_meta(physical_tables_to_invalid);
+            *cluster_guard = Arc::new(updated_cluster);
+        }
+        self.notify_cluster_changed();
+    }
+
+    pub async fn update(&self, metadata_response: MetadataResponse) -> Result<()> {
+        let origin_cluster = self.cluster.read().clone();
+        let new_cluster =
+            Cluster::from_metadata_response(metadata_response, Some(&origin_cluster))?;
+        {
+            let mut cluster = self.cluster.write();
+            *cluster = Arc::new(new_cluster);
+        }
+        self.notify_cluster_changed();
+        Ok(())
+    }
+
+    pub async fn update_tables_metadata(
+        &self,
+        table_paths: &HashSet<&TablePath>,
+        physical_table_paths: &HashSet<&Arc<PhysicalTablePath>>,
+        partition_ids: Vec<i64>,
+    ) -> Result<()> {
+        let maybe_server = {
+            let guard = self.cluster.read();
+            guard.get_one_available_server().cloned()
+        };
+
+        let server = match maybe_server {
+            Some(s) => s,
+            None => {
+                info!(
+                    "No available tablet server to update metadata, attempting to re-initialize cluster using bootstrap server."
+                );
+                self.reinit_cluster().await?;
+                return Ok(());
+            }
+        };
+
+        let conn = self.connections.get_connection(&server).await?;
+
+        let response = conn
+            .request(UpdateMetadataRequest::new(
+                table_paths,
+                physical_table_paths,
+                partition_ids,
+            ))
+            .await?;
+        self.update(response).await?;
+        Ok(())
+    }
+
+    pub async fn update_table_metadata(&self, table_path: &TablePath) -> Result<()> {
+        self.update_tables_metadata(&HashSet::from([table_path]), &HashSet::new(), vec![])
+            .await
+    }
+
+    pub async fn update_physical_table_metadata(
+        &self,
+        physical_table_paths: &[Arc<PhysicalTablePath>],
+    ) -> Result<()> {
+        let mut update_table_paths = HashSet::new();
+        let mut update_partition_paths = HashSet::new();
+        for physical_table_path in physical_table_paths {
+            match physical_table_path.get_partition_name() {
+                Some(_) => {
+                    update_partition_paths.insert(physical_table_path);
+                }
+                None => {
+                    update_table_paths.insert(physical_table_path.get_table_path());
+                }
+            }
+        }
+
+        self.update_tables_metadata(&update_table_paths, &update_partition_paths, vec![])
+            .await
+    }
+
+    pub async fn check_and_update_table_metadata(&self, table_paths: &[TablePath]) -> Result<()> {
+        let cluster_binding = self.cluster.read().clone();
+        let need_update_table_paths: HashSet<&TablePath> = table_paths
+            .iter()
+            .filter(|table_path| cluster_binding.opt_get_table(table_path).is_none())
+            .collect();
+
+        if !need_update_table_paths.is_empty() {
+            self.update_tables_metadata(&need_update_table_paths, &HashSet::new(), vec![])
+                .await?;
+        }
+        Ok(())
+    }
+
+    /// Resolves the partition id, refreshing metadata once if not cached.
+    /// Returns `None` when the partition does not exist — `PartitionNotExists`
+    /// server errors are swallowed so callers can short-circuit to an empty result.
+    pub async fn check_and_update_partition_metadata(
+        &self,
+        physical_table_path: &PhysicalTablePath,
+    ) -> Result<Option<PartitionId>> {
+        if let Some(id) = self.get_cluster().get_partition_id(physical_table_path) {
+            return Ok(Some(id));
+        }
+        let path = Arc::new(physical_table_path.clone());
+        match self.update_physical_table_metadata(&[path]).await {
+            Ok(()) => {}
+            Err(e) if matches!(e.api_error(), Some(FlussError::PartitionNotExists)) => {
+                return Ok(None);
+            }
+            Err(e) => return Err(e),
+        }
+        Ok(self.get_cluster().get_partition_id(physical_table_path))
+    }
+
+    pub async fn get_connection(&self, server_node: &ServerNode) -> Result<ServerConnection> {
+        let result = self.connections.get_connection(server_node).await?;
+        Ok(result)
+    }
+
+    pub fn get_cluster(&self) -> Arc<Cluster> {
+        let guard = self.cluster.read();
+        guard.clone()
+    }
+
+    const MAX_RETRY_TIMES: u8 = 3;
+
+    pub async fn leader_for(
+        &self,
+        table_path: &TablePath,
+        table_bucket: &TableBucket,
+    ) -> Result<Option<ServerNode>> {
+        let leader = self.get_leader_for(table_bucket);
+
+        if leader.is_some() {
+            Ok(leader)
+        } else {
+            for _ in 0..Self::MAX_RETRY_TIMES {
+                if let Some(partition_id) = table_bucket.partition_id() {
+                    self.update_tables_metadata(
+                        &HashSet::from([table_path]),
+                        &HashSet::new(),
+                        vec![partition_id],
+                    )
+                    .await?;
+                } else {
+                    self.update_tables_metadata(
+                        &HashSet::from([table_path]),
+                        &HashSet::new(),
+                        vec![],
+                    )
+                    .await?;
+                }
+
+                let cluster = self.cluster.read();
+                let leader = cluster.leader_for(table_bucket);
+
+                if leader.is_some() {
+                    return Ok(leader.cloned());
+                }
+            }
+
+            Ok(None)
+        }
+    }
+
+    fn get_leader_for(&self, table_bucket: &TableBucket) -> Option<ServerNode> {
+        let cluster = self.cluster.read();
+        cluster.leader_for(table_bucket).cloned()
+    }
+}
+
+#[cfg(test)]
+impl Metadata {
+    pub(crate) fn new_for_test(cluster: Arc<Cluster>) -> Self {
+        let (cluster_version_tx, _) = watch::channel(0);
+        Metadata {
+            cluster: RwLock::new(cluster),
+            connections: Arc::new(RpcClient::new()),
+            bootstrap: Arc::from(""),
+            cluster_version_tx,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{TableBucket, TablePath};
+    use crate::test_utils::build_cluster_arc;
+
+    #[tokio::test]
+    async fn leader_for_returns_server() {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let cluster = build_cluster_arc(&table_path, 1, 1);
+        let metadata = Metadata::new_for_test(cluster);
+        let leader = metadata
+            .leader_for(&table_path, &TableBucket::new(1, 0))
+            .await
+            .unwrap()
+            .expect("leader");
+        assert_eq!(leader.id(), 1);
+    }
+
+    #[test]
+    fn invalidate_server_removes_leader() {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let cluster = build_cluster_arc(&table_path, 1, 1);
+        let metadata = Metadata::new_for_test(cluster);
+        metadata.invalidate_server(&1, vec![1]);
+        let cluster = metadata.get_cluster();
+        assert!(cluster.get_tablet_server(1).is_none());
+    }
+
+    #[test]
+    fn parse_bootstrap_variants() {
+        // valid IP
+        let addr = Metadata::parse_bootstrap("127.0.0.1:8080").unwrap();
+        assert_eq!(addr.port(), 8080);
+
+        // valid hostname
+        let addr = Metadata::parse_bootstrap("localhost:9090").unwrap();
+        assert_eq!(addr.port(), 9090);
+
+        // valid IPv6 address
+        let addr = Metadata::parse_bootstrap("[::1]:8080").unwrap();
+        assert_eq!(addr.port(), 8080);
+
+        // invalid input: missing port
+        assert!(Metadata::parse_bootstrap("localhost").is_err());
+
+        // invalid input: out-of-range port
+        assert!(Metadata::parse_bootstrap("localhost:99999").is_err());
+
+        // invalid input: empty string
+        assert!(Metadata::parse_bootstrap("").is_err());
+
+        // invalid input: nonsensical address
+        assert!(Metadata::parse_bootstrap("invalid_address").is_err());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/mod.rs b/fluss-rust/crates/fluss/src/client/mod.rs
new file mode 100644
index 0000000000..f8027948ae
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/mod.rs
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod admin;
+mod connection;
+mod credentials;
+pub mod lookup;
+mod metadata;
+mod schema_getter;
+mod table;
+mod write;
+
+pub use admin::*;
+pub use connection::*;
+pub use credentials::*;
+pub use lookup::LookupClient;
+pub use metadata::*;
+pub(crate) use schema_getter::ClientSchemaGetter;
+pub use table::*;
+pub use write::*;
diff --git a/fluss-rust/crates/fluss/src/client/schema_getter.rs b/fluss-rust/crates/fluss/src/client/schema_getter.rs
new file mode 100644
index 0000000000..4b643c0bec
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/schema_getter.rs
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Per-table schema cache that lazily fetches missing schema versions
+//! from the coordinator. Used by the lookup path to decode rows that
+//! predate the table's current schema.
+
+use crate::client::admin::FlussAdmin;
+use crate::error::{Error, Result};
+use crate::metadata::{Schema, SchemaInfo, TablePath};
+use parking_lot::RwLock;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+pub(crate) struct ClientSchemaGetter {
+    table_path: TablePath,
+    admin: Arc<FlussAdmin>,
+    /// Pre-seeded with the table's current schema so the dominant case
+    /// (every row written under the latest schema) needs zero RPCs.
+    cache: RwLock<HashMap<i32, Arc<Schema>>>,
+}
+
+impl ClientSchemaGetter {
+    pub fn new(table_path: TablePath, admin: Arc<FlussAdmin>, latest: SchemaInfo) -> Self {
+        let mut map = HashMap::new();
+        let (schema, schema_id) = latest.into_parts();
+        map.insert(schema_id, Arc::new(schema));
+        Self {
+            table_path,
+            admin,
+            cache: RwLock::new(map),
+        }
+    }
+
+    /// Concurrent fetches for the same id are not deduplicated; we
+    /// accept one redundant RPC in exchange for staying off
+    /// `tokio::sync` machinery. Schemas are immutable per id, so
+    /// last-write-wins on the cache insert is correct.
+    pub async fn get_schema(&self, schema_id: i32) -> Result<Arc<Schema>> {
+        if let Some(schema) = self.cache.read().get(&schema_id).cloned() {
+            return Ok(schema);
+        }
+
+        let info = self
+            .admin
+            .get_table_schema(&self.table_path, Some(schema_id))
+            .await?;
+        let (schema, fetched_id) = info.into_parts();
+        if fetched_id != schema_id {
+            return Err(Error::UnexpectedError {
+                message: format!(
+                    "Requested schema id {schema_id}, but server returned schema id {fetched_id}"
+                ),
+                source: None,
+            });
+        }
+        let schema = Arc::new(schema);
+
+        self.cache.write().insert(schema_id, Arc::clone(&schema));
+        Ok(schema)
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/append.rs b/fluss-rust/crates/fluss/src/client/table/append.rs
new file mode 100644
index 0000000000..562e8ea7e7
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/append.rs
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::table::partition_getter::{PartitionGetter, get_physical_path};
+use crate::client::{WriteRecord, WriteResultFuture, WriterClient};
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::{PhysicalTablePath, TableInfo, TablePath};
+use crate::row::{ColumnarRow, InternalRow};
+use arrow::array::RecordBatch;
+use std::sync::Arc;
+
+pub struct TableAppend {
+    table_path: Arc<TablePath>,
+    table_info: Arc<TableInfo>,
+    writer_client: Arc<WriterClient>,
+}
+
+impl TableAppend {
+    pub(super) fn new(
+        table_path: TablePath,
+        table_info: Arc<TableInfo>,
+        writer_client: Arc<WriterClient>,
+    ) -> Self {
+        Self {
+            table_path: Arc::new(table_path),
+            table_info,
+            writer_client,
+        }
+    }
+
+    pub fn create_writer(&self) -> Result<AppendWriter> {
+        let partition_getter = if self.table_info.is_partitioned() {
+            Some(PartitionGetter::new(
+                self.table_info.row_type(),
+                Arc::clone(self.table_info.get_partition_keys()),
+            )?)
+        } else {
+            None
+        };
+
+        Ok(AppendWriter {
+            table_path: Arc::clone(&self.table_path),
+            partition_getter,
+            writer_client: self.writer_client.clone(),
+            table_info: Arc::clone(&self.table_info),
+        })
+    }
+}
+
+pub struct AppendWriter {
+    table_path: Arc<TablePath>,
+    partition_getter: Option<PartitionGetter>,
+    writer_client: Arc<WriterClient>,
+    table_info: Arc<TableInfo>,
+}
+
+impl AppendWriter {
+    fn check_field_count<R: InternalRow>(&self, row: &R) -> Result<()> {
+        let expected = self.table_info.get_row_type().fields().len();
+        if row.get_field_count() != expected {
+            return Err(IllegalArgument {
+                message: format!(
+                    "The field count of the row does not match the table schema. \
+                     Expected: {}, Actual: {}",
+                    expected,
+                    row.get_field_count()
+                ),
+            });
+        }
+        Ok(())
+    }
+
+    /// Appends a row to the table.
+    ///
+    /// This method returns a [`WriteResultFuture`] immediately after queueing the write,
+    /// enabling fire-and-forget semantics for efficient batching.
+    ///
+    /// # Arguments
+    /// * row - the row to append.
+    ///
+    /// # Returns
+    /// A [`WriteResultFuture`] that can be awaited to wait for server acknowledgment,
+    /// or dropped for fire-and-forget behavior (use `flush()` to ensure delivery).
+    pub fn append<R: InternalRow>(&self, row: &R) -> Result<WriteResultFuture> {
+        self.check_field_count(row)?;
+        let physical_table_path = Arc::new(get_physical_path(
+            &self.table_path,
+            self.partition_getter.as_ref(),
+            row,
+        )?);
+        let record = WriteRecord::for_append(
+            Arc::clone(&self.table_info),
+            physical_table_path,
+            self.table_info.schema_id,
+            row,
+        );
+        let result_handle = self.writer_client.send(&record)?;
+        Ok(WriteResultFuture::new(result_handle))
+    }
+
+    /// Appends an Arrow RecordBatch to the table.
+    ///
+    /// This method returns a [`WriteResultFuture`] immediately after queueing the write,
+    /// enabling fire-and-forget semantics for efficient batching.
+    ///
+    /// For partitioned tables, the partition is derived from the **first row** of the batch.
+    /// Callers must ensure all rows in the batch belong to the same partition.
+    ///
+    /// # Returns
+    /// A [`WriteResultFuture`] that can be awaited to wait for server acknowledgment,
+    /// or dropped for fire-and-forget behavior (use `flush()` to ensure delivery).
+    pub fn append_arrow_batch(&self, batch: RecordBatch) -> Result<WriteResultFuture> {
+        let physical_table_path = if self.partition_getter.is_some() && batch.num_rows() > 0 {
+            let first_row = ColumnarRow::new(
+                Arc::new(batch.clone()),
+                Arc::new(self.table_info.row_type.clone()),
+                0,
+                None,
+            );
+            Arc::new(get_physical_path(
+                &self.table_path,
+                self.partition_getter.as_ref(),
+                &first_row,
+            )?)
+        } else {
+            Arc::new(PhysicalTablePath::of(Arc::clone(&self.table_path)))
+        };
+
+        let record = WriteRecord::for_append_record_batch(
+            Arc::clone(&self.table_info),
+            physical_table_path,
+            self.table_info.schema_id,
+            batch,
+        );
+        let result_handle = self.writer_client.send(&record)?;
+        Ok(WriteResultFuture::new(result_handle))
+    }
+
+    pub async fn flush(&self) -> Result<()> {
+        self.writer_client.flush().await
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/batch_scanner.rs b/fluss-rust/crates/fluss/src/client/table/batch_scanner.rs
new file mode 100644
index 0000000000..cc0585f30e
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/batch_scanner.rs
@@ -0,0 +1,767 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Bounded batch scanner backed by a single `LimitScanRequest`, polled with
+//! `next_batch` until it returns `None` (like `RecordBatchLogReader`).
+//!
+//! The KV branch decodes a [`ValueRecordBatch`], decoding each record against
+//! its own schema id via [`FixedSchemaDecoder`] so older records are projected
+//! onto the current schema (the same path as lookup).
+
+use crate::client::ClientSchemaGetter;
+use crate::client::metadata::Metadata;
+use crate::error::{ApiError, Error, FlussError, Result};
+use crate::metadata::{KvFormat, RowType, Schema, TableBucket, TableInfo};
+use crate::proto::ErrorResponse;
+use crate::record::kv::{SCHEMA_ID_LENGTH, ValueRecordBatch};
+use crate::record::{
+    LogRecordsBatches, ReadContext as ArrowReadContext, RowAppendRecordBatchBuilder, ScanBatch,
+    to_arrow_schema,
+};
+use crate::row::FixedSchemaDecoder;
+use crate::rpc::RpcClient;
+use crate::rpc::message::LimitScanRequest;
+use arrow::array::RecordBatch;
+use arrow::compute::concat_batches;
+use arrow_schema::SchemaRef;
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::Bytes;
+use std::collections::HashMap;
+use std::ops::Range;
+use std::sync::Arc;
+
+/// One-shot bounded scanner: a single `LimitScanRequest` yielded as one
+/// [`ScanBatch`]. Creation is cheap; the request runs on the first
+/// [`next_batch`](Self::next_batch), which returns the batch once, then `None`.
+pub struct LimitBatchScanner {
+    bucket: TableBucket,
+    /// Taken on the first `next_batch` to run the scan; `None` afterward.
+    pending: Option<PendingScan>,
+}
+
+/// Request inputs captured at creation, consumed by the first `next_batch`.
+struct PendingScan {
+    rpc_client: Arc<RpcClient>,
+    metadata: Arc<Metadata>,
+    table_info: TableInfo,
+    schema_getter: Arc<ClientSchemaGetter>,
+    projected_fields: Option<Vec<usize>>,
+    limit: i32,
+}
+
+impl LimitBatchScanner {
+    pub(super) fn new(
+        rpc_client: Arc<RpcClient>,
+        metadata: Arc<Metadata>,
+        table_info: TableInfo,
+        schema_getter: Arc<ClientSchemaGetter>,
+        projected_fields: Option<Vec<usize>>,
+        bucket: TableBucket,
+        limit: i32,
+    ) -> Self {
+        Self {
+            bucket,
+            pending: Some(PendingScan {
+                rpc_client,
+                metadata,
+                table_info,
+                schema_getter,
+                projected_fields,
+                limit,
+            }),
+        }
+    }
+
+    /// Runs the scan on the first call and returns its batch, then `None`. Not
+    /// retried — an error leaves the scanner spent; create a new one to retry.
+    pub async fn next_batch(&mut self) -> Result<Option<ScanBatch>> {
+        let Some(pending) = self.pending.take() else {
+            return Ok(None);
+        };
+        run_limit_scan(&pending, &self.bucket).await.map(Some)
+    }
+
+    /// Drains the scanner into all of its batches.
+    pub async fn collect_all_batches(&mut self) -> Result<Vec<ScanBatch>> {
+        let mut batches = Vec::new();
+        while let Some(batch) = self.next_batch().await? {
+            batches.push(batch);
+        }
+        Ok(batches)
+    }
+
+    /// The bucket scanned by this `LimitBatchScanner`.
+    pub fn bucket(&self) -> &TableBucket {
+        &self.bucket
+    }
+}
+
+/// Resolves the leader, sends the `LimitScanRequest`, and decodes the response
+/// into one [`ScanBatch`].
+async fn run_limit_scan(pending: &PendingScan, bucket: &TableBucket) -> Result<ScanBatch> {
+    let leader = pending
+        .metadata
+        .leader_for(&pending.table_info.table_path, bucket)
+        .await?
+        .ok_or_else(|| {
+            Error::leader_not_available(format!("No leader found for table bucket: {bucket}"))
+        })?;
+    let connection = pending.rpc_client.get_connection(&leader).await?;
+
+    let request = LimitScanRequest::new(
+        pending.table_info.table_id,
+        bucket.partition_id(),
+        bucket.bucket_id(),
+        pending.limit,
+    );
+    let response = connection.request(request).await?;
+
+    if let Some(error_code) = response.error_code
+        && error_code != FlussError::None.code()
+    {
+        let err: ApiError = ErrorResponse {
+            error_code,
+            error_message: response.error_message.clone(),
+        }
+        .into();
+        return Err(Error::FlussAPIError { api_error: err });
+    }
+
+    let raw = response.records.unwrap_or_default();
+    // `limit` is validated positive by `TableScan::limit`.
+    let limit = pending.limit.max(0) as usize;
+    let projected = pending.projected_fields.as_deref();
+
+    // Choose the payload format from table metadata, not the response's advisory
+    // `is_log_table` flag.
+    let (batch, base_offset) = if !pending.table_info.has_primary_key() {
+        decode_log_batch(&pending.table_info, projected, raw, limit)?
+    } else {
+        // KV (primary-key) limit scan: no log offset, so base_offset is 0.
+        let batch = decode_kv_batch(
+            &pending.table_info,
+            &pending.schema_getter,
+            projected,
+            raw,
+            limit,
+        )
+        .await?;
+        (batch, 0)
+    };
+
+    Ok(ScanBatch::new(bucket.clone(), batch, base_offset))
+}
+
+/// Decode the log payload into a single Arrow `RecordBatch`, concatenating any
+/// inner batches. If more than `limit` rows are returned, the last `limit` are
+/// kept and `base_offset` is advanced by the number dropped.
+fn decode_log_batch(
+    table_info: &TableInfo,
+    projected_fields: Option<&[usize]>,
+    raw: Vec<u8>,
+    limit: usize,
+) -> Result<(RecordBatch, i64)> {
+    let row_type = Arc::new(table_info.get_row_type().clone());
+    let full_schema = to_arrow_schema(table_info.get_row_type())?;
+    let read_context = match projected_fields {
+        None => ArrowReadContext::new(full_schema.clone(), row_type.clone(), false),
+        Some(fields) => ArrowReadContext::with_projection_pushdown(
+            full_schema.clone(),
+            row_type.clone(),
+            fields.to_vec(),
+            false,
+        )?,
+    };
+
+    let target_schema: SchemaRef = match projected_fields {
+        None => full_schema,
+        Some(fields) => {
+            ArrowReadContext::project_schema(to_arrow_schema(table_info.get_row_type())?, fields)?
+        }
+    };
+
+    if raw.is_empty() {
+        return Ok((RecordBatch::new_empty(target_schema), 0));
+    }
+
+    let mut batches: Vec<RecordBatch> = Vec::new();
+    let mut base_offset: Option<i64> = None;
+    for log_batch in LogRecordsBatches::new(raw) {
+        let log_batch = log_batch?;
+        if base_offset.is_none() {
+            base_offset = Some(log_batch.base_log_offset());
+        }
+        let rb = log_batch.record_batch(&read_context)?;
+        batches.push(rb);
+    }
+
+    let base_offset = base_offset.unwrap_or(0);
+    let merged = if batches.is_empty() {
+        RecordBatch::new_empty(target_schema)
+    } else if batches.len() == 1 {
+        batches.into_iter().next().unwrap()
+    } else {
+        concat_batches(&target_schema, batches.iter()).map_err(|e| Error::UnexpectedError {
+            message: format!("Failed to concatenate log record batches: {e}"),
+            source: None,
+        })?
+    };
+
+    Ok(take_last_rows(merged, base_offset, limit))
+}
+
+/// Decode a KV limit-scan [`ValueRecordBatch`] into a single Arrow
+/// `RecordBatch`, decoding each record by its own schema id and projecting onto
+/// the current schema.
+async fn decode_kv_batch(
+    table_info: &TableInfo,
+    schema_getter: &ClientSchemaGetter,
+    projected_fields: Option<&[usize]>,
+    raw: Vec<u8>,
+    limit: usize,
+) -> Result<RecordBatch> {
+    // No records: return an empty (projected) batch.
+    if raw.is_empty() {
+        return empty_record_batch(table_info.get_row_type(), projected_fields);
+    }
+
+    let kv_format = table_info.table_config.get_kv_format()?;
+    let target_schema = table_info.get_schema();
+    let target_schema_id =
+        i16::try_from(table_info.get_schema_id()).map_err(|_| Error::UnexpectedError {
+            message: format!(
+                "Schema id {} does not fit in 16 bits — wire format violated",
+                table_info.get_schema_id()
+            ),
+            source: None,
+        })?;
+
+    let batch = ValueRecordBatch::new(Bytes::from(raw));
+    let ranges = batch.value_ranges()?;
+
+    // Collect the distinct schema ids present, then build one decoder per id
+    // (fetching older schemas via the coordinator as needed).
+    let mut schema_ids: Vec<i16> = Vec::new();
+    for range in &ranges {
+        let id = read_schema_id(&batch.data()[range.clone()])?;
+        if !schema_ids.contains(&id) {
+            schema_ids.push(id);
+        }
+    }
+    let decoders = build_kv_decoders(
+        schema_getter,
+        target_schema,
+        target_schema_id,
+        kv_format,
+        &schema_ids,
+    )
+    .await?;
+
+    value_records_to_record_batch(
+        &batch,
+        &ranges,
+        &decoders,
+        table_info.get_row_type(),
+        projected_fields,
+        limit,
+    )
+}
+
+/// Build one [`FixedSchemaDecoder`] per distinct schema id. The current schema
+/// decodes without projection; older schemas are fetched and projected onto the
+/// current schema.
+async fn build_kv_decoders(
+    schema_getter: &ClientSchemaGetter,
+    target_schema: &Schema,
+    target_schema_id: i16,
+    kv_format: KvFormat,
+    schema_ids: &[i16],
+) -> Result<HashMap<i16, FixedSchemaDecoder>> {
+    let mut decoders = HashMap::with_capacity(schema_ids.len());
+    for &id in schema_ids {
+        if decoders.contains_key(&id) {
+            continue;
+        }
+        let decoder = if id == target_schema_id {
+            FixedSchemaDecoder::new_no_projection(kv_format, target_schema)?
+        } else {
+            let source = schema_getter.get_schema(id as i32).await?;
+            FixedSchemaDecoder::new(kv_format, source.as_ref(), target_schema)?
+        };
+        decoders.insert(id, decoder);
+    }
+    Ok(decoders)
+}
+
+/// Decode every value record into a row shaped by `target_row_type`, build a
+/// single Arrow batch, keep the last `limit` rows, then apply column projection.
+fn value_records_to_record_batch(
+    batch: &ValueRecordBatch,
+    ranges: &[Range<usize>],
+    decoders: &HashMap<i16, FixedSchemaDecoder>,
+    target_row_type: &RowType,
+    projected_fields: Option<&[usize]>,
+    limit: usize,
+) -> Result<RecordBatch> {
+    let mut builder = RowAppendRecordBatchBuilder::new(target_row_type)?;
+    for range in ranges {
+        let payload = &batch.data()[range.clone()];
+        let schema_id = read_schema_id(payload)?;
+        let decoder = decoders
+            .get(&schema_id)
+            .ok_or_else(|| Error::UnexpectedError {
+                message: format!("No decoder built for schema id {schema_id}"),
+                source: None,
+            })?;
+        let row = decoder.decode(payload)?;
+        builder.append(&row)?;
+    }
+
+    let full = Arc::unwrap_or_clone(builder.build_arrow_record_batch()?);
+    let (full, _) = take_last_rows(full, 0, limit);
+    project_batch(full, target_row_type, projected_fields)
+}
+
+/// Read the leading little-endian schema id from a `[schema_id | row]` payload.
+fn read_schema_id(payload: &[u8]) -> Result<i16> {
+    if payload.len() < SCHEMA_ID_LENGTH {
+        return Err(Error::UnexpectedError {
+            message: format!(
+                "Value record payload too short: {} bytes, need {} for schema id",
+                payload.len(),
+                SCHEMA_ID_LENGTH
+            ),
+            source: None,
+        });
+    }
+    let schema_id = LittleEndian::read_i16(&payload[..SCHEMA_ID_LENGTH]);
+    if schema_id < 0 {
+        return Err(Error::UnexpectedError {
+            message: format!("Invalid negative schema id {schema_id}; payload is corrupt"),
+            source: None,
+        });
+    }
+    Ok(schema_id)
+}
+
+/// Keep the last `limit` rows of `batch`, advancing `base_offset` by the number
+/// of dropped leading rows. A `batch` at or under the limit is returned as-is.
+fn take_last_rows(batch: RecordBatch, base_offset: i64, limit: usize) -> (RecordBatch, i64) {
+    let rows = batch.num_rows();
+    if rows > limit {
+        let dropped = rows - limit;
+        (batch.slice(dropped, limit), base_offset + dropped as i64)
+    } else {
+        (batch, base_offset)
+    }
+}
+
+/// An empty `RecordBatch` with the (optionally projected) target schema.
+fn empty_record_batch(
+    target_row_type: &RowType,
+    projected_fields: Option<&[usize]>,
+) -> Result<RecordBatch> {
+    let empty = RecordBatch::new_empty(to_arrow_schema(target_row_type)?);
+    project_batch(empty, target_row_type, projected_fields)
+}
+
+/// Project `batch` (shaped by `target_row_type`) onto the requested columns.
+fn project_batch(
+    batch: RecordBatch,
+    target_row_type: &RowType,
+    projected_fields: Option<&[usize]>,
+) -> Result<RecordBatch> {
+    match projected_fields {
+        None => Ok(batch),
+        Some(fields) => {
+            let projected_schema =
+                ArrowReadContext::project_schema(to_arrow_schema(target_row_type)?, fields)?;
+            let columns: Vec<_> = fields
+                .iter()
+                .map(|&idx| batch.column(idx).clone())
+                .collect();
+            Ok(RecordBatch::try_new(projected_schema, columns)?)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::client::WriteRecord;
+    use crate::compression::{
+        ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType,
+        DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+    };
+    use crate::metadata::{
+        Column, DataField, DataType, DataTypes, PhysicalTablePath, Schema, TableDescriptor,
+        TableInfo, TablePath,
+    };
+    use crate::record::MemoryLogRecordsArrowBuilder;
+    use crate::row::GenericRow;
+    use crate::row::binary::BinaryWriter;
+    use crate::row::compacted::CompactedRowWriter;
+    use arrow::array::{Array, Int32Array, Int64Array};
+
+    fn build_two_col_table_info() -> TableInfo {
+        let row_type = DataTypes::row(vec![
+            DataField::new("id", DataTypes::int(), None),
+            DataField::new("name", DataTypes::string(), None),
+        ]);
+        let schema = Schema::builder()
+            .with_row_type(&row_type)
+            .build()
+            .expect("schema build");
+        let descriptor = TableDescriptor::builder()
+            .schema(schema)
+            .distributed_by(Some(1), vec![])
+            .build()
+            .expect("descriptor build");
+        TableInfo::of(
+            TablePath::new("db".to_string(), "tbl".to_string()),
+            42,
+            1,
+            descriptor,
+            0,
+            0,
+        )
+    }
+
+    fn build_log_records(
+        table_info: &TableInfo,
+        base_offset: i64,
+        rows: &[(i32, &str)],
+    ) -> Vec<u8> {
+        let row_type = table_info.get_row_type();
+        let table_path = table_info.table_path.clone();
+        let table_info_arc = Arc::new(table_info.clone());
+        let physical = Arc::new(PhysicalTablePath::of(Arc::new(table_path)));
+        let mut builder = MemoryLogRecordsArrowBuilder::new(
+            1,
+            row_type,
+            false,
+            ArrowCompressionInfo {
+                compression_type: ArrowCompressionType::None,
+                compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+            },
+            usize::MAX,
+            Arc::new(ArrowCompressionRatioEstimator::default()),
+        )
+        .expect("builder");
+
+        for (i, (id, name)) in rows.iter().enumerate() {
+            let mut row = GenericRow::new(2);
+            row.set_field(0, *id);
+            row.set_field(1, *name);
+            let record = WriteRecord::for_append(
+                Arc::clone(&table_info_arc),
+                physical.clone(),
+                (i + 1) as i32,
+                &row,
+            );
+            builder.append(&record).expect("append");
+        }
+        let mut data = builder.build().expect("build log batch");
+        // Builder always writes base_log_offset=0; patch it so tests can verify
+        // BatchScanner faithfully propagates whatever offset the server returned.
+        let bytes = base_offset.to_le_bytes();
+        data[..bytes.len()].copy_from_slice(&bytes);
+        data
+    }
+
+    // ---- log path ----------------------------------------------------------
+
+    #[test]
+    fn decode_log_batch_empty_returns_empty_record_batch() {
+        let table_info = build_two_col_table_info();
+        let (batch, base_offset) =
+            decode_log_batch(&table_info, None, Vec::new(), usize::MAX).expect("decode empty");
+        assert_eq!(batch.num_rows(), 0);
+        assert_eq!(batch.num_columns(), 2);
+        assert_eq!(base_offset, 0);
+    }
+
+    #[test]
+    fn decode_log_batch_empty_with_projection() {
+        let table_info = build_two_col_table_info();
+        let (batch, base_offset) =
+            decode_log_batch(&table_info, Some(&[1usize]), Vec::new(), usize::MAX)
+                .expect("decode empty");
+        assert_eq!(batch.num_rows(), 0);
+        assert_eq!(batch.num_columns(), 1);
+        assert_eq!(batch.schema().field(0).name(), "name");
+        assert_eq!(base_offset, 0);
+    }
+
+    #[test]
+    fn decode_log_batch_extracts_base_offset_and_rows() {
+        let table_info = build_two_col_table_info();
+        let raw = build_log_records(&table_info, 17, &[(1, "alice"), (2, "bob"), (3, "carol")]);
+
+        let (batch, base_offset) =
+            decode_log_batch(&table_info, None, raw, usize::MAX).expect("decode populated");
+        assert_eq!(batch.num_rows(), 3);
+        assert_eq!(batch.num_columns(), 2);
+        assert_eq!(base_offset, 17);
+    }
+
+    #[test]
+    fn decode_log_batch_projection_keeps_requested_columns() {
+        let table_info = build_two_col_table_info();
+        let raw = build_log_records(&table_info, 0, &[(7, "x"), (8, "y")]);
+
+        let (batch, _) = decode_log_batch(&table_info, Some(&[0usize]), raw, usize::MAX)
+            .expect("decode projected");
+        assert_eq!(batch.num_rows(), 2);
+        assert_eq!(batch.num_columns(), 1);
+        assert_eq!(batch.schema().field(0).name(), "id");
+    }
+
+    #[test]
+    fn decode_log_batch_truncates_to_last_limit_rows() {
+        let table_info = build_two_col_table_info();
+        // Server returned 4 rows starting at offset 100, but limit is 2.
+        let raw = build_log_records(&table_info, 100, &[(1, "a"), (2, "b"), (3, "c"), (4, "d")]);
+
+        let (batch, base_offset) = decode_log_batch(&table_info, None, raw, 2).expect("decode");
+        assert_eq!(batch.num_rows(), 2);
+        // The last two rows are kept, so the base offset advances by 2.
+        assert_eq!(base_offset, 102);
+        let ids = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(ids.value(0), 3);
+        assert_eq!(ids.value(1), 4);
+    }
+
+    // ---- KV path -----------------------------------------------------------
+
+    fn schema_with_ids(columns: &[(i32, &str, DataType)]) -> Schema {
+        let cols: Vec<Column> = columns
+            .iter()
+            .map(|(id, name, dt)| Column::new(*name, dt.clone()).with_id(*id))
+            .collect();
+        Schema::builder().with_columns(cols).build().unwrap()
+    }
+
+    /// Encode a value-record batch from `(schema_id, compacted-row-bytes)`
+    /// pairs, matching the Java `DefaultValueRecordBatch` wire layout.
+    fn value_batch(records: &[(i16, Vec<u8>)]) -> ValueRecordBatch {
+        let mut body = Vec::new();
+        for (schema_id, row) in records {
+            let rec_len = (SCHEMA_ID_LENGTH + row.len()) as i32;
+            body.extend_from_slice(&rec_len.to_le_bytes());
+            body.extend_from_slice(&schema_id.to_le_bytes());
+            body.extend_from_slice(row);
+        }
+        let mut out = Vec::new();
+        out.extend_from_slice(&((1 + 4 + body.len()) as i32).to_le_bytes()); // Length
+        out.push(0); // Magic
+        out.extend_from_slice(&(records.len() as i32).to_le_bytes()); // RecordCount
+        out.extend_from_slice(&body);
+        ValueRecordBatch::new(Bytes::from(out))
+    }
+
+    fn compacted(field_count: usize, write: impl FnOnce(&mut CompactedRowWriter)) -> Vec<u8> {
+        let mut w = CompactedRowWriter::new(field_count);
+        write(&mut w);
+        w.to_bytes().as_ref().to_vec()
+    }
+
+    fn id_name_schema() -> Schema {
+        schema_with_ids(&[
+            (0, "id", DataTypes::int()),
+            (1, "name", DataTypes::string()),
+        ])
+    }
+
+    #[test]
+    fn value_records_empty_returns_empty_batch() {
+        let schema = id_name_schema();
+        let batch = value_batch(&[]);
+        let ranges = batch.value_ranges().unwrap();
+        let rb = value_records_to_record_batch(
+            &batch,
+            &ranges,
+            &HashMap::new(),
+            schema.row_type(),
+            None,
+            usize::MAX,
+        )
+        .expect("decode empty kv");
+        assert_eq!(rb.num_rows(), 0);
+        assert_eq!(rb.num_columns(), 2);
+    }
+
+    #[test]
+    fn empty_kv_payload_returns_empty_batch() {
+        let schema = id_name_schema();
+        // Full schema.
+        let rb = empty_record_batch(schema.row_type(), None).expect("empty");
+        assert_eq!(rb.num_rows(), 0);
+        assert_eq!(rb.num_columns(), 2);
+        // Projected.
+        let rb = empty_record_batch(schema.row_type(), Some(&[1usize])).expect("empty projected");
+        assert_eq!(rb.num_rows(), 0);
+        assert_eq!(rb.num_columns(), 1);
+        assert_eq!(rb.schema().field(0).name(), "name");
+    }
+
+    #[test]
+    fn value_records_decode_rows() {
+        let schema = id_name_schema();
+        let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &schema).unwrap();
+        let mut decoders = HashMap::new();
+        decoders.insert(0i16, decoder);
+
+        let r0 = compacted(2, |w| {
+            w.write_int(1);
+            w.write_string("alice");
+        });
+        let r1 = compacted(2, |w| {
+            w.write_int(2);
+            w.write_string("bob");
+        });
+        let batch = value_batch(&[(0, r0), (0, r1)]);
+        let ranges = batch.value_ranges().unwrap();
+
+        let rb = value_records_to_record_batch(
+            &batch,
+            &ranges,
+            &decoders,
+            schema.row_type(),
+            None,
+            usize::MAX,
+        )
+        .expect("decode kv rows");
+        assert_eq!(rb.num_rows(), 2);
+        let ids = rb.column(0).as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(ids.value(0), 1);
+        assert_eq!(ids.value(1), 2);
+    }
+
+    #[test]
+    fn value_records_limit_keeps_last_rows() {
+        let schema = id_name_schema();
+        let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &schema).unwrap();
+        let mut decoders = HashMap::new();
+        decoders.insert(0i16, decoder);
+
+        let records: Vec<(i16, Vec<u8>)> = (1..=5)
+            .map(|i| {
+                (
+                    0i16,
+                    compacted(2, |w| {
+                        w.write_int(i);
+                        w.write_string("x");
+                    }),
+                )
+            })
+            .collect();
+        let batch = value_batch(&records);
+        let ranges = batch.value_ranges().unwrap();
+
+        let rb =
+            value_records_to_record_batch(&batch, &ranges, &decoders, schema.row_type(), None, 3)
+                .expect("decode kv rows");
+        assert_eq!(rb.num_rows(), 3);
+        let ids = rb.column(0).as_any().downcast_ref::<Int32Array>().unwrap();
+        // Last 3 of [1,2,3,4,5].
+        assert_eq!(ids.values(), &[3, 4, 5]);
+    }
+
+    #[test]
+    fn value_records_projection_keeps_requested_columns() {
+        let schema = id_name_schema();
+        let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &schema).unwrap();
+        let mut decoders = HashMap::new();
+        decoders.insert(0i16, decoder);
+
+        let r0 = compacted(2, |w| {
+            w.write_int(9);
+            w.write_string("nine");
+        });
+        let batch = value_batch(&[(0, r0)]);
+        let ranges = batch.value_ranges().unwrap();
+
+        let rb = value_records_to_record_batch(
+            &batch,
+            &ranges,
+            &decoders,
+            schema.row_type(),
+            Some(&[1usize]),
+            usize::MAX,
+        )
+        .expect("decode projected kv");
+        assert_eq!(rb.num_columns(), 1);
+        assert_eq!(rb.schema().field(0).name(), "name");
+    }
+
+    #[test]
+    fn value_records_decode_across_schema_evolution() {
+        // Source schema (older): [id, name]. Target (current): added `age`.
+        let source = id_name_schema();
+        let target = schema_with_ids(&[
+            (0, "id", DataTypes::int()),
+            (1, "name", DataTypes::string()),
+            (2, "age", DataTypes::bigint()),
+        ]);
+
+        let mut decoders = HashMap::new();
+        // Records with schema id 0 were written under the old schema.
+        decoders.insert(
+            0i16,
+            FixedSchemaDecoder::new(KvFormat::COMPACTED, &source, &target).unwrap(),
+        );
+        // Records with schema id 1 carry the current schema.
+        decoders.insert(
+            1i16,
+            FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap(),
+        );
+
+        let old_row = compacted(2, |w| {
+            w.write_int(1);
+            w.write_string("alice");
+        });
+        let new_row = compacted(3, |w| {
+            w.write_int(2);
+            w.write_string("bob");
+            w.write_long(30);
+        });
+        let batch = value_batch(&[(0, old_row), (1, new_row)]);
+        let ranges = batch.value_ranges().unwrap();
+
+        let rb = value_records_to_record_batch(
+            &batch,
+            &ranges,
+            &decoders,
+            target.row_type(),
+            None,
+            usize::MAX,
+        )
+        .expect("decode mixed-schema kv");
+
+        assert_eq!(rb.num_rows(), 2);
+        assert_eq!(rb.num_columns(), 3);
+        let age = rb.column(2).as_any().downcast_ref::<Int64Array>().unwrap();
+        // Old record has no `age` column -> null; new record carries 30.
+        assert!(age.is_null(0), "old-schema record must read age as null");
+        assert_eq!(age.value(1), 30);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs
new file mode 100644
index 0000000000..9d45abad29
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/log_fetch_buffer.rs
@@ -0,0 +1,947 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::RecordBatch;
+use parking_lot::Mutex;
+
+use crate::client::table::remote_log::{
+    PrefetchPermit, RemoteLogDownloadFuture, RemoteLogFile, RemoteLogSegment,
+};
+use crate::error::{ApiError, Error, Result};
+use crate::metadata::TableBucket;
+use crate::record::{
+    LogRecordBatch, LogRecordIterator, LogRecordsBatches, ReadContext, ScanRecord,
+};
+use std::{
+    collections::{HashMap, VecDeque},
+    sync::{
+        Arc,
+        atomic::{AtomicBool, Ordering},
+    },
+    time::{Duration, Instant},
+};
+use tokio::sync::Notify;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum FetchErrorAction {
+    Ignore,
+    LogOffsetOutOfRange,
+    Authorization,
+    CorruptMessage,
+    Unexpected,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub(crate) enum FetchErrorLogLevel {
+    Debug,
+    Warn,
+}
+
+#[derive(Clone, Debug)]
+pub(crate) struct FetchErrorContext {
+    pub(crate) action: FetchErrorAction,
+    pub(crate) log_level: FetchErrorLogLevel,
+    pub(crate) log_message: String,
+}
+
+/// Represents a completed fetch that can be consumed
+pub trait CompletedFetch: Send + Sync {
+    fn table_bucket(&self) -> &TableBucket;
+    fn api_error(&self) -> Option<&ApiError>;
+    fn fetch_error_context(&self) -> Option<&FetchErrorContext>;
+    fn take_error(&mut self) -> Option<Error>;
+    fn fetch_records(&mut self, max_records: usize) -> Result<Vec<ScanRecord>>;
+    fn fetch_batches(&mut self, max_batches: usize) -> Result<Vec<(RecordBatch, i64)>>;
+    fn is_consumed(&self) -> bool;
+    fn records_read(&self) -> usize;
+    fn drain(&mut self);
+    fn size_in_bytes(&self) -> usize;
+    fn high_watermark(&self) -> i64;
+    fn is_initialized(&self) -> bool;
+    fn set_initialized(&mut self);
+    fn next_fetch_offset(&self) -> i64;
+}
+
+/// Represents a pending fetch that is waiting to be completed
+pub trait PendingFetch: Send + Sync {
+    fn table_bucket(&self) -> &TableBucket;
+    fn is_completed(&self) -> bool;
+    fn to_completed_fetch(self: Box<Self>) -> Result<Box<dyn CompletedFetch>>;
+}
+
+/// Thread-safe buffer for completed fetches
+pub struct LogFetchBuffer {
+    read_context: ReadContext,
+    completed_fetches: Mutex<VecDeque<Box<dyn CompletedFetch>>>,
+    pending_fetches: Mutex<HashMap<TableBucket, VecDeque<Box<dyn PendingFetch>>>>,
+    next_in_line_fetch: Mutex<Option<Box<dyn CompletedFetch>>>,
+    not_empty_notify: Notify,
+    woken_up: Arc<AtomicBool>,
+}
+
+impl LogFetchBuffer {
+    pub fn new(read_context: ReadContext) -> Self {
+        Self {
+            read_context,
+            completed_fetches: Mutex::new(VecDeque::new()),
+            pending_fetches: Mutex::new(HashMap::new()),
+            next_in_line_fetch: Mutex::new(None),
+            not_empty_notify: Notify::new(),
+            woken_up: Arc::new(AtomicBool::new(false)),
+        }
+    }
+
+    /// Check if the buffer is empty
+    pub fn is_empty(&self) -> bool {
+        self.completed_fetches.lock().is_empty()
+    }
+
+    /// Wait for the buffer to become non-empty, with timeout.
+    /// Returns true if data became available, false if timeout.
+    pub async fn await_not_empty(&self, timeout: Duration) -> Result<bool> {
+        let deadline = Instant::now() + timeout;
+
+        loop {
+            // Check if buffer is not empty
+            if !self.is_empty() {
+                return Ok(true);
+            }
+
+            // Check if woken up
+            if self.woken_up.swap(false, Ordering::Acquire) {
+                return Err(Error::WakeupError {
+                    message: "The await operation was interrupted by wakeup.".to_string(),
+                });
+            }
+
+            // Check if timeout
+            let now = Instant::now();
+            if now >= deadline {
+                return Ok(false);
+            }
+
+            // Wait for notification with remaining time
+            let remaining = deadline - now;
+            let notified = self.not_empty_notify.notified();
+            tokio::select! {
+                _ = tokio::time::sleep(remaining) => {
+                    return Ok(false); // Timeout
+                }
+                _ = notified => {
+                    // Got notification, check again
+                    continue;
+                }
+            }
+        }
+    }
+
+    #[allow(dead_code)]
+    /// Wake up any waiting threads
+    pub fn wakeup(&self) {
+        self.woken_up.store(true, Ordering::Release);
+        self.not_empty_notify.notify_waiters();
+    }
+
+    pub(crate) fn add_api_error(
+        &self,
+        table_bucket: TableBucket,
+        api_error: ApiError,
+        fetch_error_context: FetchErrorContext,
+        fetch_offset: i64,
+    ) {
+        let error_fetch = DefaultCompletedFetch::from_api_error(
+            table_bucket,
+            api_error,
+            fetch_error_context,
+            fetch_offset,
+            self.read_context.clone(),
+        );
+        self.completed_fetches
+            .lock()
+            .push_back(Box::new(error_fetch));
+        self.not_empty_notify.notify_waiters();
+    }
+
+    /// Add a pending fetch to the buffer
+    pub fn pend(&self, pending_fetch: Box<dyn PendingFetch>) {
+        let table_bucket = pending_fetch.table_bucket().clone();
+        self.pending_fetches
+            .lock()
+            .entry(table_bucket)
+            .or_default()
+            .push_back(pending_fetch);
+    }
+
+    /// Try to complete pending fetches in order, converting them to completed fetches
+    pub fn try_complete(&self, table_bucket: &TableBucket) {
+        // Collect completed fetches while holding the pending_fetches lock,
+        // then push them to completed_fetches after releasing it to avoid
+        // holding both locks simultaneously.
+        let mut completed_to_push: Vec<Box<dyn CompletedFetch>> = Vec::new();
+        let mut has_completed = false;
+        let mut pending_error: Option<Error> = None;
+        {
+            let mut pending_map = self.pending_fetches.lock();
+            if let Some(pendings) = pending_map.get_mut(table_bucket) {
+                while let Some(front) = pendings.front() {
+                    if front.is_completed() {
+                        let pending = pendings.pop_front().unwrap();
+                        match pending.to_completed_fetch() {
+                            Ok(completed) => {
+                                completed_to_push.push(completed);
+                                has_completed = true;
+                            }
+                            Err(e) => {
+                                pending_error = Some(e);
+                                has_completed = true;
+                                break;
+                            }
+                        }
+                    } else {
+                        break;
+                    }
+                }
+                if has_completed && pendings.is_empty() {
+                    pending_map.remove(table_bucket);
+                }
+            }
+        }
+
+        if let Some(error) = pending_error {
+            let error_fetch = DefaultCompletedFetch::from_error(
+                table_bucket.clone(),
+                error,
+                -1,
+                self.read_context.clone(),
+            );
+            completed_to_push.push(Box::new(error_fetch));
+        }
+
+        if !completed_to_push.is_empty() {
+            let mut completed_queue = self.completed_fetches.lock();
+            for completed in completed_to_push {
+                completed_queue.push_back(completed);
+            }
+            has_completed = true;
+        }
+
+        if has_completed {
+            // Signal that buffer is not empty
+            self.not_empty_notify.notify_waiters();
+        }
+    }
+
+    /// Add a completed fetch to the buffer
+    pub fn add(&self, completed_fetch: Box<dyn CompletedFetch>) {
+        let table_bucket = completed_fetch.table_bucket();
+        let mut pending_map = self.pending_fetches.lock();
+
+        if let Some(pendings) = pending_map.get_mut(table_bucket)
+            && !pendings.is_empty()
+        {
+            pendings.push_back(Box::new(CompletedPendingFetch::new(completed_fetch)));
+            return;
+        }
+        // If there's no pending fetch for this table_bucket,
+        // directly add to completed_fetches
+        self.completed_fetches.lock().push_back(completed_fetch);
+        self.not_empty_notify.notify_waiters();
+    }
+
+    /// Poll the next completed fetch
+    pub fn poll(&self) -> Option<Box<dyn CompletedFetch>> {
+        self.completed_fetches.lock().pop_front()
+    }
+
+    /// Get the next in line fetch
+    pub fn next_in_line_fetch(&self) -> Option<Box<dyn CompletedFetch>> {
+        self.next_in_line_fetch.lock().take()
+    }
+
+    /// Set the next in line fetch
+    pub fn set_next_in_line_fetch(&self, fetch: Option<Box<dyn CompletedFetch>>) {
+        *self.next_in_line_fetch.lock() = fetch;
+    }
+
+    /// Get the set of buckets that have buffered data
+    pub fn buffered_buckets(&self) -> Vec<TableBucket> {
+        let mut buckets = Vec::new();
+
+        // Avoid holding multiple locks at once to prevent lock-order inversion.
+        {
+            let next_in_line_fetch = self.next_in_line_fetch.lock();
+            if let Some(complete_fetch) = next_in_line_fetch.as_ref() {
+                if !complete_fetch.is_consumed() {
+                    buckets.push(complete_fetch.table_bucket().clone());
+                }
+            }
+        }
+
+        {
+            let completed = self.completed_fetches.lock();
+            for fetch in completed.iter() {
+                buckets.push(fetch.table_bucket().clone());
+            }
+        }
+
+        {
+            let pending = self.pending_fetches.lock();
+            buckets.extend(pending.keys().cloned());
+        }
+        buckets
+    }
+}
+
+/// A wrapper that makes a completed fetch look like a pending fetch
+struct CompletedPendingFetch {
+    completed_fetch: Box<dyn CompletedFetch>,
+}
+
+impl CompletedPendingFetch {
+    fn new(completed_fetch: Box<dyn CompletedFetch>) -> Self {
+        Self { completed_fetch }
+    }
+}
+
+impl PendingFetch for CompletedPendingFetch {
+    fn table_bucket(&self) -> &TableBucket {
+        self.completed_fetch.table_bucket()
+    }
+
+    fn is_completed(&self) -> bool {
+        true
+    }
+
+    fn to_completed_fetch(self: Box<Self>) -> Result<Box<dyn CompletedFetch>> {
+        Ok(self.completed_fetch)
+    }
+}
+
+/// Default implementation of CompletedFetch for in-memory log records
+/// Used for local fetches from tablet server
+pub struct DefaultCompletedFetch {
+    table_bucket: TableBucket,
+    api_error: Option<ApiError>,
+    fetch_error_context: Option<FetchErrorContext>,
+    error: Option<Error>,
+    log_record_batch: LogRecordsBatches,
+    read_context: ReadContext,
+    next_fetch_offset: i64,
+    high_watermark: i64,
+    size_in_bytes: usize,
+    consumed: bool,
+    initialized: bool,
+    records_read: usize,
+    current_record_iterator: Option<LogRecordIterator>,
+    current_record_batch: Option<LogRecordBatch>,
+    last_record: Option<ScanRecord>,
+    cached_record_error: Option<String>,
+    corrupt_last_record: bool,
+}
+
+impl DefaultCompletedFetch {
+    pub fn new(
+        table_bucket: TableBucket,
+        log_record_batch: LogRecordsBatches,
+        size_in_bytes: usize,
+        read_context: ReadContext,
+        fetch_offset: i64,
+        high_watermark: i64,
+    ) -> Self {
+        Self {
+            table_bucket,
+            api_error: None,
+            fetch_error_context: None,
+            error: None,
+            log_record_batch,
+            read_context,
+            next_fetch_offset: fetch_offset,
+            high_watermark,
+            size_in_bytes,
+            consumed: false,
+            initialized: false,
+            records_read: 0,
+            current_record_iterator: None,
+            current_record_batch: None,
+            last_record: None,
+            cached_record_error: None,
+            corrupt_last_record: false,
+        }
+    }
+
+    pub(crate) fn from_error(
+        table_bucket: TableBucket,
+        error: Error,
+        fetch_offset: i64,
+        read_context: ReadContext,
+    ) -> Self {
+        Self {
+            table_bucket,
+            api_error: None,
+            fetch_error_context: None,
+            error: Some(error),
+            log_record_batch: LogRecordsBatches::new(Vec::new()),
+            read_context,
+            next_fetch_offset: fetch_offset,
+            high_watermark: -1,
+            size_in_bytes: 0,
+            consumed: false,
+            initialized: false,
+            records_read: 0,
+            current_record_iterator: None,
+            current_record_batch: None,
+            last_record: None,
+            cached_record_error: None,
+            corrupt_last_record: false,
+        }
+    }
+
+    pub(crate) fn from_api_error(
+        table_bucket: TableBucket,
+        api_error: ApiError,
+        fetch_error_context: FetchErrorContext,
+        fetch_offset: i64,
+        read_context: ReadContext,
+    ) -> Self {
+        Self {
+            table_bucket,
+            api_error: Some(api_error),
+            fetch_error_context: Some(fetch_error_context),
+            error: None,
+            log_record_batch: LogRecordsBatches::new(Vec::new()),
+            read_context,
+            next_fetch_offset: fetch_offset,
+            high_watermark: -1,
+            size_in_bytes: 0,
+            consumed: false,
+            initialized: false,
+            records_read: 0,
+            current_record_iterator: None,
+            current_record_batch: None,
+            last_record: None,
+            cached_record_error: None,
+            corrupt_last_record: false,
+        }
+    }
+
+    /// Get the next fetched record, handling batch iteration and record skipping
+    fn next_fetched_record(&mut self) -> Result<Option<ScanRecord>> {
+        loop {
+            if let Some(record) = self
+                .current_record_iterator
+                .as_mut()
+                .and_then(Iterator::next)
+            {
+                if record.offset() >= self.next_fetch_offset {
+                    return Ok(Some(record));
+                }
+            } else if let Some(batch_result) = self.log_record_batch.next() {
+                let batch = batch_result?;
+                self.current_record_iterator = Some(batch.records(&self.read_context)?);
+                self.current_record_batch = Some(batch);
+            } else {
+                if let Some(batch) = self.current_record_batch.take() {
+                    self.next_fetch_offset = batch.next_log_offset();
+                }
+                self.drain();
+                return Ok(None);
+            }
+        }
+    }
+
+    fn fetch_error(&self) -> Error {
+        let mut message = format!(
+            "Received exception when fetching the next record from {table_bucket}. If needed, please back to past the record to continue scanning.",
+            table_bucket = self.table_bucket
+        );
+        if let Some(cause) = self.cached_record_error.as_deref() {
+            message.push_str(&format!(" Cause: {cause}"));
+        }
+        Error::UnexpectedError {
+            message,
+            source: None,
+        }
+    }
+    /// Get the next batch with its base offset.
+    /// Returns (RecordBatch, base_offset) where base_offset is the offset of the first record.
+    fn next_fetched_batch(&mut self) -> Result<Option<(RecordBatch, i64)>> {
+        loop {
+            let Some(log_batch_result) = self.log_record_batch.next() else {
+                self.drain();
+                return Ok(None);
+            };
+
+            let log_batch = log_batch_result?;
+            let mut record_batch = log_batch.record_batch(&self.read_context)?;
+
+            // Skip empty batches
+            if record_batch.num_rows() == 0 {
+                continue;
+            }
+
+            // Calculate the effective base offset for this batch
+            let log_base_offset = log_batch.base_log_offset();
+            let effective_base_offset = if self.next_fetch_offset > log_base_offset {
+                let skip_count = (self.next_fetch_offset - log_base_offset) as usize;
+                if skip_count >= record_batch.num_rows() {
+                    continue;
+                }
+                // Slice the batch to skip the first skip_count rows
+                record_batch = record_batch.slice(skip_count, record_batch.num_rows() - skip_count);
+                self.next_fetch_offset
+            } else {
+                log_base_offset
+            };
+
+            self.next_fetch_offset = log_batch.next_log_offset();
+            self.records_read += record_batch.num_rows();
+            return Ok(Some((record_batch, effective_base_offset)));
+        }
+    }
+}
+
+impl CompletedFetch for DefaultCompletedFetch {
+    fn table_bucket(&self) -> &TableBucket {
+        &self.table_bucket
+    }
+
+    fn api_error(&self) -> Option<&ApiError> {
+        self.api_error.as_ref()
+    }
+
+    fn fetch_error_context(&self) -> Option<&FetchErrorContext> {
+        self.fetch_error_context.as_ref()
+    }
+
+    fn take_error(&mut self) -> Option<Error> {
+        self.error.take()
+    }
+
+    fn fetch_records(&mut self, max_records: usize) -> Result<Vec<ScanRecord>> {
+        if let Some(error) = self.error.take() {
+            return Err(error);
+        }
+
+        if let Some(api_error) = self.api_error.as_ref() {
+            return Err(Error::FlussAPIError {
+                api_error: ApiError {
+                    code: api_error.code,
+                    message: api_error.message.clone(),
+                },
+            });
+        }
+
+        if self.corrupt_last_record {
+            return Err(self.fetch_error());
+        }
+
+        if self.consumed {
+            return Ok(Vec::new());
+        }
+
+        let mut scan_records = Vec::new();
+
+        for _ in 0..max_records {
+            if self.cached_record_error.is_none() {
+                self.corrupt_last_record = true;
+                match self.next_fetched_record() {
+                    Ok(Some(record)) => {
+                        self.corrupt_last_record = false;
+                        self.last_record = Some(record);
+                    }
+                    Ok(None) => {
+                        self.corrupt_last_record = false;
+                        self.last_record = None;
+                    }
+                    Err(e) => {
+                        self.cached_record_error = Some(e.to_string());
+                    }
+                }
+            }
+
+            let Some(record) = self.last_record.take() else {
+                break;
+            };
+
+            self.next_fetch_offset = record.offset() + 1;
+            self.records_read += 1;
+            scan_records.push(record);
+        }
+
+        if self.cached_record_error.is_some() && scan_records.is_empty() {
+            return Err(self.fetch_error());
+        }
+
+        Ok(scan_records)
+    }
+
+    fn fetch_batches(&mut self, max_batches: usize) -> Result<Vec<(RecordBatch, i64)>> {
+        if let Some(error) = self.error.take() {
+            return Err(error);
+        }
+
+        if let Some(api_error) = self.api_error.as_ref() {
+            return Err(Error::FlussAPIError {
+                api_error: ApiError {
+                    code: api_error.code,
+                    message: api_error.message.clone(),
+                },
+            });
+        }
+
+        if self.consumed {
+            return Ok(Vec::new());
+        }
+
+        let mut batches = Vec::with_capacity(max_batches.min(16));
+
+        for _ in 0..max_batches {
+            match self.next_fetched_batch()? {
+                Some(batch_with_offset) => batches.push(batch_with_offset),
+                None => break,
+            }
+        }
+
+        Ok(batches)
+    }
+
+    fn is_consumed(&self) -> bool {
+        self.consumed
+    }
+
+    fn records_read(&self) -> usize {
+        self.records_read
+    }
+
+    fn drain(&mut self) {
+        self.consumed = true;
+        self.api_error = None;
+        self.fetch_error_context = None;
+        self.error = None;
+        self.cached_record_error = None;
+        self.corrupt_last_record = false;
+        self.last_record = None;
+    }
+
+    fn size_in_bytes(&self) -> usize {
+        self.size_in_bytes
+    }
+
+    fn high_watermark(&self) -> i64 {
+        self.high_watermark
+    }
+
+    fn is_initialized(&self) -> bool {
+        self.initialized
+    }
+
+    fn set_initialized(&mut self) {
+        self.initialized = true;
+    }
+
+    fn next_fetch_offset(&self) -> i64 {
+        self.next_fetch_offset
+    }
+}
+
+/// Completed fetch for remote log segments
+/// Matches Java's RemoteCompletedFetch design - separate class for remote vs local
+/// Holds RAII permit until consumed (data is in inner)
+pub struct RemoteCompletedFetch {
+    inner: DefaultCompletedFetch,
+    permit: Option<PrefetchPermit>,
+}
+
+impl RemoteCompletedFetch {
+    pub fn new(inner: DefaultCompletedFetch, permit: PrefetchPermit) -> Self {
+        Self {
+            inner,
+            permit: Some(permit),
+        }
+    }
+}
+
+impl CompletedFetch for RemoteCompletedFetch {
+    fn table_bucket(&self) -> &TableBucket {
+        self.inner.table_bucket()
+    }
+
+    fn api_error(&self) -> Option<&ApiError> {
+        self.inner.api_error()
+    }
+
+    fn fetch_error_context(&self) -> Option<&FetchErrorContext> {
+        self.inner.fetch_error_context()
+    }
+
+    fn take_error(&mut self) -> Option<Error> {
+        self.inner.take_error()
+    }
+
+    fn fetch_records(&mut self, max_records: usize) -> Result<Vec<ScanRecord>> {
+        self.inner.fetch_records(max_records)
+    }
+
+    fn fetch_batches(&mut self, max_batches: usize) -> Result<Vec<(RecordBatch, i64)>> {
+        self.inner.fetch_batches(max_batches)
+    }
+
+    fn is_consumed(&self) -> bool {
+        self.inner.is_consumed()
+    }
+
+    fn records_read(&self) -> usize {
+        self.inner.records_read()
+    }
+
+    fn drain(&mut self) {
+        self.inner.drain();
+        // Release permit immediately (don't wait for struct drop)
+        // Critical: allows prefetch to continue even if Box<dyn CompletedFetch> kept around
+        self.permit.take(); // drops permit here, triggers recycle notification
+    }
+
+    fn size_in_bytes(&self) -> usize {
+        self.inner.size_in_bytes()
+    }
+
+    fn high_watermark(&self) -> i64 {
+        self.inner.high_watermark()
+    }
+
+    fn is_initialized(&self) -> bool {
+        self.inner.is_initialized()
+    }
+
+    fn set_initialized(&mut self) {
+        self.inner.set_initialized()
+    }
+
+    fn next_fetch_offset(&self) -> i64 {
+        self.inner.next_fetch_offset()
+    }
+}
+// Permit released explicitly in drain() or automatically when struct drops
+
+/// Pending fetch that waits for remote log file to be downloaded
+pub struct RemotePendingFetch {
+    segment: RemoteLogSegment,
+    download_future: RemoteLogDownloadFuture,
+    pos_in_log_segment: i32,
+    fetch_offset: i64,
+    high_watermark: i64,
+    read_context: ReadContext,
+}
+
+impl RemotePendingFetch {
+    pub fn new(
+        segment: RemoteLogSegment,
+        download_future: RemoteLogDownloadFuture,
+        pos_in_log_segment: i32,
+        fetch_offset: i64,
+        high_watermark: i64,
+        read_context: ReadContext,
+    ) -> Self {
+        Self {
+            segment,
+            download_future,
+            pos_in_log_segment,
+            fetch_offset,
+            high_watermark,
+            read_context,
+        }
+    }
+}
+
+impl PendingFetch for RemotePendingFetch {
+    fn table_bucket(&self) -> &TableBucket {
+        &self.segment.table_bucket
+    }
+
+    fn is_completed(&self) -> bool {
+        self.download_future.is_done()
+    }
+
+    fn to_completed_fetch(self: Box<Self>) -> Result<Box<dyn CompletedFetch>> {
+        // Take the RemoteLogFile and destructure
+        let remote_log_file = self.download_future.take_remote_log_file()?;
+        let RemoteLogFile {
+            file_path,
+            file_size: _,
+            permit,
+        } = remote_log_file;
+
+        // Open file for streaming (no memory allocation for entire file)
+        let file = std::fs::File::open(&file_path)?;
+        let file_size = file.metadata()?.len() as usize;
+
+        // Create file-backed LogRecordsBatches with cleanup (streaming!)
+        // Data will be read batch-by-batch on-demand, not all at once
+        // FileSource will delete the file when dropped (after file is closed)
+        let log_record_batch =
+            LogRecordsBatches::from_file(file, self.pos_in_log_segment as usize, file_path)?;
+
+        // Calculate size based on position offset
+        let size_in_bytes = if self.pos_in_log_segment > 0 {
+            let pos = self.pos_in_log_segment as usize;
+            if pos >= file_size {
+                return Err(Error::UnexpectedError {
+                    message: format!("Position {pos} exceeds file size {file_size}"),
+                    source: None,
+                });
+            }
+            file_size - pos
+        } else {
+            file_size
+        };
+
+        // Create DefaultCompletedFetch
+        let inner_fetch = DefaultCompletedFetch::new(
+            self.segment.table_bucket.clone(),
+            log_record_batch,
+            size_in_bytes,
+            self.read_context,
+            self.fetch_offset,
+            self.high_watermark,
+        );
+
+        // Wrap it with RemoteCompletedFetch to hold the permit
+        // Permit manages the prefetch slot (releases semaphore and notifies coordinator) when dropped;
+        // file deletion is handled by FileCleanupGuard in the file-backed source created via from_file
+        Ok(Box::new(RemoteCompletedFetch::new(inner_fetch, permit)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::client::WriteRecord;
+    use crate::compression::{
+        ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType,
+        DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+    };
+    use crate::metadata::{DataField, DataTypes, PhysicalTablePath, RowType, TablePath};
+    use crate::record::{MemoryLogRecordsArrowBuilder, ReadContext, to_arrow_schema};
+    use crate::row::GenericRow;
+    use crate::test_utils::build_table_info;
+    use std::sync::Arc;
+
+    fn test_read_context() -> Result<ReadContext> {
+        let row_type = RowType::new(vec![DataField::new("id", DataTypes::int(), None)]);
+        Ok(ReadContext::new(
+            to_arrow_schema(&row_type)?,
+            Arc::new(row_type),
+            false,
+        ))
+    }
+
+    struct ErrorPendingFetch {
+        table_bucket: TableBucket,
+    }
+
+    impl PendingFetch for ErrorPendingFetch {
+        fn table_bucket(&self) -> &TableBucket {
+            &self.table_bucket
+        }
+
+        fn is_completed(&self) -> bool {
+            true
+        }
+
+        fn to_completed_fetch(self: Box<Self>) -> Result<Box<dyn CompletedFetch>> {
+            Err(Error::UnexpectedError {
+                message: "pending fetch failure".to_string(),
+                source: None,
+            })
+        }
+    }
+
+    #[tokio::test]
+    async fn await_not_empty_returns_wakeup_error() {
+        let buffer = LogFetchBuffer::new(test_read_context().unwrap());
+        buffer.wakeup();
+
+        let result = buffer.await_not_empty(Duration::from_millis(10)).await;
+        assert!(matches!(result, Err(Error::WakeupError { .. })));
+    }
+
+    #[tokio::test]
+    async fn await_not_empty_returns_pending_error() {
+        let buffer = LogFetchBuffer::new(test_read_context().unwrap());
+        let table_bucket = TableBucket::new(1, 0);
+        buffer.pend(Box::new(ErrorPendingFetch {
+            table_bucket: table_bucket.clone(),
+        }));
+        buffer.try_complete(&table_bucket);
+
+        let result = buffer.await_not_empty(Duration::from_millis(10)).await;
+        assert!(matches!(result, Ok(true)));
+
+        let mut completed = buffer.poll().expect("completed fetch");
+        assert!(completed.take_error().is_some());
+    }
+
+    #[test]
+    fn default_completed_fetch_reads_records() -> Result<()> {
+        let row_type = RowType::new(vec![
+            DataField::new("id", DataTypes::int(), None),
+            DataField::new("name", DataTypes::string(), None),
+        ]);
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path)));
+
+        let mut builder = MemoryLogRecordsArrowBuilder::new(
+            1,
+            &row_type,
+            false,
+            ArrowCompressionInfo {
+                compression_type: ArrowCompressionType::None,
+                compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+            },
+            usize::MAX,
+            Arc::new(ArrowCompressionRatioEstimator::default()),
+        )?;
+
+        let mut row = GenericRow::new(2);
+        row.set_field(0, 1_i32);
+        row.set_field(1, "alice");
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+        builder.append(&record)?;
+
+        let data = builder.build()?;
+        let log_records = LogRecordsBatches::new(data.clone());
+        let read_context = ReadContext::new(to_arrow_schema(&row_type)?, Arc::new(row_type), false);
+        let mut fetch = DefaultCompletedFetch::new(
+            TableBucket::new(1, 0),
+            log_records,
+            data.len(),
+            read_context,
+            0,
+            0,
+        );
+
+        let records = fetch.fetch_records(10)?;
+        assert_eq!(records.len(), 1);
+        assert_eq!(records[0].offset(), 0);
+
+        let empty = fetch.fetch_records(10)?;
+        assert!(empty.is_empty());
+
+        Ok(())
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/lookup.rs b/fluss-rust/crates/fluss/src/client/table/lookup.rs
new file mode 100644
index 0000000000..51a0a0714d
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/lookup.rs
@@ -0,0 +1,774 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::bucketing::BucketingFunction;
+use crate::client::ClientSchemaGetter;
+use crate::client::lookup::LookupClient;
+use crate::client::metadata::Metadata;
+use crate::client::table::partition_getter::PartitionGetter;
+use crate::error::{Error, Result};
+use crate::metadata::{
+    KvFormat, PhysicalTablePath, RowType, Schema, TableBucket, TableInfo, TablePath,
+};
+use crate::record::RowAppendRecordBatchBuilder;
+use crate::record::kv::SCHEMA_ID_LENGTH;
+use crate::row::encode::{KeyEncoder, KeyEncoderFactory};
+use crate::row::{FixedSchemaDecoder, InternalRow, LookupRow};
+use arrow::array::RecordBatch;
+use byteorder::{ByteOrder, LittleEndian};
+use futures::future::try_join_all;
+use parking_lot::RwLock;
+use std::collections::HashMap;
+use std::sync::Arc;
+
+/// Per-Lookuper decoder cache. The target-schema decoder is held
+/// directly so the dominant decode path is a single field access; older
+/// schemas are populated lazily on first observation.
+struct DecoderCache {
+    target_id: i16,
+    target_decoder: Arc<FixedSchemaDecoder>,
+    others: RwLock<HashMap<i16, Arc<FixedSchemaDecoder>>>,
+}
+
+impl DecoderCache {
+    fn new(target_id: i16, target_decoder: Arc<FixedSchemaDecoder>) -> Self {
+        Self {
+            target_id,
+            target_decoder,
+            others: RwLock::new(HashMap::new()),
+        }
+    }
+
+    fn decode<'a>(&self, schema_id: i16, bytes: &'a [u8]) -> Result<LookupRow<'a>> {
+        if schema_id == self.target_id {
+            return self.target_decoder.decode(bytes);
+        }
+        let decoder =
+            self.others
+                .read()
+                .get(&schema_id)
+                .cloned()
+                .ok_or_else(|| Error::RowConvertError {
+                    message: format!("No decoder available for schema id {schema_id}"),
+                })?;
+        decoder.decode(bytes)
+    }
+
+    fn contains(&self, schema_id: i16) -> bool {
+        schema_id == self.target_id || self.others.read().contains_key(&schema_id)
+    }
+
+    fn insert(&self, schema_id: i16, decoder: Arc<FixedSchemaDecoder>) {
+        self.others.write().insert(schema_id, decoder);
+    }
+
+    #[cfg(test)]
+    fn get(&self, schema_id: i16) -> Option<Arc<FixedSchemaDecoder>> {
+        if schema_id == self.target_id {
+            return Some(Arc::clone(&self.target_decoder));
+        }
+        self.others.read().get(&schema_id).cloned()
+    }
+}
+
+/// Rows returned from a lookup. Primary-key lookups produce at most one
+/// row; prefix-key lookups may produce many. Rows written under older
+/// schemas are decoded with their original schema and projected to the
+/// schema captured when the `Lookuper` was created — schema evolutions
+/// that land after that point are not picked up by an existing
+/// `Lookuper`; create a new one to see them.
+pub struct LookupResult {
+    rows: Vec<Vec<u8>>,
+    target_row_type: Arc<RowType>,
+    decoders: Arc<DecoderCache>,
+}
+
+impl LookupResult {
+    fn new(rows: Vec<Vec<u8>>, target_row_type: Arc<RowType>, decoders: Arc<DecoderCache>) -> Self {
+        Self {
+            rows,
+            target_row_type,
+            decoders,
+        }
+    }
+
+    fn read_schema_id(bytes: &[u8]) -> Result<i16> {
+        if bytes.len() < SCHEMA_ID_LENGTH {
+            return Err(Error::RowConvertError {
+                message: format!(
+                    "Row payload too short: {} bytes, need at least {} for schema id",
+                    bytes.len(),
+                    SCHEMA_ID_LENGTH
+                ),
+            });
+        }
+        let schema_id = LittleEndian::read_i16(&bytes[..SCHEMA_ID_LENGTH]);
+        if schema_id < 0 {
+            return Err(Error::RowConvertError {
+                message: format!("Invalid negative schema id {schema_id}; row prefix is corrupt"),
+            });
+        }
+        Ok(schema_id)
+    }
+
+    fn decode<'a>(&self, bytes: &'a [u8]) -> Result<LookupRow<'a>> {
+        let schema_id = Self::read_schema_id(bytes)?;
+        self.decoders.decode(schema_id, bytes)
+    }
+
+    /// Returns the single row when exactly one is present, `None` for
+    /// empty, or an error if the result holds more than one row.
+    pub fn get_single_row(&self) -> Result<Option<LookupRow<'_>>> {
+        match self.rows.len() {
+            0 => Ok(None),
+            1 => Ok(Some(self.decode(&self.rows[0])?)),
+            _ => Err(Error::UnexpectedError {
+                message: "LookupResult contains multiple rows, use get_rows() instead".to_string(),
+                source: None,
+            }),
+        }
+    }
+
+    pub fn get_rows(&self) -> Result<Vec<LookupRow<'_>>> {
+        self.rows.iter().map(|bytes| self.decode(bytes)).collect()
+    }
+
+    pub fn to_record_batch(&self) -> Result<RecordBatch> {
+        let mut builder = RowAppendRecordBatchBuilder::new(&self.target_row_type)?;
+        for bytes in &self.rows {
+            let row = self.decode(bytes)?;
+            builder.append(&row)?;
+        }
+        builder.build_arrow_record_batch().map(Arc::unwrap_or_clone)
+    }
+}
+
+struct LookupSchemaCtx {
+    target_schema: Arc<Schema>,
+    target_row_type: Arc<RowType>,
+    kv_format: KvFormat,
+    schema_getter: Arc<ClientSchemaGetter>,
+    decoders: Arc<DecoderCache>,
+}
+
+impl LookupSchemaCtx {
+    fn new(table_info: &TableInfo, schema_getter: Arc<ClientSchemaGetter>) -> Result<Self> {
+        let target_schema_i32 = table_info.get_schema_id();
+        if !(0..=i16::MAX as i32).contains(&target_schema_i32) {
+            return Err(Error::UnexpectedError {
+                message: format!(
+                    "Schema id {target_schema_i32} does not fit in 16 bits — wire format violated"
+                ),
+                source: None,
+            });
+        }
+        let target_schema = Arc::new(table_info.get_schema().clone());
+        let target_row_type = Arc::new(table_info.row_type().clone());
+        let kv_format = table_info.get_table_config().get_kv_format()?;
+        let target_decoder = Arc::new(FixedSchemaDecoder::new_no_projection(
+            kv_format,
+            target_schema.as_ref(),
+        )?);
+        let decoders = Arc::new(DecoderCache::new(target_schema_i32 as i16, target_decoder));
+        Ok(Self {
+            target_schema,
+            target_row_type,
+            kv_format,
+            schema_getter,
+            decoders,
+        })
+    }
+
+    async fn ensure_decoders(&self, rows: &[Vec<u8>]) -> Result<()> {
+        let mut missing: Vec<i16> = Vec::new();
+        for bytes in rows {
+            let schema_id = LookupResult::read_schema_id(bytes)?;
+            if !self.decoders.contains(schema_id) && !missing.contains(&schema_id) {
+                missing.push(schema_id);
+            }
+        }
+        if missing.is_empty() {
+            return Ok(());
+        }
+
+        let fetches = missing.into_iter().map(|schema_id| {
+            let cache = Arc::clone(&self.decoders);
+            let schema_getter = Arc::clone(&self.schema_getter);
+            let target_schema = Arc::clone(&self.target_schema);
+            let kv_format = self.kv_format;
+            async move {
+                let source = schema_getter.get_schema(schema_id as i32).await?;
+                let decoder =
+                    FixedSchemaDecoder::new(kv_format, source.as_ref(), target_schema.as_ref())?;
+                cache.insert(schema_id, Arc::new(decoder));
+                Ok::<_, Error>(())
+            }
+        });
+        try_join_all(fetches).await?;
+        Ok(())
+    }
+
+    async fn build_result(&self, rows: Vec<Vec<u8>>) -> Result<LookupResult> {
+        if !rows.is_empty() {
+            self.ensure_decoders(&rows).await?;
+        }
+        Ok(LookupResult::new(
+            rows,
+            Arc::clone(&self.target_row_type),
+            Arc::clone(&self.decoders),
+        ))
+    }
+
+    fn empty_result(&self) -> LookupResult {
+        LookupResult::new(
+            Vec::new(),
+            Arc::clone(&self.target_row_type),
+            Arc::clone(&self.decoders),
+        )
+    }
+}
+
+/// Builder for lookup operations. `create_lookuper()` builds a primary-key
+/// `Lookuper`; `lookup_by(columns).create_lookuper()` builds a
+/// `PrefixKeyLookuper` for prefix scans.
+// TODO: Add create_typed_lookuper<T>() for typed lookups with POJO mapping
+pub struct TableLookup {
+    lookup_client: Arc<LookupClient>,
+    table_info: TableInfo,
+    metadata: Arc<Metadata>,
+    schema_getter: Arc<ClientSchemaGetter>,
+}
+
+impl TableLookup {
+    pub(super) fn new(
+        lookup_client: Arc<LookupClient>,
+        table_info: TableInfo,
+        metadata: Arc<Metadata>,
+        schema_getter: Arc<ClientSchemaGetter>,
+    ) -> Self {
+        Self {
+            lookup_client,
+            table_info,
+            metadata,
+            schema_getter,
+        }
+    }
+
+    /// Switches the builder into prefix-scan mode. `lookup_column_names`
+    /// must list the table's partition keys (if any) plus the bucket keys,
+    /// in that order — i.e. this is a **bucket-key prefix** scan, not an
+    /// arbitrary primary-key prefix. Validation is deferred to
+    /// `create_lookuper()`.
+    pub fn lookup_by(self, lookup_column_names: Vec<String>) -> TablePrefixLookup {
+        TablePrefixLookup {
+            lookup_client: self.lookup_client,
+            table_info: self.table_info,
+            metadata: self.metadata,
+            schema_getter: self.schema_getter,
+            lookup_column_names,
+        }
+    }
+
+    /// Creates a `Lookuper` for performing key-based lookups.
+    ///
+    /// The lookuper will automatically encode the key and compute the bucket
+    /// for each lookup using the appropriate bucketing function.
+    ///
+    /// The lookuper uses a shared `LookupClient` that batches multiple lookup
+    /// operations together to reduce network round trips. This achieves parity
+    /// with the Java client implementation for improved throughput.
+    pub fn create_lookuper(self) -> Result<Lookuper> {
+        let num_buckets = self.table_info.get_num_buckets();
+
+        // Get data lake format from table config for bucketing function
+        let data_lake_format = self.table_info.get_table_config().get_datalake_format()?;
+        let bucketing_function = <dyn BucketingFunction>::of(data_lake_format.as_ref());
+
+        let row_type = self.table_info.row_type();
+        let primary_keys = self.table_info.get_primary_keys();
+        let lookup_row_type = row_type.project_with_field_names(primary_keys)?;
+
+        let physical_primary_keys = self.table_info.get_physical_primary_keys().to_vec();
+        let primary_key_encoder =
+            KeyEncoderFactory::of(&lookup_row_type, &physical_primary_keys, &data_lake_format)?;
+
+        let bucket_key_encoder = if self.table_info.is_default_bucket_key() {
+            None
+        } else {
+            let bucket_keys = self.table_info.get_bucket_keys().to_vec();
+            Some(KeyEncoderFactory::of(
+                &lookup_row_type,
+                &bucket_keys,
+                &data_lake_format,
+            )?)
+        };
+
+        let partition_getter = if self.table_info.is_partitioned() {
+            Some(PartitionGetter::new(
+                &lookup_row_type,
+                Arc::clone(self.table_info.get_partition_keys()),
+            )?)
+        } else {
+            None
+        };
+
+        let schema_ctx = LookupSchemaCtx::new(&self.table_info, self.schema_getter)?;
+
+        Ok(Lookuper {
+            table_path: Arc::new(self.table_info.table_path.clone()),
+            table_info: self.table_info,
+            metadata: self.metadata,
+            lookup_client: self.lookup_client,
+            bucketing_function,
+            primary_key_encoder,
+            bucket_key_encoder,
+            partition_getter,
+            num_buckets,
+            schema_ctx,
+        })
+    }
+}
+
+/// Performs key-based lookups against a primary key table.
+///
+/// The `Lookuper` automatically encodes the lookup key, computes the target
+/// bucket, and retrieves the value using the batched `LookupClient`.
+///
+/// # Example
+/// ```ignore
+/// let lookuper = table.new_lookup()?.create_lookuper()?;
+/// let row = GenericRow::new(vec![Datum::Int32(42)]); // lookup key
+/// let result = lookuper.lookup(&row).await?;
+/// ```
+pub struct Lookuper {
+    table_path: Arc<TablePath>,
+    table_info: TableInfo,
+    metadata: Arc<Metadata>,
+    lookup_client: Arc<LookupClient>,
+    bucketing_function: Box<dyn BucketingFunction>,
+    primary_key_encoder: Box<dyn KeyEncoder>,
+    bucket_key_encoder: Option<Box<dyn KeyEncoder>>,
+    partition_getter: Option<PartitionGetter>,
+    num_buckets: i32,
+    schema_ctx: LookupSchemaCtx,
+}
+
+impl Lookuper {
+    /// Looks up a value by its primary key.
+    ///
+    /// The key is encoded and the bucket is automatically computed using
+    /// the table's bucketing function. The lookup is queued and batched
+    /// with other lookups for improved throughput.
+    ///
+    /// # Arguments
+    /// * `row` - The row containing the primary key field values
+    ///
+    /// # Returns
+    /// * `Ok(LookupResult)` - The lookup result (may be empty if key not found)
+    /// * `Err(Error)` - If the lookup fails
+    pub async fn lookup(&mut self, row: &dyn InternalRow) -> Result<LookupResult> {
+        let pk_bytes = self.primary_key_encoder.encode_key(row)?;
+        let bk_bytes = match &mut self.bucket_key_encoder {
+            Some(encoder) => encoder.encode_key(row)?,
+            None => pk_bytes.clone(),
+        };
+
+        let partition_id = if let Some(ref partition_getter) = self.partition_getter {
+            let partition_name = partition_getter.get_partition(row)?;
+            let physical_table_path = PhysicalTablePath::of_partitioned(
+                Arc::clone(&self.table_path),
+                Some(partition_name),
+            );
+            match self
+                .metadata
+                .check_and_update_partition_metadata(&physical_table_path)
+                .await?
+            {
+                Some(id) => Some(id),
+                None => return Ok(self.schema_ctx.empty_result()),
+            }
+        } else {
+            None
+        };
+
+        let bucket_id = self
+            .bucketing_function
+            .bucketing(&bk_bytes, self.num_buckets)?;
+
+        let table_id = self.table_info.get_table_id();
+        let table_bucket = TableBucket::new_with_partition(table_id, partition_id, bucket_id);
+
+        // Use the batched lookup client
+        let result = self
+            .lookup_client
+            .lookup(self.table_path.as_ref().clone(), table_bucket, pk_bytes)
+            .await?;
+
+        let rows = match result {
+            Some(value_bytes) => vec![value_bytes],
+            None => Vec::new(),
+        };
+        self.schema_ctx.build_result(rows).await
+    }
+
+    /// Returns a reference to the table info.
+    pub fn table_info(&self) -> &TableInfo {
+        &self.table_info
+    }
+}
+
+pub struct TablePrefixLookup {
+    lookup_client: Arc<LookupClient>,
+    table_info: TableInfo,
+    metadata: Arc<Metadata>,
+    schema_getter: Arc<ClientSchemaGetter>,
+    lookup_column_names: Vec<String>,
+}
+
+impl TablePrefixLookup {
+    pub fn create_lookuper(self) -> Result<PrefixKeyLookuper> {
+        validate_prefix_lookup(&self.table_info, &self.lookup_column_names)?;
+
+        let num_buckets = self.table_info.get_num_buckets();
+        let data_lake_format = self.table_info.get_table_config().get_datalake_format()?;
+        let bucketing_function = <dyn BucketingFunction>::of(data_lake_format.as_ref());
+
+        let row_type = self.table_info.row_type();
+        let lookup_row_type = row_type.project_with_field_names(&self.lookup_column_names)?;
+
+        let bucket_keys = self.table_info.get_bucket_keys().to_vec();
+        let prefix_key_encoder =
+            KeyEncoderFactory::of(&lookup_row_type, &bucket_keys, &data_lake_format)?;
+
+        let partition_getter = if self.table_info.is_partitioned() {
+            Some(PartitionGetter::new(
+                &lookup_row_type,
+                Arc::clone(self.table_info.get_partition_keys()),
+            )?)
+        } else {
+            None
+        };
+
+        let schema_ctx = LookupSchemaCtx::new(&self.table_info, self.schema_getter)?;
+
+        Ok(PrefixKeyLookuper {
+            table_path: Arc::new(self.table_info.table_path.clone()),
+            table_info: self.table_info,
+            metadata: self.metadata,
+            lookup_client: self.lookup_client,
+            bucketing_function,
+            prefix_key_encoder,
+            partition_getter,
+            num_buckets,
+            schema_ctx,
+        })
+    }
+}
+
+fn validate_prefix_lookup(table_info: &TableInfo, lookup_columns: &[String]) -> Result<()> {
+    if !table_info.has_primary_key() {
+        return Err(Error::IllegalArgument {
+            message: format!(
+                "Log table {} doesn't support prefix lookup",
+                table_info.get_table_path()
+            ),
+        });
+    }
+
+    let physical_primary_keys = table_info.get_physical_primary_keys();
+    let bucket_keys = table_info.get_bucket_keys();
+
+    if bucket_keys.is_empty() {
+        return Err(Error::IllegalArgument {
+            message: format!(
+                "Can not perform prefix lookup on table '{}', because it has no bucket keys.",
+                table_info.get_table_path()
+            ),
+        });
+    }
+
+    if !physical_primary_keys.starts_with(bucket_keys) {
+        return Err(Error::IllegalArgument {
+            message: format!(
+                "Can not perform prefix lookup on table '{}', because the bucket keys {:?} \
+                 is not a prefix subset of the physical primary keys {:?} \
+                 (excluded partition fields if present).",
+                table_info.get_table_path(),
+                bucket_keys,
+                physical_primary_keys,
+            ),
+        });
+    }
+
+    let partition_keys: &[String] = table_info.get_partition_keys();
+    if table_info.is_partitioned() {
+        for pk in partition_keys {
+            if !lookup_columns.iter().any(|c| c == pk) {
+                return Err(Error::IllegalArgument {
+                    message: format!(
+                        "Can not perform prefix lookup on table '{}', because the lookup columns \
+                         {:?} must contain all partition fields {:?}.",
+                        table_info.get_table_path(),
+                        lookup_columns,
+                        partition_keys,
+                    ),
+                });
+            }
+        }
+    }
+
+    let physical_lookup_columns: Vec<&String> = lookup_columns
+        .iter()
+        .filter(|c| !partition_keys.iter().any(|p| p == *c))
+        .collect();
+    if physical_lookup_columns.len() != bucket_keys.len()
+        || !physical_lookup_columns
+            .iter()
+            .zip(bucket_keys.iter())
+            .all(|(a, b)| *a == b)
+    {
+        return Err(Error::IllegalArgument {
+            message: format!(
+                "Can not perform prefix lookup on table '{}', because the lookup columns {:?} \
+                 must contain all bucket keys {:?} in order.",
+                table_info.get_table_path(),
+                lookup_columns,
+                bucket_keys,
+            ),
+        });
+    }
+
+    if bucket_keys == physical_primary_keys {
+        return Err(Error::IllegalArgument {
+            message: format!(
+                "Can not perform prefix lookup on table '{}', because the lookup columns {:?} \
+                 equals the physical primary keys {:?}. \
+                 Please use primary key lookup (Lookuper without lookup_by) instead.",
+                table_info.get_table_path(),
+                lookup_columns,
+                physical_primary_keys,
+            ),
+        });
+    }
+
+    Ok(())
+}
+
+pub struct PrefixKeyLookuper {
+    table_path: Arc<TablePath>,
+    table_info: TableInfo,
+    metadata: Arc<Metadata>,
+    lookup_client: Arc<LookupClient>,
+    bucketing_function: Box<dyn BucketingFunction>,
+    prefix_key_encoder: Box<dyn KeyEncoder>,
+    partition_getter: Option<PartitionGetter>,
+    num_buckets: i32,
+    schema_ctx: LookupSchemaCtx,
+}
+
+impl PrefixKeyLookuper {
+    pub async fn lookup(&mut self, row: &dyn InternalRow) -> Result<LookupResult> {
+        let prefix_bytes = self.prefix_key_encoder.encode_key(row)?;
+
+        let partition_id = if let Some(ref partition_getter) = self.partition_getter {
+            let partition_name = partition_getter.get_partition(row)?;
+            let physical_table_path = PhysicalTablePath::of_partitioned(
+                Arc::clone(&self.table_path),
+                Some(partition_name),
+            );
+            match self
+                .metadata
+                .check_and_update_partition_metadata(&physical_table_path)
+                .await?
+            {
+                Some(id) => Some(id),
+                None => return Ok(self.schema_ctx.empty_result()),
+            }
+        } else {
+            None
+        };
+
+        let bucket_id = self
+            .bucketing_function
+            .bucketing(&prefix_bytes, self.num_buckets)?;
+
+        let table_id = self.table_info.get_table_id();
+        let table_bucket = TableBucket::new_with_partition(table_id, partition_id, bucket_id);
+
+        let rows = self
+            .lookup_client
+            .prefix_lookup(self.table_path.as_ref().clone(), table_bucket, prefix_bytes)
+            .await?;
+
+        self.schema_ctx.build_result(rows).await
+    }
+
+    pub fn table_info(&self) -> &TableInfo {
+        &self.table_info
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{Column, DataTypes, Schema};
+    use crate::row::binary::BinaryWriter;
+    use crate::row::compacted::CompactedRowWriter;
+    use arrow::array::Int32Array;
+
+    fn make_row_bytes(schema_id: i16, row_data: &[u8]) -> Vec<u8> {
+        let mut bytes = Vec::with_capacity(SCHEMA_ID_LENGTH + row_data.len());
+        bytes.extend_from_slice(&schema_id.to_le_bytes());
+        bytes.extend_from_slice(row_data);
+        bytes
+    }
+
+    fn schema_with_ids(columns: &[(i32, &str, crate::metadata::DataType)]) -> Schema {
+        let cols: Vec<Column> = columns
+            .iter()
+            .map(|(id, name, dt)| Column::new(*name, dt.clone()).with_id(*id))
+            .collect();
+        Schema::builder().with_columns(cols).build().unwrap()
+    }
+
+    fn cache_with(
+        target_id: i16,
+        target_decoder: FixedSchemaDecoder,
+        others: Vec<(i16, FixedSchemaDecoder)>,
+    ) -> Arc<DecoderCache> {
+        let cache = DecoderCache::new(target_id, Arc::new(target_decoder));
+        for (id, decoder) in others {
+            cache.insert(id, Arc::new(decoder));
+        }
+        Arc::new(cache)
+    }
+
+    fn lookup_result_from(
+        rows: Vec<Vec<u8>>,
+        target_schema: &Schema,
+        decoders: Arc<DecoderCache>,
+    ) -> LookupResult {
+        LookupResult::new(rows, Arc::new(target_schema.row_type().clone()), decoders)
+    }
+
+    #[test]
+    fn test_to_record_batch_empty() {
+        let target = schema_with_ids(&[(0, "id", DataTypes::int())]);
+        let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap();
+        let result = lookup_result_from(Vec::new(), &target, cache_with(0, decoder, vec![]));
+        let batch = result.to_record_batch().unwrap();
+        assert_eq!(batch.num_rows(), 0);
+        assert_eq!(batch.num_columns(), 1);
+    }
+
+    #[test]
+    fn test_to_record_batch_with_row_at_target_schema() {
+        let target = schema_with_ids(&[(0, "id", DataTypes::int())]);
+
+        let mut writer = CompactedRowWriter::new(1);
+        writer.write_int(42);
+        let row_bytes = make_row_bytes(0, writer.buffer());
+
+        let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap();
+        let result = lookup_result_from(vec![row_bytes], &target, cache_with(0, decoder, vec![]));
+
+        let batch = result.to_record_batch().unwrap();
+        assert_eq!(batch.num_rows(), 1);
+        let col = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        assert_eq!(col.value(0), 42);
+    }
+
+    #[test]
+    fn test_get_rows_decodes_per_row_schema_id_with_projection() {
+        let source = schema_with_ids(&[(0, "a", DataTypes::int())]);
+        let target = schema_with_ids(&[(0, "a", DataTypes::int()), (1, "b", DataTypes::string())]);
+
+        let mut w = CompactedRowWriter::new(1);
+        w.write_int(7);
+        let old_row = make_row_bytes(3, w.buffer());
+
+        let mut w = CompactedRowWriter::new(2);
+        w.write_int(8);
+        w.write_string("eight");
+        let new_row = make_row_bytes(7, w.buffer());
+
+        let target_decoder =
+            FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap();
+        let projection_decoder =
+            FixedSchemaDecoder::new(KvFormat::COMPACTED, &source, &target).unwrap();
+        let cache = cache_with(7, target_decoder, vec![(3, projection_decoder)]);
+        let result = lookup_result_from(vec![old_row, new_row], &target, cache);
+
+        let rows = result.get_rows().unwrap();
+        assert_eq!(rows.len(), 2);
+        assert_eq!(rows[0].get_int(0).unwrap(), 7);
+        assert!(rows[0].is_null_at(1).unwrap());
+        assert_eq!(rows[1].get_int(0).unwrap(), 8);
+        assert_eq!(rows[1].get_string(1).unwrap(), "eight");
+    }
+
+    #[test]
+    fn test_to_record_batch_payload_too_short() {
+        let target = schema_with_ids(&[(0, "id", DataTypes::int())]);
+        let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap();
+        let result = lookup_result_from(vec![vec![0u8]], &target, cache_with(0, decoder, vec![]));
+        assert!(result.to_record_batch().is_err());
+    }
+
+    #[test]
+    fn test_get_rows_errors_when_no_decoder_for_schema_id() {
+        let target = schema_with_ids(&[(0, "id", DataTypes::int())]);
+        let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap();
+        let mut w = CompactedRowWriter::new(1);
+        w.write_int(1);
+        let row = make_row_bytes(99, w.buffer());
+        let result = lookup_result_from(vec![row], &target, cache_with(0, decoder, vec![]));
+
+        let err = result
+            .get_rows()
+            .map(|_| ())
+            .map_err(|e| e.to_string())
+            .unwrap_err();
+        assert!(err.contains("schema id 99"), "{err}");
+    }
+
+    #[test]
+    fn test_read_schema_id_rejects_negative() {
+        let bytes = [0xFFu8, 0xFFu8, 0u8];
+        let err = LookupResult::read_schema_id(&bytes).unwrap_err();
+        assert!(
+            err.to_string().contains("Invalid negative schema id"),
+            "{err}"
+        );
+    }
+
+    #[test]
+    fn test_decoder_cache_target_lookup_skips_lock() {
+        let target = schema_with_ids(&[(0, "a", DataTypes::int())]);
+        let target_decoder =
+            Arc::new(FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &target).unwrap());
+        let cache = DecoderCache::new(7, Arc::clone(&target_decoder));
+
+        let returned = cache.get(7).expect("target id must hit the cache");
+        assert!(Arc::ptr_eq(&returned, &target_decoder));
+        assert!(cache.get(99).is_none());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/mod.rs b/fluss-rust/crates/fluss/src/client/table/mod.rs
new file mode 100644
index 0000000000..657a44bfe8
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/mod.rs
@@ -0,0 +1,167 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::connection::FlussConnection;
+use crate::client::metadata::Metadata;
+use crate::client::schema_getter::ClientSchemaGetter;
+use crate::error::{Error, Result};
+use crate::metadata::{SchemaInfo, TableInfo, TablePath};
+use std::sync::Arc;
+
+pub const EARLIEST_OFFSET: i64 = -2;
+
+mod append;
+mod batch_scanner;
+mod lookup;
+
+mod log_fetch_buffer;
+mod partition_getter;
+mod reader;
+mod remote_log;
+mod scanner;
+mod upsert;
+
+pub use append::{AppendWriter, TableAppend};
+pub use batch_scanner::LimitBatchScanner;
+pub use lookup::{LookupResult, Lookuper, PrefixKeyLookuper, TableLookup, TablePrefixLookup};
+pub use reader::{RecordBatchLogReader, SyncRecordBatchLogReader};
+pub use remote_log::{
+    DEFAULT_REMOTE_FILE_DOWNLOAD_THREAD_NUM, DEFAULT_SCANNER_REMOTE_LOG_PREFETCH_NUM,
+};
+pub use scanner::{LogScanner, RecordBatchLogScanner, TableScan};
+pub use upsert::{TableUpsert, UpsertWriter};
+
+#[allow(dead_code)]
+pub struct FlussTable<'a> {
+    conn: &'a FlussConnection,
+    metadata: Arc<Metadata>,
+    table_info: TableInfo,
+    table_path: TablePath,
+    has_primary_key: bool,
+}
+
+impl<'a> FlussTable<'a> {
+    pub fn new(conn: &'a FlussConnection, metadata: Arc<Metadata>, table_info: TableInfo) -> Self {
+        FlussTable {
+            conn,
+            table_path: table_info.table_path.clone(),
+            has_primary_key: table_info.has_primary_key(),
+            table_info,
+            metadata,
+        }
+    }
+
+    pub fn new_append(&self) -> Result<TableAppend> {
+        if self.has_primary_key {
+            return Err(Error::UnsupportedOperation {
+                message: "Append is only supported for log tables (without primary key)"
+                    .to_string(),
+            });
+        }
+        Ok(TableAppend::new(
+            self.table_path.clone(),
+            Arc::new(self.table_info.clone()),
+            self.conn.get_or_create_writer_client()?,
+        ))
+    }
+
+    pub fn new_scan(&self) -> TableScan<'_> {
+        TableScan::new(self.conn, self.table_info.clone(), self.metadata.clone())
+    }
+
+    pub fn metadata(&self) -> &Arc<Metadata> {
+        &self.metadata
+    }
+
+    pub fn get_table_info(&self) -> &TableInfo {
+        &self.table_info
+    }
+
+    pub fn table_path(&self) -> &TablePath {
+        &self.table_path
+    }
+
+    pub fn has_primary_key(&self) -> bool {
+        self.has_primary_key
+    }
+
+    /// Creates a new `TableLookup` for configuring lookup operations.
+    ///
+    /// This follows the same pattern as `new_scan()` and `new_append()`,
+    /// returning a configuration object that can be used to create a `Lookuper`.
+    ///
+    /// The table must have a primary key (be a primary key table).
+    ///
+    /// # Returns
+    /// * `Ok(TableLookup)` - A lookup configuration object
+    /// * `Err(Error)` - If the table doesn't have a primary key
+    ///
+    /// # Example
+    /// ```ignore
+    /// let table = conn.get_table(&table_path).await?;
+    /// let lookuper = table.new_lookup()?.create_lookuper()?;
+    /// let key = vec![1, 2, 3]; // encoded primary key bytes
+    /// if let Some(value) = lookuper.lookup(key).await? {
+    ///     println!("Found value: {:?}", value);
+    /// }
+    /// ```
+    pub fn new_lookup(&self) -> Result<TableLookup> {
+        if !self.has_primary_key {
+            return Err(Error::UnsupportedOperation {
+                message: "Lookup is only supported for primary key tables".to_string(),
+            });
+        }
+        let lookup_client = self.conn.get_or_create_lookup_client()?;
+        // Pre-seed the schema getter with the table's current schema —
+        // rows written under it (the dominant case) never trigger an RPC.
+        let latest = SchemaInfo::new(
+            self.table_info.get_schema().clone(),
+            self.table_info.get_schema_id(),
+        );
+        let schema_getter = Arc::new(ClientSchemaGetter::new(
+            self.table_path.clone(),
+            self.conn.get_admin()?,
+            latest,
+        ));
+        Ok(TableLookup::new(
+            lookup_client,
+            self.table_info.clone(),
+            self.metadata.clone(),
+            schema_getter,
+        ))
+    }
+
+    pub fn new_upsert(&self) -> Result<TableUpsert> {
+        if !self.has_primary_key {
+            return Err(Error::UnsupportedOperation {
+                message: "Upsert is only supported for primary key tables".to_string(),
+            });
+        }
+
+        Ok(TableUpsert::new(
+            self.table_path.clone(),
+            self.table_info.clone(),
+            self.conn.get_or_create_writer_client()?,
+        ))
+    }
+}
+
+impl<'a> Drop for FlussTable<'a> {
+    fn drop(&mut self) {
+        // do-nothing now
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/partition_getter.rs b/fluss-rust/crates/fluss/src/client/table/partition_getter.rs
new file mode 100644
index 0000000000..1115ded3bd
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/partition_getter.rs
@@ -0,0 +1,199 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::{DataType, PhysicalTablePath, ResolvedPartitionSpec, RowType, TablePath};
+use crate::row::InternalRow;
+use crate::row::field_getter::FieldGetter;
+use crate::util::partition;
+use std::sync::Arc;
+
+/// Get the physical table path for a row, handling partitioned vs non-partitioned tables.
+pub fn get_physical_path<R: InternalRow>(
+    table_path: &Arc<TablePath>,
+    partition_getter: Option<&PartitionGetter>,
+    row: &R,
+) -> Result<PhysicalTablePath> {
+    if let Some(getter) = partition_getter {
+        let partition = getter.get_partition(row)?;
+        Ok(PhysicalTablePath::of_partitioned(
+            Arc::clone(table_path),
+            Some(partition),
+        ))
+    } else {
+        Ok(PhysicalTablePath::of(Arc::clone(table_path)))
+    }
+}
+
+/// A getter to get partition name from a row.
+#[allow(dead_code)]
+pub struct PartitionGetter {
+    partition_keys: Arc<[String]>,
+    partitions: Vec<(DataType, FieldGetter)>,
+}
+
+#[allow(dead_code)]
+impl PartitionGetter {
+    pub fn new(row_type: &RowType, partition_keys: Arc<[String]>) -> Result<Self> {
+        let mut partitions = Vec::with_capacity(partition_keys.len());
+
+        for partition_key in partition_keys.iter() {
+            if let Some(partition_col_index) = row_type.get_field_index(partition_key.as_str()) {
+                let data_type = row_type
+                    .fields()
+                    .get(partition_col_index)
+                    .unwrap()
+                    .data_type
+                    .clone();
+                let field_getter = FieldGetter::create(&data_type, partition_col_index);
+
+                partitions.push((data_type, field_getter));
+            } else {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "The partition column {partition_key} is not in the row {row_type}."
+                    ),
+                });
+            };
+        }
+
+        Ok(Self {
+            partition_keys,
+            partitions,
+        })
+    }
+
+    pub fn get_partition(&self, row: &dyn InternalRow) -> Result<String> {
+        self.get_partition_spec(row)
+            .map(|ps| ps.get_partition_name())
+    }
+
+    pub fn get_partition_spec(&self, row: &dyn InternalRow) -> Result<ResolvedPartitionSpec> {
+        let mut partition_values = Vec::with_capacity(self.partitions.len());
+
+        for (data_type, field_getter) in &self.partitions {
+            let value = field_getter.get_field(row)?;
+            if value.is_null() {
+                return Err(IllegalArgument {
+                    message: "Partition value shouldn't be null.".to_string(),
+                });
+            }
+            partition_values.push(partition::convert_value_of_type(&value, data_type)?);
+        }
+
+        ResolvedPartitionSpec::new(Arc::clone(&self.partition_keys), partition_values)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataField, IntType, StringType};
+    use crate::row::{Datum, GenericRow};
+
+    #[test]
+    fn test_partition_getter_single_key() {
+        let row_type = RowType::new(vec![
+            DataField::new("id", DataType::Int(IntType::new()), None),
+            DataField::new("region", DataType::String(StringType::new()), None),
+        ]);
+
+        let getter = PartitionGetter::new(&row_type, Arc::from(["region".to_string()]))
+            .expect("should succeed");
+
+        let row = GenericRow::from_data(vec![Datum::Int32(42), Datum::from("US")]);
+        let partition_name = getter.get_partition(&row).expect("should succeed");
+        assert_eq!(partition_name, "US");
+    }
+
+    #[test]
+    fn test_partition_getter_multiple_keys() {
+        let row_type = RowType::new(vec![
+            DataField::new("id", DataType::Int(IntType::new()), None),
+            DataField::new("date", DataType::String(StringType::new()), None),
+            DataField::new("region", DataType::String(StringType::new()), None),
+        ]);
+
+        let getter = PartitionGetter::new(
+            &row_type,
+            Arc::from(["date".to_string(), "region".to_string()]),
+        )
+        .expect("should succeed");
+
+        let row = GenericRow::from_data(vec![
+            Datum::Int32(42),
+            Datum::from("2024-01-15"),
+            Datum::from("US"),
+        ]);
+        let partition_name = getter.get_partition(&row).expect("should succeed");
+        assert_eq!(partition_name, "2024-01-15$US");
+    }
+
+    #[test]
+    fn test_partition_getter_invalid_column() {
+        let row_type = RowType::new(vec![DataField::new(
+            "id",
+            DataType::Int(IntType::new()),
+            None,
+        )]);
+
+        let result = PartitionGetter::new(&row_type, Arc::from(["nonexistent".to_string()]));
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_partition_getter_null_value() {
+        let row_type = RowType::new(vec![
+            DataField::new("id", DataType::Int(IntType::new()), None),
+            DataField::new("region", DataType::String(StringType::new()), None),
+        ]);
+
+        let getter = PartitionGetter::new(&row_type, Arc::from(["region".to_string()]))
+            .expect("should succeed");
+
+        let row = GenericRow::from_data(vec![Datum::Int32(42), Datum::Null]);
+        let result = getter.get_partition(&row);
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_get_partition_spec() {
+        let row_type = RowType::new(vec![
+            DataField::new("id", DataType::Int(IntType::new()), None),
+            DataField::new("date", DataType::String(StringType::new()), None),
+            DataField::new("region", DataType::String(StringType::new()), None),
+        ]);
+
+        let getter = PartitionGetter::new(
+            &row_type,
+            Arc::from(["date".to_string(), "region".to_string()]),
+        )
+        .expect("should succeed");
+
+        let row = GenericRow::from_data(vec![
+            Datum::Int32(42),
+            Datum::from("2024-01-15"),
+            Datum::from("US"),
+        ]);
+        let spec = getter.get_partition_spec(&row).expect("should succeed");
+
+        assert_eq!(spec.get_partition_keys(), &["date", "region"]);
+        assert_eq!(spec.get_partition_values(), &["2024-01-15", "US"]);
+        assert_eq!(spec.get_partition_name(), "2024-01-15$US");
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/reader.rs b/fluss-rust/crates/fluss/src/client/table/reader.rs
new file mode 100644
index 0000000000..518c68a222
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/reader.rs
@@ -0,0 +1,701 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Bounded log reader that polls until stopping offsets, then terminates.
+//!
+//! Unlike [`RecordBatchLogScanner`] which is unbounded (continuous streaming),
+//! [`RecordBatchLogReader`] reads log data up to a finite set of stopping
+//! offsets and then signals completion. This enables "snapshot-style" reads
+//! from a streaming log: capture the latest offsets, then consume all data
+//! up to those offsets.
+//!
+//! The reader **takes ownership** of the scanner (move, not clone). Once the
+//! scanner is moved into a reader, the compiler prevents concurrent polls.
+//!
+//! The reader also provides a synchronous [`arrow::record_batch::RecordBatchReader`]
+//! adapter via [`RecordBatchLogReader::to_record_batch_reader`] for Arrow
+//! ecosystem interop and FFI consumers (Python, C++).
+
+use crate::client::admin::FlussAdmin;
+use crate::client::table::RecordBatchLogScanner;
+use crate::error::{Error, Result};
+use crate::metadata::TableBucket;
+use crate::record::ScanBatch;
+use crate::rpc::message::OffsetSpec;
+use arrow::record_batch::RecordBatch;
+use arrow_schema::SchemaRef;
+use log::warn;
+use std::collections::{HashMap, VecDeque};
+use std::time::Duration;
+
+const DEFAULT_POLL_TIMEOUT: Duration = Duration::from_millis(500);
+
+/// Bounded log reader that consumes log data up to specified stopping offsets.
+///
+/// This type wraps a [`RecordBatchLogScanner`] and adds stopping semantics:
+/// it polls batches from the scanner, filters/slices them against per-bucket
+/// stopping offsets, and signals completion when all buckets are caught up.
+///
+/// The reader takes **ownership** of the scanner. Once moved in, no other code
+/// can poll the same scanner concurrently.
+///
+/// # Construction
+///
+/// Use [`RecordBatchLogReader::new_until_latest`] for the common case of
+/// reading all currently-available data, or [`RecordBatchLogReader::new_until_offsets`]
+/// for custom stopping offsets.
+///
+/// # Async iteration
+///
+/// Call [`next_batch`](RecordBatchLogReader::next_batch) repeatedly to get
+/// [`ScanBatch`]es lazily, one at a time. Returns `None` when all buckets
+/// have reached their stopping offsets.
+///
+/// # Sync adapter
+///
+/// Call [`to_record_batch_reader`](RecordBatchLogReader::to_record_batch_reader)
+/// to get a synchronous [`arrow::record_batch::RecordBatchReader`] suitable
+/// for Arrow FFI consumers.
+pub struct RecordBatchLogReader {
+    scanner: RecordBatchLogScanner,
+    stopping_offsets: HashMap<TableBucket, i64>,
+    buffer: VecDeque<ScanBatch>,
+    schema: SchemaRef,
+}
+
+impl RecordBatchLogReader {
+    /// Create a reader that reads until the latest offsets at the time of creation.
+    ///
+    /// Queries the server for the current latest offset of each subscribed
+    /// bucket, then reads until those offsets are reached. Buckets whose
+    /// subscribed offset already meets or exceeds the latest offset are
+    /// excluded (nothing to read).
+    ///
+    /// Partition metadata is fetched once during construction; no caching
+    /// is needed since each reader is typically short-lived.
+    pub async fn new_until_latest(
+        scanner: RecordBatchLogScanner,
+        admin: &FlussAdmin,
+    ) -> Result<Self> {
+        // Acquire the guard first so no concurrent unsubscribe can mutate
+        // state between reading subscriptions and using them.
+        scanner.try_set_reader_active()?;
+
+        let subscribed = scanner.get_subscribed_buckets();
+        if subscribed.is_empty() {
+            scanner.clear_reader_active();
+            return Err(Error::IllegalArgument {
+                message: "No buckets subscribed. Call subscribe() before creating a reader."
+                    .to_string(),
+            });
+        }
+
+        let stopping_offsets = match query_latest_offsets(admin, &scanner, &subscribed).await {
+            Ok(o) => o,
+            Err(e) => {
+                scanner.clear_reader_active();
+                return Err(e);
+            }
+        };
+        let schema = scanner.schema();
+
+        Ok(Self {
+            scanner,
+            stopping_offsets,
+            buffer: VecDeque::new(),
+            schema,
+        })
+    }
+
+    /// Create a reader with explicit stopping offsets per bucket.
+    ///
+    /// # NOTE: Every key in `stopping_offsets` **must** correspond to a bucket that is
+    /// currently subscribed on the `scanner`. If a stopping offset refers to a
+    /// bucket that will never appear in polled batches, the reader will loop
+    /// indefinitely waiting for data that never arrives.
+    ///
+    /// Use [`new_until_latest`](Self::new_until_latest) for the common case;
+    /// it queries the server and builds a validated stopping-offset map
+    /// automatically.
+    pub fn new_until_offsets(
+        scanner: RecordBatchLogScanner,
+        stopping_offsets: HashMap<TableBucket, i64>,
+    ) -> Result<Self> {
+        scanner.try_set_reader_active()?;
+        let schema = scanner.schema();
+        Ok(Self {
+            scanner,
+            stopping_offsets,
+            buffer: VecDeque::new(),
+            schema,
+        })
+    }
+
+    /// Returns the Arrow schema for batches produced by this reader.
+    pub fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+
+    /// Drain all remaining batches until stopping offsets are satisfied.
+    ///
+    /// This is a convenience for callers (e.g. bindings building a single Arrow
+    /// table) that want to materialize the full result in Rust without per-batch
+    /// iteration.
+    pub async fn collect_all_batches(&mut self) -> Result<Vec<ScanBatch>> {
+        let mut out = Vec::new();
+        while let Some(b) = self.next_batch().await? {
+            out.push(b);
+        }
+        Ok(out)
+    }
+
+    /// Fetch the next [`ScanBatch`], or `None` if all buckets are caught up.
+    ///
+    /// Each call may internally poll multiple batches from the scanner,
+    /// buffer them, and return one at a time. Batches that cross a stopping
+    /// offset boundary are sliced to exclude records at or beyond the stop point.
+    ///
+    /// Completed buckets are unsubscribed from the scanner to avoid wasting
+    /// network traffic on data the reader will discard.
+    pub async fn next_batch(&mut self) -> Result<Option<ScanBatch>> {
+        loop {
+            if let Some(batch) = self.buffer.pop_front() {
+                return Ok(Some(batch));
+            }
+
+            if self.stopping_offsets.is_empty() {
+                return Ok(None);
+            }
+
+            let scan_batches = self.scanner.poll(DEFAULT_POLL_TIMEOUT).await?;
+
+            if scan_batches.is_empty() {
+                continue;
+            }
+
+            let completed =
+                filter_batches(scan_batches, &mut self.stopping_offsets, &mut self.buffer);
+
+            // Use the `_sync` unsubscribe variants here: the active-reader
+            // guard rejects calls to the async `unsubscribe*` methods, but
+            // the reader is allowed to clean up its own completed buckets.
+            // The sync variants do the same map removal without the guard
+            // check, and the partitioned/non-partitioned mismatch they
+            // silently ignore is unreachable since the reader inherits the
+            // scanner's partition mode.
+            for tb in completed {
+                if let Some(partition_id) = tb.partition_id() {
+                    self.scanner
+                        .unsubscribe_partition_sync(partition_id, tb.bucket_id());
+                } else {
+                    self.scanner.unsubscribe_sync(tb.bucket_id());
+                }
+            }
+        }
+    }
+
+    /// Convert this async reader into a synchronous [`arrow::record_batch::RecordBatchReader`].
+    ///
+    /// The returned adapter calls [`tokio::runtime::Handle::block_on`] on each
+    /// iterator step. **Do not** call this from inside a Tokio worker thread
+    /// while the same runtime is driving async work (nested `block_on` can
+    /// panic or deadlock). Prefer [`next_batch`](RecordBatchLogReader::next_batch)
+    /// in async Rust code. This is intended for sync/FFI boundaries (C++, some
+    /// Python call paths).
+    pub fn to_record_batch_reader(
+        self,
+        handle: tokio::runtime::Handle,
+    ) -> SyncRecordBatchLogReader {
+        SyncRecordBatchLogReader {
+            reader: self,
+            handle,
+        }
+    }
+}
+
+/// Best-effort cleanup when the reader is dropped before all buckets reach
+/// their stopping offsets (early `break`, an exception in the consumer, etc.).
+///
+/// Why this matters even though we own the scanner:
+///
+/// In pure Rust, dropping the reader drops the owned `RecordBatchLogScanner`,
+/// which decrements the `Arc<LogScannerInner>` to zero and frees the inner
+/// state. Subscriptions die with it, so this `Drop` is a no-op in that path.
+///
+/// In the binding layer (Python today, C++/Elixir later), the binding holds
+/// its own `Arc<LogScannerInner>` and uses
+/// [`RecordBatchLogScanner::new_shared_handle`] to obtain a second handle for
+/// the reader. When the reader is dropped mid-iteration the inner state stays
+/// alive — and any buckets the reader hadn't yet completed remain in
+/// `LogScannerStatus.bucket_status_map`. The user's next operations on the
+/// original `LogScanner` would then see "ghost" subscriptions (extra buckets
+/// being polled, stale offsets, etc.).
+///
+/// The `next_batch` loop already calls `unsubscribe` on each completed bucket,
+/// so `stopping_offsets` accurately reflects the still-active set when `Drop`
+/// runs. We unsubscribe each remaining bucket synchronously via the
+/// `_sync` escape hatches (the underlying `LogScannerStatus` ops don't await),
+/// so this is safe to call from any context — sync, async, a Tokio worker, or
+/// a Python thread holding the GIL.
+///
+/// After cleanup, the `reader_active` guard is cleared so that the original
+/// scanner (held by the binding layer) can accept new subscriptions again.
+///
+/// Caveats:
+/// - Batches already buffered in `LogFetcher.log_fetch_buffer` for an
+///   unsubscribed bucket are not drained here. They'll either be filtered out
+///   by the next `RecordBatchLogReader` (via the "bucket not in
+///   stopping_offsets" branch) or surface to a direct `poll_arrow` caller, who
+///   was sharing scanner state in the first place.
+/// - `Drop` cannot return errors. The `_sync` variants no-op on
+///   partitioned/non-partitioned mismatch, but that mismatch is unreachable
+///   here because the reader was constructed from this scanner and inherited
+///   its partition mode.
+impl Drop for RecordBatchLogReader {
+    fn drop(&mut self) {
+        for (tb, _) in self.stopping_offsets.drain() {
+            if let Some(partition_id) = tb.partition_id() {
+                self.scanner
+                    .unsubscribe_partition_sync(partition_id, tb.bucket_id());
+            } else {
+                self.scanner.unsubscribe_sync(tb.bucket_id());
+            }
+        }
+        self.scanner.clear_reader_active();
+    }
+}
+
+/// Synchronous adapter that implements [`arrow::record_batch::RecordBatchReader`].
+///
+/// Created via [`RecordBatchLogReader::to_record_batch_reader`].
+/// Blocks the current thread on each `next()` call using the provided
+/// Tokio runtime handle.
+///
+/// The iterator yields plain [`RecordBatch`]es (bucket/offset metadata from
+/// [`ScanBatch`] is stripped to satisfy the Arrow trait contract).
+pub struct SyncRecordBatchLogReader {
+    reader: RecordBatchLogReader,
+    handle: tokio::runtime::Handle,
+}
+
+impl Iterator for SyncRecordBatchLogReader {
+    type Item = std::result::Result<RecordBatch, arrow::error::ArrowError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.handle.block_on(self.reader.next_batch()) {
+            Ok(Some(scan_batch)) => Some(Ok(scan_batch.into_batch())),
+            Ok(None) => None,
+            Err(e) => Some(Err(arrow::error::ArrowError::ExternalError(Box::new(e)))),
+        }
+    }
+}
+
+impl arrow::record_batch::RecordBatchReader for SyncRecordBatchLogReader {
+    fn schema(&self) -> SchemaRef {
+        self.reader.schema()
+    }
+}
+
+/// Query latest offsets for all subscribed buckets, handling both partitioned
+/// and non-partitioned tables.
+///
+/// Buckets whose subscribed offset already meets or exceeds the latest offset
+/// are excluded from the result (there is nothing to read). A `latest_offset`
+/// of `0` means the bucket is empty and is silently skipped; a negative value
+/// is unexpected from the server and is logged as a warning before being
+/// skipped.
+async fn query_latest_offsets(
+    admin: &FlussAdmin,
+    scanner: &RecordBatchLogScanner,
+    subscribed: &[(TableBucket, i64)],
+) -> Result<HashMap<TableBucket, i64>> {
+    let table_path = scanner.table_path();
+
+    if !scanner.is_partitioned() {
+        let bucket_ids: Vec<i32> = subscribed.iter().map(|(tb, _)| tb.bucket_id()).collect();
+
+        let offsets = admin
+            .list_offsets(table_path, &bucket_ids, OffsetSpec::Latest)
+            .await?;
+
+        let subscribed_offset_by_bucket: HashMap<i32, i64> = subscribed
+            .iter()
+            .map(|(tb, off)| (tb.bucket_id(), *off))
+            .collect();
+
+        let table_id = scanner.table_id();
+        Ok(offsets
+            .into_iter()
+            .filter(|(bucket_id, latest_offset)| {
+                if *latest_offset < 0 {
+                    warn!(
+                        "Server returned negative latest offset {latest_offset} for bucket {bucket_id} of table {table_id}; skipping bucket."
+                    );
+                    return false;
+                }
+                if *latest_offset == 0 {
+                    return false;
+                }
+                let Some(&subscribed_offset) = subscribed_offset_by_bucket.get(bucket_id)
+                else {
+                    return false;
+                };
+                subscribed_offset < *latest_offset
+            })
+            .map(|(bucket_id, offset)| (TableBucket::new(table_id, bucket_id), offset))
+            .collect())
+    } else {
+        query_partitioned_offsets(admin, scanner, subscribed).await
+    }
+}
+
+/// Query offsets for partitioned table subscriptions.
+///
+/// Partition metadata is fetched once per reader construction (not cached),
+/// since each [`RecordBatchLogReader`] is typically short-lived and consumed.
+async fn query_partitioned_offsets(
+    admin: &FlussAdmin,
+    scanner: &RecordBatchLogScanner,
+    subscribed: &[(TableBucket, i64)],
+) -> Result<HashMap<TableBucket, i64>> {
+    let table_path = scanner.table_path();
+    let table_id = scanner.table_id();
+
+    let partition_infos = admin.list_partition_infos(table_path).await?;
+    let partition_id_to_name: HashMap<i64, String> = partition_infos
+        .into_iter()
+        .map(|info| (info.get_partition_id(), info.get_partition_name()))
+        .collect();
+
+    let subscribed_offset_map: HashMap<TableBucket, i64> = subscribed.iter().cloned().collect();
+
+    let mut by_partition: HashMap<i64, Vec<i32>> = HashMap::new();
+    for (tb, _) in subscribed {
+        if let Some(partition_id) = tb.partition_id() {
+            by_partition
+                .entry(partition_id)
+                .or_default()
+                .push(tb.bucket_id());
+        }
+    }
+
+    let mut result: HashMap<TableBucket, i64> = HashMap::new();
+
+    for (partition_id, bucket_ids) in by_partition {
+        let partition_name =
+            partition_id_to_name
+                .get(&partition_id)
+                .ok_or_else(|| Error::UnexpectedError {
+                    message: format!("Unknown partition_id: {partition_id}"),
+                    source: None,
+                })?;
+
+        let offsets = admin
+            .list_partition_offsets(table_path, partition_name, &bucket_ids, OffsetSpec::Latest)
+            .await?;
+
+        for (bucket_id, latest_offset) in offsets {
+            if latest_offset < 0 {
+                warn!(
+                    "Server returned negative latest offset {latest_offset} for bucket {bucket_id} of partition {partition_id} (table {table_id}); skipping bucket."
+                );
+                continue;
+            }
+            if latest_offset == 0 {
+                continue;
+            }
+            let tb = TableBucket::new_with_partition(table_id, Some(partition_id), bucket_id);
+            let Some(&subscribed_offset) = subscribed_offset_map.get(&tb) else {
+                continue;
+            };
+            if subscribed_offset < latest_offset {
+                result.insert(tb, latest_offset);
+            }
+        }
+    }
+
+    Ok(result)
+}
+
+/// Filter and slice scan batches against per-bucket stopping offsets.
+///
+/// For each batch:
+/// - If the batch's bucket is not in `stopping_offsets`, skip it.
+/// - If `base_offset >= stop_at`, the bucket is exhausted; remove from map.
+/// - If `last_offset >= stop_at`, slice to keep only records before stop_at.
+/// - Otherwise, keep the full batch.
+///
+/// Accepted batches with at least one row are pushed to `buffer`; empty
+/// batches (e.g. a server-emitted batch containing no rows, or a slice that
+/// reduces to zero rows) are dropped so consumers never observe an empty
+/// `ScanBatch`. Returns the list of buckets that completed (were removed
+/// from `stopping_offsets`).
+fn filter_batches(
+    scan_batches: Vec<ScanBatch>,
+    stopping_offsets: &mut HashMap<TableBucket, i64>,
+    buffer: &mut VecDeque<ScanBatch>,
+) -> Vec<TableBucket> {
+    let mut completed = Vec::new();
+
+    for scan_batch in scan_batches {
+        let bucket = scan_batch.bucket().clone();
+        let Some(&stop_at) = stopping_offsets.get(&bucket) else {
+            continue;
+        };
+
+        let base_offset = scan_batch.base_offset();
+        let last_offset = scan_batch.last_offset();
+
+        if base_offset >= stop_at {
+            stopping_offsets.remove(&bucket);
+            completed.push(bucket);
+            continue;
+        }
+
+        let kept_batch = if last_offset >= stop_at {
+            let num_to_keep = (stop_at - base_offset) as usize;
+            let b = scan_batch.into_batch();
+            let limit = num_to_keep.min(b.num_rows());
+            ScanBatch::new(bucket.clone(), b.slice(0, limit), base_offset)
+        } else {
+            scan_batch
+        };
+
+        if kept_batch.batch().num_rows() > 0 {
+            buffer.push_back(kept_batch);
+        }
+
+        if last_offset >= stop_at - 1 {
+            stopping_offsets.remove(&bucket);
+            completed.push(bucket);
+        }
+    }
+
+    completed
+}
+
+// Rust-level end-to-end coverage for `new_until_latest`, partitioned tables,
+// and `new_until_offsets` stopping semantics lives in
+// `crates/fluss/tests/integration/record_batch_log_reader.rs`. Drop cleanup and the
+// reader-active guard remain covered by the Python integration test
+// `test_to_arrow_batch_reader_drop_and_guard`.
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::Int32Array;
+    use arrow_schema::{DataType, Field, Schema};
+    use std::sync::Arc;
+
+    fn test_schema() -> SchemaRef {
+        Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, false)]))
+    }
+
+    fn make_batch(values: &[i32]) -> RecordBatch {
+        RecordBatch::try_new(
+            test_schema(),
+            vec![Arc::new(Int32Array::from(values.to_vec()))],
+        )
+        .unwrap()
+    }
+
+    fn make_scan_batch(bucket: TableBucket, base_offset: i64, values: &[i32]) -> ScanBatch {
+        ScanBatch::new(bucket, make_batch(values), base_offset)
+    }
+
+    fn bucket(id: i32) -> TableBucket {
+        TableBucket::new(1, id)
+    }
+
+    #[test]
+    fn filter_batch_entirely_before_stop() {
+        let mut offsets = HashMap::from([(bucket(0), 100)]);
+        let mut buffer = VecDeque::new();
+
+        let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3])];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert_eq!(buffer.len(), 1);
+        assert_eq!(buffer[0].batch().num_rows(), 3);
+        assert!(offsets.contains_key(&bucket(0)));
+        assert!(completed.is_empty());
+    }
+
+    #[test]
+    fn filter_batch_crossing_stop_offset_is_sliced() {
+        let mut offsets = HashMap::from([(bucket(0), 12)]);
+        let mut buffer = VecDeque::new();
+
+        // base_offset=10, 5 rows -> offsets 10,11,12,13,14; stop_at=12 -> keep 2
+        let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3, 4, 5])];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert_eq!(buffer.len(), 1);
+        assert_eq!(buffer[0].batch().num_rows(), 2);
+        assert!(!offsets.contains_key(&bucket(0)));
+        assert_eq!(completed, vec![bucket(0)]);
+    }
+
+    #[test]
+    fn filter_batch_at_or_after_stop_offset_is_skipped() {
+        let mut offsets = HashMap::from([(bucket(0), 10)]);
+        let mut buffer = VecDeque::new();
+
+        // base_offset=10, stop_at=10 -> base >= stop, skip entirely
+        let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3])];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert!(buffer.is_empty());
+        assert!(!offsets.contains_key(&bucket(0)));
+        assert_eq!(completed, vec![bucket(0)]);
+    }
+
+    #[test]
+    fn filter_batch_ending_exactly_at_stop_minus_one() {
+        let mut offsets = HashMap::from([(bucket(0), 13)]);
+        let mut buffer = VecDeque::new();
+
+        // base_offset=10, 3 rows -> offsets 10,11,12; last_offset=12, stop_at=13
+        // last_offset (12) >= stop_at - 1 (12) => bucket done
+        let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3])];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert_eq!(buffer.len(), 1);
+        assert_eq!(buffer[0].batch().num_rows(), 3);
+        assert!(!offsets.contains_key(&bucket(0)));
+        assert_eq!(completed, vec![bucket(0)]);
+    }
+
+    #[test]
+    fn filter_unknown_bucket_is_ignored() {
+        let mut offsets = HashMap::from([(bucket(0), 100)]);
+        let mut buffer = VecDeque::new();
+
+        let batches = vec![make_scan_batch(bucket(99), 0, &[1, 2])];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert!(buffer.is_empty());
+        assert!(offsets.contains_key(&bucket(0)));
+        assert!(completed.is_empty());
+    }
+
+    #[test]
+    fn filter_multiple_buckets_independent_tracking() {
+        let mut offsets = HashMap::from([(bucket(0), 12), (bucket(1), 5)]);
+        let mut buffer = VecDeque::new();
+
+        let batches = vec![
+            make_scan_batch(bucket(0), 10, &[1, 2, 3]), // last=12, stop=12 -> keep 2, done
+            make_scan_batch(bucket(1), 0, &[10, 20, 30]), // last=2, stop=5 -> keep all, not done
+        ];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert_eq!(buffer.len(), 2);
+        assert_eq!(buffer[0].batch().num_rows(), 2); // bucket 0: sliced
+        assert_eq!(buffer[1].batch().num_rows(), 3); // bucket 1: full
+        assert!(!offsets.contains_key(&bucket(0))); // bucket 0: done
+        assert!(offsets.contains_key(&bucket(1))); // bucket 1: still tracking
+        assert_eq!(completed, vec![bucket(0)]);
+    }
+
+    #[test]
+    fn filter_empty_batch_at_stop() {
+        let mut offsets = HashMap::from([(bucket(0), 5)]);
+        let mut buffer = VecDeque::new();
+
+        // empty batch: base_offset=5, 0 rows -> last_offset = base-1 = 4
+        // base_offset (5) >= stop_at (5) -> skip, remove
+        let batches = vec![make_scan_batch(bucket(0), 5, &[])];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert!(buffer.is_empty());
+        assert!(!offsets.contains_key(&bucket(0)));
+        assert_eq!(completed, vec![bucket(0)]);
+    }
+
+    #[test]
+    fn filter_drops_empty_batch_before_stop() {
+        // Empty batch well below the stop offset: base=5, 0 rows -> last=4, stop=100.
+        // base_offset (5) < stop_at (100) and last_offset (4) < stop_at (100),
+        // so it falls into the "keep full batch" branch but must not surface to
+        // the consumer because it has zero rows.
+        let mut offsets = HashMap::from([(bucket(0), 100)]);
+        let mut buffer = VecDeque::new();
+
+        let batches = vec![make_scan_batch(bucket(0), 5, &[])];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert!(buffer.is_empty());
+        assert!(offsets.contains_key(&bucket(0)));
+        assert!(completed.is_empty());
+    }
+
+    #[test]
+    fn filter_single_row_batch_before_stop() {
+        let mut offsets = HashMap::from([(bucket(0), 10)]);
+        let mut buffer = VecDeque::new();
+
+        let batches = vec![make_scan_batch(bucket(0), 5, &[42])];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert_eq!(buffer.len(), 1);
+        assert_eq!(buffer[0].batch().num_rows(), 1);
+        assert!(offsets.contains_key(&bucket(0)));
+        assert!(completed.is_empty());
+    }
+
+    #[test]
+    fn filter_single_row_batch_at_stop_boundary() {
+        let mut offsets = HashMap::from([(bucket(0), 5)]);
+        let mut buffer = VecDeque::new();
+
+        // base_offset=4, 1 row -> last_offset=4, stop=5
+        // last < stop -> keep all; last (4) >= stop-1 (4) -> done
+        let batches = vec![make_scan_batch(bucket(0), 4, &[42])];
+        let completed = filter_batches(batches, &mut offsets, &mut buffer);
+
+        assert_eq!(buffer.len(), 1);
+        assert_eq!(buffer[0].batch().num_rows(), 1);
+        assert!(!offsets.contains_key(&bucket(0)));
+        assert_eq!(completed, vec![bucket(0)]);
+    }
+
+    #[test]
+    fn filter_preserves_scan_batch_metadata() {
+        let mut offsets = HashMap::from([(bucket(3), 100)]);
+        let mut buffer = VecDeque::new();
+
+        let batches = vec![make_scan_batch(bucket(3), 42, &[1, 2])];
+        filter_batches(batches, &mut offsets, &mut buffer);
+
+        let sb = &buffer[0];
+        assert_eq!(*sb.bucket(), bucket(3));
+        assert_eq!(sb.base_offset(), 42);
+    }
+
+    #[test]
+    fn filter_sliced_batch_preserves_base_offset() {
+        let mut offsets = HashMap::from([(bucket(0), 12)]);
+        let mut buffer = VecDeque::new();
+
+        let batches = vec![make_scan_batch(bucket(0), 10, &[1, 2, 3, 4, 5])];
+        filter_batches(batches, &mut offsets, &mut buffer);
+
+        let sb = &buffer[0];
+        assert_eq!(sb.base_offset(), 10);
+        assert_eq!(*sb.bucket(), bucket(0));
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/remote_log.rs b/fluss-rust/crates/fluss/src/client/table/remote_log.rs
new file mode 100644
index 0000000000..c48bdccabf
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/remote_log.rs
@@ -0,0 +1,1343 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use crate::client::credentials::CredentialsReceiver;
+use crate::error::{Error, Result};
+use crate::io::{FileIO, Storage};
+use crate::metadata::TableBucket;
+use crate::metrics::ScannerMetrics;
+use crate::proto::{PbRemoteLogFetchInfo, PbRemoteLogSegment};
+use futures::TryStreamExt;
+use parking_lot::Mutex;
+use std::{
+    cmp::{Ordering, Reverse},
+    collections::{BinaryHeap, HashMap},
+    future::Future,
+    io, mem,
+    path::{Path, PathBuf},
+    pin::Pin,
+    sync::Arc,
+    time::Duration,
+};
+
+#[cfg(test)]
+use std::{
+    env,
+    time::{SystemTime, UNIX_EPOCH},
+};
+use tempfile::TempDir;
+use tokio::io::AsyncWriteExt;
+use tokio::sync::{Notify, OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use tokio::task::JoinSet;
+
+/// Default maximum number of remote log segments to prefetch
+/// Matches Java's CLIENT_SCANNER_REMOTE_LOG_PREFETCH_NUM (default: 4)
+pub const DEFAULT_SCANNER_REMOTE_LOG_PREFETCH_NUM: usize = 4;
+
+/// Default maximum concurrent remote log downloads
+/// Matches Java's REMOTE_FILE_DOWNLOAD_THREAD_NUM (default: 3)
+pub const DEFAULT_REMOTE_FILE_DOWNLOAD_THREAD_NUM: usize = 3;
+
+/// Initial retry backoff delay (milliseconds)
+/// Prevents hot-spin retry loops on persistent failures
+const RETRY_BACKOFF_BASE_MS: u64 = 100;
+
+/// Maximum retry backoff delay (milliseconds)
+/// Caps exponential backoff to avoid excessive delays
+const RETRY_BACKOFF_MAX_MS: u64 = 5_000;
+
+/// Maximum number of retries before giving up
+/// After this many retries, the download will fail permanently
+const MAX_RETRY_COUNT: u32 = 10;
+
+/// Calculate exponential backoff delay with jitter for retries
+fn calculate_backoff_delay(retry_count: u32) -> tokio::time::Duration {
+    use rand::Rng;
+
+    // Exponential backoff: base * 2^retry_count
+    let exponential_ms = RETRY_BACKOFF_BASE_MS.saturating_mul(1 << retry_count.min(10)); // Cap exponent to prevent overflow
+
+    // Cap at maximum
+    let capped_ms = exponential_ms.min(RETRY_BACKOFF_MAX_MS);
+
+    // Add jitter (±25% randomness) to avoid thundering herd
+    let mut rng = rand::rng();
+    let jitter = rng.random_range(0.75..=1.25);
+    let final_ms = ((capped_ms as f64) * jitter) as u64;
+
+    tokio::time::Duration::from_millis(final_ms)
+}
+
+/// Result of a fetch operation containing file path and size
+#[derive(Debug)]
+pub struct FetchResult {
+    pub file_path: PathBuf,
+    pub file_size: usize,
+}
+
+/// Trait for fetching remote log segments (allows dependency injection for testing)
+pub trait RemoteLogFetcher: Send + Sync {
+    fn fetch(
+        &self,
+        request: &RemoteLogDownloadRequest,
+    ) -> Pin<Box<dyn Future<Output = Result<FetchResult>> + Send>>;
+}
+
+/// Represents a remote log segment that needs to be downloaded
+#[derive(Debug, Clone)]
+pub struct RemoteLogSegment {
+    pub segment_id: String,
+    pub start_offset: i64,
+    #[allow(dead_code)]
+    pub end_offset: i64,
+    #[allow(dead_code)]
+    pub size_in_bytes: i32,
+    pub table_bucket: TableBucket,
+    pub max_timestamp: i64,
+}
+
+impl RemoteLogSegment {
+    pub fn from_proto(segment: &PbRemoteLogSegment, table_bucket: TableBucket) -> Self {
+        Self {
+            segment_id: segment.remote_log_segment_id.clone(),
+            start_offset: segment.remote_log_start_offset,
+            end_offset: segment.remote_log_end_offset,
+            size_in_bytes: segment.segment_size_in_bytes,
+            table_bucket,
+            // Match Java's behavior: use -1 for missing timestamp
+            // (Java: CommonRpcMessageUtils.java:171-174)
+            max_timestamp: segment.max_timestamp.unwrap_or(-1),
+        }
+    }
+
+    /// Get the local file name for this remote log segment
+    pub fn local_file_name(&self) -> String {
+        // Format: ${remote_segment_id}_${offset_prefix}.log
+        let offset_prefix = format!("{:020}", self.start_offset);
+        format!("{}_{}.log", self.segment_id, offset_prefix)
+    }
+}
+
+/// Represents remote log fetch information
+#[derive(Debug, Clone)]
+pub struct RemoteLogFetchInfo {
+    pub remote_log_tablet_dir: String,
+    #[allow(dead_code)]
+    pub partition_name: Option<String>,
+    pub remote_log_segments: Vec<RemoteLogSegment>,
+    pub first_start_pos: i32,
+}
+
+impl RemoteLogFetchInfo {
+    pub fn from_proto(info: &PbRemoteLogFetchInfo, table_bucket: TableBucket) -> Self {
+        let segments = info
+            .remote_log_segments
+            .iter()
+            .map(|s| RemoteLogSegment::from_proto(s, table_bucket.clone()))
+            .collect();
+
+        Self {
+            remote_log_tablet_dir: info.remote_log_tablet_dir.clone(),
+            partition_name: info.partition_name.clone(),
+            remote_log_segments: segments,
+            first_start_pos: info.first_start_pos.unwrap_or(0),
+        }
+    }
+}
+
+/// RAII guard for prefetch permit that notifies coordinator on drop
+///
+/// NOTE: File deletion is now handled by FileSource::drop(), not here.
+/// This ensures the file is closed before deletion
+#[derive(Debug)]
+pub struct PrefetchPermit {
+    permit: Option<OwnedSemaphorePermit>,
+    recycle_notify: Arc<Notify>,
+}
+
+impl PrefetchPermit {
+    fn new(permit: OwnedSemaphorePermit, recycle_notify: Arc<Notify>) -> Self {
+        Self {
+            permit: Some(permit),
+            recycle_notify,
+        }
+    }
+}
+
+impl Drop for PrefetchPermit {
+    fn drop(&mut self) {
+        // Release capacity (critical: permit must be dropped before notify)
+        let _ = self.permit.take(); // drops permit here
+
+        // Then wake coordinator so it can acquire the now-available permit
+        self.recycle_notify.notify_one();
+    }
+}
+
+/// Downloaded remote log file with prefetch permit
+/// File remains on disk for memory efficiency; file deletion is handled by FileCleanupGuard in FileSource
+#[derive(Debug)]
+pub struct RemoteLogFile {
+    /// Path to the downloaded file on local disk
+    pub file_path: PathBuf,
+    /// Size of the file in bytes
+    /// Currently unused but kept for potential future use (logging, metrics, etc.)
+    #[allow(dead_code)]
+    pub file_size: usize,
+    /// RAII permit that releases prefetch semaphore slot and notifies coordinator when dropped
+    pub permit: PrefetchPermit,
+}
+
+/// Represents a request to download a remote log segment with priority ordering
+#[derive(Debug)]
+pub struct RemoteLogDownloadRequest {
+    segment: RemoteLogSegment,
+    remote_log_tablet_dir: String,
+    result_sender: oneshot::Sender<Result<RemoteLogFile>>,
+    retry_count: u32,
+    next_retry_at: Option<tokio::time::Instant>,
+}
+
+impl RemoteLogDownloadRequest {
+    /// Get the segment (used by test fetcher implementations)
+    #[cfg(test)]
+    pub fn segment(&self) -> &RemoteLogSegment {
+        &self.segment
+    }
+}
+
+// Total ordering for priority queue (Rust requirement: cmp==Equal implies Eq)
+// Primary: Java semantics (timestamp cross-bucket, offset within-bucket)
+// Tie-breakers: table_bucket fields (table_id, partition_id, bucket_id), then segment_id
+impl Ord for RemoteLogDownloadRequest {
+    fn cmp(&self, other: &Self) -> Ordering {
+        if self.segment.table_bucket == other.segment.table_bucket {
+            // Same bucket: order by start_offset (ascending - earlier segments first)
+            self.segment
+                .start_offset
+                .cmp(&other.segment.start_offset)
+                .then_with(|| self.segment.segment_id.cmp(&other.segment.segment_id))
+        } else {
+            // Different buckets: order by max_timestamp (ascending - older segments first)
+            // Then by table_bucket fields for true total ordering
+            self.segment
+                .max_timestamp
+                .cmp(&other.segment.max_timestamp)
+                .then_with(|| {
+                    self.segment
+                        .table_bucket
+                        .table_id()
+                        .cmp(&other.segment.table_bucket.table_id())
+                })
+                .then_with(|| {
+                    self.segment
+                        .table_bucket
+                        .partition_id()
+                        .cmp(&other.segment.table_bucket.partition_id())
+                })
+                .then_with(|| {
+                    self.segment
+                        .table_bucket
+                        .bucket_id()
+                        .cmp(&other.segment.table_bucket.bucket_id())
+                })
+                .then_with(|| self.segment.segment_id.cmp(&other.segment.segment_id))
+        }
+    }
+}
+
+impl PartialOrd for RemoteLogDownloadRequest {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for RemoteLogDownloadRequest {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+impl Eq for RemoteLogDownloadRequest {}
+
+/// Result of a download task
+enum DownloadResult {
+    /// Successful download - deliver result to future
+    Success {
+        result: RemoteLogFile,
+        result_sender: oneshot::Sender<Result<RemoteLogFile>>,
+    },
+    /// Download failed - re-queue request for retry (Java pattern)
+    FailedRetry { request: RemoteLogDownloadRequest },
+    /// Download failed permanently after max retries - fail the future
+    FailedPermanently {
+        error: Error,
+        result_sender: oneshot::Sender<Result<RemoteLogFile>>,
+    },
+    /// Cancelled - don't deliver, don't re-queue
+    Cancelled,
+}
+
+/// Production implementation of RemoteLogFetcher that downloads from actual storage
+struct ProductionFetcher {
+    credentials_rx: CredentialsReceiver,
+    local_log_dir: Arc<TempDir>,
+    remote_log_read_concurrency: usize,
+}
+
+impl RemoteLogFetcher for ProductionFetcher {
+    fn fetch(
+        &self,
+        request: &RemoteLogDownloadRequest,
+    ) -> Pin<Box<dyn Future<Output = Result<FetchResult>> + Send>> {
+        let mut credentials_rx = self.credentials_rx.clone();
+        let local_log_dir = self.local_log_dir.clone();
+        let remote_log_read_concurrency = self.remote_log_read_concurrency;
+
+        // Clone data needed for async operation to avoid lifetime issues
+        let segment = request.segment.clone();
+        let remote_log_tablet_dir = request.remote_log_tablet_dir.to_string();
+
+        Box::pin(async move {
+            let local_file_name = segment.local_file_name();
+            let local_file_path = local_log_dir.path().join(&local_file_name);
+
+            // Build remote path
+            let offset_prefix = format!("{:020}", segment.start_offset);
+            let remote_path = format!(
+                "{}/{}/{}.log",
+                remote_log_tablet_dir, segment.segment_id, offset_prefix
+            );
+
+            // Get credentials from watch channel, waiting if not yet fetched
+            // - None = not yet fetched, wait
+            // - Some(props) = fetched (may be empty if no auth needed)
+            let remote_fs_props = {
+                let maybe_props = credentials_rx.borrow().clone();
+                match maybe_props {
+                    Some(props) => props,
+                    None => {
+                        // Credentials not yet fetched, wait for first update
+                        log::info!("Waiting for credentials to be available...");
+                        // If the sender side has been dropped (e.g. during shutdown),
+                        // this will return an error. Surface that as a proper error
+                        // instead of silently falling back to empty credentials.
+                        if let Err(e) = credentials_rx.changed().await {
+                            let io_err = io::Error::new(
+                                io::ErrorKind::BrokenPipe,
+                                format!(
+                                    "credentials manager shut down before credentials were obtained: {e}"
+                                ),
+                            );
+                            return Err(io_err.into());
+                        }
+                        // After a successful change notification, credentials should be set.
+                        // If they are still missing, treat this as an error instead of
+                        // defaulting to an empty map (which could break auth flows).
+                        credentials_rx
+                            .borrow()
+                            .clone()
+                            .ok_or_else(|| Error::UnexpectedError {
+                                message: "credentials not available after watch notification"
+                                    .to_string(),
+                                source: None,
+                            })?
+                    }
+                }
+            };
+
+            // Download file to disk (streaming, no memory spike)
+            let file_path = RemoteLogDownloader::download_file(
+                &remote_log_tablet_dir,
+                &remote_path,
+                &local_file_path,
+                &remote_fs_props,
+                remote_log_read_concurrency,
+            )
+            .await?;
+
+            // Get file size
+            let metadata = tokio::fs::metadata(&file_path).await?;
+            let file_size = metadata.len() as usize;
+
+            // Return file path - file stays on disk until PrefetchPermit is dropped
+            Ok(FetchResult {
+                file_path,
+                file_size,
+            })
+        })
+    }
+}
+
+/// Coordinator that owns all download state and orchestrates downloads
+struct DownloadCoordinator {
+    download_queue: BinaryHeap<Reverse<RemoteLogDownloadRequest>>,
+    active_downloads: JoinSet<DownloadResult>,
+    in_flight: usize,
+    prefetch_semaphore: Arc<Semaphore>,
+    max_concurrent_downloads: usize,
+    recycle_notify: Arc<Notify>,
+    fetcher: Arc<dyn RemoteLogFetcher>,
+    /// Per-table scanner metric handles cloned by every spawned download
+    /// task to attribute remote-fetch metrics to the owning scanner's
+    /// `(database, table)`.
+    metrics: Arc<ScannerMetrics>,
+}
+
+impl DownloadCoordinator {
+    /// Check if we should wait for recycle notification
+    /// Only wait if we're blocked on permits AND have pending work
+    fn should_wait_for_recycle(&self) -> bool {
+        !self.download_queue.is_empty()
+            && self.in_flight < self.max_concurrent_downloads
+            && self.prefetch_semaphore.available_permits() == 0
+    }
+
+    /// Find the earliest retry deadline among pending requests
+    fn next_retry_deadline(&self) -> Option<tokio::time::Instant> {
+        self.download_queue
+            .iter()
+            .filter_map(|Reverse(req)| req.next_retry_at)
+            .min()
+    }
+}
+
+impl DownloadCoordinator {
+    /// Try to start as many downloads as possible (event-driven drain)
+    fn drain(&mut self) {
+        // Collect deferred requests (backoff not ready) to push back later
+        let mut deferred = Vec::new();
+        // Scan entire queue once to find ready requests (prevents head-of-line blocking)
+        // Bound to reasonable max to avoid excessive work if queue is huge
+        let max_scan = self.download_queue.len().min(100);
+        let mut scanned = 0;
+
+        while !self.download_queue.is_empty()
+            && self.in_flight < self.max_concurrent_downloads
+            && scanned < max_scan
+        {
+            // Try acquire prefetch permit (non-blocking)
+            let permit = match self.prefetch_semaphore.clone().try_acquire_owned() {
+                Ok(p) => p,
+                Err(_) => break, // No permits available
+            };
+
+            // Pop highest priority request
+            let Some(Reverse(request)) = self.download_queue.pop() else {
+                drop(permit);
+                break;
+            };
+
+            scanned += 1;
+
+            // Retry backoff check: defer if retry time hasn't arrived yet
+            if let Some(next_retry_at) = request.next_retry_at {
+                let now = tokio::time::Instant::now();
+                if next_retry_at > now {
+                    // Not ready for retry yet - defer and continue looking for ready requests
+                    drop(permit);
+                    deferred.push(request);
+                    continue; // Don't block - keep looking for ready requests
+                }
+            }
+
+            // Cancellation check: skip if sender closed
+            if request.result_sender.is_closed() {
+                drop(permit);
+                continue; // Try next request
+            }
+
+            // Clone data for the spawned task
+            let fetcher = self.fetcher.clone();
+            let recycle_notify = self.recycle_notify.clone();
+            let metrics = Arc::clone(&self.metrics);
+
+            // Spawn download task
+            self.active_downloads.spawn(async move {
+                spawn_download_task(request, permit, fetcher, recycle_notify, metrics).await
+            });
+            self.in_flight += 1;
+        }
+
+        // Push deferred requests back to queue (maintains priority order)
+        if !deferred.is_empty() {
+            for req in deferred {
+                self.download_queue.push(Reverse(req));
+            }
+        }
+    }
+}
+
+/// Spawn a download task that attempts download once
+/// Matches Java's RemoteLogDownloader.java
+///
+/// Benefits over infinite in-place retry:
+/// - Failed downloads don't block prefetch slots
+/// - Other segments can make progress while one is failing
+/// - Natural retry through coordinator re-picking from queue
+async fn spawn_download_task(
+    request: RemoteLogDownloadRequest,
+    permit: tokio::sync::OwnedSemaphorePermit,
+    fetcher: Arc<dyn RemoteLogFetcher>,
+    recycle_notify: Arc<Notify>,
+    metrics: Arc<ScannerMetrics>,
+) -> DownloadResult {
+    // Check if receiver still alive (early cancellation check)
+    if request.result_sender.is_closed() {
+        drop(permit);
+        return DownloadResult::Cancelled;
+    }
+
+    // Java reference: RemoteLogDownloader.java increments `remoteFetchRequestCount`
+    // immediately before initiating the download. Each retry of the same segment
+    // counts as a separate request (matches Java behavior).
+    metrics.record_remote_fetch_request();
+
+    // Try download ONCE
+    let download_result = fetcher.fetch(&request).await;
+
+    match download_result {
+        Ok(fetch_result) => {
+            // Success - permit will be released on drop (FileSource handles file deletion)
+            metrics.record_remote_fetch_bytes(fetch_result.file_size as u64);
+            DownloadResult::Success {
+                result: RemoteLogFile {
+                    file_path: fetch_result.file_path,
+                    file_size: fetch_result.file_size,
+                    permit: PrefetchPermit::new(permit, recycle_notify.clone()),
+                },
+                result_sender: request.result_sender,
+            }
+        }
+        Err(_e) if request.result_sender.is_closed() => {
+            // Receiver dropped (cancelled) - release permit, don't re-queue
+            drop(permit);
+            DownloadResult::Cancelled
+        }
+        Err(e) => {
+            // Download failed - check if we should retry or give up
+            // Counted per attempt, so retries each contribute one error.
+            metrics.record_remote_fetch_error();
+            let retry_count = request.retry_count + 1;
+
+            if retry_count > MAX_RETRY_COUNT {
+                // Too many retries - give up and fail the future
+                log::error!(
+                    "Failed to download remote log segment {} after {} retries: {}. Giving up.",
+                    request.segment.segment_id,
+                    retry_count,
+                    e
+                );
+                drop(permit); // Release immediately
+
+                DownloadResult::FailedPermanently {
+                    error: Error::UnexpectedError {
+                        message: format!(
+                            "Failed to download remote log segment after {retry_count} retries: {e}"
+                        ),
+                        source: Some(Box::new(e)),
+                    },
+                    result_sender: request.result_sender,
+                }
+            } else {
+                // Retry with exponential backoff
+                let backoff_delay = calculate_backoff_delay(retry_count);
+                let next_retry_at = tokio::time::Instant::now() + backoff_delay;
+
+                log::warn!(
+                    "Failed to download remote log segment {}: {}. Retry {}/{} after {:?}",
+                    request.segment.segment_id,
+                    e,
+                    retry_count,
+                    MAX_RETRY_COUNT,
+                    backoff_delay
+                );
+                drop(permit); // Release immediately - critical!
+
+                // Update retry state
+                let mut retry_request = request;
+                retry_request.retry_count = retry_count;
+                retry_request.next_retry_at = Some(next_retry_at);
+
+                // Re-queue request to same priority queue
+                // Future stays with request, NOT completed - will complete on successful retry
+                DownloadResult::FailedRetry {
+                    request: retry_request,
+                }
+            }
+        }
+    }
+}
+
+/// Coordinator event loop - owns all download state and reacts to events
+async fn coordinator_loop(
+    mut coordinator: DownloadCoordinator,
+    mut request_receiver: mpsc::UnboundedReceiver<RemoteLogDownloadRequest>,
+) {
+    loop {
+        // Drain once at start of iteration to process ready work
+        coordinator.drain();
+
+        // Calculate sleep duration until next retry (if any deferred requests)
+        let next_retry_sleep = coordinator.next_retry_deadline().map(|deadline| {
+            let now = tokio::time::Instant::now();
+            if deadline > now {
+                deadline - now
+            } else {
+                tokio::time::Duration::from_millis(0) // Ready now
+            }
+        });
+
+        tokio::select! {
+            // Event 1: NewRequest
+            Some(request) = request_receiver.recv() => {
+                coordinator.download_queue.push(Reverse(request));
+                // Immediately try to start this download
+                continue;
+            }
+
+            // Event 2: DownloadFinished
+            Some(result) = coordinator.active_downloads.join_next() => {
+                coordinator.in_flight -= 1;
+
+                match result {
+                    Ok(DownloadResult::Success { result, result_sender }) => {
+                        // Success - deliver result to future
+                        if !result_sender.is_closed() {
+                            let _ = result_sender.send(Ok(result));
+                        }
+                        // Permit held in RemoteLogFile until consumed
+                    }
+                    Ok(DownloadResult::FailedRetry { request }) => {
+                        // Re-queue immediately (don't block coordinator with sleep)
+                        // The retry time will be checked in drain() before processing
+                        // (Java line 177: segmentsToFetch.add(request))
+                        // Permit already released (Java line 174)
+                        coordinator.download_queue.push(Reverse(request));
+                    }
+                    Ok(DownloadResult::FailedPermanently { error, result_sender }) => {
+                        // Permanent failure - deliver error to future
+                        if !result_sender.is_closed() {
+                            let _ = result_sender.send(Err(error));
+                        }
+                        // Permit already released
+                    }
+                    Ok(DownloadResult::Cancelled) => {
+                        // Cancelled - permit already released, nothing to do
+                    }
+                    Err(e) => {
+                        log::error!("Download task panicked: {e:?}");
+                        // Permit already released via RAII
+                    }
+                }
+                // Immediately try to start another download
+                continue;
+            }
+
+            // Event 3: Recycled (only wait when blocked on permits with pending work)
+            _ = coordinator.recycle_notify.notified(),
+                if coordinator.should_wait_for_recycle() => {
+                // Wake up to try draining
+                continue;
+            }
+
+            // Event 4: Retry timer - wake up when next retry is ready
+            _ = tokio::time::sleep(next_retry_sleep.unwrap_or(tokio::time::Duration::from_secs(3600))),
+                if next_retry_sleep.is_some() => {
+                // Wake up to retry deferred requests
+                continue;
+            }
+
+            else => break,  // All channels closed AND no work pending
+        }
+    }
+}
+
+type CompletionCallback = Box<dyn Fn() + Send + Sync>;
+
+/// Future for a remote log download request
+pub struct RemoteLogDownloadFuture {
+    result: Arc<Mutex<Option<Result<RemoteLogFile>>>>,
+    completion_callbacks: Arc<Mutex<Vec<CompletionCallback>>>,
+}
+
+impl RemoteLogDownloadFuture {
+    pub fn new(receiver: oneshot::Receiver<Result<RemoteLogFile>>) -> Self {
+        let result = Arc::new(Mutex::new(None));
+        let result_clone = Arc::clone(&result);
+        let completion_callbacks: Arc<Mutex<Vec<CompletionCallback>>> =
+            Arc::new(Mutex::new(Vec::new()));
+        let callbacks_clone = Arc::clone(&completion_callbacks);
+
+        // Spawn a task to wait for the download and update result, then call callbacks
+        tokio::spawn(async move {
+            let download_result = match receiver.await {
+                Ok(Ok(path)) => Ok(path),
+                Ok(Err(e)) => Err(e),
+                Err(e) => Err(Error::UnexpectedError {
+                    message: format!("Download & Read future cancelled: {e:?}"),
+                    source: None,
+                }),
+            };
+
+            *result_clone.lock() = Some(download_result);
+
+            // Call all registered callbacks
+            // We need to take the callbacks to avoid holding the lock while calling them
+            // This also ensures that any callbacks registered after this point will be called immediately
+            let callbacks: Vec<CompletionCallback> = {
+                let mut callbacks_guard = callbacks_clone.lock();
+                mem::take(&mut *callbacks_guard)
+            };
+            for callback in callbacks {
+                callback();
+            }
+
+            // After calling callbacks, any new callbacks registered will see is_done() == true
+            // and will be called immediately in on_complete()
+        });
+
+        Self {
+            result,
+            completion_callbacks,
+        }
+    }
+
+    /// Register a callback to be called when download completes (similar to Java's onComplete)
+    pub fn on_complete<F>(&self, callback: F)
+    where
+        F: Fn() + Send + Sync + 'static,
+    {
+        // Acquire callbacks lock first to ensure atomicity of the check-and-register operation
+        let mut callbacks_guard = self.completion_callbacks.lock();
+
+        // Check completion status while holding the callbacks lock.
+        // This ensures that:
+        // 1. If the task completes between checking is_done() and registering the callback,
+        //    we'll see the completion state correctly
+        // 2. The background task cannot clear the callbacks list while we're checking/registering
+        let is_done = self.result.lock().is_some();
+
+        if is_done {
+            // If already completed, call immediately (drop lock first to avoid deadlock)
+            drop(callbacks_guard);
+            callback();
+        } else {
+            // Register the callback while holding the callbacks lock.
+            // This ensures that even if the background task completes right after we check
+            // is_done(), it will wait for us to release the lock before taking callbacks.
+            // When it does take callbacks, it will see our callback in the list and execute it.
+            callbacks_guard.push(Box::new(callback));
+            // Lock is automatically released here
+        }
+    }
+
+    pub fn is_done(&self) -> bool {
+        self.result.lock().is_some()
+    }
+
+    /// Take the RemoteLogFile (including the permit) from this future
+    /// This should only be called when the download is complete
+    /// This is the correct way to consume the download - it transfers permit ownership
+    pub fn take_remote_log_file(&self) -> Result<RemoteLogFile> {
+        let mut guard = self.result.lock();
+        match guard.take() {
+            Some(Ok(remote_log_file)) => Ok(remote_log_file),
+            Some(Err(e)) => {
+                let error_msg = format!("{e}");
+                Err(Error::IoUnexpectedError {
+                    message: format!("Fail to get remote log file: {error_msg}"),
+                    source: io::Error::other(error_msg),
+                })
+            }
+            None => Err(Error::IoUnexpectedError {
+                message: "Remote log file already taken or not ready".to_string(),
+                source: io::Error::other("Remote log file already taken or not ready"),
+            }),
+        }
+    }
+}
+
+/// Downloader for remote log segment files.
+///
+/// # Shutdown behavior
+///
+/// When the downloader is dropped, the request channel closes, signaling the coordinator
+/// to stop accepting new work. The coordinator will finish any in-flight downloads but
+/// won't wait for completion. Pending futures will fail.
+pub struct RemoteLogDownloader {
+    request_sender: Option<mpsc::UnboundedSender<RemoteLogDownloadRequest>>,
+}
+
+impl RemoteLogDownloader {
+    pub(crate) fn new(
+        local_log_dir: TempDir,
+        max_prefetch_segments: usize,
+        max_concurrent_downloads: usize,
+        remote_log_read_concurrency: usize,
+        credentials_rx: CredentialsReceiver,
+        metrics: Arc<ScannerMetrics>,
+    ) -> Result<Self> {
+        let fetcher = Arc::new(ProductionFetcher {
+            credentials_rx,
+            local_log_dir: Arc::new(local_log_dir),
+            remote_log_read_concurrency,
+        });
+
+        Self::new_with_fetcher(
+            fetcher,
+            max_prefetch_segments,
+            max_concurrent_downloads,
+            metrics,
+        )
+    }
+
+    /// Create a RemoteLogDownloader with a custom fetcher (for testing).
+    pub(crate) fn new_with_fetcher(
+        fetcher: Arc<dyn RemoteLogFetcher>,
+        max_prefetch_segments: usize,
+        max_concurrent_downloads: usize,
+        metrics: Arc<ScannerMetrics>,
+    ) -> Result<Self> {
+        let (request_sender, request_receiver) = mpsc::unbounded_channel();
+
+        let coordinator = DownloadCoordinator {
+            download_queue: BinaryHeap::new(),
+            active_downloads: JoinSet::new(),
+            in_flight: 0,
+            prefetch_semaphore: Arc::new(Semaphore::new(max_prefetch_segments)),
+            max_concurrent_downloads,
+            recycle_notify: Arc::new(Notify::new()),
+            fetcher,
+            metrics,
+        };
+
+        // Spawn coordinator task - it will exit when request_sender is dropped
+        tokio::spawn(coordinator_loop(coordinator, request_receiver));
+
+        Ok(Self {
+            request_sender: Some(request_sender),
+        })
+    }
+
+    /// Request to fetch a remote log segment to local. This method is non-blocking.
+    pub fn request_remote_log(
+        &self,
+        remote_log_tablet_dir: &str,
+        segment: &RemoteLogSegment,
+    ) -> RemoteLogDownloadFuture {
+        let (result_sender, result_receiver) = oneshot::channel();
+
+        let request = RemoteLogDownloadRequest {
+            segment: segment.clone(),
+            remote_log_tablet_dir: remote_log_tablet_dir.to_string(),
+            result_sender,
+            retry_count: 0,
+            next_retry_at: None,
+        };
+
+        // Send to coordinator (non-blocking)
+        if let Some(ref sender) = self.request_sender {
+            if sender.send(request).is_err() {
+                // Coordinator is gone - immediately fail the future
+                let (error_sender, error_receiver) = oneshot::channel();
+                let _ = error_sender.send(Err(Error::UnexpectedError {
+                    message: "RemoteLogDownloader coordinator has shut down".to_string(),
+                    source: None,
+                }));
+                return RemoteLogDownloadFuture::new(error_receiver);
+            }
+        }
+
+        RemoteLogDownloadFuture::new(result_receiver)
+    }
+}
+
+impl Drop for RemoteLogDownloader {
+    fn drop(&mut self) {
+        // Drop the request sender to signal coordinator shutdown.
+        // This causes request_receiver.recv() to return None, allowing the
+        // coordinator to exit gracefully after processing pending work.
+        // The coordinator task will finish on its own when it sees the channel closed.
+        drop(self.request_sender.take());
+    }
+}
+
+impl RemoteLogDownloader {
+    /// Download a file from remote storage to local using streaming read/write.
+    async fn download_file(
+        remote_log_tablet_dir: &str,
+        remote_path: &str,
+        local_path: &Path,
+        remote_fs_props: &HashMap<String, String>,
+        remote_log_read_concurrency: usize,
+    ) -> Result<PathBuf> {
+        // Handle both URL (e.g., "s3://bucket/path") and local file paths
+        // If the path doesn't contain "://", treat it as a local file path
+        let remote_log_tablet_dir_url = if remote_log_tablet_dir.contains("://") {
+            remote_log_tablet_dir.to_string()
+        } else {
+            format!("file://{remote_log_tablet_dir}")
+        };
+
+        // Create FileIO from the remote log tablet dir URL to get the storage
+        let file_io_builder = FileIO::from_url(&remote_log_tablet_dir_url)?;
+
+        // For S3/S3A URLs, inject S3 credentials from props
+        let file_io_builder = if remote_log_tablet_dir.starts_with("s3://")
+            || remote_log_tablet_dir.starts_with("s3a://")
+            || remote_log_tablet_dir.starts_with("oss://")
+        {
+            file_io_builder.with_props(
+                remote_fs_props
+                    .iter()
+                    .map(|(k, v)| (k.as_str(), v.as_str())),
+            )
+        } else {
+            file_io_builder
+        };
+
+        // Build storage and create operator directly
+        let storage = Storage::build(file_io_builder)?;
+        let (op, relative_path) = storage.create(remote_path)?;
+
+        // Timeout for remote storage operations (30 seconds)
+        const REMOTE_OP_TIMEOUT: Duration = Duration::from_secs(30);
+        const CHUNK_SIZE: usize = 8 * 1024 * 1024; // 8MiB
+
+        Self::download_file_streaming(
+            &op,
+            relative_path,
+            remote_path,
+            local_path,
+            CHUNK_SIZE,
+            remote_log_read_concurrency,
+            REMOTE_OP_TIMEOUT,
+        )
+        .await?;
+
+        Ok(local_path.to_path_buf())
+    }
+
+    async fn download_file_streaming(
+        op: &opendal::Operator,
+        relative_path: &str,
+        remote_path: &str,
+        local_path: &Path,
+        chunk_size: usize,
+        streaming_read_concurrency: usize,
+        remote_op_timeout: Duration,
+    ) -> Result<()> {
+        let mut local_file = tokio::fs::File::create(local_path).await?;
+
+        let reader_future = op
+            .reader_with(relative_path)
+            .chunk(chunk_size)
+            .concurrent(streaming_read_concurrency);
+        let reader = tokio::time::timeout(remote_op_timeout, reader_future)
+            .await
+            .map_err(|e| Error::IoUnexpectedError {
+                message: format!("Timeout creating streaming reader for {remote_path}: {e}."),
+                source: io::ErrorKind::TimedOut.into(),
+            })??;
+
+        let mut stream = tokio::time::timeout(remote_op_timeout, reader.into_bytes_stream(..))
+            .await
+            .map_err(|e| Error::IoUnexpectedError {
+                message: format!("Timeout creating streaming bytes stream for {remote_path}: {e}."),
+                source: io::ErrorKind::TimedOut.into(),
+            })??;
+
+        let mut chunk_count = 0u64;
+        while let Some(chunk) = tokio::time::timeout(remote_op_timeout, stream.try_next())
+            .await
+            .map_err(|e| Error::IoUnexpectedError {
+                message: format!(
+                    "Timeout streaming chunk from remote storage: {remote_path}, exception: {e}."
+                ),
+                source: io::ErrorKind::TimedOut.into(),
+            })??
+        {
+            chunk_count += 1;
+            if chunk_count <= 3 || chunk_count % 10 == 0 {
+                log::debug!("Remote log streaming download: chunk #{chunk_count} ({remote_path})");
+            }
+            local_file.write_all(&chunk).await?;
+        }
+
+        local_file.sync_all().await?;
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::TablePath;
+    use crate::test_utils::test_scanner_metrics;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    /// Helper function to create a TableBucket for testing
+    fn create_table_bucket(table_id: i64, bucket_id: i32) -> TableBucket {
+        TableBucket::new(table_id, bucket_id)
+    }
+
+    /// `ScannerMetrics` instance shared across the local test fixtures. The
+    /// labels are arbitrary because none of the tests in this module install
+    /// a metrics recorder; the metrics just need to exist for the API
+    /// surface.
+    fn metrics() -> Arc<ScannerMetrics> {
+        test_scanner_metrics(&TablePath::new("db", "tbl"))
+    }
+
+    /// Simplified fake fetcher for testing
+    struct FakeFetcher {
+        completion_gate: Arc<Notify>,
+        in_flight: Arc<AtomicUsize>,
+        max_seen_in_flight: Arc<AtomicUsize>,
+        fail_count: Arc<Mutex<usize>>,
+        auto_complete: bool,
+    }
+
+    impl FakeFetcher {
+        fn new(fail_count: usize, auto_complete: bool) -> Self {
+            Self {
+                completion_gate: Arc::new(Notify::new()),
+                in_flight: Arc::new(AtomicUsize::new(0)),
+                max_seen_in_flight: Arc::new(AtomicUsize::new(0)),
+                fail_count: Arc::new(Mutex::new(fail_count)),
+                auto_complete,
+            }
+        }
+
+        fn max_seen_in_flight(&self) -> usize {
+            self.max_seen_in_flight.load(Ordering::SeqCst)
+        }
+
+        fn in_flight(&self) -> usize {
+            self.in_flight.load(Ordering::SeqCst)
+        }
+
+        fn release_one(&self) {
+            self.completion_gate.notify_one();
+        }
+
+        fn release_all(&self) {
+            self.completion_gate.notify_waiters();
+        }
+    }
+
+    impl RemoteLogFetcher for FakeFetcher {
+        fn fetch(
+            &self,
+            request: &RemoteLogDownloadRequest,
+        ) -> Pin<Box<dyn Future<Output = Result<FetchResult>> + Send>> {
+            let gate = self.completion_gate.clone();
+            let in_flight = self.in_flight.clone();
+            let max_seen = self.max_seen_in_flight.clone();
+            let fail_count = self.fail_count.clone();
+            let segment_id = request.segment().segment_id.clone();
+            let auto_complete = self.auto_complete;
+
+            Box::pin(async move {
+                // Track in-flight
+                let current = in_flight.fetch_add(1, Ordering::SeqCst) + 1;
+                max_seen.fetch_max(current, Ordering::SeqCst);
+
+                // Wait for gate (or auto-complete)
+                if !auto_complete {
+                    gate.notified().await;
+                } else {
+                    tokio::task::yield_now().await;
+                }
+
+                // Check if should fail
+                let should_fail = {
+                    let mut count = fail_count.lock();
+                    if *count > 0 {
+                        *count -= 1;
+                        true
+                    } else {
+                        false
+                    }
+                };
+
+                in_flight.fetch_sub(1, Ordering::SeqCst);
+
+                if should_fail {
+                    Err(Error::UnexpectedError {
+                        message: format!("Fake fetch failed for {segment_id}"),
+                        source: None,
+                    })
+                } else {
+                    let fake_data = vec![1, 2, 3, 4];
+                    let temp_dir = env::temp_dir();
+                    let timestamp = SystemTime::now()
+                        .duration_since(UNIX_EPOCH)
+                        .unwrap()
+                        .as_nanos();
+                    let file_path =
+                        temp_dir.join(format!("fake_segment_{segment_id}_{timestamp}.log"));
+                    tokio::fs::write(&file_path, &fake_data).await?;
+
+                    Ok(FetchResult {
+                        file_path,
+                        file_size: fake_data.len(),
+                    })
+                }
+            })
+        }
+    }
+
+    /// Helper function to create a RemoteLogSegment for testing
+    fn create_segment(
+        segment_id: &str,
+        start_offset: i64,
+        max_timestamp: i64,
+        table_bucket: TableBucket,
+    ) -> RemoteLogSegment {
+        RemoteLogSegment {
+            segment_id: segment_id.to_string(),
+            start_offset,
+            end_offset: start_offset + 1000,
+            size_in_bytes: 1024,
+            table_bucket,
+            max_timestamp,
+        }
+    }
+
+    /// Helper function to create a RemoteLogDownloadRequest for testing
+    fn create_request(segment: RemoteLogSegment) -> RemoteLogDownloadRequest {
+        let (result_sender, _) = oneshot::channel();
+        RemoteLogDownloadRequest {
+            remote_log_tablet_dir: "test_dir".to_string(),
+            segment,
+            result_sender,
+            retry_count: 0,
+            next_retry_at: None,
+        }
+    }
+
+    #[test]
+    fn test_priority_ordering_matching_java_test_case() {
+        // Test priority ordering: timestamp across buckets, offset within bucket
+        // Does NOT test tie-breakers (segment_id) - those are implementation details
+
+        let bucket1 = create_table_bucket(1, 0);
+        let bucket2 = create_table_bucket(1, 1);
+        let bucket3 = create_table_bucket(1, 2);
+        let bucket4 = create_table_bucket(1, 3);
+
+        // Create segments with distinct timestamps/offsets (no ties)
+        let seg_negative = create_segment("seg_neg", 0, -1, bucket1.clone());
+        let seg_zero = create_segment("seg_zero", 0, 0, bucket2.clone());
+        let seg_1000 = create_segment("seg_1000", 0, 1000, bucket3.clone());
+        let seg_2000 = create_segment("seg_2000", 0, 2000, bucket4.clone());
+        let seg_same_bucket_100 = create_segment("seg_sb_100", 100, 5000, bucket1.clone());
+        let seg_same_bucket_50 = create_segment("seg_sb_50", 50, 5000, bucket1.clone());
+
+        let mut heap = BinaryHeap::new();
+        heap.push(Reverse(create_request(seg_2000)));
+        heap.push(Reverse(create_request(seg_same_bucket_100)));
+        heap.push(Reverse(create_request(seg_1000)));
+        heap.push(Reverse(create_request(seg_zero)));
+        heap.push(Reverse(create_request(seg_negative)));
+        heap.push(Reverse(create_request(seg_same_bucket_50)));
+
+        // Verify ordering by timestamp/offset, not segment_id
+        let first = heap.pop().unwrap().0;
+        assert_eq!(first.segment.max_timestamp, -1, "Lowest timestamp first");
+
+        let second = heap.pop().unwrap().0;
+        assert_eq!(second.segment.max_timestamp, 0);
+
+        let third = heap.pop().unwrap().0;
+        assert_eq!(third.segment.max_timestamp, 1000);
+
+        let fourth = heap.pop().unwrap().0;
+        assert_eq!(fourth.segment.max_timestamp, 2000);
+
+        // Last two are same bucket (ts=5000), ordered by offset
+        let fifth = heap.pop().unwrap().0;
+        assert_eq!(fifth.segment.max_timestamp, 5000);
+        assert_eq!(
+            fifth.segment.start_offset, 50,
+            "Lower offset first within bucket"
+        );
+
+        let sixth = heap.pop().unwrap().0;
+        assert_eq!(sixth.segment.max_timestamp, 5000);
+        assert_eq!(sixth.segment.start_offset, 100);
+    }
+
+    #[tokio::test]
+    async fn test_concurrency_and_priority() {
+        // Test concurrency limiting and priority-based scheduling together
+        let fake_fetcher = Arc::new(FakeFetcher::new(0, false)); // Manual control
+
+        let downloader = RemoteLogDownloader::new_with_fetcher(
+            fake_fetcher.clone(),
+            10, // High prefetch limit
+            2,  // Max concurrent downloads = 2
+            metrics(),
+        )
+        .unwrap();
+
+        let bucket = create_table_bucket(1, 0);
+
+        // Request 4 segments with same priority (to isolate concurrency limiting from priority)
+        let segs: Vec<_> = (0..4)
+            .map(|i| create_segment(&format!("seg{i}"), i * 100, 1000, bucket.clone()))
+            .collect();
+
+        let _futures: Vec<_> = segs
+            .iter()
+            .map(|seg| downloader.request_remote_log("dir", seg))
+            .collect();
+
+        // Wait for exactly 2 to start
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert_eq!(
+            fake_fetcher.in_flight(),
+            2,
+            "Concurrency limit: exactly 2 should be in-flight"
+        );
+
+        // Release one
+        fake_fetcher.release_one();
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        // Max should never exceed 2
+        assert_eq!(
+            fake_fetcher.max_seen_in_flight(),
+            2,
+            "Max concurrent should not exceed 2"
+        );
+
+        // Release all
+        fake_fetcher.release_all();
+    }
+
+    #[tokio::test]
+    async fn test_prefetch_limit() {
+        // Test that prefetch semaphore limits outstanding downloads
+        let fake_fetcher = Arc::new(FakeFetcher::new(0, true)); // Auto-complete
+
+        let downloader = RemoteLogDownloader::new_with_fetcher(
+            fake_fetcher,
+            2,  // Max prefetch = 2
+            10, // High concurrent limit
+            metrics(),
+        )
+        .unwrap();
+
+        let bucket = create_table_bucket(1, 0);
+
+        // Request 4 downloads
+        let segs: Vec<_> = (0..4)
+            .map(|i| create_segment(&format!("seg{i}"), i * 100, 1000, bucket.clone()))
+            .collect();
+
+        let mut futures: Vec<_> = segs
+            .iter()
+            .map(|seg| downloader.request_remote_log("dir", seg))
+            .collect();
+
+        // Wait for first 2 to complete
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(2);
+        loop {
+            if futures.iter().filter(|f| f.is_done()).count() >= 2 {
+                break;
+            }
+            if tokio::time::Instant::now() > deadline {
+                panic!("Timeout waiting for first 2 downloads");
+            }
+            tokio::time::sleep(Duration::from_millis(10)).await;
+        }
+
+        // Verify 3rd and 4th are blocked (prefetch limit)
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert_eq!(
+            futures.iter().filter(|f| f.is_done()).count(),
+            2,
+            "Prefetch limit: only 2 should complete"
+        );
+
+        // Drop first 2 (releases permits)
+        let f4 = futures.pop().unwrap();
+        let f3 = futures.pop().unwrap();
+        drop(futures);
+
+        // 3rd and 4th should now complete
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(2);
+        loop {
+            if f3.is_done() && f4.is_done() {
+                break;
+            }
+            if tokio::time::Instant::now() > deadline {
+                panic!("Timeout after permit release");
+            }
+            tokio::time::sleep(Duration::from_millis(10)).await;
+        }
+    }
+
+    #[tokio::test]
+    async fn test_retry_and_cancellation() {
+        // Test retry with exponential backoff
+        let fake_fetcher = Arc::new(FakeFetcher::new(2, true)); // Fail twice, succeed third time
+
+        let downloader =
+            RemoteLogDownloader::new_with_fetcher(fake_fetcher.clone(), 10, 1, metrics()).unwrap();
+
+        let bucket = create_table_bucket(1, 0);
+        let seg = create_segment("seg1", 0, 1000, bucket);
+
+        let future = downloader.request_remote_log("dir", &seg);
+
+        // Should succeed after retries
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(5);
+        loop {
+            if future.is_done() {
+                break;
+            }
+            if tokio::time::Instant::now() > deadline {
+                panic!("Timeout waiting for retry to succeed");
+            }
+            tokio::time::sleep(Duration::from_millis(50)).await;
+        }
+
+        assert!(future.is_done(), "Should succeed after retries");
+
+        // Test cancellation
+        let seg2 = create_segment("seg2", 100, 1000, create_table_bucket(1, 0));
+        let fake_fetcher2 = Arc::new(FakeFetcher::new(100, true)); // Fail forever
+        let downloader2 =
+            RemoteLogDownloader::new_with_fetcher(fake_fetcher2.clone(), 10, 1, metrics()).unwrap();
+
+        let future2 = downloader2.request_remote_log("dir", &seg2);
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        // Drop to cancel
+        drop(future2);
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        assert_eq!(
+            fake_fetcher2.in_flight(),
+            0,
+            "Cancellation should release resources"
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/scanner.rs b/fluss-rust/crates/fluss/src/client/table/scanner.rs
new file mode 100644
index 0000000000..f0cb320171
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/scanner.rs
@@ -0,0 +1,2763 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::ClientSchemaGetter;
+use crate::client::connection::FlussConnection;
+use crate::client::credentials::SecurityTokenManager;
+use crate::client::metadata::Metadata;
+use crate::client::table::batch_scanner::LimitBatchScanner;
+use crate::client::table::log_fetch_buffer::{
+    CompletedFetch, DefaultCompletedFetch, FetchErrorAction, FetchErrorContext, FetchErrorLogLevel,
+    LogFetchBuffer, RemotePendingFetch,
+};
+use crate::client::table::remote_log::{RemoteLogDownloader, RemoteLogFetchInfo};
+use crate::config::Config;
+use crate::error::Error::UnsupportedOperation;
+use crate::error::{ApiError, Error, FlussError, Result};
+use crate::metadata::{
+    LogFormat, PhysicalTablePath, RowType, SchemaInfo, TableBucket, TableInfo, TablePath,
+};
+use crate::metrics::ScannerMetrics;
+use crate::proto::{
+    ErrorResponse, FetchLogRequest, FetchLogResponse, PbFetchLogReqForBucket, PbFetchLogReqForTable,
+};
+use crate::record::{
+    LogRecordsBatches, ReadContext, ScanBatch, ScanRecord, ScanRecords, to_arrow_schema,
+};
+use crate::rpc::{RpcClient, RpcError, message};
+use crate::util::FairBucketStatusMap;
+use crate::{PartitionId, TableId};
+use arrow_schema::SchemaRef;
+use log::{debug, warn};
+use parking_lot::{Mutex, RwLock};
+use prost::Message;
+use std::{
+    collections::{HashMap, HashSet},
+    slice::from_ref,
+    sync::Arc,
+    time::{Duration, Instant},
+};
+use tempfile::TempDir;
+
+pub struct TableScan<'a> {
+    conn: &'a FlussConnection,
+    table_info: TableInfo,
+    metadata: Arc<Metadata>,
+    /// Column indices to project. None means all columns, Some(vec) means only the specified columns (non-empty).
+    projected_fields: Option<Vec<usize>>,
+    /// Optional row limit. When set, callers may construct a [`BatchScanner`] for a one-shot bounded scan.
+    limit: Option<i32>,
+}
+
+impl<'a> TableScan<'a> {
+    pub fn new(conn: &'a FlussConnection, table_info: TableInfo, metadata: Arc<Metadata>) -> Self {
+        Self {
+            conn,
+            table_info,
+            metadata,
+            projected_fields: None,
+            limit: None,
+        }
+    }
+
+    /// Sets a row limit for the scan, enabling [`Self::create_bucket_batch_scanner`].
+    ///
+    /// The limit must be positive. A limit is incompatible with the log
+    /// scanners, which reject it.
+    pub fn limit(mut self, n: i32) -> Result<Self> {
+        if n <= 0 {
+            return Err(Error::IllegalArgument {
+                message: format!("Scan limit must be positive, got {n}"),
+            });
+        }
+        self.limit = Some(n);
+        Ok(self)
+    }
+
+    /// Log scanners don't support limit pushdown; reject a configured limit
+    /// rather than silently ignoring it.
+    fn reject_limit(&self, scanner: &str) -> Result<()> {
+        if let Some(limit) = self.limit {
+            return Err(Error::UnsupportedOperation {
+                message: format!(
+                    "{scanner} doesn't support limit pushdown. Table: {}, requested limit: {limit}",
+                    self.table_info.table_path
+                ),
+            });
+        }
+        Ok(())
+    }
+
+    /// Creates a one-shot bounded scan of `table_bucket`.
+    ///
+    /// Requires a previously-configured limit via [`Self::limit`]. Creation is
+    /// cheap; the `LimitScanRequest` runs on the first
+    /// [`LimitBatchScanner::next_batch`].
+    pub fn create_bucket_batch_scanner(
+        self,
+        table_bucket: TableBucket,
+    ) -> Result<LimitBatchScanner> {
+        let limit = self.limit.ok_or_else(|| Error::IllegalArgument {
+            message: "create_bucket_batch_scanner requires a limit configured via .limit(n)"
+                .to_string(),
+        })?;
+        if table_bucket.table_id() != self.table_info.table_id {
+            return Err(Error::IllegalArgument {
+                message: format!(
+                    "Bucket table_id {} does not match scan table_id {}",
+                    table_bucket.table_id(),
+                    self.table_info.table_id
+                ),
+            });
+        }
+        let num_buckets = self.table_info.get_num_buckets();
+        if table_bucket.bucket_id() < 0 || table_bucket.bucket_id() >= num_buckets {
+            return Err(Error::IllegalArgument {
+                message: format!(
+                    "Bucket id {} out of range for table with {num_buckets} buckets",
+                    table_bucket.bucket_id()
+                ),
+            });
+        }
+        // Log tables decode as Arrow IPC, so only ARROW format is supported (KV
+        // tables use the value-record path and are exempt).
+        if !self.table_info.has_primary_key() {
+            validate_scan_support(&self.table_info.table_path, &self.table_info)?;
+        }
+        // Pre-seed the current schema; older versions are fetched lazily during
+        // KV decode. Mirrors `Table::new_lookup`.
+        let latest = SchemaInfo::new(
+            self.table_info.get_schema().clone(),
+            self.table_info.get_schema_id(),
+        );
+        let schema_getter = Arc::new(ClientSchemaGetter::new(
+            self.table_info.table_path.clone(),
+            self.conn.get_admin()?,
+            latest,
+        ));
+        Ok(LimitBatchScanner::new(
+            self.conn.get_connections(),
+            self.metadata.clone(),
+            self.table_info,
+            schema_getter,
+            self.projected_fields,
+            table_bucket,
+            limit,
+        ))
+    }
+
+    /// Projects the scan to only include specified columns by their indices.
+    ///
+    /// # Arguments
+    /// * `column_indices` - Zero-based indices of columns to include in the scan
+    ///
+    /// # Errors
+    /// Returns an error if `column_indices` is empty or if any column index is out of range.
+    ///
+    /// # Example
+    /// ```
+    /// # use fluss::client::FlussConnection;
+    /// # use fluss::config::Config;
+    /// # use fluss::error::Result;
+    /// # use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+    /// # use fluss::row::InternalRow;
+    /// # use std::time::Duration;
+    ///
+    /// # pub async fn example() -> Result<()> {
+    ///     let mut config = Config::default();
+    ///     config.bootstrap_servers = "127.0.0.1:9123".to_string();
+    ///     let conn = FlussConnection::new(config).await?;
+    ///
+    ///     let table_descriptor = TableDescriptor::builder()
+    ///         .schema(
+    ///             Schema::builder()
+    ///                 .column("col1", DataTypes::int())
+    ///                 .column("col2", DataTypes::string())
+    ///                 .column("col3", DataTypes::string())
+    ///                 .column("col4", DataTypes::string())
+    ///             .build()?,
+    ///         ).build()?;
+    ///     let table_path = TablePath::new("fluss".to_owned(), "rust_test_long".to_owned());
+    ///     let admin = conn.get_admin()?;
+    ///     admin.create_table(&table_path, &table_descriptor, true)
+    ///         .await?;
+    ///     let table_info = admin.get_table_info(&table_path).await?;
+    ///     let table = conn.get_table(&table_path).await?;
+    ///
+    ///     // Project columns by indices
+    ///     let scanner = table.new_scan().project(&[0, 2, 3])?.create_log_scanner()?;
+    ///     let scan_records = scanner.poll(Duration::from_secs(10)).await?;
+    ///     for record in scan_records {
+    ///         let row = record.row();
+    ///         println!(
+    ///             "{{{}, {}, {}}}@{}",
+    ///             row.get_int(0)?,
+    ///             row.get_string(2)?,
+    ///             row.get_string(3)?,
+    ///             record.offset()
+    ///         );
+    ///     }
+    ///     # Ok(())
+    /// # }
+    /// ```
+    pub fn project(mut self, column_indices: &[usize]) -> Result<Self> {
+        if column_indices.is_empty() {
+            return Err(Error::IllegalArgument {
+                message: "Column indices cannot be empty".to_string(),
+            });
+        }
+        let field_count = self.table_info.row_type().fields().len();
+        for &idx in column_indices {
+            if idx >= field_count {
+                return Err(Error::IllegalArgument {
+                    message: format!(
+                        "Column index {} out of range (max: {})",
+                        idx,
+                        field_count - 1
+                    ),
+                });
+            }
+        }
+        self.projected_fields = Some(column_indices.to_vec());
+        Ok(self)
+    }
+
+    /// Projects the scan to only include specified columns by their names.
+    ///
+    /// # Arguments
+    /// * `column_names` - Names of columns to include in the scan
+    ///
+    /// # Errors
+    /// Returns an error if `column_names` is empty or if any column name is not found in the table schema.
+    ///
+    /// # Example
+    /// ```
+    /// # use fluss::client::FlussConnection;
+    /// # use fluss::config::Config;
+    /// # use fluss::error::Result;
+    /// # use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+    /// # use fluss::row::InternalRow;
+    /// # use std::time::Duration;
+    ///
+    /// # pub async fn example() -> Result<()> {
+    ///     let mut config = Config::default();
+    ///     config.bootstrap_servers = "127.0.0.1:9123".to_string();
+    ///     let conn = FlussConnection::new(config).await?;
+    ///
+    ///     let table_descriptor = TableDescriptor::builder()
+    ///         .schema(
+    ///             Schema::builder()
+    ///                 .column("col1", DataTypes::int())
+    ///                 .column("col2", DataTypes::string())
+    ///                 .column("col3", DataTypes::string())
+    ///             .build()?,
+    ///         ).build()?;
+    ///     let table_path = TablePath::new("fluss".to_owned(), "rust_test_long".to_owned());
+    ///     let admin = conn.get_admin()?;
+    ///     admin.create_table(&table_path, &table_descriptor, true)
+    ///         .await?;
+    ///     let table = conn.get_table(&table_path).await?;
+    ///
+    ///     // Project columns by column names
+    ///     let scanner = table.new_scan().project_by_name(&["col1", "col3"])?.create_log_scanner()?;
+    ///     let scan_records = scanner.poll(Duration::from_secs(10)).await?;
+    ///     for record in scan_records {
+    ///         let row = record.row();
+    ///         println!(
+    ///             "{{{}, {}}}@{}",
+    ///             row.get_int(0)?,
+    ///             row.get_string(1)?,
+    ///             record.offset()
+    ///         );
+    ///     }
+    ///     # Ok(())
+    /// # }
+    /// ```
+    pub fn project_by_name(mut self, column_names: &[&str]) -> Result<Self> {
+        if column_names.is_empty() {
+            return Err(Error::IllegalArgument {
+                message: "Column names cannot be empty".to_string(),
+            });
+        }
+        let row_type = self.table_info.row_type();
+        let mut indices = Vec::new();
+
+        for name in column_names {
+            let idx = row_type
+                .fields()
+                .iter()
+                .position(|f| f.name() == *name)
+                .ok_or_else(|| Error::IllegalArgument {
+                    message: format!("Column '{name}' not found"),
+                })?;
+            indices.push(idx);
+        }
+
+        self.projected_fields = Some(indices);
+        Ok(self)
+    }
+
+    pub fn create_log_scanner(self) -> Result<LogScanner> {
+        self.reject_limit("LogScanner")?;
+        validate_scan_support(&self.table_info.table_path, &self.table_info)?;
+        let inner = LogScannerInner::new(
+            &self.table_info,
+            self.metadata.clone(),
+            self.conn.get_connections(),
+            self.conn.config(),
+            self.projected_fields,
+        )?;
+        Ok(LogScanner {
+            inner: Arc::new(inner),
+        })
+    }
+
+    pub fn create_record_batch_log_scanner(self) -> Result<RecordBatchLogScanner> {
+        self.reject_limit("RecordBatchLogScanner")?;
+        validate_scan_support(&self.table_info.table_path, &self.table_info)?;
+        let inner = LogScannerInner::new(
+            &self.table_info,
+            self.metadata.clone(),
+            self.conn.get_connections(),
+            self.conn.config(),
+            self.projected_fields,
+        )?;
+        Ok(RecordBatchLogScanner {
+            inner: Arc::new(inner),
+        })
+    }
+}
+
+/// Scanner for reading log records one at a time with per-record metadata.
+///
+/// Use this scanner when you need access to individual record offsets and timestamps.
+/// For batch-level access, use [`RecordBatchLogScanner`] instead.
+pub struct LogScanner {
+    inner: Arc<LogScannerInner>,
+}
+
+/// Scanner for reading log data as Arrow RecordBatches.
+///
+/// More efficient than [`LogScanner`] for batch-level analytics where per-record
+/// metadata (offsets, timestamps) is not needed.
+///
+/// This type is intentionally **not** `Clone`. To perform a bounded read, move
+/// the scanner into a [`crate::client::RecordBatchLogReader`] — the compiler
+/// then prevents concurrent polls by construction.
+pub struct RecordBatchLogScanner {
+    inner: Arc<LogScannerInner>,
+}
+
+/// Private shared implementation for both scanner types
+struct LogScannerInner {
+    table_path: TablePath,
+    table_id: TableId,
+    metadata: Arc<Metadata>,
+    log_scanner_status: Arc<LogScannerStatus>,
+    log_fetcher: LogFetcher,
+    is_partitioned_table: bool,
+    arrow_schema: SchemaRef,
+    /// Guards against subscription changes while a
+    /// [`crate::client::RecordBatchLogReader`] is iterating.
+    reader_active: std::sync::atomic::AtomicBool,
+    /// Holds the snapshot fields used by [`PollGuard`] to derive the
+    /// scanner poll-timing metrics. The mutex makes the state updates
+    /// in `record_poll_start` / `record_poll_end` atomic; metric
+    /// emission and `log::warn!` calls happen after the lock is
+    /// released. The start↔end pairing depends on the single-consumer
+    /// contract documented on [`LogScanner::poll`] and
+    /// [`RecordBatchLogScanner::poll`] (mirrors Java's
+    /// `LogScannerImpl.acquire()`). Overlapping polls on the same
+    /// scanner trip a `debug_assert!` in `record_poll_start` (debug
+    /// builds) or emit a `log::warn!` (release builds).
+    poll_state: Mutex<PollState>,
+    /// Per-table scanner metric handles, pre-bound with `database`/`table`
+    /// labels.
+    metrics: Arc<ScannerMetrics>,
+}
+
+/// Snapshot state used to derive the scanner poll-timing metrics.
+///
+/// The mutex makes the state updates in `record_poll_start` /
+/// `record_poll_end` atomic with respect to themselves; metric
+/// emission (`metrics::gauge!(...).set(...)`) and `log::warn!` calls
+/// happen after the lock is released so a user-installed recorder or
+/// logger cannot stall the critical section. The mutex does **not** by
+/// itself preserve start↔end pairing across overlapping `poll()` calls
+/// — that invariant relies on the single-consumer contract that
+/// mirrors Java's `LogScannerImpl.acquire()`. Concurrent polls on the
+/// same scanner are detected by a `debug_assert!` in
+/// `record_poll_start` (panics in debug / tests) and a `log::warn!` on
+/// both anomalous paths (`record_poll_start` sees a stale `Some`;
+/// `record_poll_end` sees `None`) for release-build observability.
+#[derive(Default, Debug)]
+struct PollState {
+    /// Instant captured at the most recent `record_poll_start()`. `None`
+    /// before the first poll.
+    last_poll_at: Option<Instant>,
+    /// Instant captured at the start of the in-flight poll. `None` after
+    /// the last `record_poll_end()`.
+    poll_start_at: Option<Instant>,
+    /// Cached ms between the two most recent poll starts, used to compute
+    /// `poll_idle_ratio` in `record_poll_end`.
+    time_between_poll_ms: f64,
+}
+
+/// Pairs `record_poll_start` with `record_poll_end`. Created
+/// at the top of `poll_records` / `poll_batches`; `record_poll_end` runs on
+/// drop, including the cancellation path (caller drops the future).
+struct PollGuard<'a> {
+    inner: &'a LogScannerInner,
+}
+
+impl<'a> PollGuard<'a> {
+    fn new(inner: &'a LogScannerInner) -> Self {
+        inner.record_poll_start();
+        Self { inner }
+    }
+}
+
+impl Drop for PollGuard<'_> {
+    fn drop(&mut self) {
+        self.inner.record_poll_end();
+    }
+}
+
+impl LogScannerInner {
+    fn new(
+        table_info: &TableInfo,
+        metadata: Arc<Metadata>,
+        connections: Arc<RpcClient>,
+        config: &Config,
+        projected_fields: Option<Vec<usize>>,
+    ) -> Result<Self> {
+        let log_scanner_status = Arc::new(LogScannerStatus::new());
+
+        let full_row_type = table_info.get_row_type();
+        let arrow_schema = match &projected_fields {
+            Some(indices) => {
+                let projected_fields_vec: Vec<_> = indices
+                    .iter()
+                    .map(|&i| full_row_type.fields()[i].clone())
+                    .collect();
+                let projected_row_type = crate::metadata::RowType::new(projected_fields_vec);
+                to_arrow_schema(&projected_row_type)?
+            }
+            None => to_arrow_schema(full_row_type)?,
+        };
+
+        let metrics = Arc::new(ScannerMetrics::new(&table_info.table_path));
+        Ok(Self {
+            table_path: table_info.table_path.clone(),
+            table_id: table_info.table_id,
+            is_partitioned_table: table_info.is_partitioned(),
+            metadata: metadata.clone(),
+            log_scanner_status: log_scanner_status.clone(),
+            log_fetcher: LogFetcher::new(
+                table_info.clone(),
+                connections,
+                metadata,
+                log_scanner_status.clone(),
+                config,
+                projected_fields,
+                Arc::clone(&metrics),
+            )?,
+            arrow_schema,
+            reader_active: std::sync::atomic::AtomicBool::new(false),
+            poll_state: Mutex::new(PollState::default()),
+            metrics,
+        })
+    }
+
+    fn check_no_active_reader(&self) -> Result<()> {
+        if self
+            .reader_active
+            .load(std::sync::atomic::Ordering::Acquire)
+        {
+            return Err(Error::IllegalArgument {
+                message: "Cannot modify subscriptions while a RecordBatchLogReader is active. \
+                          Drop the reader first."
+                    .to_string(),
+            });
+        }
+        Ok(())
+    }
+
+    async fn poll_records(&self, timeout: Duration) -> Result<ScanRecords> {
+        // Pairs record_poll_start (now) with record_poll_end
+        // (drop). Runs on every exit, including the cancellation path
+        // where the caller drops this future.
+        let _poll_guard = PollGuard::new(self);
+        let start = Instant::now();
+        let deadline = start + timeout;
+
+        loop {
+            // Try to collect fetches
+            let fetch_result = self.poll_for_fetches().await?;
+
+            if !fetch_result.is_empty() {
+                // We have data, send next round of fetches and return
+                // This enables pipelining while user processes the data
+                self.log_fetcher.send_fetches().await?;
+                return Ok(ScanRecords::new(fetch_result));
+            }
+
+            // No data available, check if we should wait
+            let now = Instant::now();
+            if now >= deadline {
+                // Timeout reached, return empty result
+                return Ok(ScanRecords::new(HashMap::new()));
+            }
+
+            // Wait for buffer to become non-empty with remaining time
+            let remaining = deadline - now;
+            let has_data = self
+                .log_fetcher
+                .log_fetch_buffer
+                .await_not_empty(remaining)
+                .await?;
+
+            if !has_data {
+                // Timeout while waiting
+                return Ok(ScanRecords::new(HashMap::new()));
+            }
+
+            // Buffer became non-empty, try again
+        }
+    }
+
+    /// Records the start of a `poll()` call and emits
+    /// `SCANNER_TIME_BETWEEN_POLL_MS`. The first poll emits `0.0`,
+    /// matching Java's `ScannerMetricGroup.recordPollStart`
+    /// (`timeMsBetweenPoll = lastPollMs != 0L ? pollStartMs - lastPollMs : 0L`).
+    ///
+    /// Single-consumer contract: a previous poll must have recorded its
+    /// end before the next start. Java enforces this with
+    /// `LogScannerImpl.acquire()` (throws `ConcurrentModificationException`).
+    /// Rust surfaces violations as:
+    /// - debug builds: `debug_assert!` panics (caught by tests),
+    /// - release builds: `log::warn!` + the in-flight `poll_start_at` is
+    ///   overwritten so the metric series keeps moving; the resulting
+    ///   `time_between_poll_ms` / `poll_idle_ratio` values for the
+    ///   overlapping polls are not meaningful until the overlap clears.
+    fn record_poll_start(&self) {
+        let now = Instant::now();
+        // Compute under the lock; emit the metric outside the critical
+        // section so a user-installed recorder cannot stall the next poll.
+        let (between_ms, overlap) = {
+            let mut state = self.poll_state.lock();
+            let overlap = state.poll_start_at.is_some();
+            debug_assert!(
+                !overlap,
+                "concurrent poll() detected on the same scanner; \
+                 LogScanner / RecordBatchLogScanner are single-consumer \
+                 (see LogScannerImpl.acquire() for Java parity)"
+            );
+            let between_ms = match state.last_poll_at {
+                Some(prev) => now.duration_since(prev).as_secs_f64() * 1000.0,
+                None => 0.0,
+            };
+            state.time_between_poll_ms = between_ms;
+            state.last_poll_at = Some(now);
+            state.poll_start_at = Some(now);
+            (between_ms, overlap)
+        };
+        if overlap {
+            warn!(
+                "concurrent poll() detected on scanner; single-consumer \
+                 contract violated, poll-timing metrics will be inaccurate \
+                 until the overlap clears"
+            );
+        }
+        self.metrics.record_time_between_poll_ms(between_ms);
+    }
+
+    /// Computes `poll_idle_ratio = poll_time / (poll_time + between_time)`.
+    /// On the first poll, `between_time` is 0 so the ratio is 1.0
+    /// (poll-bound).
+    ///
+    /// Orphan call: if no matching `record_poll_start` is in flight,
+    /// emits a `log::warn!` (single-consumer contract may have been
+    /// violated, e.g. in release builds where the start-side
+    /// `debug_assert!` is compiled out) and skips the metric update.
+    fn record_poll_end(&self) {
+        let now = Instant::now();
+        // Compute under the lock; emit metric / warn outside the critical
+        // section so neither the user-installed recorder nor the logger
+        // can stall the next poll.
+        let (orphan, ratio) = {
+            let mut state = self.poll_state.lock();
+            match state.poll_start_at.take() {
+                None => (true, None),
+                Some(start) => {
+                    let poll_time_ms = now.duration_since(start).as_secs_f64() * 1000.0;
+                    let total = poll_time_ms + state.time_between_poll_ms;
+                    let r = (total > 0.0).then_some(poll_time_ms / total);
+                    (false, r)
+                }
+            }
+        };
+        if orphan {
+            warn!(
+                "record_poll_end called without a matching record_poll_start; \
+                 single-consumer contract may have been violated, idle ratio \
+                 for this poll is not emitted"
+            );
+            return;
+        }
+        if let Some(r) = ratio {
+            self.metrics.record_poll_idle_ratio(r);
+        }
+    }
+
+    async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> {
+        self.check_no_active_reader()?;
+        if self.is_partitioned_table {
+            return Err(Error::UnsupportedOperation {
+                message: "The table is a partitioned table, please use \"subscribe_partition\" to \
+                subscribe a partitioned bucket instead."
+                    .to_string(),
+            });
+        }
+        let table_bucket = TableBucket::new(self.table_id, bucket);
+        self.metadata
+            .check_and_update_table_metadata(from_ref(&self.table_path))
+            .await?;
+        self.log_scanner_status
+            .assign_scan_bucket(table_bucket, offset);
+        Ok(())
+    }
+
+    async fn subscribe_buckets(&self, bucket_offsets: &HashMap<i32, i64>) -> Result<()> {
+        self.check_no_active_reader()?;
+        if self.is_partitioned_table {
+            return Err(Error::UnsupportedOperation {
+                message:
+                    "The table is a partitioned table, please use \"subscribe_partition_buckets\" instead."
+                        .to_string(),
+            });
+        }
+
+        let mut scan_bucket_offsets = HashMap::new();
+        for (bucket_id, offset) in bucket_offsets {
+            let table_bucket = TableBucket::new(self.table_id, *bucket_id);
+            scan_bucket_offsets.insert(table_bucket, *offset);
+        }
+        self.do_subscribe_buckets(scan_bucket_offsets).await
+    }
+
+    async fn subscribe_partition(
+        &self,
+        partition_id: PartitionId,
+        bucket: i32,
+        offset: i64,
+    ) -> Result<()> {
+        self.check_no_active_reader()?;
+        if !self.is_partitioned_table {
+            return Err(Error::UnsupportedOperation {
+                message: "The table is not a partitioned table, please use \"subscribe\" to \
+                subscribe a non-partitioned bucket instead."
+                    .to_string(),
+            });
+        }
+        let table_bucket =
+            TableBucket::new_with_partition(self.table_id, Some(partition_id), bucket);
+        self.metadata
+            .check_and_update_table_metadata(from_ref(&self.table_path))
+            .await?;
+        self.log_scanner_status
+            .assign_scan_bucket(table_bucket, offset);
+        Ok(())
+    }
+
+    async fn subscribe_partition_buckets(
+        &self,
+        partition_bucket_offsets: &HashMap<(PartitionId, i32), i64>,
+    ) -> Result<()> {
+        self.check_no_active_reader()?;
+        if !self.is_partitioned_table {
+            return Err(UnsupportedOperation {
+                message: "The table is not a partitioned table, please use \"subscribe_buckets\" \
+                    to subscribe to non-partitioned buckets instead."
+                    .to_string(),
+            });
+        }
+
+        let mut scan_bucket_offsets = HashMap::new();
+        for (&(partition_id, bucket_id), &offset) in partition_bucket_offsets {
+            let table_bucket =
+                TableBucket::new_with_partition(self.table_id, Some(partition_id), bucket_id);
+            scan_bucket_offsets.insert(table_bucket, offset);
+        }
+        self.do_subscribe_buckets(scan_bucket_offsets).await
+    }
+
+    async fn do_subscribe_buckets(&self, bucket_offsets: HashMap<TableBucket, i64>) -> Result<()> {
+        if bucket_offsets.is_empty() {
+            return Err(Error::UnexpectedError {
+                message: "Bucket offsets are empty.".to_string(),
+                source: None,
+            });
+        }
+
+        self.metadata
+            .check_and_update_table_metadata(from_ref(&self.table_path))
+            .await?;
+
+        self.log_scanner_status.assign_scan_buckets(bucket_offsets);
+        Ok(())
+    }
+
+    async fn unsubscribe(&self, bucket: i32) -> Result<()> {
+        self.check_no_active_reader()?;
+        if self.is_partitioned_table {
+            return Err(Error::UnsupportedOperation {
+                message:
+                    "The table is a partitioned table, please use \"unsubscribe_partition\" to \
+                    unsubscribe a partitioned bucket instead."
+                        .to_string(),
+            });
+        }
+        let table_bucket = TableBucket::new(self.table_id, bucket);
+        self.log_scanner_status
+            .unassign_scan_buckets(from_ref(&table_bucket));
+        Ok(())
+    }
+
+    async fn unsubscribe_partition(&self, partition_id: PartitionId, bucket: i32) -> Result<()> {
+        self.check_no_active_reader()?;
+        if !self.is_partitioned_table {
+            return Err(Error::UnsupportedOperation {
+                message: "Can't unsubscribe a partition for a non-partitioned table.".to_string(),
+            });
+        }
+        let table_bucket =
+            TableBucket::new_with_partition(self.table_id, Some(partition_id), bucket);
+        self.log_scanner_status
+            .unassign_scan_buckets(from_ref(&table_bucket));
+        Ok(())
+    }
+
+    async fn poll_for_fetches(&self) -> Result<HashMap<TableBucket, Vec<ScanRecord>>> {
+        let result = self.log_fetcher.collect_fetches()?;
+        if !result.is_empty() {
+            return Ok(result);
+        }
+
+        // send any new fetches (won't resend pending fetches).
+        self.log_fetcher.send_fetches().await?;
+
+        // Collect completed fetches from buffer
+        self.log_fetcher.collect_fetches()
+    }
+
+    async fn poll_batches(&self, timeout: Duration) -> Result<Vec<ScanBatch>> {
+        let _poll_guard = PollGuard::new(self);
+        let start = Instant::now();
+        let deadline = start + timeout;
+
+        loop {
+            let batches = self.poll_for_batches().await?;
+
+            if !batches.is_empty() {
+                self.log_fetcher.send_fetches().await?;
+                return Ok(batches);
+            }
+
+            let now = Instant::now();
+            if now >= deadline {
+                return Ok(Vec::new());
+            }
+
+            let remaining = deadline - now;
+            let has_data = self
+                .log_fetcher
+                .log_fetch_buffer
+                .await_not_empty(remaining)
+                .await?;
+
+            if !has_data {
+                return Ok(Vec::new());
+            }
+        }
+    }
+
+    async fn poll_for_batches(&self) -> Result<Vec<ScanBatch>> {
+        let result = self.log_fetcher.collect_batches()?;
+        if !result.is_empty() {
+            return Ok(result);
+        }
+
+        self.log_fetcher.send_fetches().await?;
+        self.log_fetcher.collect_batches()
+    }
+}
+
+// Implementation for LogScanner (records mode)
+impl LogScanner {
+    pub async fn poll(&self, timeout: Duration) -> Result<ScanRecords> {
+        self.inner.poll_records(timeout).await
+    }
+
+    pub async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> {
+        self.inner.subscribe(bucket, offset).await
+    }
+
+    pub async fn subscribe_buckets(&self, bucket_offsets: &HashMap<i32, i64>) -> Result<()> {
+        self.inner.subscribe_buckets(bucket_offsets).await
+    }
+
+    pub async fn subscribe_partition(
+        &self,
+        partition_id: PartitionId,
+        bucket: i32,
+        offset: i64,
+    ) -> Result<()> {
+        self.inner
+            .subscribe_partition(partition_id, bucket, offset)
+            .await
+    }
+
+    pub async fn subscribe_partition_buckets(
+        &self,
+        partition_bucket_offsets: &HashMap<(PartitionId, i32), i64>,
+    ) -> Result<()> {
+        self.inner
+            .subscribe_partition_buckets(partition_bucket_offsets)
+            .await
+    }
+
+    pub async fn unsubscribe(&self, bucket: i32) -> Result<()> {
+        self.inner.unsubscribe(bucket).await
+    }
+
+    pub async fn unsubscribe_partition(
+        &self,
+        partition_id: PartitionId,
+        bucket: i32,
+    ) -> Result<()> {
+        self.inner.unsubscribe_partition(partition_id, bucket).await
+    }
+}
+
+// Implementation for RecordBatchLogScanner (batches mode)
+impl RecordBatchLogScanner {
+    /// Poll for batches with metadata (bucket and offset information).
+    pub async fn poll(&self, timeout: Duration) -> Result<Vec<ScanBatch>> {
+        self.inner.poll_batches(timeout).await
+    }
+
+    pub async fn subscribe(&self, bucket: i32, offset: i64) -> Result<()> {
+        self.inner.subscribe(bucket, offset).await
+    }
+
+    pub async fn subscribe_buckets(&self, bucket_offsets: &HashMap<i32, i64>) -> Result<()> {
+        self.inner.subscribe_buckets(bucket_offsets).await
+    }
+
+    pub async fn subscribe_partition(
+        &self,
+        partition_id: PartitionId,
+        bucket: i32,
+        offset: i64,
+    ) -> Result<()> {
+        self.inner
+            .subscribe_partition(partition_id, bucket, offset)
+            .await
+    }
+
+    /// Returns whether the table is partitioned
+    pub fn is_partitioned(&self) -> bool {
+        self.inner.is_partitioned_table
+    }
+
+    /// Returns all subscribed buckets with their current offsets
+    pub fn get_subscribed_buckets(&self) -> Vec<(TableBucket, i64)> {
+        self.inner.log_scanner_status.get_all_subscriptions()
+    }
+
+    pub async fn subscribe_partition_buckets(
+        &self,
+        partition_bucket_offsets: &HashMap<(PartitionId, i32), i64>,
+    ) -> Result<()> {
+        self.inner
+            .subscribe_partition_buckets(partition_bucket_offsets)
+            .await
+    }
+
+    pub async fn unsubscribe(&self, bucket: i32) -> Result<()> {
+        self.inner.unsubscribe(bucket).await
+    }
+
+    pub async fn unsubscribe_partition(
+        &self,
+        partition_id: PartitionId,
+        bucket: i32,
+    ) -> Result<()> {
+        self.inner.unsubscribe_partition(partition_id, bucket).await
+    }
+
+    /// Returns the Arrow schema for batches produced by this scanner.
+    pub fn schema(&self) -> SchemaRef {
+        self.inner.arrow_schema.clone()
+    }
+
+    pub fn table_path(&self) -> &TablePath {
+        &self.inner.table_path
+    }
+
+    pub fn table_id(&self) -> TableId {
+        self.inner.table_id
+    }
+
+    /// Creates a new handle to the same underlying scanner state.
+    ///
+    /// Binding layers (Python, C++) that hold the scanner behind shared
+    /// ownership (`Arc`) cannot move it into a [`crate::client::RecordBatchLogReader`].
+    /// This method produces a second handle so the reader can take ownership
+    /// while the binding retains its reference for subscription management.
+    ///
+    /// **Not intended for general use** — prefer moving the scanner directly.
+    #[doc(hidden)]
+    pub fn new_shared_handle(&self) -> Self {
+        RecordBatchLogScanner {
+            inner: Arc::clone(&self.inner),
+        }
+    }
+
+    /// Atomically marks the scanner as having an active reader.
+    ///
+    /// Returns `Err(IllegalArgument)` if another reader is already active on
+    /// this scanner — only one [`crate::client::RecordBatchLogReader`] may
+    /// iterate per scanner at a time. This mirrors Java's
+    /// `LogScannerImpl.acquire()` single-consumer guard.
+    pub(crate) fn try_set_reader_active(&self) -> Result<()> {
+        use std::sync::atomic::Ordering;
+        self.inner
+            .reader_active
+            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
+            .map(|_| ())
+            .map_err(|_| Error::IllegalArgument {
+                message: "Another RecordBatchLogReader is already active on this scanner. \
+                          Drop the existing reader first."
+                    .to_string(),
+            })
+    }
+
+    /// Clears the active-reader guard, re-enabling subscription changes.
+    pub(crate) fn clear_reader_active(&self) {
+        self.inner
+            .reader_active
+            .store(false, std::sync::atomic::Ordering::Release);
+    }
+
+    /// Synchronous, infallible counterpart to [`unsubscribe`](Self::unsubscribe).
+    ///
+    /// Exists so [`crate::client::RecordBatchLogReader`]'s `Drop` impl can
+    /// release lingering subscriptions without `.await`. The async version is
+    /// also synchronous under the hood (it only acquires a lock and removes
+    /// from a map — no IO), so this exposes the same work without the
+    /// async wrapper. Silently no-ops on partitioned/non-partitioned mismatch
+    /// because `Drop` cannot return errors; callers must pick the correct
+    /// variant.
+    ///
+    /// **Not intended for general use** — prefer the async [`unsubscribe`].
+    pub(crate) fn unsubscribe_sync(&self, bucket: i32) {
+        if self.inner.is_partitioned_table {
+            return;
+        }
+        let table_bucket = TableBucket::new(self.inner.table_id, bucket);
+        self.inner
+            .log_scanner_status
+            .unassign_scan_buckets(from_ref(&table_bucket));
+    }
+
+    /// Synchronous, infallible counterpart to
+    /// [`unsubscribe_partition`](Self::unsubscribe_partition). See
+    /// [`unsubscribe_sync`](Self::unsubscribe_sync) for rationale.
+    pub(crate) fn unsubscribe_partition_sync(&self, partition_id: PartitionId, bucket: i32) {
+        if !self.inner.is_partitioned_table {
+            return;
+        }
+        let table_bucket =
+            TableBucket::new_with_partition(self.inner.table_id, Some(partition_id), bucket);
+        self.inner
+            .log_scanner_status
+            .unassign_scan_buckets(from_ref(&table_bucket));
+    }
+}
+
+struct LogFetcher {
+    conns: Arc<RpcClient>,
+    metadata: Arc<Metadata>,
+    table_path: TablePath,
+    is_partitioned: bool,
+    log_scanner_status: Arc<LogScannerStatus>,
+    read_context: ReadContext,
+    remote_read_context: ReadContext,
+    remote_log_downloader: Arc<RemoteLogDownloader>,
+    /// Background security token manager for remote filesystem access.
+    /// Kept alive to run the background refresh task; stopped on drop.
+    #[allow(dead_code)]
+    security_token_manager: Arc<SecurityTokenManager>,
+    log_fetch_buffer: Arc<LogFetchBuffer>,
+    nodes_with_pending_fetch_requests: Arc<Mutex<HashSet<i32>>>,
+    /// Per-table scanner metric handles shared with the owning
+    /// `LogScannerInner` and `RemoteLogDownloader`.
+    metrics: Arc<ScannerMetrics>,
+    max_poll_records: usize,
+    fetch_max_bytes: i32,
+    fetch_min_bytes: i32,
+    fetch_wait_max_time_ms: i32,
+    fetch_max_bytes_for_bucket: i32,
+}
+
+struct FetchResponseContext {
+    metadata: Arc<Metadata>,
+    log_fetch_buffer: Arc<LogFetchBuffer>,
+    log_scanner_status: Arc<LogScannerStatus>,
+    read_context: ReadContext,
+    remote_read_context: ReadContext,
+    remote_log_downloader: Arc<RemoteLogDownloader>,
+    /// Per-table scanner metric handles for `scanner.fetch_*` recording.
+    metrics: Arc<ScannerMetrics>,
+    /// `Instant` captured immediately before the FetchLog RPC; used to compute
+    /// `scanner.fetch_latency_ms` on a successful response.
+    request_start_time: Instant,
+}
+
+impl LogFetcher {
+    pub fn new(
+        table_info: TableInfo,
+        conns: Arc<RpcClient>,
+        metadata: Arc<Metadata>,
+        log_scanner_status: Arc<LogScannerStatus>,
+        config: &Config,
+        projected_fields: Option<Vec<usize>>,
+        metrics: Arc<ScannerMetrics>,
+    ) -> Result<Self> {
+        let full_row_type = table_info.get_row_type();
+        let full_arrow_schema = to_arrow_schema(full_row_type)?;
+        let projected_row_type = match &projected_fields {
+            None => Arc::new(full_row_type.clone()),
+            Some(fields) => Arc::new(RowType::new(
+                fields
+                    .iter()
+                    .map(|&i| full_row_type.fields()[i].clone())
+                    .collect(),
+            )),
+        };
+        let read_context = Self::create_read_context(
+            full_arrow_schema.clone(),
+            projected_row_type.clone(),
+            projected_fields.clone(),
+            false,
+        )?
+        .with_fluss_row_type(projected_row_type.clone());
+        let remote_read_context = Self::create_read_context(
+            full_arrow_schema,
+            projected_row_type.clone(),
+            projected_fields.clone(),
+            true,
+        )?
+        .with_fluss_row_type(projected_row_type);
+
+        let tmp_dir = TempDir::with_prefix("fluss-remote-logs")?;
+        let log_fetch_buffer = Arc::new(LogFetchBuffer::new(read_context.clone()));
+
+        // Create security token manager for background token refresh
+        let security_token_manager =
+            Arc::new(SecurityTokenManager::new(conns.clone(), metadata.clone()));
+
+        // Subscribe to credentials updates and pass to remote log downloader
+        let credentials_rx = security_token_manager.subscribe();
+
+        let remote_log_downloader = Arc::new(RemoteLogDownloader::new(
+            tmp_dir,
+            config.scanner_remote_log_prefetch_num,
+            config.remote_file_download_thread_num,
+            config.scanner_remote_log_read_concurrency,
+            credentials_rx,
+            Arc::clone(&metrics),
+        )?);
+
+        // Start the background token refresh task
+        security_token_manager.start();
+
+        Ok(LogFetcher {
+            conns: conns.clone(),
+            metadata: metadata.clone(),
+            table_path: table_info.table_path.clone(),
+            is_partitioned: table_info.is_partitioned(),
+            log_scanner_status,
+            read_context,
+            remote_read_context,
+            remote_log_downloader,
+            security_token_manager,
+            log_fetch_buffer,
+            nodes_with_pending_fetch_requests: Arc::new(Mutex::new(HashSet::new())),
+            metrics,
+            max_poll_records: config.scanner_log_max_poll_records,
+            fetch_max_bytes: config.scanner_log_fetch_max_bytes,
+            fetch_min_bytes: config.scanner_log_fetch_min_bytes,
+            fetch_wait_max_time_ms: config.scanner_log_fetch_wait_max_time_ms,
+            fetch_max_bytes_for_bucket: config.scanner_log_fetch_max_bytes_for_bucket,
+        })
+    }
+
+    fn create_read_context(
+        full_arrow_schema: SchemaRef,
+        row_type: Arc<RowType>,
+        projected_fields: Option<Vec<usize>>,
+        is_from_remote: bool,
+    ) -> Result<ReadContext> {
+        match projected_fields {
+            None => Ok(ReadContext::new(
+                full_arrow_schema,
+                row_type,
+                is_from_remote,
+            )),
+            Some(fields) => ReadContext::with_projection_pushdown(
+                full_arrow_schema,
+                row_type,
+                fields,
+                is_from_remote,
+            ),
+        }
+    }
+
+    fn describe_fetch_error(
+        error: FlussError,
+        table_bucket: &TableBucket,
+        fetch_offset: i64,
+        error_message: &str,
+    ) -> FetchErrorContext {
+        match error {
+            FlussError::NotLeaderOrFollower
+            | FlussError::LogStorageException
+            | FlussError::KvStorageException
+            | FlussError::StorageException
+            | FlussError::FencedLeaderEpochException
+            | FlussError::LeaderNotAvailableException => FetchErrorContext {
+                action: FetchErrorAction::Ignore,
+                log_level: FetchErrorLogLevel::Debug,
+                log_message: format!(
+                    "Error in fetch for bucket {table_bucket}: {error:?}: {error_message}"
+                ),
+            },
+            FlussError::UnknownTableOrBucketException => FetchErrorContext {
+                action: FetchErrorAction::Ignore,
+                log_level: FetchErrorLogLevel::Warn,
+                log_message: format!(
+                    "Received unknown table or bucket error in fetch for bucket {table_bucket}"
+                ),
+            },
+            FlussError::LogOffsetOutOfRangeException => FetchErrorContext {
+                action: FetchErrorAction::LogOffsetOutOfRange,
+                log_level: FetchErrorLogLevel::Debug,
+                log_message: format!(
+                    "The fetching offset {fetch_offset} is out of range for bucket {table_bucket}: {error_message}"
+                ),
+            },
+            FlussError::AuthorizationException => FetchErrorContext {
+                action: FetchErrorAction::Authorization,
+                log_level: FetchErrorLogLevel::Debug,
+                log_message: format!(
+                    "Authorization error while fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}"
+                ),
+            },
+            FlussError::UnknownServerError => FetchErrorContext {
+                action: FetchErrorAction::Ignore,
+                log_level: FetchErrorLogLevel::Warn,
+                log_message: format!(
+                    "Unknown server error while fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}"
+                ),
+            },
+            FlussError::CorruptMessage => FetchErrorContext {
+                action: FetchErrorAction::CorruptMessage,
+                log_level: FetchErrorLogLevel::Debug,
+                log_message: format!(
+                    "Encountered corrupt message when fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}"
+                ),
+            },
+            _ => FetchErrorContext {
+                action: FetchErrorAction::Unexpected,
+                log_level: FetchErrorLogLevel::Debug,
+                log_message: format!(
+                    "Unexpected error code {error:?} while fetching at offset {fetch_offset} from bucket {table_bucket}: {error_message}"
+                ),
+            },
+        }
+    }
+
+    fn should_invalidate_table_meta(error: FlussError) -> bool {
+        matches!(
+            error,
+            FlussError::NotLeaderOrFollower
+                | FlussError::LeaderNotAvailableException
+                | FlussError::FencedLeaderEpochException
+                | FlussError::UnknownTableOrBucketException
+                | FlussError::InvalidCoordinatorException
+        )
+    }
+
+    async fn check_and_update_metadata(&self, table_buckets: &[TableBucket]) -> Result<()> {
+        let mut partition_ids = Vec::new();
+        let mut need_update = false;
+
+        for tb in table_buckets {
+            if self.get_table_bucket_leader(tb).is_some() {
+                continue;
+            }
+
+            if self.is_partitioned {
+                partition_ids.push(tb.partition_id().unwrap());
+            } else {
+                need_update = true;
+                break;
+            }
+        }
+
+        let update_result = if self.is_partitioned && !partition_ids.is_empty() {
+            self.metadata
+                .update_tables_metadata(
+                    &HashSet::from([&self.table_path]),
+                    &HashSet::new(),
+                    partition_ids,
+                )
+                .await
+        } else if need_update {
+            self.metadata.update_table_metadata(&self.table_path).await
+        } else {
+            Ok(())
+        };
+
+        // TODO: Handle PartitionNotExist error like java side
+        update_result.or_else(|e| {
+            if let Error::RpcError { source, .. } = &e
+                && matches!(source, RpcError::ConnectionError(_) | RpcError::Poisoned(_))
+            {
+                warn!("Retrying after encountering error while updating table metadata: {e}");
+                Ok(())
+            } else {
+                Err(e)
+            }
+        })?;
+        Ok(())
+    }
+
+    /// Send fetch requests asynchronously without waiting for responses
+    async fn send_fetches(&self) -> Result<()> {
+        self.check_and_update_metadata(self.fetchable_buckets().as_slice())
+            .await?;
+        let fetch_request = self.prepare_fetch_log_requests().await;
+
+        for (leader, fetch_request) in fetch_request {
+            debug!("Adding pending request for node id {leader}");
+            // Check if we already have a pending request for this node
+            {
+                self.nodes_with_pending_fetch_requests.lock().insert(leader);
+            }
+
+            let cluster = self.metadata.get_cluster().clone();
+
+            let conns = Arc::clone(&self.conns);
+            let log_fetch_buffer = self.log_fetch_buffer.clone();
+            let log_scanner_status = self.log_scanner_status.clone();
+            let read_context = self.read_context.clone();
+            let remote_read_context = self.remote_read_context.clone();
+            let remote_log_downloader = Arc::clone(&self.remote_log_downloader);
+            let nodes_with_pending = self.nodes_with_pending_fetch_requests.clone();
+            let metadata = self.metadata.clone();
+            let metrics = Arc::clone(&self.metrics);
+            // Spawn async task to handle the fetch request
+            // Note: These tasks are not explicitly tracked or cancelled when LogFetcher is dropped.
+            // This is acceptable because:
+            // 1. Tasks will naturally complete (network requests will return or timeout)
+            // 2. Tasks use Arc references, so resources are properly shared
+            // 3. When the program exits, tokio runtime will clean up all tasks
+            // 4. Tasks are short-lived (network I/O operations)
+            tokio::spawn(async move {
+                // make sure it will always remove leader from pending nodes
+                let _guard = scopeguard::guard((), |_| {
+                    nodes_with_pending.lock().remove(&leader);
+                });
+
+                let server_node = match cluster.get_tablet_server(leader) {
+                    Some(node) => node,
+                    None => {
+                        warn!("No server node found for leader {leader}, retrying");
+                        Self::handle_fetch_failure(metadata, &leader, &fetch_request).await;
+                        return;
+                    }
+                };
+
+                let con = match conns.get_connection(server_node).await {
+                    Ok(con) => con,
+                    Err(e) => {
+                        warn!("Retrying after error getting connection to destination node: {e:?}");
+                        Self::handle_fetch_failure(metadata, &leader, &fetch_request).await;
+                        return;
+                    }
+                };
+
+                // Java increment the fetch counter and capture `requestStartTime` immediately
+                // before the RPC. Failed connection acquisition above is not counted.
+                let request_start_time = Instant::now();
+                metrics.record_fetch_request();
+
+                let fetch_response = match con
+                    .request(message::FetchLogRequest::new(fetch_request.clone()))
+                    .await
+                {
+                    Ok(resp) => resp,
+                    Err(e) => {
+                        warn!(
+                            "Retrying after error fetching log from destination node {server_node:?}: {e:?}"
+                        );
+                        Self::handle_fetch_failure(metadata, &leader, &fetch_request).await;
+                        return;
+                    }
+                };
+
+                // Build the context after the RPC so `request_start_time` measures only RPC wall-clock
+                // — not tablet-server lookup or connection acquisition, which is matching Java's bebaviour
+                // Building it here also skips the allocation on the early-return error paths above.
+                let response_context = FetchResponseContext {
+                    metadata: metadata.clone(),
+                    log_fetch_buffer,
+                    log_scanner_status,
+                    read_context,
+                    remote_read_context,
+                    remote_log_downloader,
+                    metrics,
+                    request_start_time,
+                };
+                Self::handle_fetch_response(fetch_response, response_context).await;
+            });
+        }
+
+        Ok(())
+    }
+
+    async fn handle_fetch_failure(
+        metadata: Arc<Metadata>,
+        server_id: &i32,
+        request: &FetchLogRequest,
+    ) {
+        let table_ids = request.tables_req.iter().map(|r| r.table_id).collect();
+        metadata.invalidate_server(server_id, table_ids);
+    }
+
+    /// Handle fetch response and add completed fetches to buffer
+    async fn handle_fetch_response(
+        fetch_response: FetchLogResponse,
+        context: FetchResponseContext,
+    ) {
+        let FetchResponseContext {
+            metadata,
+            log_fetch_buffer,
+            log_scanner_status,
+            read_context,
+            remote_read_context,
+            remote_log_downloader,
+            metrics,
+            request_start_time,
+        } = context;
+
+        // `encoded_len()` mirrors Java's `fetchLogResponse.totalSize()`:
+        // both report the serialized API message body size, excluding protocol
+        // headers and framing. Recorded unconditionally (including zero-record
+        // responses) to match Java's histogram semantics.
+        metrics.record_fetch_latency_ms(request_start_time.elapsed().as_secs_f64() * 1000.0);
+        metrics.record_bytes_per_request(fetch_response.encoded_len() as f64);
+
+        for pb_fetch_log_resp in fetch_response.tables_resp {
+            let table_id = pb_fetch_log_resp.table_id;
+            let fetch_log_for_buckets = pb_fetch_log_resp.buckets_resp;
+
+            for fetch_log_for_bucket in fetch_log_for_buckets {
+                let bucket: i32 = fetch_log_for_bucket.bucket_id;
+                let table_bucket = TableBucket::new_with_partition(
+                    table_id,
+                    fetch_log_for_bucket.partition_id,
+                    bucket,
+                );
+
+                // todo: check fetch result code for per-bucket
+                let Some(fetch_offset) = log_scanner_status.get_bucket_offset(&table_bucket) else {
+                    debug!(
+                        "Ignoring fetch log response for bucket {table_bucket} because the bucket has been unsubscribed."
+                    );
+                    continue;
+                };
+
+                if let Some(error_code) = fetch_log_for_bucket.error_code
+                    && error_code != FlussError::None.code()
+                {
+                    let api_error: ApiError = ErrorResponse {
+                        error_code,
+                        error_message: fetch_log_for_bucket.error_message.clone(),
+                    }
+                    .into();
+
+                    let error = FlussError::for_code(error_code);
+                    if Self::should_invalidate_table_meta(error) {
+                        // TODO: Consider triggering table meta invalidation from sender/lookup paths.
+                        let table_id = table_bucket.table_id();
+                        let cluster = metadata.get_cluster();
+                        if let Some(table_path) = cluster.get_table_path_by_id(table_id) {
+                            let physical_tables = HashSet::from([PhysicalTablePath::of(Arc::new(
+                                table_path.clone(),
+                            ))]);
+                            metadata.invalidate_physical_table_meta(&physical_tables);
+                        } else {
+                            warn!(
+                                "Table id {table_id} is missing from table_path_by_id while invalidating table metadata"
+                            );
+                        }
+                    }
+                    let error_context = Self::describe_fetch_error(
+                        error,
+                        &table_bucket,
+                        fetch_offset,
+                        api_error.message.as_str(),
+                    );
+                    log_scanner_status.move_bucket_to_end(table_bucket.clone());
+                    match error_context.log_level {
+                        FetchErrorLogLevel::Debug => {
+                            debug!("{}", error_context.log_message);
+                        }
+                        FetchErrorLogLevel::Warn => {
+                            warn!("{}", error_context.log_message);
+                        }
+                    }
+                    log_fetch_buffer.add_api_error(
+                        table_bucket.clone(),
+                        api_error,
+                        error_context,
+                        fetch_offset,
+                    );
+                    continue;
+                }
+
+                // Check if this is a remote log fetch
+                if let Some(ref remote_log_fetch_info) = fetch_log_for_bucket.remote_log_fetch_info
+                {
+                    // Remote fs props are already set by the background SecurityTokenManager
+                    let remote_fetch_info =
+                        RemoteLogFetchInfo::from_proto(remote_log_fetch_info, table_bucket.clone());
+
+                    let high_watermark = fetch_log_for_bucket.high_watermark.unwrap_or(-1);
+                    Self::pending_remote_fetches(
+                        remote_log_downloader.clone(),
+                        log_fetch_buffer.clone(),
+                        remote_read_context.clone(),
+                        &table_bucket,
+                        remote_fetch_info,
+                        fetch_offset,
+                        high_watermark,
+                    );
+                } else if fetch_log_for_bucket.records.is_some() {
+                    // Handle regular in-memory records - create completed fetch directly
+                    let high_watermark = fetch_log_for_bucket.high_watermark.unwrap_or(-1);
+                    let records = fetch_log_for_bucket.records.unwrap_or(vec![]);
+                    let size_in_bytes = records.len();
+                    let log_record_batch = LogRecordsBatches::new(records);
+
+                    let completed_fetch = DefaultCompletedFetch::new(
+                        table_bucket.clone(),
+                        log_record_batch,
+                        size_in_bytes,
+                        read_context.clone(),
+                        fetch_offset,
+                        high_watermark,
+                    );
+                    log_fetch_buffer.add(Box::new(completed_fetch));
+                }
+            }
+        }
+    }
+
+    fn pending_remote_fetches(
+        remote_log_downloader: Arc<RemoteLogDownloader>,
+        log_fetch_buffer: Arc<LogFetchBuffer>,
+        read_context: ReadContext,
+        table_bucket: &TableBucket,
+        remote_fetch_info: RemoteLogFetchInfo,
+        fetch_offset: i64,
+        high_watermark: i64,
+    ) {
+        // Download and process remote log segments
+        let mut pos_in_log_segment = remote_fetch_info.first_start_pos;
+        let mut current_fetch_offset = fetch_offset;
+        for (i, segment) in remote_fetch_info.remote_log_segments.iter().enumerate() {
+            if i > 0 {
+                pos_in_log_segment = 0;
+                current_fetch_offset = segment.start_offset;
+            }
+
+            // todo:
+            // 1: control the max threads to download remote segment
+            // 2: introduce priority queue to priority highest for earliest segment
+            let download_future = remote_log_downloader
+                .request_remote_log(&remote_fetch_info.remote_log_tablet_dir, segment);
+
+            // Register callback to be called when download completes
+            // (similar to Java's downloadFuture.onComplete)
+            // This must be done before creating RemotePendingFetch to avoid move issues
+            let table_bucket = table_bucket.clone();
+            let log_fetch_buffer_clone = log_fetch_buffer.clone();
+            download_future.on_complete(move || {
+                log_fetch_buffer_clone.try_complete(&table_bucket);
+            });
+
+            let pending_fetch = RemotePendingFetch::new(
+                segment.clone(),
+                download_future,
+                pos_in_log_segment,
+                current_fetch_offset,
+                high_watermark,
+                read_context.clone(),
+            );
+            // Add to pending fetches in buffer (similar to Java's logFetchBuffer.pend)
+            log_fetch_buffer.pend(Box::new(pending_fetch));
+        }
+    }
+
+    /// Collect completed fetches from buffer
+    /// Reference: LogFetchCollector.collectFetch in Java
+    fn collect_fetches(&self) -> Result<HashMap<TableBucket, Vec<ScanRecord>>> {
+        let mut result: HashMap<TableBucket, Vec<ScanRecord>> = HashMap::new();
+        let mut records_remaining = self.max_poll_records;
+
+        let collect_result: Result<()> = {
+            while records_remaining > 0 {
+                // Get the next in line fetch, or get a new one from buffer
+                let next_in_line = self.log_fetch_buffer.next_in_line_fetch();
+
+                if next_in_line.is_none() || next_in_line.as_ref().unwrap().is_consumed() {
+                    // Get a new fetch from buffer
+                    if let Some(completed_fetch) = self.log_fetch_buffer.poll() {
+                        // Initialize the fetch if not already initialized
+                        if !completed_fetch.is_initialized() {
+                            let size_in_bytes = completed_fetch.size_in_bytes();
+                            match self.initialize_fetch(completed_fetch) {
+                                Ok(initialized) => {
+                                    self.log_fetch_buffer.set_next_in_line_fetch(initialized);
+                                    continue;
+                                }
+                                Err(e) => {
+                                    // Remove a completedFetch upon a parse with exception if
+                                    // (1) it contains no records, and
+                                    // (2) there are no fetched records with actual content preceding this
+                                    // exception.
+                                    if result.is_empty() && size_in_bytes == 0 {
+                                        // todo: do we need to consider it like java ?
+                                        // self.log_fetch_buffer.poll();
+                                    }
+                                    return Err(e);
+                                }
+                            }
+                        } else {
+                            self.log_fetch_buffer
+                                .set_next_in_line_fetch(Some(completed_fetch));
+                        }
+                        // Note: poll() already removed the fetch from buffer, so no need to call poll()
+                    } else {
+                        // No more fetches available
+                        break;
+                    }
+                } else {
+                    // Fetch records from next_in_line
+                    if let Some(mut next_fetch) = next_in_line {
+                        let records = match self
+                            .fetch_records_from_fetch(&mut next_fetch, records_remaining)
+                        {
+                            Ok(records) => records,
+                            Err(e) => {
+                                if !next_fetch.is_consumed() {
+                                    self.log_fetch_buffer
+                                        .set_next_in_line_fetch(Some(next_fetch));
+                                }
+                                return Err(e);
+                            }
+                        };
+
+                        if !records.is_empty() {
+                            let table_bucket = next_fetch.table_bucket().clone();
+                            // Merge with existing records for this bucket
+                            let existing = result.entry(table_bucket).or_default();
+                            let records_count = records.len();
+                            existing.extend(records);
+
+                            records_remaining = records_remaining.saturating_sub(records_count);
+                        }
+
+                        // If the fetch is not fully consumed, put it back for the next round
+                        if !next_fetch.is_consumed() {
+                            self.log_fetch_buffer
+                                .set_next_in_line_fetch(Some(next_fetch));
+                        }
+                        // If consumed, next_fetch will be dropped here (which is correct)
+                    }
+                }
+            }
+            Ok(())
+        };
+
+        match collect_result {
+            Ok(()) => Ok(result),
+            Err(e) => {
+                if result.is_empty() {
+                    Err(e)
+                } else {
+                    Ok(result)
+                }
+            }
+        }
+    }
+
+    /// Initialize a completed fetch, checking offset match and updating high watermark
+    fn initialize_fetch(
+        &self,
+        mut completed_fetch: Box<dyn CompletedFetch>,
+    ) -> Result<Option<Box<dyn CompletedFetch>>> {
+        if let Some(error) = completed_fetch.take_error() {
+            return Err(error);
+        }
+
+        let table_bucket = completed_fetch.table_bucket().clone();
+        let fetch_offset = completed_fetch.next_fetch_offset();
+
+        if let Some(api_error) = completed_fetch.api_error() {
+            let error = FlussError::for_code(api_error.code);
+            let error_message = api_error.message.as_str();
+            self.log_scanner_status
+                .move_bucket_to_end(table_bucket.clone());
+            let action = completed_fetch
+                .fetch_error_context()
+                .map(|context| context.action)
+                .unwrap_or(FetchErrorAction::Unexpected);
+            match action {
+                FetchErrorAction::Ignore => {
+                    return Ok(None);
+                }
+                FetchErrorAction::LogOffsetOutOfRange => {
+                    return Err(Error::UnexpectedError {
+                        message: format!(
+                            "The fetching offset {fetch_offset} is out of range: {error_message}"
+                        ),
+                        source: None,
+                    });
+                }
+                FetchErrorAction::Authorization => {
+                    return Err(Error::FlussAPIError {
+                        api_error: ApiError {
+                            code: api_error.code,
+                            message: api_error.message.to_string(),
+                        },
+                    });
+                }
+                FetchErrorAction::CorruptMessage => {
+                    return Err(Error::UnexpectedError {
+                        message: format!(
+                            "Encountered corrupt message when fetching offset {fetch_offset} for bucket {table_bucket}: {error_message}"
+                        ),
+                        source: None,
+                    });
+                }
+                FetchErrorAction::Unexpected => {
+                    return Err(Error::UnexpectedError {
+                        message: format!(
+                            "Unexpected error code {error:?} while fetching at offset {fetch_offset} from bucket {table_bucket}: {error_message}"
+                        ),
+                        source: None,
+                    });
+                }
+            }
+        }
+
+        // Check if bucket is still subscribed
+        let Some(current_offset) = self.log_scanner_status.get_bucket_offset(&table_bucket) else {
+            warn!(
+                "Discarding stale fetch response for bucket {table_bucket:?} since the bucket has been unsubscribed"
+            );
+            return Ok(None);
+        };
+
+        // Check if offset matches
+        if fetch_offset != current_offset {
+            warn!(
+                "Discarding stale fetch response for bucket {table_bucket:?} since its offset {fetch_offset} does not match the expected offset {current_offset}"
+            );
+            return Ok(None);
+        }
+
+        // Update high watermark
+        let high_watermark = completed_fetch.high_watermark();
+        if high_watermark >= 0 {
+            self.log_scanner_status
+                .update_high_watermark(&table_bucket, high_watermark);
+        }
+
+        completed_fetch.set_initialized();
+        Ok(Some(completed_fetch))
+    }
+
+    /// Fetch records from a completed fetch, checking offset match
+    fn fetch_records_from_fetch(
+        &self,
+        next_in_line_fetch: &mut Box<dyn CompletedFetch>,
+        max_records: usize,
+    ) -> Result<Vec<ScanRecord>> {
+        let table_bucket = next_in_line_fetch.table_bucket().clone();
+        let current_offset = self.log_scanner_status.get_bucket_offset(&table_bucket);
+
+        if current_offset.is_none() {
+            warn!(
+                "Ignoring fetched records for {table_bucket:?} since the bucket has been unsubscribed"
+            );
+            next_in_line_fetch.drain();
+            return Ok(Vec::new());
+        }
+
+        let current_offset = current_offset.unwrap();
+        let fetch_offset = next_in_line_fetch.next_fetch_offset();
+
+        // Check if this fetch is next in line
+        if fetch_offset == current_offset {
+            let records = next_in_line_fetch.fetch_records(max_records)?;
+            let next_fetch_offset = next_in_line_fetch.next_fetch_offset();
+
+            if next_fetch_offset > current_offset {
+                self.log_scanner_status
+                    .update_offset(&table_bucket, next_fetch_offset);
+            }
+
+            if next_in_line_fetch.is_consumed() && next_in_line_fetch.records_read() > 0 {
+                self.log_scanner_status
+                    .move_bucket_to_end(table_bucket.clone());
+            }
+
+            Ok(records)
+        } else {
+            // These records aren't next in line, ignore them
+            warn!(
+                "Ignoring fetched records for {table_bucket:?} at offset {fetch_offset} since the current offset is {current_offset}"
+            );
+            next_in_line_fetch.drain();
+            Ok(Vec::new())
+        }
+    }
+
+    /// Collect completed fetches as ScanBatches (with bucket and offset metadata)
+    fn collect_batches(&self) -> Result<Vec<ScanBatch>> {
+        // Limit memory usage with both batch count and byte size constraints.
+        // Max 100 batches per poll, but also check total bytes (soft cap ~64MB).
+        const MAX_BATCHES: usize = 100;
+        const MAX_BYTES: usize = 64 * 1024 * 1024; // 64MB soft cap
+        let mut result: Vec<ScanBatch> = Vec::new();
+        let mut batches_remaining = MAX_BATCHES;
+        let mut bytes_consumed: usize = 0;
+
+        let collect_result: Result<()> = {
+            while batches_remaining > 0 && bytes_consumed < MAX_BYTES {
+                let next_in_line = self.log_fetch_buffer.next_in_line_fetch();
+
+                match next_in_line {
+                    Some(mut next_fetch) if !next_fetch.is_consumed() => {
+                        let scan_batches =
+                            self.fetch_batches_from_fetch(&mut next_fetch, batches_remaining)?;
+                        let batch_count = scan_batches.len();
+
+                        if !scan_batches.is_empty() {
+                            // Track bytes consumed (soft cap - may exceed by one fetch)
+                            let batch_bytes: usize = scan_batches
+                                .iter()
+                                .map(|sb| sb.batch().get_array_memory_size())
+                                .sum();
+                            bytes_consumed += batch_bytes;
+
+                            result.extend(scan_batches);
+                            batches_remaining = batches_remaining.saturating_sub(batch_count);
+                        }
+
+                        if !next_fetch.is_consumed() {
+                            self.log_fetch_buffer
+                                .set_next_in_line_fetch(Some(next_fetch));
+                        }
+                    }
+                    _ => {
+                        if let Some(completed_fetch) = self.log_fetch_buffer.poll() {
+                            if !completed_fetch.is_initialized() {
+                                let size_in_bytes = completed_fetch.size_in_bytes();
+                                match self.initialize_fetch(completed_fetch) {
+                                    Ok(initialized) => {
+                                        self.log_fetch_buffer.set_next_in_line_fetch(initialized);
+                                        continue;
+                                    }
+                                    Err(e) => {
+                                        if result.is_empty() && size_in_bytes == 0 {
+                                            continue;
+                                        }
+                                        return Err(e);
+                                    }
+                                }
+                            } else {
+                                self.log_fetch_buffer
+                                    .set_next_in_line_fetch(Some(completed_fetch));
+                            }
+                        } else {
+                            break;
+                        }
+                    }
+                }
+            }
+            Ok(())
+        };
+
+        match collect_result {
+            Ok(()) => Ok(result),
+            Err(e) => {
+                if result.is_empty() {
+                    Err(e)
+                } else {
+                    Ok(result)
+                }
+            }
+        }
+    }
+
+    fn fetch_batches_from_fetch(
+        &self,
+        next_in_line_fetch: &mut Box<dyn CompletedFetch>,
+        max_batches: usize,
+    ) -> Result<Vec<ScanBatch>> {
+        let table_bucket = next_in_line_fetch.table_bucket().clone();
+        let current_offset = self.log_scanner_status.get_bucket_offset(&table_bucket);
+
+        if current_offset.is_none() {
+            warn!(
+                "Ignoring fetched batches for {table_bucket:?} since the bucket has been unsubscribed"
+            );
+            next_in_line_fetch.drain();
+            return Ok(Vec::new());
+        }
+
+        let current_offset = current_offset.unwrap();
+        let fetch_offset = next_in_line_fetch.next_fetch_offset();
+
+        if fetch_offset == current_offset {
+            let batches_with_offsets = next_in_line_fetch.fetch_batches(max_batches)?;
+            let next_fetch_offset = next_in_line_fetch.next_fetch_offset();
+
+            if next_fetch_offset > current_offset {
+                self.log_scanner_status
+                    .update_offset(&table_bucket, next_fetch_offset);
+            }
+
+            // Convert to ScanBatch with bucket info
+            Ok(batches_with_offsets
+                .into_iter()
+                .map(|(batch, base_offset)| {
+                    ScanBatch::new(table_bucket.clone(), batch, base_offset)
+                })
+                .collect())
+        } else {
+            warn!(
+                "Ignoring fetched batches for {table_bucket:?} at offset {fetch_offset} since the current offset is {current_offset}"
+            );
+            next_in_line_fetch.drain();
+            Ok(Vec::new())
+        }
+    }
+
+    async fn prepare_fetch_log_requests(&self) -> HashMap<i32, FetchLogRequest> {
+        let mut fetch_log_req_for_buckets = HashMap::new();
+        let mut table_id = None;
+        let mut ready_for_fetch_count = 0;
+        for bucket in self.fetchable_buckets() {
+            if table_id.is_none() {
+                table_id = Some(bucket.table_id());
+            }
+
+            let offset = match self.log_scanner_status.get_bucket_offset(&bucket) {
+                Some(offset) => offset,
+                None => {
+                    debug!(
+                        "Skipping fetch request for bucket {bucket} because the bucket has been unsubscribed."
+                    );
+                    continue;
+                }
+            };
+
+            match self.get_table_bucket_leader(&bucket) {
+                None => {
+                    log::trace!(
+                        "Skipping fetch request for bucket {bucket} because leader is not available."
+                    )
+                }
+                Some(leader) => {
+                    if self
+                        .nodes_with_pending_fetch_requests
+                        .lock()
+                        .contains(&leader)
+                    {
+                        log::trace!(
+                            "Skipping fetch request for bucket {bucket} because previous request to server {leader} has not been processed."
+                        )
+                    } else {
+                        let fetch_log_req_for_bucket = PbFetchLogReqForBucket {
+                            partition_id: bucket.partition_id(),
+                            bucket_id: bucket.bucket_id(),
+                            fetch_offset: offset,
+                            max_fetch_bytes: self.fetch_max_bytes_for_bucket,
+                        };
+
+                        fetch_log_req_for_buckets
+                            .entry(leader)
+                            .or_insert_with(Vec::new)
+                            .push(fetch_log_req_for_bucket);
+                        ready_for_fetch_count += 1;
+                    }
+                }
+            }
+        }
+
+        if ready_for_fetch_count == 0 {
+            HashMap::new()
+        } else {
+            let (projection_enabled, projected_fields) =
+                match self.read_context.project_fields_in_order() {
+                    None => (false, vec![]),
+                    Some(fields) => (true, fields.iter().map(|&i| i as i32).collect()),
+                };
+
+            fetch_log_req_for_buckets
+                .into_iter()
+                .map(|(leader_id, feq_for_buckets)| {
+                    let req_for_table = PbFetchLogReqForTable {
+                        table_id: table_id.unwrap(),
+                        projection_pushdown_enabled: projection_enabled,
+                        projected_fields: projected_fields.clone(),
+                        buckets_req: feq_for_buckets,
+                        filter_predicate: None,
+                        filter_schema_id: None,
+                    };
+
+                    let fetch_log_request = FetchLogRequest {
+                        follower_server_id: -1,
+                        max_bytes: self.fetch_max_bytes,
+                        tables_req: vec![req_for_table],
+                        max_wait_ms: Some(self.fetch_wait_max_time_ms),
+                        min_bytes: Some(self.fetch_min_bytes),
+                    };
+                    (leader_id, fetch_log_request)
+                })
+                .collect()
+        }
+    }
+
+    fn fetchable_buckets(&self) -> Vec<TableBucket> {
+        // Get buckets that are not already in the buffer
+        let buffered = self.log_fetch_buffer.buffered_buckets();
+        let buffered_set: HashSet<TableBucket> = buffered.into_iter().collect();
+        self.log_scanner_status
+            .fetchable_buckets(|tb| !buffered_set.contains(tb))
+    }
+
+    fn get_table_bucket_leader(&self, tb: &TableBucket) -> Option<i32> {
+        let cluster = self.metadata.get_cluster();
+        cluster.leader_for(tb).map(|leader| leader.id())
+    }
+}
+
+pub struct LogScannerStatus {
+    bucket_status_map: Arc<RwLock<FairBucketStatusMap<BucketScanStatus>>>,
+}
+
+#[allow(dead_code)]
+impl LogScannerStatus {
+    pub fn new() -> Self {
+        Self {
+            bucket_status_map: Arc::new(RwLock::new(FairBucketStatusMap::new())),
+        }
+    }
+
+    pub fn prepare_to_poll(&self) -> bool {
+        let map = self.bucket_status_map.read();
+        map.size() > 0
+    }
+
+    pub fn move_bucket_to_end(&self, table_bucket: TableBucket) {
+        let mut map = self.bucket_status_map.write();
+        map.move_to_end(table_bucket);
+    }
+
+    /// Gets the offset of a bucket if it exists
+    pub fn get_bucket_offset(&self, table_bucket: &TableBucket) -> Option<i64> {
+        let map = self.bucket_status_map.read();
+        map.status_value(table_bucket).map(|status| status.offset())
+    }
+
+    pub fn update_high_watermark(&self, table_bucket: &TableBucket, high_watermark: i64) {
+        if let Some(status) = self.get_status(table_bucket) {
+            status.set_high_watermark(high_watermark);
+        }
+    }
+
+    pub fn update_offset(&self, table_bucket: &TableBucket, offset: i64) {
+        if let Some(status) = self.get_status(table_bucket) {
+            status.set_offset(offset);
+        }
+    }
+
+    pub fn assign_scan_buckets(&self, scan_bucket_offsets: HashMap<TableBucket, i64>) {
+        let mut map = self.bucket_status_map.write();
+        for (bucket, offset) in scan_bucket_offsets {
+            let status = map
+                .status_value(&bucket)
+                .cloned()
+                .unwrap_or_else(|| Arc::new(BucketScanStatus::new(offset)));
+            status.set_offset(offset);
+            map.update(bucket, status);
+        }
+    }
+
+    pub fn assign_scan_bucket(&self, table_bucket: TableBucket, offset: i64) {
+        let status = Arc::new(BucketScanStatus::new(offset));
+        self.bucket_status_map.write().update(table_bucket, status);
+    }
+
+    /// Unassigns scan buckets
+    pub fn unassign_scan_buckets(&self, buckets: &[TableBucket]) {
+        let mut map = self.bucket_status_map.write();
+        for bucket in buckets {
+            map.remove(bucket);
+        }
+    }
+
+    /// Gets fetchable buckets based on availability predicate
+    pub fn fetchable_buckets<F>(&self, is_available: F) -> Vec<TableBucket>
+    where
+        F: Fn(&TableBucket) -> bool,
+    {
+        let map = self.bucket_status_map.read();
+        let mut result = Vec::new();
+        map.for_each(|bucket, _| {
+            if is_available(bucket) {
+                result.push(bucket.clone());
+            }
+        });
+        result
+    }
+
+    /// Returns all subscribed buckets with their current offsets
+    pub fn get_all_subscriptions(&self) -> Vec<(TableBucket, i64)> {
+        let map = self.bucket_status_map.read();
+        let mut result = Vec::new();
+        map.for_each(|bucket, status| {
+            result.push((bucket.clone(), status.offset()));
+        });
+        result
+    }
+
+    /// Helper to get bucket status
+    fn get_status(&self, table_bucket: &TableBucket) -> Option<Arc<BucketScanStatus>> {
+        let map = self.bucket_status_map.read();
+        map.status_value(table_bucket).cloned()
+    }
+}
+
+impl Default for LogScannerStatus {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub struct BucketScanStatus {
+    offset: RwLock<i64>,
+    high_watermark: RwLock<i64>,
+}
+
+#[allow(dead_code)]
+impl BucketScanStatus {
+    pub fn new(offset: i64) -> Self {
+        Self {
+            offset: RwLock::new(offset),
+            high_watermark: RwLock::new(0),
+        }
+    }
+
+    pub fn offset(&self) -> i64 {
+        *self.offset.read()
+    }
+
+    pub fn set_offset(&self, offset: i64) {
+        *self.offset.write() = offset
+    }
+
+    pub fn high_watermark(&self) -> i64 {
+        *self.high_watermark.read()
+    }
+
+    pub fn set_high_watermark(&self, high_watermark: i64) {
+        *self.high_watermark.write() = high_watermark
+    }
+}
+
+fn validate_scan_support(table_path: &TablePath, table_info: &TableInfo) -> Result<()> {
+    if table_info.schema.primary_key().is_some() {
+        return Err(UnsupportedOperation {
+            message: format!("Table {table_path} is not a Log Table and doesn't support scan."),
+        });
+    }
+
+    let log_format = table_info.table_config.get_log_format()?;
+    if LogFormat::ARROW != log_format {
+        return Err(UnsupportedOperation {
+            message: format!(
+                "Scan is only supported for ARROW format and table {table_path} uses {log_format} format"
+            ),
+        });
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::client::WriteRecord;
+    use crate::client::metadata::Metadata;
+    use crate::compression::{
+        ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType,
+        DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+    };
+    use crate::metadata::{DataTypes, PhysicalTablePath, Schema, TableInfo, TablePath};
+    use crate::proto::{PbFetchLogRespForBucket, PbFetchLogRespForTable};
+    use crate::record::MemoryLogRecordsArrowBuilder;
+    use crate::row::{Datum, GenericRow};
+    use crate::rpc::FlussError;
+    use crate::test_utils::{
+        assert_scanner_entries_labeled, build_cluster_arc, build_table_info, test_scanner_metrics,
+    };
+
+    fn build_records(table_info: &TableInfo, table_path: Arc<TablePath>) -> Result<Vec<u8>> {
+        let mut builder = MemoryLogRecordsArrowBuilder::new(
+            1,
+            table_info.get_row_type(),
+            false,
+            ArrowCompressionInfo {
+                compression_type: ArrowCompressionType::None,
+                compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+            },
+            usize::MAX,
+            Arc::new(ArrowCompressionRatioEstimator::default()),
+        )?;
+        let physical_table_path = Arc::new(PhysicalTablePath::of(table_path));
+        let row = GenericRow {
+            values: vec![Datum::Int32(1)],
+        };
+        let record =
+            WriteRecord::for_append(Arc::new(table_info.clone()), physical_table_path, 1, &row);
+        builder.append(&record)?;
+        builder.build()
+    }
+
+    #[tokio::test]
+    async fn collect_fetches_updates_offset() -> Result<()> {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = build_table_info(table_path.clone(), 1, 1);
+        let cluster = build_cluster_arc(&table_path, 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster));
+        let status = Arc::new(LogScannerStatus::new());
+        let fetcher = LogFetcher::new(
+            table_info.clone(),
+            Arc::new(RpcClient::new()),
+            metadata,
+            status.clone(),
+            &Config::default(),
+            None,
+            test_scanner_metrics(&table_path),
+        )?;
+
+        let bucket = TableBucket::new(1, 0);
+        status.assign_scan_bucket(bucket.clone(), 0);
+
+        let data = build_records(&table_info, Arc::new(table_path))?;
+        let log_records = LogRecordsBatches::new(data.clone());
+        let row_type = Arc::new(table_info.get_row_type().clone());
+        let read_context = ReadContext::new(to_arrow_schema(&row_type)?, row_type, false);
+        let completed =
+            DefaultCompletedFetch::new(bucket.clone(), log_records, data.len(), read_context, 0, 0);
+        fetcher.log_fetch_buffer.add(Box::new(completed));
+
+        let fetched = fetcher.collect_fetches()?;
+        assert_eq!(fetched.get(&bucket).unwrap().len(), 1);
+        assert_eq!(status.get_bucket_offset(&bucket), Some(1));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn fetch_records_from_fetch_drains_unassigned_bucket() -> Result<()> {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = build_table_info(table_path.clone(), 1, 1);
+        let cluster = build_cluster_arc(&table_path, 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster));
+        let status = Arc::new(LogScannerStatus::new());
+        let fetcher = LogFetcher::new(
+            table_info.clone(),
+            Arc::new(RpcClient::new()),
+            metadata,
+            status,
+            &Config::default(),
+            None,
+            test_scanner_metrics(&table_path),
+        )?;
+
+        let bucket = TableBucket::new(1, 0);
+        let data = build_records(&table_info, Arc::new(table_path))?;
+        let log_records = LogRecordsBatches::new(data.clone());
+        let row_type = Arc::new(table_info.get_row_type().clone());
+        let read_context = ReadContext::new(to_arrow_schema(&row_type)?, row_type, false);
+        let mut completed: Box<dyn CompletedFetch> = Box::new(DefaultCompletedFetch::new(
+            bucket,
+            log_records,
+            data.len(),
+            read_context,
+            0,
+            0,
+        ));
+
+        let records = fetcher.fetch_records_from_fetch(&mut completed, 10)?;
+        assert!(records.is_empty());
+        assert!(completed.is_consumed());
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn prepare_fetch_log_requests_skips_pending() -> Result<()> {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = build_table_info(table_path.clone(), 1, 1);
+        let cluster = build_cluster_arc(&table_path, 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster));
+        let status = Arc::new(LogScannerStatus::new());
+        status.assign_scan_bucket(TableBucket::new(1, 0), 0);
+        let fetcher = LogFetcher::new(
+            table_info,
+            Arc::new(RpcClient::new()),
+            metadata,
+            status,
+            &Config::default(),
+            None,
+            test_scanner_metrics(&table_path),
+        )?;
+
+        fetcher.nodes_with_pending_fetch_requests.lock().insert(1);
+
+        let requests = fetcher.prepare_fetch_log_requests().await;
+        assert!(requests.is_empty());
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn handle_fetch_response_sets_error() -> Result<()> {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = build_table_info(table_path.clone(), 1, 1);
+        let cluster = build_cluster_arc(&table_path, 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster));
+        let status = Arc::new(LogScannerStatus::new());
+        status.assign_scan_bucket(TableBucket::new(1, 0), 5);
+        let fetcher = LogFetcher::new(
+            table_info.clone(),
+            Arc::new(RpcClient::new()),
+            metadata.clone(),
+            status.clone(),
+            &Config::default(),
+            None,
+            test_scanner_metrics(&table_path),
+        )?;
+
+        let response = FetchLogResponse {
+            tables_resp: vec![PbFetchLogRespForTable {
+                table_id: 1,
+                buckets_resp: vec![PbFetchLogRespForBucket {
+                    partition_id: None,
+                    bucket_id: 0,
+                    error_code: Some(FlussError::AuthorizationException.code()),
+                    error_message: Some("denied".to_string()),
+                    high_watermark: None,
+                    log_start_offset: None,
+                    remote_log_fetch_info: None,
+                    records: None,
+                    filtered_end_offset: None,
+                }],
+            }],
+        };
+
+        let response_context = FetchResponseContext {
+            metadata: metadata.clone(),
+            log_fetch_buffer: fetcher.log_fetch_buffer.clone(),
+            log_scanner_status: fetcher.log_scanner_status.clone(),
+            read_context: fetcher.read_context.clone(),
+            remote_read_context: fetcher.remote_read_context.clone(),
+            remote_log_downloader: fetcher.remote_log_downloader.clone(),
+            metrics: Arc::clone(&fetcher.metrics),
+            request_start_time: Instant::now(),
+        };
+
+        LogFetcher::handle_fetch_response(response, response_context).await;
+
+        let completed = fetcher.log_fetch_buffer.poll().expect("completed fetch");
+        let api_error = completed.api_error().expect("api error");
+        assert_eq!(api_error.code, FlussError::AuthorizationException.code());
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn handle_fetch_response_invalidates_table_meta() -> Result<()> {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = build_table_info(table_path.clone(), 1, 1);
+        let cluster = build_cluster_arc(&table_path, 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster.clone()));
+        let status = Arc::new(LogScannerStatus::new());
+        status.assign_scan_bucket(TableBucket::new(1, 0), 5);
+        let fetcher = LogFetcher::new(
+            table_info.clone(),
+            Arc::new(RpcClient::new()),
+            metadata.clone(),
+            status.clone(),
+            &Config::default(),
+            None,
+            test_scanner_metrics(&table_path),
+        )?;
+
+        let bucket = TableBucket::new(1, 0);
+        assert!(metadata.leader_for(&table_path, &bucket).await?.is_some());
+
+        let response = FetchLogResponse {
+            tables_resp: vec![PbFetchLogRespForTable {
+                table_id: 1,
+                buckets_resp: vec![PbFetchLogRespForBucket {
+                    partition_id: None,
+                    bucket_id: 0,
+                    error_code: Some(FlussError::NotLeaderOrFollower.code()),
+                    error_message: Some("not leader".to_string()),
+                    high_watermark: None,
+                    log_start_offset: None,
+                    remote_log_fetch_info: None,
+                    records: None,
+                    filtered_end_offset: None,
+                }],
+            }],
+        };
+
+        let response_context = FetchResponseContext {
+            metadata: metadata.clone(),
+            log_fetch_buffer: fetcher.log_fetch_buffer.clone(),
+            log_scanner_status: fetcher.log_scanner_status.clone(),
+            read_context: fetcher.read_context.clone(),
+            remote_read_context: fetcher.remote_read_context.clone(),
+            remote_log_downloader: fetcher.remote_log_downloader.clone(),
+            metrics: Arc::clone(&fetcher.metrics),
+            request_start_time: Instant::now(),
+        };
+
+        LogFetcher::handle_fetch_response(response, response_context).await;
+
+        assert!(metadata.get_cluster().leader_for(&bucket).is_none());
+        Ok(())
+    }
+
+    fn create_test_table_info(
+        has_primary_key: bool,
+        log_format: Option<&str>,
+    ) -> (TableInfo, TablePath) {
+        let mut schema_builder = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string());
+
+        if has_primary_key {
+            schema_builder = schema_builder.primary_key(vec!["id"]);
+        }
+
+        let schema = schema_builder.build().unwrap();
+        let table_path = TablePath::new("test_db", "test_table");
+
+        let mut properties = HashMap::new();
+        if let Some(format) = log_format {
+            properties.insert("table.log.format".to_string(), format.to_string());
+        }
+
+        let table_info = TableInfo::new(
+            table_path.clone(),
+            1,
+            1,
+            schema,
+            vec![],
+            Arc::from(vec![]),
+            1,
+            properties,
+            HashMap::new(),
+            None,
+            0,
+            0,
+        );
+
+        (table_info, table_path)
+    }
+
+    #[test]
+    fn test_validate_scan_support() {
+        // Primary key table
+        let (table_info, table_path) = create_test_table_info(true, Some("ARROW"));
+        let result = validate_scan_support(&table_path, &table_info);
+
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(matches!(err, UnsupportedOperation { .. }));
+        assert!(err.to_string().contains(
+            format!("Table {table_path} is not a Log Table and doesn't support scan.").as_str()
+        ));
+
+        // Indexed format
+        let (table_info, table_path) = create_test_table_info(false, Some("INDEXED"));
+        let result = validate_scan_support(&table_path, &table_info);
+
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(matches!(err, UnsupportedOperation { .. }));
+        assert!(err.to_string().contains(format!("Scan is only supported for ARROW format and table {table_path} uses INDEXED format").as_str()));
+
+        // Default format
+        let (table_info, table_path) = create_test_table_info(false, None);
+        let result = validate_scan_support(&table_path, &table_info);
+        assert!(result.is_ok());
+
+        // Arrow format
+        let (table_info, table_path) = create_test_table_info(false, Some("ARROW"));
+        let result = validate_scan_support(&table_path, &table_info);
+        assert!(result.is_ok());
+    }
+
+    #[tokio::test]
+    async fn prepare_fetch_log_requests_uses_configured_fetch_params() -> Result<()> {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = build_table_info(table_path.clone(), 1, 1);
+        let cluster = build_cluster_arc(&table_path, 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster));
+        let status = Arc::new(LogScannerStatus::new());
+        status.assign_scan_bucket(TableBucket::new(1, 0), 0);
+
+        let config = Config {
+            scanner_log_fetch_max_bytes: 1234,
+            scanner_log_fetch_min_bytes: 7,
+            scanner_log_fetch_wait_max_time_ms: 89,
+            scanner_log_fetch_max_bytes_for_bucket: 512,
+            ..Config::default()
+        };
+
+        let fetcher = LogFetcher::new(
+            table_info,
+            Arc::new(RpcClient::new()),
+            metadata,
+            status,
+            &config,
+            None,
+            test_scanner_metrics(&table_path),
+        )?;
+
+        let requests = fetcher.prepare_fetch_log_requests().await;
+        // In this test cluster, leader id should exist; but even if it changes,
+        // assert over all built requests.
+        assert!(!requests.is_empty());
+        for req in requests.values() {
+            assert_eq!(req.max_bytes, 1234);
+            assert_eq!(req.min_bytes, Some(7));
+            assert_eq!(req.max_wait_ms, Some(89));
+
+            for table_req in &req.tables_req {
+                for bucket_req in &table_req.buckets_req {
+                    assert_eq!(bucket_req.max_fetch_bytes, 512);
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /// Builds a self-contained `LogScannerInner` for poll-timing tests
+    /// inside a `current_thread` runtime so callers can drive `PollGuard`
+    /// lifecycles synchronously.
+    fn with_test_log_scanner_inner<F: FnOnce(&LogScannerInner)>(body: F) {
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("build current_thread runtime");
+        rt.block_on(async {
+            let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+            let table_info = build_table_info(table_path.clone(), 1, 1);
+            let cluster = build_cluster_arc(&table_path, 1, 1);
+            let metadata = Arc::new(Metadata::new_for_test(cluster));
+            let inner = LogScannerInner::new(
+                &table_info,
+                metadata,
+                Arc::new(RpcClient::new()),
+                &Config::default(),
+                None,
+            )
+            .expect("build LogScannerInner");
+            body(&inner);
+        });
+    }
+
+    fn snapshot_gauge(
+        snapshotter: &metrics_util::debugging::Snapshotter,
+        name: &str,
+    ) -> Option<f64> {
+        use metrics_util::debugging::DebugValue;
+        snapshotter
+            .snapshot()
+            .into_vec()
+            .into_iter()
+            .find_map(|(key, _, _, val)| {
+                if key.key().name() == name {
+                    if let DebugValue::Gauge(g) = val {
+                        return Some(g.into_inner());
+                    }
+                }
+                None
+            })
+    }
+
+    /// Exercises the `PollGuard` lifecycle across two consecutive
+    /// `record_poll_start` calls. Asserts both poll-timing gauges are
+    /// emitted at the right moments and `record_poll_end` runs on guard
+    /// drop (also the cancellation-safety path, since dropping the
+    /// `poll()` future drops the guard).
+    #[test]
+    fn poll_guard_emits_time_between_poll_and_idle_ratio() {
+        use crate::metrics::{SCANNER_POLL_IDLE_RATIO, SCANNER_TIME_BETWEEN_POLL_MS};
+        use metrics_util::debugging::DebuggingRecorder;
+
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            with_test_log_scanner_inner(|inner| {
+                // First poll: emits time_between_poll_ms=0 (Java parity:
+                // ScannerMetricGroup.recordPollStart emits 0 when there is
+                // no previous poll). Idle ratio is also emitted as 1.0
+                // on drop (poll_time / (poll_time + 0) = 1.0).
+                {
+                    let _g = PollGuard::new(inner);
+                    std::thread::sleep(std::time::Duration::from_millis(5));
+                }
+
+                // Brief gap so time_between_poll_ms is observably > 0.
+                std::thread::sleep(std::time::Duration::from_millis(5));
+
+                // Second poll: refreshes both time_between_poll_ms (>0)
+                // and a fresh idle ratio.
+                {
+                    let _g = PollGuard::new(inner);
+                    std::thread::sleep(std::time::Duration::from_millis(5));
+                }
+            });
+        });
+
+        let between = snapshot_gauge(&snapshotter, SCANNER_TIME_BETWEEN_POLL_MS)
+            .expect("time_between_poll_ms must be emitted on every poll");
+        assert!(
+            between > 0.0,
+            "second-poll time_between_poll_ms must be positive, got {between}"
+        );
+
+        let ratio = snapshot_gauge(&snapshotter, SCANNER_POLL_IDLE_RATIO)
+            .expect("poll_idle_ratio must be emitted on poll end");
+        assert!(
+            (0.0..=1.0).contains(&ratio),
+            "poll_idle_ratio must be in [0, 1], got {ratio}"
+        );
+
+        // Both gauges must carry `database=db` / `table=tbl` (the fixture
+        // values from `with_test_log_scanner_inner`).
+        assert_scanner_entries_labeled(&snapshotter.snapshot().into_vec(), "db", "tbl");
+    }
+
+    /// Java parity: `ScannerMetricGroup.recordPollStart` emits
+    /// `timeMsBetweenPoll = 0` on the very first poll. The Rust gauge
+    /// must do the same so dashboards see the metric series from poll #1.
+    #[test]
+    fn time_between_poll_ms_emits_zero_on_first_poll() {
+        use crate::metrics::SCANNER_TIME_BETWEEN_POLL_MS;
+        use metrics_util::debugging::DebuggingRecorder;
+
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            with_test_log_scanner_inner(|inner| {
+                let _g = PollGuard::new(inner);
+                // Drop at end of scope completes the poll; the value of
+                // SCANNER_TIME_BETWEEN_POLL_MS was emitted at start, not end.
+            });
+        });
+
+        let between = snapshot_gauge(&snapshotter, SCANNER_TIME_BETWEEN_POLL_MS)
+            .expect("time_between_poll_ms must be emitted on the first poll");
+        assert_eq!(
+            between, 0.0,
+            "first-poll time_between_poll_ms must be 0.0 (Java parity), got {between}"
+        );
+        assert_scanner_entries_labeled(&snapshotter.snapshot().into_vec(), "db", "tbl");
+    }
+
+    /// Pins the single-consumer contract: overlapping `PollGuard`s on the
+    /// same scanner trip the `debug_assert!` in `record_poll_start`.
+    /// Release builds skip the check, so the test is gated on
+    /// `debug_assertions`.
+    #[cfg(debug_assertions)]
+    #[test]
+    #[should_panic(expected = "concurrent poll() detected")]
+    fn overlapping_polls_panic_in_debug_builds() {
+        with_test_log_scanner_inner(|inner| {
+            let _g1 = PollGuard::new(inner);
+            // _g1 has not been dropped → poll_start_at is still Some,
+            // so the second start must panic.
+            let _g2 = PollGuard::new(inner);
+        });
+    }
+
+    /// Drives `handle_fetch_response` against a local metrics recorder and
+    /// asserts that latency + bytes-per-request histograms are emitted with
+    /// values that mirror what Java would record. This complements the unit
+    /// tests in `metrics.rs` (which only verify the facade) by exercising
+    /// the actual instrumented call path.
+    ///
+    /// Note: uses a `current_thread` runtime inside `with_local_recorder`
+    /// (rather than `#[tokio::test]`) because the metrics facade installs a
+    /// thread-local recorder; running the async work on the same thread is
+    /// the only way to observe the emitted metrics in the snapshot. Both
+    /// the fetcher construction and the `handle_fetch_response` call run
+    /// inside the runtime (the security-token manager and remote-log
+    /// downloader require a Tokio reactor).
+    #[test]
+    fn handle_fetch_response_emits_latency_and_bytes_metrics() {
+        use crate::metrics::{SCANNER_BYTES_PER_REQUEST, SCANNER_FETCH_LATENCY_MS};
+        use metrics_util::debugging::{DebugValue, DebuggingRecorder};
+
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        let expected_bytes = metrics::with_local_recorder(&recorder, || {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .expect("build current_thread runtime");
+
+            rt.block_on(async {
+                let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+                let table_info = build_table_info(table_path.clone(), 1, 1);
+                let cluster = build_cluster_arc(&table_path, 1, 1);
+                let metadata = Arc::new(Metadata::new_for_test(cluster));
+                let status = Arc::new(LogScannerStatus::new());
+                status.assign_scan_bucket(TableBucket::new(1, 0), 5);
+                let fetcher = LogFetcher::new(
+                    table_info,
+                    Arc::new(RpcClient::new()),
+                    metadata.clone(),
+                    status,
+                    &Config::default(),
+                    None,
+                    test_scanner_metrics(&table_path),
+                )
+                .expect("build LogFetcher");
+
+                let response = FetchLogResponse {
+                    tables_resp: vec![PbFetchLogRespForTable {
+                        table_id: 1,
+                        buckets_resp: vec![PbFetchLogRespForBucket {
+                            partition_id: None,
+                            bucket_id: 0,
+                            error_code: Some(FlussError::None.code()),
+                            error_message: None,
+                            high_watermark: Some(7),
+                            log_start_offset: Some(0),
+                            remote_log_fetch_info: None,
+                            records: None,
+                            filtered_end_offset: None,
+                        }],
+                    }],
+                };
+                let expected_bytes = response.encoded_len() as f64;
+                let response_context = FetchResponseContext {
+                    metadata: metadata.clone(),
+                    log_fetch_buffer: fetcher.log_fetch_buffer.clone(),
+                    log_scanner_status: fetcher.log_scanner_status.clone(),
+                    read_context: fetcher.read_context.clone(),
+                    remote_read_context: fetcher.remote_read_context.clone(),
+                    remote_log_downloader: fetcher.remote_log_downloader.clone(),
+                    metrics: Arc::clone(&fetcher.metrics),
+                    request_start_time: Instant::now(),
+                };
+
+                LogFetcher::handle_fetch_response(response, response_context).await;
+                expected_bytes
+            })
+        });
+
+        let entries: Vec<_> = snapshotter.snapshot().into_vec();
+        let find_histogram = |name: &str| -> Vec<f64> {
+            entries
+                .iter()
+                .find_map(|(key, _, _, val)| {
+                    if key.key().name() == name {
+                        if let DebugValue::Histogram(v) = val {
+                            return Some(v.iter().map(|f| f.into_inner()).collect());
+                        }
+                    }
+                    None
+                })
+                .unwrap_or_default()
+        };
+
+        let latency_samples = find_histogram(SCANNER_FETCH_LATENCY_MS);
+        assert_eq!(latency_samples.len(), 1, "expected one latency sample");
+        assert!(
+            latency_samples[0] >= 0.0,
+            "latency must be non-negative, got {}",
+            latency_samples[0]
+        );
+
+        let bytes_samples = find_histogram(SCANNER_BYTES_PER_REQUEST);
+        assert_eq!(
+            bytes_samples,
+            vec![expected_bytes],
+            "bytes histogram must record encoded_len() for parity with Java fetchLogResponse.totalSize()",
+        );
+
+        // Every emitted scanner metric must carry both `database` and `table`
+        // labels — that's the whole point of `ScannerMetrics`. If a future
+        // contributor adds a new `metrics::*!` macro inline (bypassing
+        // `ScannerMetrics`), this assertion catches it.
+        assert_scanner_entries_labeled(&entries, "db", "tbl");
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/table/upsert.rs b/fluss-rust/crates/fluss/src/client/table/upsert.rs
new file mode 100644
index 0000000000..52ec37b37b
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/table/upsert.rs
@@ -0,0 +1,560 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::{RowBytes, WriteFormat, WriteRecord, WriteResultFuture, WriterClient};
+use crate::error::Error::{IllegalArgument, UnexpectedError};
+use crate::error::Result;
+use crate::metadata::{RowType, TableInfo, TablePath};
+use crate::row::InternalRow;
+use crate::row::encode::{KeyEncoder, KeyEncoderFactory, RowEncoder, RowEncoderFactory};
+use crate::row::field_getter::FieldGetter;
+use std::sync::{Arc, Mutex};
+
+use crate::client::table::partition_getter::{PartitionGetter, get_physical_path};
+use bitvec::prelude::bitvec;
+use bytes::Bytes;
+
+#[allow(dead_code)]
+pub struct TableUpsert {
+    table_path: TablePath,
+    table_info: TableInfo,
+    writer_client: Arc<WriterClient>,
+    target_columns: Option<Arc<Vec<usize>>>,
+}
+
+#[allow(dead_code)]
+impl TableUpsert {
+    pub fn new(
+        table_path: TablePath,
+        table_info: TableInfo,
+        writer_client: Arc<WriterClient>,
+    ) -> Self {
+        Self {
+            table_path,
+            table_info,
+            writer_client,
+            target_columns: None,
+        }
+    }
+
+    pub fn partial_update(&self, target_columns: Option<Vec<usize>>) -> Result<Self> {
+        if let Some(columns) = &target_columns {
+            let num_columns = self.table_info.row_type().fields().len();
+
+            if let Some(&invalid_column) = columns.iter().find(|&&col| col >= num_columns) {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "Invalid target column index: {invalid_column} for table {}. The table only has {num_columns} columns.",
+                        self.table_path
+                    ),
+                });
+            }
+        }
+
+        Ok(Self {
+            table_path: self.table_path.clone(),
+            table_info: self.table_info.clone(),
+            writer_client: self.writer_client.clone(),
+            target_columns: target_columns.map(Arc::new),
+        })
+    }
+
+    pub fn partial_update_with_column_names(&self, target_column_names: &[&str]) -> Result<Self> {
+        let row_type = self.table_info.row_type();
+        let col_indices: Vec<(&str, Option<usize>)> = target_column_names
+            .iter()
+            .map(|col_name| (*col_name, row_type.get_field_index(col_name)))
+            .collect();
+
+        if let Some((missing_name, _)) = col_indices.iter().find(|(_, ix)| ix.is_none()) {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Cannot find target column `{}` for table {}.",
+                    missing_name, self.table_path
+                ),
+            });
+        }
+
+        let valid_col_indices: Vec<usize> = col_indices
+            .into_iter()
+            .map(|(_, index)| index.unwrap())
+            .collect();
+
+        self.partial_update(Some(valid_col_indices))
+    }
+
+    pub fn create_writer(&self) -> Result<UpsertWriter> {
+        UpsertWriterFactory::create(
+            Arc::new(self.table_path.clone()),
+            Arc::new(self.table_info.clone()),
+            self.target_columns.clone(),
+            Arc::clone(&self.writer_client),
+        )
+    }
+}
+
+pub struct UpsertWriter {
+    table_path: Arc<TablePath>,
+    writer_client: Arc<WriterClient>,
+    partition_field_getter: Option<PartitionGetter>,
+    primary_key_encoder: Mutex<Box<dyn KeyEncoder>>,
+    target_columns: Option<Arc<Vec<usize>>>,
+    // Use primary key encoder as bucket key encoder when None
+    bucket_key_encoder: Option<Mutex<Box<dyn KeyEncoder>>>,
+    write_format: WriteFormat,
+    row_encoder: Mutex<Box<dyn RowEncoder>>,
+    field_getters: Box<[FieldGetter]>,
+    table_info: Arc<TableInfo>,
+}
+
+struct UpsertWriterFactory;
+
+impl UpsertWriterFactory {
+    pub fn create(
+        table_path: Arc<TablePath>,
+        table_info: Arc<TableInfo>,
+        partial_update_columns: Option<Arc<Vec<usize>>>,
+        writer_client: Arc<WriterClient>,
+    ) -> Result<UpsertWriter> {
+        let data_lake_format = &table_info.table_config.get_datalake_format()?;
+        let row_type = table_info.row_type();
+        let physical_pks = table_info.get_physical_primary_keys();
+
+        let names = table_info.get_schema().auto_increment_col_names();
+
+        Self::sanity_check(
+            row_type,
+            &table_info.primary_keys,
+            names,
+            &partial_update_columns,
+        )?;
+
+        let primary_key_encoder = KeyEncoderFactory::of(row_type, physical_pks, data_lake_format)?;
+        let bucket_key_encoder = if !table_info.is_default_bucket_key() {
+            Some(KeyEncoderFactory::of(
+                row_type,
+                table_info.get_bucket_keys(),
+                data_lake_format,
+            )?)
+        } else {
+            // Defaults to using primary key encoder when None for bucket key
+            None
+        };
+
+        let kv_format = table_info.get_table_config().get_kv_format()?;
+        let write_format = WriteFormat::from_kv_format(&kv_format)?;
+
+        let field_getters = FieldGetter::create_field_getters(row_type);
+
+        let partition_field_getter = if table_info.is_partitioned() {
+            Some(PartitionGetter::new(
+                row_type,
+                Arc::clone(table_info.get_partition_keys()),
+            )?)
+        } else {
+            None
+        };
+
+        Ok(UpsertWriter {
+            table_path,
+            partition_field_getter,
+            writer_client,
+            primary_key_encoder: Mutex::new(primary_key_encoder),
+            target_columns: partial_update_columns,
+            bucket_key_encoder: bucket_key_encoder.map(Mutex::new),
+            write_format,
+            row_encoder: Mutex::new(Box::new(RowEncoderFactory::create(
+                kv_format,
+                row_type.clone(),
+            )?)),
+            field_getters,
+            table_info: table_info.clone(),
+        })
+    }
+
+    #[allow(dead_code)]
+    fn sanity_check(
+        row_type: &RowType,
+        primary_keys: &Vec<String>,
+        auto_increment_col_names: &Vec<String>,
+        target_columns: &Option<Arc<Vec<usize>>>,
+    ) -> Result<()> {
+        if target_columns.is_none() {
+            if !auto_increment_col_names.is_empty() {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "This table has auto increment column {}. Explicitly specifying values for an auto increment column is not allowed. Please Specify non-auto-increment columns as target columns using partialUpdate first.",
+                        auto_increment_col_names.join(", ")
+                    ),
+                });
+            }
+            return Ok(());
+        }
+
+        let field_count = row_type.fields().len();
+
+        let mut target_column_set = bitvec![0; field_count];
+
+        let columns = target_columns.as_ref().unwrap().as_ref();
+
+        for &target_index in columns {
+            target_column_set.set(target_index, true);
+        }
+
+        let mut pk_column_set = bitvec![0; field_count];
+
+        // check the target columns contains the primary key
+        for primary_key in primary_keys {
+            let pk_index = row_type.get_field_index(primary_key.as_str());
+            match pk_index {
+                Some(pk_index) => {
+                    if !target_column_set[pk_index] {
+                        return Err(IllegalArgument {
+                            message: format!(
+                                "The target write columns {} must contain the primary key columns {}",
+                                row_type.project(columns)?.get_field_names().join(", "),
+                                primary_keys.join(", ")
+                            ),
+                        });
+                    }
+                    pk_column_set.set(pk_index, true);
+                }
+                None => {
+                    return Err(IllegalArgument {
+                        message: format!(
+                            "The specified primary key {primary_key} is not in row type {row_type}"
+                        ),
+                    });
+                }
+            }
+        }
+
+        let mut auto_increment_column_set = bitvec![0; field_count];
+        // explicitly specifying values for an auto increment column is not allowed
+        for auto_increment_col_name in auto_increment_col_names {
+            let auto_increment_field_index =
+                row_type.get_field_index(auto_increment_col_name.as_str());
+
+            if let Some(index) = auto_increment_field_index {
+                if target_column_set[index] {
+                    return Err(IllegalArgument {
+                        message: format!(
+                            "Explicitly specifying values for the auto increment column {auto_increment_col_name} is not allowed."
+                        ),
+                    });
+                }
+
+                auto_increment_column_set.set(index, true);
+            }
+        }
+
+        // check the columns not in targetColumns should be nullable
+        for i in 0..field_count {
+            // column not in primary key and not in auto increment column
+            if !pk_column_set[i] && !auto_increment_column_set[i] {
+                // the column should be nullable
+                if !row_type.fields().get(i).unwrap().data_type.is_nullable() {
+                    return Err(IllegalArgument {
+                        message: format!(
+                            "Partial Update requires all columns except primary key to be nullable, but column {} is NOT NULL.",
+                            row_type.fields().get(i).unwrap().name()
+                        ),
+                    });
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+impl UpsertWriter {
+    fn check_field_count<R: InternalRow>(&self, row: &R) -> Result<()> {
+        let expected = self.table_info.get_row_type().fields().len();
+        if row.get_field_count() != expected {
+            return Err(IllegalArgument {
+                message: format!(
+                    "The field count of the row does not match the table schema. Expected: {}, Actual: {}",
+                    expected,
+                    row.get_field_count()
+                ),
+            });
+        }
+        Ok(())
+    }
+
+    fn get_keys(&self, row: &dyn InternalRow) -> Result<(Bytes, Option<Bytes>)> {
+        let key = self
+            .primary_key_encoder
+            .lock()
+            .map_err(|e| UnexpectedError {
+                message: format!("primary_key_encoder lock poisoned: {e}"),
+                source: None,
+            })?
+            .encode_key(row)?;
+        let bucket_key = match &self.bucket_key_encoder {
+            Some(encoder) => Some(
+                encoder
+                    .lock()
+                    .map_err(|e| UnexpectedError {
+                        message: format!("bucket_key_encoder lock poisoned: {e}"),
+                        source: None,
+                    })?
+                    .encode_key(row)?,
+            ),
+            None => Some(key.clone()),
+        };
+        Ok((key, bucket_key))
+    }
+
+    fn encode_row<R: InternalRow>(&self, row: &R) -> Result<Bytes> {
+        let mut encoder = self.row_encoder.lock().map_err(|e| UnexpectedError {
+            message: format!("row_encoder lock poisoned: {e}"),
+            source: None,
+        })?;
+        encoder.start_new_row()?;
+        for (pos, field_getter) in self.field_getters.iter().enumerate() {
+            let datum = field_getter.get_field(row)?;
+            encoder.encode_field(pos, datum)?;
+        }
+        encoder.finish_row()
+    }
+
+    /// Flush data written that have not yet been sent to the server, forcing the client to send the
+    /// requests to server and blocks on the completion of the requests associated with these
+    /// records. A request is considered completed when it is successfully acknowledged according to
+    /// the CLIENT_WRITER_ACKS configuration option you have specified or else it
+    /// results in an error.
+    pub async fn flush(&self) -> Result<()> {
+        self.writer_client.flush().await
+    }
+
+    /// Inserts row into Fluss table if they do not already exist, or updates them if they do exist.
+    ///
+    /// This method returns a [`WriteResultFuture`] immediately after queueing the write,
+    /// enabling fire-and-forget semantics for efficient batching.
+    ///
+    /// # Arguments
+    /// * row - the row to upsert.
+    ///
+    /// # Returns
+    /// A [`WriteResultFuture`] that can be awaited to wait for server acknowledgment,
+    /// or dropped for fire-and-forget behavior (use `flush()` to ensure delivery).
+    pub fn upsert<R: InternalRow>(&self, row: &R) -> Result<WriteResultFuture> {
+        self.check_field_count(row)?;
+
+        let (key, bucket_key) = self.get_keys(row)?;
+
+        let row_bytes: RowBytes<'_> = match row.as_encoded_bytes(self.write_format) {
+            Some(bytes) => RowBytes::Borrowed(bytes),
+            None => RowBytes::Owned(self.encode_row(row)?),
+        };
+
+        let write_record = WriteRecord::for_upsert(
+            Arc::clone(&self.table_info),
+            Arc::new(get_physical_path(
+                &self.table_path,
+                self.partition_field_getter.as_ref(),
+                row,
+            )?),
+            self.table_info.schema_id,
+            key,
+            bucket_key,
+            self.write_format,
+            self.target_columns.clone(),
+            Some(row_bytes),
+        );
+
+        let result_handle = self.writer_client.send(&write_record)?;
+        Ok(WriteResultFuture::new(result_handle))
+    }
+
+    /// Delete certain row by the input row in Fluss table, the input row must contain the primary
+    /// key.
+    ///
+    /// This method returns a [`WriteResultFuture`] immediately after queueing the delete,
+    /// enabling fire-and-forget semantics for efficient batching.
+    ///
+    /// # Arguments
+    /// * row - the row to delete (must contain the primary key fields).
+    ///
+    /// # Returns
+    /// A [`WriteResultFuture`] that can be awaited to wait for server acknowledgment,
+    /// or dropped for fire-and-forget behavior (use `flush()` to ensure delivery).
+    pub fn delete<R: InternalRow>(&self, row: &R) -> Result<WriteResultFuture> {
+        self.check_field_count(row)?;
+
+        let (key, bucket_key) = self.get_keys(row)?;
+
+        let write_record = WriteRecord::for_upsert(
+            Arc::clone(&self.table_info),
+            Arc::new(get_physical_path(
+                &self.table_path,
+                self.partition_field_getter.as_ref(),
+                row,
+            )?),
+            self.table_info.schema_id,
+            key,
+            bucket_key,
+            self.write_format,
+            self.target_columns.clone(),
+            None,
+        );
+
+        let result_handle = self.writer_client.send(&write_record)?;
+        Ok(WriteResultFuture::new(result_handle))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataField, DataTypes};
+
+    #[test]
+    fn sanity_check() {
+        // No target columns specified but table has auto-increment column
+        let fields = vec![
+            DataField::new("id", DataTypes::int().as_non_nullable(), None),
+            DataField::new("name", DataTypes::string(), None),
+        ];
+        let row_type = RowType::new(fields);
+        let primary_keys = vec!["id".to_string()];
+        let auto_increment_col_names = vec!["id".to_string()];
+        let target_columns = None;
+
+        let result = UpsertWriterFactory::sanity_check(
+            &row_type,
+            &primary_keys,
+            &auto_increment_col_names,
+            &target_columns,
+        );
+
+        assert!(result.unwrap_err().to_string().contains(
+            "This table has auto increment column id. Explicitly specifying values for an auto increment column is not allowed. Please Specify non-auto-increment columns as target columns using partialUpdate first."
+        ));
+
+        // Target columns do not contain primary key
+        let fields = vec![
+            DataField::new("id", DataTypes::int().as_non_nullable(), None),
+            DataField::new("name", DataTypes::string(), None),
+            DataField::new("value", DataTypes::int(), None),
+        ];
+        let row_type = RowType::new(fields);
+        let primary_keys = vec!["id".to_string()];
+        let auto_increment_col_names = vec![];
+        let target_columns = Some(Arc::new(vec![1usize]));
+
+        let result = UpsertWriterFactory::sanity_check(
+            &row_type,
+            &primary_keys,
+            &auto_increment_col_names,
+            &target_columns,
+        );
+
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("The target write columns name must contain the primary key columns id")
+        );
+
+        // Primary key column not found in row type
+        let fields = vec![
+            DataField::new("id", DataTypes::int().as_non_nullable(), None),
+            DataField::new("name", DataTypes::string(), None),
+        ];
+        let row_type = RowType::new(fields);
+        let primary_keys = vec!["nonexistent_pk".to_string()];
+        let auto_increment_col_names = vec![];
+        let target_columns = Some(Arc::new(vec![0usize, 1]));
+
+        let result = UpsertWriterFactory::sanity_check(
+            &row_type,
+            &primary_keys,
+            &auto_increment_col_names,
+            &target_columns,
+        );
+
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("The specified primary key nonexistent_pk is not in row type")
+        );
+
+        // Target columns include auto-increment column
+        let fields = vec![
+            DataField::new("id", DataTypes::int().as_non_nullable(), None),
+            DataField::new("seq", DataTypes::bigint().as_non_nullable(), None),
+            DataField::new("name", DataTypes::string(), None),
+        ];
+        let row_type = RowType::new(fields);
+        let primary_keys = vec!["id".to_string()];
+        let auto_increment_col_names = vec!["seq".to_string()];
+        let target_columns = Some(Arc::new(vec![0usize, 1, 2]));
+
+        let result = UpsertWriterFactory::sanity_check(
+            &row_type,
+            &primary_keys,
+            &auto_increment_col_names,
+            &target_columns,
+        );
+
+        assert!(result.unwrap_err().to_string().contains(
+            "Explicitly specifying values for the auto increment column seq is not allowed."
+        ));
+
+        // Non-nullable column not in target columns (partial update requires nullable)
+        let fields = vec![
+            DataField::new("id", DataTypes::int().as_non_nullable(), None),
+            DataField::new(
+                "required_field",
+                DataTypes::string().as_non_nullable(),
+                None,
+            ),
+            DataField::new("optional_field", DataTypes::int(), None),
+        ];
+        let row_type = RowType::new(fields);
+        let primary_keys = vec!["id".to_string()];
+        let auto_increment_col_names = vec![];
+        let target_columns = Some(Arc::new(vec![0usize]));
+
+        let result = UpsertWriterFactory::sanity_check(
+            &row_type,
+            &primary_keys,
+            &auto_increment_col_names,
+            &target_columns,
+        );
+
+        assert!(result.unwrap_err().to_string().contains(
+            "Partial Update requires all columns except primary key to be nullable, but column required_field is NOT NULL."
+        ));
+    }
+}
+
+/// The result of upserting a record
+/// Currently this is an empty struct to allow for compatible evolution in the future
+#[derive(Default)]
+#[allow(dead_code)]
+pub struct UpsertResult;
+
+/// The result of deleting a record
+/// Currently this is an empty struct to allow for compatible evolution in the future
+#[derive(Default)]
+#[allow(dead_code)]
+pub struct DeleteResult;
diff --git a/fluss-rust/crates/fluss/src/client/write/accumulator.rs b/fluss-rust/crates/fluss/src/client/write/accumulator.rs
new file mode 100644
index 0000000000..244edf7399
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/accumulator.rs
@@ -0,0 +1,1759 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::broadcast;
+use crate::client::write::IdempotenceManager;
+use crate::client::write::batch::WriteBatch::{ArrowLog, Kv};
+use crate::client::write::batch::{ArrowLogWriteBatch, KvWriteBatch, WriteBatch};
+use crate::client::write::dynamic_batch_size::DynamicWriteBatchSizeEstimator;
+use crate::client::{LogWriteRecord, Record, ResultHandle, WriteRecord};
+use crate::cluster::{BucketLocation, Cluster, ServerNode};
+use crate::compression::ArrowCompressionRatioEstimator;
+use crate::config::Config;
+use crate::error::{Error, Result};
+use crate::metadata::{PhysicalTablePath, TableBucket};
+use crate::record::{NO_BATCH_SEQUENCE, NO_WRITER_ID};
+use crate::util::current_time_ms;
+use crate::{BucketId, PartitionId, TableId};
+use dashmap::DashMap;
+use parking_lot::{Condvar, Mutex, RwLock};
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, AtomicI32, AtomicI64, AtomicUsize, Ordering};
+use std::time::{Duration, Instant};
+use tokio::sync::Notify;
+
+/// Byte-counting semaphore that blocks producers when total buffered memory
+/// exceeds the configured limit. Matches Java's `LazyMemorySegmentPool` behavior.
+///
+/// TODO: Replace `notify_all()` with per-waiter FIFO signaling (Java uses per-request
+/// Condition objects in a Deque) to avoid thundering herd under high contention.
+///
+/// TODO: Track actual batch memory usage instead of reserving a fixed `writer_batch_size`
+/// per batch. This over-counts when batches don't fill completely, reducing effective
+/// throughput. Requires tighter coupling with batch internals.
+pub(crate) struct MemoryLimiter {
+    state: Mutex<usize>,
+    cond: Condvar,
+    max_memory: usize,
+    wait_timeout: Duration,
+    closed: AtomicBool,
+    waiting_count: AtomicUsize,
+}
+
+impl MemoryLimiter {
+    pub fn new(max_memory: usize, wait_timeout: Duration) -> Self {
+        Self {
+            state: Mutex::new(0),
+            cond: Condvar::new(),
+            max_memory,
+            wait_timeout,
+            closed: AtomicBool::new(false),
+            waiting_count: AtomicUsize::new(0),
+        }
+    }
+
+    /// Try to acquire `size` bytes. Blocks until memory is available,
+    /// the timeout expires, or the limiter is closed.
+    /// Returns a `MemoryPermit` on success.
+    pub fn acquire(self: &Arc<Self>, size: usize) -> Result<MemoryPermit> {
+        if self.closed.load(Ordering::Acquire) {
+            return Err(Error::WriterClosed {
+                message: "Memory limiter is closed".to_string(),
+            });
+        }
+
+        if size > self.max_memory {
+            return Err(Error::IllegalArgument {
+                message: format!(
+                    "Batch size {} exceeds total buffer memory limit {}",
+                    size, self.max_memory
+                ),
+            });
+        }
+
+        let mut used = self.state.lock();
+        let deadline = Instant::now() + self.wait_timeout;
+        while *used + size > self.max_memory {
+            self.waiting_count.fetch_add(1, Ordering::Relaxed);
+            let result = self.cond.wait_until(&mut used, deadline);
+            self.waiting_count.fetch_sub(1, Ordering::Relaxed);
+
+            if self.closed.load(Ordering::Acquire) {
+                return Err(Error::WriterClosed {
+                    message: "Memory limiter is closed".to_string(),
+                });
+            }
+            if result.timed_out() && *used + size > self.max_memory {
+                return Err(Error::BufferExhausted {
+                    message: format!(
+                        "Failed to allocate {} bytes for write batch within {}ms. \
+                         {} of {} bytes in use, {} threads waiting.",
+                        size,
+                        self.wait_timeout.as_millis(),
+                        *used,
+                        self.max_memory,
+                        self.waiting_count.load(Ordering::Relaxed),
+                    ),
+                });
+            }
+        }
+
+        *used += size;
+        Ok(MemoryPermit {
+            limiter: Arc::clone(self),
+            size,
+        })
+    }
+
+    fn release(&self, size: usize) {
+        let mut used = self.state.lock();
+        *used = used.saturating_sub(size);
+        self.cond.notify_all();
+    }
+
+    /// Returns true if any producers are currently blocked waiting for memory.
+    /// Used by `ready()` to mark all batches as immediately sendable when
+    /// memory is exhausted (matching Java's `exhausted` flag).
+    pub fn has_waiters(&self) -> bool {
+        self.waiting_count.load(Ordering::Relaxed) > 0
+    }
+
+    /// Mark the limiter as closed and wake all blocked producers.
+    fn close(&self) {
+        self.closed.store(true, Ordering::Release);
+        self.cond.notify_all();
+    }
+}
+
+/// RAII guard that releases memory back to the `MemoryLimiter` on drop.
+pub(crate) struct MemoryPermit {
+    limiter: Arc<MemoryLimiter>,
+    size: usize,
+}
+
+impl std::fmt::Debug for MemoryPermit {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("MemoryPermit")
+            .field("size", &self.size)
+            .finish_non_exhaustive()
+    }
+}
+
+impl Drop for MemoryPermit {
+    fn drop(&mut self) {
+        if self.size > 0 {
+            self.limiter.release(self.size);
+        }
+    }
+}
+
+// Type alias to simplify complex nested types
+type BucketBatches = Vec<(BucketId, Arc<Mutex<VecDeque<WriteBatch>>>)>;
+
+#[allow(dead_code)]
+pub struct RecordAccumulator {
+    config: Config,
+    write_batches: DashMap<Arc<PhysicalTablePath>, BucketAndWriteBatches>,
+    // batch_id -> (complete callback, memory permit)
+    incomplete_batches: RwLock<HashMap<i64, (ResultHandle, MemoryPermit)>>,
+    batch_timeout_ms: i64,
+    closed: AtomicBool,
+    flushes_in_progress: AtomicI32,
+    appends_in_progress: i32,
+    nodes_drain_index: Mutex<HashMap<i32, usize>>,
+    batch_id: AtomicI64,
+    idempotence_manager: Arc<IdempotenceManager>,
+    memory_limiter: Arc<MemoryLimiter>,
+    /// Wakes the sender task when new batches are created or existing batches
+    /// become full, so the sender can drain them immediately instead of waiting
+    /// for its next poll cycle. This is the Rust equivalent of Java's
+    /// `Sender.wakeup()` / Kafka's `RecordAccumulator.wakeup()`.
+    sender_wakeup: Notify,
+}
+
+impl RecordAccumulator {
+    pub fn new(config: Config, idempotence_manager: Arc<IdempotenceManager>) -> Self {
+        let batch_timeout_ms = config.writer_batch_timeout_ms;
+        let memory_limiter = Arc::new(MemoryLimiter::new(
+            config.writer_buffer_memory_size,
+            Duration::from_millis(config.writer_buffer_wait_timeout_ms),
+        ));
+        RecordAccumulator {
+            config,
+            write_batches: Default::default(),
+            incomplete_batches: Default::default(),
+            batch_timeout_ms,
+            closed: Default::default(),
+            flushes_in_progress: Default::default(),
+            appends_in_progress: Default::default(),
+            nodes_drain_index: Default::default(),
+            batch_id: Default::default(),
+            idempotence_manager,
+            memory_limiter,
+            sender_wakeup: Notify::new(),
+        }
+    }
+
+    fn try_append(
+        &self,
+        record: &WriteRecord,
+        dq: &mut VecDeque<WriteBatch>,
+    ) -> Result<Option<RecordAppendResult>> {
+        let dq_size = dq.len();
+        if let Some(last_batch) = dq.back_mut() {
+            return if let Some(result_handle) = last_batch.try_append(record)? {
+                Ok(Some(RecordAppendResult::new(
+                    result_handle,
+                    dq_size > 1 || last_batch.is_closed(),
+                    false,
+                    false,
+                )))
+            } else {
+                Ok(None)
+            };
+        }
+        Ok(None)
+    }
+
+    fn append_new_batch(
+        &self,
+        cluster: &Cluster,
+        record: &WriteRecord,
+        dq: &mut VecDeque<WriteBatch>,
+        permit: MemoryPermit,
+        alloc_size: usize,
+        compression_ratio_estimator: Arc<ArrowCompressionRatioEstimator>,
+    ) -> Result<RecordAppendResult> {
+        let physical_table_path = &record.physical_table_path;
+        let table_path = physical_table_path.get_table_path();
+        let table_info = cluster.get_table(table_path)?;
+        let arrow_compression_info = table_info.get_table_config().get_arrow_compression_info()?;
+        let row_type = &table_info.row_type;
+
+        let schema_id = table_info.schema_id;
+
+        let mut batch: WriteBatch = match record.record() {
+            Record::Log(_) => ArrowLog(ArrowLogWriteBatch::new(
+                self.batch_id.fetch_add(1, Ordering::Relaxed),
+                Arc::clone(physical_table_path),
+                schema_id,
+                arrow_compression_info,
+                row_type,
+                current_time_ms(),
+                matches!(&record.record, Record::Log(LogWriteRecord::RecordBatch(_))),
+                alloc_size,
+                compression_ratio_estimator,
+            )?),
+            Record::Kv(kv_record) => Kv(KvWriteBatch::new(
+                self.batch_id.fetch_add(1, Ordering::Relaxed),
+                Arc::clone(physical_table_path),
+                schema_id,
+                alloc_size,
+                record.write_format.to_kv_format()?,
+                kv_record.target_columns.clone(),
+                current_time_ms(),
+            )),
+        };
+
+        let batch_id = batch.batch_id();
+
+        let result_handle = batch
+            .try_append(record)?
+            .expect("must append to a new batch");
+
+        let batch_is_closed = batch.is_closed();
+        dq.push_back(batch);
+
+        self.incomplete_batches
+            .write()
+            .insert(batch_id, (result_handle.clone(), permit));
+        Ok(RecordAppendResult::new(
+            result_handle,
+            dq.len() > 1 || batch_is_closed,
+            true,
+            false,
+        ))
+    }
+
+    pub fn append(
+        &self,
+        record: &WriteRecord<'_>,
+        bucket_id: BucketId,
+        cluster: &Cluster,
+        abort_if_batch_full: bool,
+    ) -> Result<RecordAppendResult> {
+        let physical_table_path = &record.physical_table_path;
+        let table_path = physical_table_path.get_table_path();
+        let table_info = cluster.get_table(table_path)?;
+        let is_partitioned_table = table_info.is_partitioned();
+
+        let partition_id = if is_partitioned_table {
+            cluster.get_partition_id(physical_table_path)
+        } else {
+            None
+        };
+
+        let (dq, compression_ratio_estimator, dynamic_target) = {
+            let mut binding = self
+                .write_batches
+                .entry(Arc::clone(physical_table_path))
+                .or_insert_with(|| {
+                    BucketAndWriteBatches::new(
+                        table_info.table_id,
+                        is_partitioned_table,
+                        partition_id,
+                        &self.config,
+                    )
+                });
+            let bucket_and_batches = binding.value_mut();
+            let dq = bucket_and_batches
+                .batches
+                .entry(bucket_id)
+                .or_insert_with(|| Arc::new(Mutex::new(VecDeque::new())))
+                .clone();
+            let dynamic_target = bucket_and_batches
+                .dynamic_batch_size
+                .as_ref()
+                .map(|est| est.current());
+            (
+                dq,
+                Arc::clone(&bucket_and_batches.compression_ratio_estimator),
+                dynamic_target,
+            )
+        };
+
+        let mut dq_guard = dq.lock();
+        if let Some(append_result) = self.try_append(record, &mut dq_guard)? {
+            return Ok(append_result);
+        }
+
+        if abort_if_batch_full {
+            return Ok(RecordAppendResult::new_without_result_handle(
+                true, false, true,
+            ));
+        }
+
+        // Drop dq lock before blocking on memory to prevent deadlock:
+        // producer holds dq + blocks on memory, while sender needs dq to drain.
+        drop(dq_guard);
+
+        let batch_size = dynamic_target.unwrap_or(self.config.writer_batch_size as usize);
+        let record_size = record.estimated_record_size();
+        let alloc_size = batch_size.max(record_size);
+        let permit = self.memory_limiter.acquire(alloc_size)?;
+
+        // Re-acquire dq lock after memory is available
+        let mut dq_guard = dq.lock();
+        // Re-try: another thread may have created a batch while we waited
+        if let Some(append_result) = self.try_append(record, &mut dq_guard)? {
+            return Ok(append_result); // permit drops here, memory released
+        }
+
+        self.append_new_batch(
+            cluster,
+            record,
+            &mut dq_guard,
+            permit,
+            alloc_size,
+            compression_ratio_estimator,
+        )
+    }
+
+    pub fn ready(&self, cluster: &Arc<Cluster>) -> Result<ReadyCheckResult> {
+        // Snapshot just the Arcs we need, avoiding cloning the entire BucketAndWriteBatches struct
+        let entries: Vec<(Arc<PhysicalTablePath>, Option<PartitionId>, BucketBatches)> = self
+            .write_batches
+            .iter()
+            .map(|entry| {
+                let physical_table_path = Arc::clone(entry.key());
+                let partition_id = entry.value().partition_id;
+                let bucket_batches: Vec<_> = entry
+                    .value()
+                    .batches
+                    .iter()
+                    .map(|(bucket_id, batch_arc)| (*bucket_id, batch_arc.clone()))
+                    .collect();
+                (physical_table_path, partition_id, bucket_batches)
+            })
+            .collect();
+
+        let mut ready_nodes = HashSet::new();
+        let mut next_ready_check_delay_ms = self.batch_timeout_ms;
+        let mut unknown_leader_tables = HashSet::new();
+        let exhausted = self.memory_limiter.has_waiters();
+
+        for (physical_table_path, mut partition_id, bucket_batches) in entries {
+            next_ready_check_delay_ms = self.bucket_ready(
+                &physical_table_path,
+                physical_table_path.get_partition_name().is_some(),
+                &mut partition_id,
+                bucket_batches,
+                &mut ready_nodes,
+                &mut unknown_leader_tables,
+                cluster,
+                next_ready_check_delay_ms,
+                exhausted,
+            )?
+        }
+
+        Ok(ReadyCheckResult {
+            ready_nodes,
+            next_ready_check_delay_ms,
+            unknown_leader_tables,
+        })
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    fn bucket_ready(
+        &self,
+        physical_table_path: &Arc<PhysicalTablePath>,
+        is_partitioned_table: bool,
+        partition_id: &mut Option<PartitionId>,
+        bucket_batches: BucketBatches,
+        ready_nodes: &mut HashSet<ServerNode>,
+        unknown_leader_tables: &mut HashSet<Arc<PhysicalTablePath>>,
+        cluster: &Cluster,
+        next_ready_check_delay_ms: i64,
+        exhausted: bool,
+    ) -> Result<i64> {
+        let mut next_delay = next_ready_check_delay_ms;
+
+        // First check this table has partitionId.
+        if is_partitioned_table && partition_id.is_none() {
+            let partition_id = cluster.get_partition_id(physical_table_path);
+
+            if partition_id.is_some() {
+                // Update the cached partition_id
+                if let Some(mut entry) = self.write_batches.get_mut(physical_table_path) {
+                    entry.partition_id = partition_id;
+                }
+            } else {
+                log::debug!(
+                    "Partition does not exist for {}, bucket will not be set to ready",
+                    physical_table_path.as_ref()
+                );
+
+                // TODO: we shouldn't add unready partitions to unknownLeaderTables,
+                // because it cases PartitionNotExistException later
+                unknown_leader_tables.insert(Arc::clone(physical_table_path));
+                return Ok(next_delay);
+            }
+        }
+
+        for (bucket_id, batch) in bucket_batches {
+            let batch_guard = batch.lock();
+            if batch_guard.is_empty() {
+                continue;
+            }
+
+            let batch = batch_guard.front().unwrap();
+            let waited_time_ms = batch.waited_time_ms(current_time_ms());
+            let deque_size = batch_guard.len();
+            let full = deque_size > 1 || batch.is_closed();
+            let table_bucket = cluster.get_table_bucket(physical_table_path, bucket_id)?;
+            if let Some(leader) = cluster.leader_for(&table_bucket) {
+                next_delay = self.batch_ready(
+                    leader,
+                    waited_time_ms,
+                    full,
+                    exhausted,
+                    ready_nodes,
+                    next_delay,
+                );
+            } else {
+                unknown_leader_tables.insert(Arc::clone(physical_table_path));
+            }
+        }
+        Ok(next_delay)
+    }
+
+    fn batch_ready(
+        &self,
+        leader: &ServerNode,
+        waited_time_ms: i64,
+        full: bool,
+        exhausted: bool,
+        ready_nodes: &mut HashSet<ServerNode>,
+        next_ready_check_delay_ms: i64,
+    ) -> i64 {
+        if !ready_nodes.contains(leader) {
+            let expired = waited_time_ms >= self.batch_timeout_ms;
+            let sendable = full
+                || expired
+                || exhausted
+                || self.closed.load(Ordering::Acquire)
+                || self.flush_in_progress();
+
+            if sendable {
+                ready_nodes.insert(leader.clone());
+            } else {
+                let time_left_ms = self.batch_timeout_ms.saturating_sub(waited_time_ms);
+                return next_ready_check_delay_ms.min(time_left_ms);
+            }
+        }
+        next_ready_check_delay_ms
+    }
+
+    pub fn drain(
+        &self,
+        cluster: Arc<Cluster>,
+        nodes: &HashSet<ServerNode>,
+        max_size: i32,
+    ) -> Result<HashMap<i32, Vec<ReadyWriteBatch>>> {
+        if nodes.is_empty() {
+            return Ok(HashMap::new());
+        }
+        let mut batches = HashMap::new();
+        for node in nodes {
+            let ready = self.drain_batches_for_one_node(&cluster, node, max_size)?;
+            if !ready.is_empty() {
+                batches.insert(node.id(), ready);
+            }
+        }
+
+        Ok(batches)
+    }
+
+    /// Matches Java's `shouldStopDrainBatchesForBucket`. Returns true if
+    /// this bucket should be skipped during drain.
+    fn should_stop_drain_batches_for_bucket(
+        &self,
+        first: &WriteBatch,
+        table_bucket: &TableBucket,
+    ) -> bool {
+        if !self.idempotence_manager.is_enabled() {
+            return false;
+        }
+        if !self.idempotence_manager.is_writer_id_valid() {
+            return true;
+        }
+
+        // Use batch_id comparison instead of sequence comparison. After
+        // handle_failed_batch adjusts InFlightBatch sequences, the WriteBatch's
+        // stored sequence may be stale (re_enqueue syncs it, but this is more
+        // robust). Java can compare sequences because resetWriterState mutates
+        // the batch directly; Rust uses lightweight InFlightBatch proxies.
+        let is_first_in_flight = self.idempotence_manager.in_flight_count(table_bucket) == 0
+            || (first.has_batch_sequence()
+                && self
+                    .idempotence_manager
+                    .is_first_in_flight_batch(table_bucket, first.batch_id()));
+
+        if is_first_in_flight {
+            return false;
+        }
+
+        if !first.has_batch_sequence() {
+            // Fresh batch: respect max in-flight limit
+            !self
+                .idempotence_manager
+                .can_send_more_requests(table_bucket)
+        } else {
+            // Re-enqueued batch that's NOT first in-flight: stop
+            true
+        }
+    }
+
+    fn drain_batches_for_one_node(
+        &self,
+        cluster: &Cluster,
+        node: &ServerNode,
+        max_size: i32,
+    ) -> Result<Vec<ReadyWriteBatch>> {
+        let mut size: usize = 0;
+        let buckets = self.get_all_buckets_in_current_node(node, cluster);
+        let mut ready = Vec::new();
+
+        if buckets.is_empty() {
+            return Ok(ready);
+        }
+
+        let start = {
+            let mut nodes_drain_index_guard = self.nodes_drain_index.lock();
+            let drain_index = nodes_drain_index_guard.entry(node.id()).or_insert(0);
+            *drain_index % buckets.len()
+        };
+
+        let mut current_index = start;
+        let mut last_processed_index;
+
+        loop {
+            let bucket = &buckets[current_index];
+            let table_path = bucket.physical_table_path();
+            let table_bucket = bucket.table_bucket.clone();
+            last_processed_index = current_index;
+            current_index = (current_index + 1) % buckets.len();
+
+            let deque = self
+                .write_batches
+                .get(table_path)
+                .and_then(|bucket_and_write_batches| {
+                    bucket_and_write_batches
+                        .batches
+                        .get(&table_bucket.bucket_id())
+                        .cloned()
+                });
+
+            if let Some(deque) = deque {
+                let mut maybe_batch = None;
+                {
+                    let mut batch_lock = deque.lock();
+                    if !batch_lock.is_empty() {
+                        let first_batch = batch_lock.front().unwrap();
+
+                        if size + first_batch.estimated_size_in_bytes() > max_size as usize
+                            && !ready.is_empty()
+                        {
+                            // there is a rare case that a single batch size is larger than the request size
+                            // due to compression; in this case we will still eventually send this batch in
+                            // a single request.
+                            break;
+                        }
+
+                        // Improvement: `continue` instead of `break` to skip
+                        // only this bucket, not all buckets for the node.
+                        if self.should_stop_drain_batches_for_bucket(first_batch, &table_bucket) {
+                            if current_index == start {
+                                break;
+                            }
+                            continue;
+                        }
+
+                        maybe_batch = Some(batch_lock.pop_front().unwrap());
+                    }
+                }
+
+                if let Some(ref mut batch) = maybe_batch {
+                    // Assign writer state to fresh batches (matching Java's drain loop)
+                    let writer_id = if self.idempotence_manager.is_enabled() {
+                        self.idempotence_manager.writer_id()
+                    } else {
+                        NO_WRITER_ID
+                    };
+                    if writer_id != NO_WRITER_ID && !batch.has_batch_sequence() {
+                        self.idempotence_manager
+                            .maybe_update_writer_id(&table_bucket);
+                        let seq = self
+                            .idempotence_manager
+                            .next_sequence_and_increment(&table_bucket);
+                        batch.set_writer_state(writer_id, seq);
+                        self.idempotence_manager.add_in_flight_batch(
+                            &table_bucket,
+                            seq,
+                            batch.batch_id(),
+                        );
+                    }
+                }
+
+                if let Some(mut batch) = maybe_batch {
+                    let current_batch_size = batch.estimated_size_in_bytes();
+                    size += current_batch_size;
+
+                    self.record_actual_batch_size(table_path, current_batch_size);
+
+                    // mark the batch as drained.
+                    batch.drained(current_time_ms());
+                    ready.push(ReadyWriteBatch {
+                        table_bucket,
+                        write_batch: batch,
+                    });
+                }
+            }
+            if current_index == start {
+                break;
+            }
+        }
+
+        // Store the last processed index to maintain round-robin fairness
+        {
+            let mut nodes_drain_index_guard = self.nodes_drain_index.lock();
+            nodes_drain_index_guard.insert(node.id(), last_processed_index);
+        }
+
+        Ok(ready)
+    }
+
+    pub fn remove_incomplete_batches(&self, batch_id: i64) {
+        self.incomplete_batches.write().remove(&batch_id);
+    }
+
+    fn record_actual_batch_size(&self, table_path: &Arc<PhysicalTablePath>, actual: usize) {
+        let Some(entry) = self.write_batches.get(table_path) else {
+            return;
+        };
+        let Some(estimator) = entry.dynamic_batch_size.as_ref() else {
+            return;
+        };
+        let prev = estimator.current();
+        let next = estimator.update(actual);
+        if next != prev {
+            log::debug!(
+                "Set estimated batch size for {} from {} to {}",
+                table_path.as_ref(),
+                prev,
+                next
+            );
+        }
+    }
+
+    #[cfg(test)]
+    fn estimated_batch_size(&self, table_path: &Arc<PhysicalTablePath>) -> Option<usize> {
+        self.write_batches
+            .get(table_path)?
+            .dynamic_batch_size
+            .as_ref()
+            .map(|est| est.current())
+    }
+
+    pub fn re_enqueue(&self, mut ready_write_batch: ReadyWriteBatch) {
+        ready_write_batch.write_batch.re_enqueued();
+
+        // Sync WriteBatch sequence with IdempotenceManager's adjusted sequence.
+        // When handle_failed_batch adjusts InFlightBatch sequences (after a prior
+        // batch fails), the WriteBatch is not updated (unlike Java which calls
+        // resetWriterState on the actual batch). We must sync here so that:
+        // 1. should_stop_drain_batches_for_bucket comparisons work correctly
+        // 2. build() produces bytes with the correct (adjusted) sequence
+        if self.idempotence_manager.is_enabled()
+            && ready_write_batch.write_batch.has_batch_sequence()
+        {
+            if let Some(adjusted_seq) = self.idempotence_manager.get_adjusted_sequence(
+                &ready_write_batch.table_bucket,
+                ready_write_batch.write_batch.batch_id(),
+            ) {
+                if adjusted_seq != ready_write_batch.write_batch.batch_sequence() {
+                    let writer_id = ready_write_batch.write_batch.writer_id();
+                    ready_write_batch
+                        .write_batch
+                        .set_writer_state(writer_id, adjusted_seq);
+                }
+            }
+        }
+
+        let dq = self.get_or_create_deque(&ready_write_batch);
+        let mut dq_guard = dq.lock();
+        if self.idempotence_manager.is_enabled() {
+            self.insert_in_sequence_order(&mut dq_guard, ready_write_batch);
+        } else {
+            dq_guard.push_front(ready_write_batch.write_batch);
+        }
+    }
+
+    /// Insert a re-enqueued batch in sequence order. Matches Java's
+    /// `insertInSequenceOrder`. If the batch is the next expected in-flight,
+    /// push to front; otherwise, find the correct sorted position.
+    fn insert_in_sequence_order(
+        &self,
+        dq: &mut VecDeque<WriteBatch>,
+        ready_write_batch: ReadyWriteBatch,
+    ) {
+        debug_assert!(
+            ready_write_batch.write_batch.batch_sequence() != NO_BATCH_SEQUENCE,
+            "Re-enqueuing a batch without a sequence (batch_id={})",
+            ready_write_batch.write_batch.batch_id()
+        );
+        debug_assert!(
+            self.idempotence_manager
+                .in_flight_count(&ready_write_batch.table_bucket)
+                > 0,
+            "Re-enqueuing a batch not tracked in in-flight (batch_id={}, bucket={})",
+            ready_write_batch.write_batch.batch_id(),
+            ready_write_batch.table_bucket
+        );
+
+        if dq.is_empty() {
+            dq.push_front(ready_write_batch.write_batch);
+            return;
+        }
+
+        // If it's the first in-flight batch for its bucket, push to front
+        if self.idempotence_manager.is_first_in_flight_batch(
+            &ready_write_batch.table_bucket,
+            ready_write_batch.write_batch.batch_id(),
+        ) {
+            dq.push_front(ready_write_batch.write_batch);
+            return;
+        }
+
+        // Find the correct position sorted by batch_sequence
+        let batch_seq = ready_write_batch.write_batch.batch_sequence();
+        let mut insert_pos = dq.len();
+        for (i, existing) in dq.iter().enumerate() {
+            if existing.has_batch_sequence() && existing.batch_sequence() > batch_seq {
+                insert_pos = i;
+                break;
+            }
+        }
+        dq.insert(insert_pos, ready_write_batch.write_batch);
+    }
+
+    fn get_or_create_deque(
+        &self,
+        ready_write_batch: &ReadyWriteBatch,
+    ) -> Arc<Mutex<VecDeque<WriteBatch>>> {
+        let physical_table_path = ready_write_batch.write_batch.physical_table_path();
+        let bucket_id = ready_write_batch.table_bucket.bucket_id();
+        let table_id = ready_write_batch.table_bucket.table_id();
+        let partition_id = ready_write_batch.table_bucket.partition_id();
+        let is_partitioned_table = partition_id.is_some();
+
+        let mut binding = self
+            .write_batches
+            .entry(Arc::clone(physical_table_path))
+            .or_insert_with(|| {
+                BucketAndWriteBatches::new(
+                    table_id,
+                    is_partitioned_table,
+                    partition_id,
+                    &self.config,
+                )
+            });
+        let bucket_and_batches = binding.value_mut();
+        bucket_and_batches
+            .batches
+            .entry(bucket_id)
+            .or_insert_with(|| Arc::new(Mutex::new(VecDeque::new())))
+            .clone()
+    }
+
+    /// Mark the accumulator as closed. All batches become immediately ready
+    /// (sendable) in `batch_ready`, triggering a full drain without waiting
+    /// for `batch_timeout_ms`. Matches Java's `RecordAccumulator.close()`.
+    pub fn close(&self) {
+        self.closed.store(true, Ordering::Release);
+        self.wakeup_sender();
+    }
+
+    pub fn is_closed(&self) -> bool {
+        self.closed.load(Ordering::Acquire)
+    }
+
+    pub fn abort_batches(&self, error: broadcast::Error) {
+        self.memory_limiter.close();
+        // Complete batches still in deques (not yet drained).
+        for mut entry in self.write_batches.iter_mut() {
+            for (_bucket_id, deque) in entry.value_mut().batches.iter_mut() {
+                let mut dq = deque.lock();
+                while let Some(batch) = dq.pop_front() {
+                    batch.complete(Err(error.clone()));
+                }
+            }
+        }
+        // Fail any remaining handles (including in-flight batches that were
+        // drained but not yet completed). This is a no-op for handles already
+        // completed above via WriteBatch::complete.
+        let mut incomplete = self.incomplete_batches.write();
+        for (handle, _permit) in incomplete.values() {
+            handle.fail(error.clone());
+        }
+        incomplete.clear();
+    }
+
+    pub fn has_incomplete(&self) -> bool {
+        !self.incomplete_batches.read().is_empty()
+    }
+
+    /// Wake the sender task so it can drain ready batches immediately.
+    pub fn wakeup_sender(&self) {
+        self.sender_wakeup.notify_one();
+    }
+
+    /// Returns a future that completes when `wakeup_sender()` is called.
+    pub fn notified(&self) -> tokio::sync::futures::Notified<'_> {
+        self.sender_wakeup.notified()
+    }
+
+    fn get_all_buckets_in_current_node(
+        &self,
+        current: &ServerNode,
+        cluster: &Cluster,
+    ) -> Vec<BucketLocation> {
+        let mut buckets = vec![];
+        for bucket_locations in cluster.get_bucket_locations_by_path().values() {
+            for bucket_location in bucket_locations {
+                if let Some(leader) = bucket_location.leader() {
+                    if current.id() == leader.id() {
+                        buckets.push(bucket_location.clone());
+                    }
+                }
+            }
+        }
+        buckets
+    }
+
+    pub fn has_undrained(&self) -> bool {
+        for entry in self.write_batches.iter() {
+            for (_, batch_deque) in entry.value().batches.iter() {
+                if !batch_deque.lock().is_empty() {
+                    return true;
+                }
+            }
+        }
+        false
+    }
+
+    pub fn get_physical_table_paths_in_batches(&self) -> Vec<Arc<PhysicalTablePath>> {
+        self.write_batches
+            .iter()
+            .map(|entry| Arc::clone(entry.key()))
+            .collect()
+    }
+
+    fn flush_in_progress(&self) -> bool {
+        self.flushes_in_progress.load(Ordering::SeqCst) > 0
+    }
+
+    pub fn begin_flush(&self) {
+        self.flushes_in_progress.fetch_add(1, Ordering::SeqCst);
+        self.wakeup_sender();
+    }
+
+    #[allow(unused_must_use)]
+    pub async fn await_flush_completion(&self) -> Result<()> {
+        // Clone handles before awaiting to avoid holding RwLock read guard across await points
+        let handles: Vec<_> = self
+            .incomplete_batches
+            .read()
+            .values()
+            .map(|(h, _)| h.clone())
+            .collect();
+
+        // Await on all handles
+        let result = async {
+            for result_handle in handles {
+                result_handle.wait().await?;
+            }
+            Ok(())
+        }
+        .await;
+
+        // Always decrement flushes_in_progress, even if an error occurred
+        // This mimics the Java finally block behavior
+        self.flushes_in_progress.fetch_sub(1, Ordering::SeqCst);
+
+        result
+    }
+}
+
+pub struct ReadyWriteBatch {
+    pub table_bucket: TableBucket,
+    pub write_batch: WriteBatch,
+}
+
+impl ReadyWriteBatch {
+    pub fn write_batch(&self) -> &WriteBatch {
+        &self.write_batch
+    }
+}
+
+#[allow(dead_code)]
+struct BucketAndWriteBatches {
+    table_id: TableId,
+    is_partitioned_table: bool,
+    partition_id: Option<PartitionId>,
+    batches: HashMap<BucketId, Arc<Mutex<VecDeque<WriteBatch>>>>,
+    /// Compression ratio estimator shared across Arrow log batches for this table.
+    compression_ratio_estimator: Arc<ArrowCompressionRatioEstimator>,
+    /// `None` when `writer_dynamic_batch_size_enabled` is false.
+    dynamic_batch_size: Option<DynamicWriteBatchSizeEstimator>,
+}
+
+impl BucketAndWriteBatches {
+    fn new(
+        table_id: TableId,
+        is_partitioned_table: bool,
+        partition_id: Option<PartitionId>,
+        config: &Config,
+    ) -> Self {
+        let dynamic_batch_size = config.writer_dynamic_batch_size_enabled.then(|| {
+            DynamicWriteBatchSizeEstimator::new(
+                config.writer_dynamic_batch_size_min as usize,
+                config.writer_batch_size as usize,
+            )
+        });
+        Self {
+            table_id,
+            is_partitioned_table,
+            partition_id,
+            batches: Default::default(),
+            compression_ratio_estimator: Arc::new(ArrowCompressionRatioEstimator::default()),
+            dynamic_batch_size,
+        }
+    }
+}
+
+pub struct RecordAppendResult {
+    pub batch_is_full: bool,
+    pub new_batch_created: bool,
+    pub abort_record_for_new_batch: bool,
+    pub result_handle: Option<ResultHandle>,
+}
+
+impl RecordAppendResult {
+    fn new(
+        result_handle: ResultHandle,
+        batch_is_full: bool,
+        new_batch_created: bool,
+        abort_record_for_new_batch: bool,
+    ) -> Self {
+        Self {
+            batch_is_full,
+            new_batch_created,
+            abort_record_for_new_batch,
+            result_handle: Some(result_handle),
+        }
+    }
+
+    fn new_without_result_handle(
+        batch_is_full: bool,
+        new_batch_created: bool,
+        abort_record_for_new_batch: bool,
+    ) -> Self {
+        Self {
+            batch_is_full,
+            new_batch_created,
+            abort_record_for_new_batch,
+            result_handle: None,
+        }
+    }
+}
+
+pub struct ReadyCheckResult {
+    pub ready_nodes: HashSet<ServerNode>,
+    pub next_ready_check_delay_ms: i64,
+    pub unknown_leader_tables: HashSet<Arc<PhysicalTablePath>>,
+}
+
+impl ReadyCheckResult {
+    pub fn new(
+        ready_nodes: HashSet<ServerNode>,
+        next_ready_check_delay_ms: i64,
+        unknown_leader_tables: HashSet<Arc<PhysicalTablePath>>,
+    ) -> Self {
+        ReadyCheckResult {
+            ready_nodes,
+            next_ready_check_delay_ms,
+            unknown_leader_tables,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::client::write::write_format::WriteFormat;
+    use crate::client::write::{RowBytes, WriteRecord};
+    use crate::metadata::TablePath;
+    use crate::row::{Datum, GenericRow};
+    use crate::test_utils::{build_cluster, build_table_info};
+    use bytes::Bytes;
+    use std::sync::Arc;
+
+    fn disabled_idempotence() -> Arc<IdempotenceManager> {
+        Arc::new(IdempotenceManager::new(false, 5))
+    }
+
+    fn enabled_idempotence() -> Arc<IdempotenceManager> {
+        Arc::new(IdempotenceManager::new(true, 5))
+    }
+
+    #[tokio::test]
+    async fn re_enqueue_increments_attempts() -> Result<()> {
+        let config = Config::default();
+        let accumulator = RecordAccumulator::new(config, disabled_idempotence());
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let cluster = Arc::new(build_cluster(&table_path, 1, 1));
+        let row = GenericRow {
+            values: vec![Datum::Int32(1)],
+        };
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+
+        accumulator.append(&record, 0, &cluster, false)?;
+
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        let mut batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?;
+        let mut drained = batches.remove(&1).expect("drained batches");
+        let batch = drained.pop().expect("batch");
+        assert_eq!(batch.write_batch.attempts(), 0);
+
+        accumulator.re_enqueue(batch);
+
+        let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?;
+        let mut drained = batches.remove(&1).expect("drained batches");
+        let batch = drained.pop().expect("batch");
+        assert_eq!(batch.write_batch.attempts(), 1);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn flush_counter_decremented_on_error() -> Result<()> {
+        use crate::client::write::broadcast::BroadcastOnce;
+        use std::sync::atomic::Ordering;
+
+        let config = Config::default();
+        let accumulator = RecordAccumulator::new(config, disabled_idempotence());
+
+        accumulator.begin_flush();
+        assert_eq!(accumulator.flushes_in_progress.load(Ordering::SeqCst), 1);
+
+        // Create a failing batch by dropping the BroadcastOnce without broadcasting
+        {
+            let broadcast = BroadcastOnce::default();
+            let receiver = broadcast.receiver();
+            let handle = ResultHandle::new(receiver);
+            let permit = accumulator.memory_limiter.acquire(1024).unwrap();
+            accumulator
+                .incomplete_batches
+                .write()
+                .insert(1, (handle, permit));
+            // broadcast is dropped here, causing an error
+        }
+
+        // Await flush completion should fail but still decrement counter
+        let result = accumulator.await_flush_completion().await;
+        assert!(result.is_err());
+
+        // Counter should still be decremented (this is the critical fix!)
+        assert_eq!(accumulator.flushes_in_progress.load(Ordering::SeqCst), 0);
+        assert!(!accumulator.flush_in_progress());
+
+        Ok(())
+    }
+
+    fn append_and_drain(
+        accumulator: &RecordAccumulator,
+        cluster: &Arc<crate::cluster::Cluster>,
+        table_path: &TablePath,
+        bucket_id: i32,
+    ) -> Result<ReadyWriteBatch> {
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 2));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let row = GenericRow {
+            values: vec![Datum::Int32(1)],
+        };
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+        accumulator.append(&record, bucket_id, cluster, false)?;
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        let mut batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?;
+        let mut drained = batches.remove(&1).expect("drained batches");
+        Ok(drained.pop().expect("batch"))
+    }
+
+    #[test]
+    fn test_should_stop_drain_for_fresh_batch_over_limit() {
+        let idempotence = Arc::new(IdempotenceManager::new(true, 2));
+        idempotence.set_writer_id(42);
+        let config = Config::default();
+        let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence));
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let cluster = Arc::new(build_cluster(&table_path, 1, 1));
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let row = GenericRow {
+            values: vec![Datum::Int32(1)],
+        };
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+        accumulator
+            .append(&record, 0, &cluster, false)
+            .expect("append");
+
+        let table_bucket = TableBucket::new(1, 0);
+
+        // Add 2 in-flight batches (reaching the max_in_flight=2)
+        idempotence.add_in_flight_batch(&table_bucket, 0, 100);
+        idempotence.add_in_flight_batch(&table_bucket, 1, 101);
+
+        // Get the front batch from the deque
+        let entry = accumulator
+            .write_batches
+            .get(&PhysicalTablePath::of(Arc::new(table_path)))
+            .unwrap();
+        let dq = entry.batches.get(&0).unwrap();
+        let dq_guard = dq.lock();
+        let first_batch = dq_guard.front().unwrap();
+
+        // Fresh batch (no batch_sequence) with in-flight at limit → should stop
+        assert!(!first_batch.has_batch_sequence());
+        assert!(accumulator.should_stop_drain_batches_for_bucket(first_batch, &table_bucket));
+
+        // Remove one in-flight → under limit → should not stop
+        drop(dq_guard);
+        idempotence.remove_in_flight_batch(&table_bucket, 101);
+        let dq_guard = entry.batches.get(&0).unwrap().lock();
+        let first_batch = dq_guard.front().unwrap();
+        assert!(!accumulator.should_stop_drain_batches_for_bucket(first_batch, &table_bucket));
+    }
+
+    #[test]
+    fn test_should_stop_drain_for_retry_not_first_inflight() {
+        let idempotence = enabled_idempotence();
+        idempotence.set_writer_id(42);
+        let config = Config::default();
+        let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence));
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let cluster = Arc::new(build_cluster(&table_path, 1, 1));
+
+        // Drain two separate batches to get batch0(seq=0) and batch1(seq=1)
+        let batch0 =
+            append_and_drain(&accumulator, &cluster, &table_path, 0).expect("drain batch0");
+        let batch1 =
+            append_and_drain(&accumulator, &cluster, &table_path, 0).expect("drain batch1");
+
+        assert_eq!(batch0.write_batch.batch_sequence(), 0);
+        assert_eq!(batch1.write_batch.batch_sequence(), 1);
+
+        let batch1_id = batch1.write_batch.batch_id();
+        let table_bucket = batch0.table_bucket.clone();
+
+        // Re-enqueue only batch1 (simulating batch0 still in-flight, batch1 got error)
+        accumulator.re_enqueue(batch1);
+
+        let entry = accumulator
+            .write_batches
+            .get(&PhysicalTablePath::of(Arc::new(table_path)))
+            .unwrap();
+        let dq = entry.batches.get(&0).unwrap();
+        let dq_guard = dq.lock();
+        let first_batch = dq_guard.front().unwrap();
+
+        // Batch1 is re-enqueued with seq=1, but batch0 (seq=0) is the first in-flight.
+        // batch1's batch_id != first in-flight batch_id → should stop.
+        assert!(first_batch.has_batch_sequence());
+        assert_eq!(first_batch.batch_id(), batch1_id);
+        assert!(accumulator.should_stop_drain_batches_for_bucket(first_batch, &table_bucket));
+    }
+
+    #[tokio::test]
+    async fn test_insert_in_sequence_order() -> Result<()> {
+        let idempotence = enabled_idempotence();
+        idempotence.set_writer_id(42);
+        let config = Config::default();
+        let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence));
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let cluster = Arc::new(build_cluster(&table_path, 1, 2));
+
+        // Create and drain 3 batches to get them with sequences 0, 1, 2
+        let batch0 = append_and_drain(&accumulator, &cluster, &table_path, 0)?;
+        let batch1 = append_and_drain(&accumulator, &cluster, &table_path, 0)?;
+        let batch2 = append_and_drain(&accumulator, &cluster, &table_path, 0)?;
+
+        assert_eq!(batch0.write_batch.batch_sequence(), 0);
+        assert_eq!(batch1.write_batch.batch_sequence(), 1);
+        assert_eq!(batch2.write_batch.batch_sequence(), 2);
+
+        let batch0_id = batch0.write_batch.batch_id();
+        let batch1_id = batch1.write_batch.batch_id();
+        let batch2_id = batch2.write_batch.batch_id();
+        let table_bucket = batch0.table_bucket.clone();
+
+        // Re-enqueue in reverse order: 2, 0, 1
+        // insert_in_sequence_order should sort them as: 0, 1, 2
+        accumulator.re_enqueue(batch2);
+        accumulator.re_enqueue(batch0);
+        accumulator.re_enqueue(batch1);
+
+        // Verify the deque order directly
+        let entry = accumulator
+            .write_batches
+            .get(&PhysicalTablePath::of(Arc::new(table_path)))
+            .unwrap();
+        let dq = entry.batches.get(&0).unwrap();
+        let dq_guard = dq.lock();
+        assert_eq!(dq_guard.len(), 3);
+        // batch0 (seq=0) is the first in-flight, so it should be at front
+        assert_eq!(dq_guard[0].batch_id(), batch0_id);
+        assert_eq!(dq_guard[0].batch_sequence(), 0);
+        assert_eq!(dq_guard[1].batch_id(), batch1_id);
+        assert_eq!(dq_guard[1].batch_sequence(), 1);
+        assert_eq!(dq_guard[2].batch_id(), batch2_id);
+        assert_eq!(dq_guard[2].batch_sequence(), 2);
+        drop(dq_guard);
+
+        // Drain: first in-flight is seq=0, so batch0 passes should_stop check
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        let mut batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?;
+        let drained = batches.remove(&1).expect("drained batches");
+        assert_eq!(drained.len(), 1);
+        assert_eq!(drained[0].write_batch.batch_sequence(), 0);
+
+        // Complete batch0 so batch1 becomes first in-flight
+        idempotence.handle_completed_batch(&table_bucket, batch0_id, 42);
+
+        let mut batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?;
+        let drained = batches.remove(&1).expect("drained");
+        assert_eq!(drained[0].write_batch.batch_sequence(), 1);
+
+        idempotence.handle_completed_batch(&table_bucket, batch1_id, 42);
+
+        let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?;
+        let drained = batches.remove(&1).expect("drained");
+        assert_eq!(drained[0].write_batch.batch_sequence(), 2);
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_abort_batches() -> Result<()> {
+        let idempotence = disabled_idempotence();
+        let config = Config::default();
+        let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence));
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let cluster = Arc::new(build_cluster(&table_path, 1, 1));
+        let row = GenericRow {
+            values: vec![Datum::Int32(1)],
+        };
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+
+        let result = accumulator.append(&record, 0, &cluster, false)?;
+        let handle = result.result_handle.expect("handle");
+        assert!(accumulator.has_incomplete());
+
+        accumulator.abort_batches(broadcast::Error::Client {
+            message: "test abort".to_string(),
+        });
+
+        assert!(!accumulator.has_incomplete());
+        assert!(!accumulator.has_undrained());
+
+        // The handle should receive the error
+        let batch_result = handle.wait().await?;
+        assert!(matches!(
+            batch_result,
+            Err(broadcast::Error::Client { message }) if message == "test abort"
+        ));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_drain_skips_blocked_bucket_continues_others() -> Result<()> {
+        // Use max_in_flight=1 so that one in-flight batch blocks further draining
+        let idempotence = Arc::new(IdempotenceManager::new(true, 1));
+        idempotence.set_writer_id(42);
+        let config = Config::default();
+        let accumulator = RecordAccumulator::new(config, Arc::clone(&idempotence));
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let cluster = Arc::new(build_cluster(&table_path, 1, 2));
+
+        // Append to both buckets
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 2));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let row = GenericRow {
+            values: vec![Datum::Int32(1)],
+        };
+
+        // Append to bucket 0
+        let record =
+            WriteRecord::for_append(table_info.clone(), physical_table_path.clone(), 1, &row);
+        accumulator.append(&record, 0, &cluster, false)?;
+
+        // Append to bucket 1
+        let record =
+            WriteRecord::for_append(table_info.clone(), physical_table_path.clone(), 1, &row);
+        accumulator.append(&record, 1, &cluster, false)?;
+
+        // Drain once — both buckets get batches assigned with sequences
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        let batches = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?;
+        let drained = batches.get(&1).expect("drained");
+        // Both buckets should produce batches
+        assert_eq!(drained.len(), 2);
+
+        // Now: both buckets have 1 in-flight each (added during drain).
+        // Append another record to each bucket.
+        let record =
+            WriteRecord::for_append(table_info.clone(), physical_table_path.clone(), 1, &row);
+        accumulator.append(&record, 0, &cluster, false)?;
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+        accumulator.append(&record, 1, &cluster, false)?;
+
+        // With max_in_flight=1, both buckets are at limit → should_stop returns true
+        // for fresh batches. The drain should skip both (continue, not break).
+        let batches2 = accumulator.drain(cluster.clone(), &nodes, 1024 * 1024)?;
+        // No batches should be drained (both blocked)
+        assert!(
+            batches2.is_empty() || batches2.get(&1).is_none_or(|b| b.is_empty()),
+            "Expected no batches when all buckets are blocked"
+        );
+
+        // Complete the in-flight for bucket 0
+        let bucket0_batch = &drained[0];
+        idempotence.handle_completed_batch(
+            &bucket0_batch.table_bucket,
+            bucket0_batch.write_batch.batch_id(),
+            42,
+        );
+
+        // Now bucket 0 is unblocked but bucket 1 is still blocked
+        let batches3 = accumulator.drain(cluster, &nodes, 1024 * 1024)?;
+        let drained3 = batches3.get(&1).expect("some drained");
+        // Only bucket 0 should produce a batch (continue skipped bucket 1)
+        assert_eq!(drained3.len(), 1);
+        assert_eq!(drained3[0].table_bucket.bucket_id(), 0);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_memory_limiter_acquire_release() {
+        let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(1)));
+
+        let permit1 = limiter.acquire(512).unwrap();
+        let permit2 = limiter.acquire(512).unwrap();
+
+        // At capacity — verify used is 1024
+        assert_eq!(*limiter.state.lock(), 1024);
+
+        // Release one permit, verify used drops
+        drop(permit1);
+        assert_eq!(*limiter.state.lock(), 512);
+
+        drop(permit2);
+        assert_eq!(*limiter.state.lock(), 0);
+    }
+
+    #[test]
+    fn test_memory_limiter_oversized_batch_fails_immediately() {
+        let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(60)));
+
+        let result = limiter.acquire(2048);
+        assert!(matches!(result.unwrap_err(), Error::IllegalArgument { .. }));
+    }
+
+    #[test]
+    fn test_memory_limiter_blocks_then_unblocks() {
+        let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(5)));
+
+        let permit = limiter.acquire(1024).unwrap();
+        assert_eq!(*limiter.state.lock(), 1024);
+
+        // Spawn a thread that tries to acquire — it should block
+        let limiter2 = Arc::clone(&limiter);
+        let handle = std::thread::spawn(move || limiter2.acquire(512));
+
+        // Give the thread time to block
+        std::thread::sleep(Duration::from_millis(50));
+        // Still at capacity (thread is blocked)
+        assert_eq!(*limiter.state.lock(), 1024);
+
+        // Release the permit — thread should unblock
+        drop(permit);
+
+        let result = handle.join().unwrap();
+        assert!(result.is_ok());
+        let _permit2 = result.unwrap();
+        assert_eq!(*limiter.state.lock(), 512);
+    }
+
+    #[test]
+    fn test_memory_limiter_timeout() {
+        let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_millis(100)));
+
+        let _permit = limiter.acquire(1024).unwrap();
+
+        // This should timeout
+        let start = Instant::now();
+        let result = limiter.acquire(512);
+        let elapsed = start.elapsed();
+
+        assert!(matches!(result.unwrap_err(), Error::BufferExhausted { .. }));
+        assert!(elapsed >= Duration::from_millis(80)); // allow some timing slack
+    }
+
+    #[test]
+    fn test_memory_limiter_close_fails_immediately() {
+        let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(60)));
+
+        let _permit = limiter.acquire(512).unwrap();
+
+        limiter.close();
+
+        // New acquire should fail immediately, not block for 60s
+        let start = Instant::now();
+        let result = limiter.acquire(256);
+        let elapsed = start.elapsed();
+
+        assert!(matches!(result.unwrap_err(), Error::WriterClosed { .. }));
+        assert!(elapsed < Duration::from_millis(50));
+    }
+
+    #[test]
+    fn test_memory_limiter_close_unblocks_waiting_threads() {
+        let limiter = Arc::new(MemoryLimiter::new(1024, Duration::from_secs(60)));
+
+        // Fill the limiter completely
+        let _permit = limiter.acquire(1024).unwrap();
+
+        // Spawn a thread that blocks waiting for memory
+        let limiter2 = Arc::clone(&limiter);
+        let handle = std::thread::spawn(move || {
+            let start = Instant::now();
+            let result = limiter2.acquire(512);
+            (result, start.elapsed())
+        });
+
+        // Give the thread time to block
+        std::thread::sleep(Duration::from_millis(50));
+        assert_eq!(limiter.waiting_count.load(Ordering::Relaxed), 1);
+
+        // Close the limiter — should unblock the waiting thread
+        limiter.close();
+
+        let (result, elapsed) = handle.join().unwrap();
+        assert!(matches!(result.unwrap_err(), Error::WriterClosed { .. }));
+        assert!(elapsed < Duration::from_secs(5)); // should not wait the full 60s
+    }
+
+    #[test]
+    fn test_oversized_kv_record_does_not_panic() {
+        use crate::client::write::write_format::WriteFormat;
+        use crate::client::write::{RowBytes, WriteRecord};
+        use bytes::Bytes;
+
+        // Use a tiny batch size so the KV record exceeds it
+        let config = Config {
+            writer_batch_size: 64,
+            writer_buffer_memory_size: 1024 * 1024,
+            ..Config::default()
+        };
+
+        let accumulator = RecordAccumulator::new(config, disabled_idempotence());
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let cluster = Arc::new(build_cluster(&table_path, 1, 1));
+
+        // Create a KV record larger than batch_size (64 bytes)
+        let key = Bytes::from(vec![0u8; 32]);
+        let value = vec![0u8; 256];
+        let record = WriteRecord::for_upsert(
+            table_info,
+            physical_table_path,
+            1,
+            key,
+            None,
+            WriteFormat::CompactedKv,
+            None,
+            Some(RowBytes::Owned(Bytes::from(value))),
+        );
+
+        // This used to panic with "must append to a new batch" because
+        // the KV write limit was hardcoded to DEFAULT_WRITE_LIMIT (256 bytes)
+        // instead of using alloc_size = max(batch_size, record_size).
+        let result = accumulator.append(&record, 0, &cluster, false);
+        assert!(result.is_ok(), "oversized KV record should not panic");
+    }
+
+    #[test]
+    fn test_memory_permit_accounts_for_oversized_record() {
+        use crate::client::write::write_format::WriteFormat;
+        use crate::client::write::{RowBytes, WriteRecord};
+        use bytes::Bytes;
+
+        let config = Config {
+            writer_batch_size: 64,
+            writer_buffer_memory_size: 1024 * 1024,
+            ..Config::default()
+        };
+
+        let accumulator = RecordAccumulator::new(config, disabled_idempotence());
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let cluster = Arc::new(build_cluster(&table_path, 1, 1));
+
+        let key = Bytes::from(vec![0u8; 32]);
+        let value = vec![0u8; 256];
+        let record = WriteRecord::for_upsert(
+            table_info,
+            physical_table_path,
+            1,
+            key,
+            None,
+            WriteFormat::CompactedKv,
+            None,
+            Some(RowBytes::Owned(Bytes::from(value))),
+        );
+
+        // estimated_record_size includes batch header overhead
+        let expected_alloc = record.estimated_record_size();
+        assert!(expected_alloc > 64, "record should exceed batch_size=64");
+
+        accumulator.append(&record, 0, &cluster, false).unwrap();
+
+        // The permit should reserve max(batch_size, estimated_record_size) bytes.
+        let used = *accumulator.memory_limiter.state.lock();
+        assert_eq!(
+            used, expected_alloc,
+            "memory limiter should reserve max(batch_size, estimated_record_size)"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_sender_wakeup_notifies() {
+        let accumulator = RecordAccumulator::new(Config::default(), disabled_idempotence());
+
+        // notified() should complete when wakeup_sender() is called
+        let notified = accumulator.notified();
+        accumulator.wakeup_sender();
+        // If wakeup doesn't work, this would hang forever.
+        tokio::time::timeout(Duration::from_millis(100), notified)
+            .await
+            .expect("notified should complete after wakeup_sender");
+    }
+
+    #[test]
+    fn dynamic_batch_size_shrinks_after_small_drained_batch() {
+        let target = 256 * 1024;
+        let config = Config {
+            writer_dynamic_batch_size_enabled: true,
+            writer_batch_size: target,
+            writer_dynamic_batch_size_min: 4 * 1024,
+            writer_buffer_memory_size: 1024 * 1024,
+            ..Config::default()
+        };
+        let accumulator = RecordAccumulator::new(config, disabled_idempotence());
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let cluster = Arc::new(build_cluster(&table_path, 1, 1));
+        let row = GenericRow {
+            values: vec![Datum::Int32(1)],
+        };
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+
+        accumulator.append(&record, 0, &cluster, false).unwrap();
+        assert_eq!(*accumulator.memory_limiter.state.lock(), target as usize);
+
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        let mut drained = accumulator
+            .drain(cluster.clone(), &nodes, 1024 * 1024)
+            .unwrap();
+        let mut batches = drained.remove(&1).expect("drained batches");
+        let batch = batches.pop().expect("batch");
+        accumulator.remove_incomplete_batches(batch.write_batch.batch_id());
+        assert_eq!(*accumulator.memory_limiter.state.lock(), 0);
+
+        accumulator.append(&record, 0, &cluster, false).unwrap();
+        let second = *accumulator.memory_limiter.state.lock();
+        assert!(second < target as usize, "{second} >= {target}");
+    }
+
+    #[test]
+    fn dynamic_batch_size_grows_after_full_drained_batch() {
+        let max = 256 * 1024;
+        let config = Config {
+            writer_dynamic_batch_size_enabled: true,
+            writer_batch_size: max,
+            writer_dynamic_batch_size_min: 4 * 1024,
+            writer_buffer_memory_size: 4 * 1024 * 1024,
+            ..Config::default()
+        };
+        let accumulator = RecordAccumulator::new(config, disabled_idempotence());
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let cluster = Arc::new(build_cluster(&table_path, 1, 1));
+        let nodes = HashSet::from([cluster.get_tablet_server(1).unwrap().clone()]);
+
+        let kv = |size: usize| {
+            WriteRecord::for_upsert(
+                Arc::clone(&table_info),
+                Arc::clone(&physical_table_path),
+                1,
+                Bytes::from(vec![0u8; 32]),
+                None,
+                WriteFormat::CompactedKv,
+                None,
+                Some(RowBytes::Owned(Bytes::from(vec![0u8; size]))),
+            )
+        };
+        let drain_one = || {
+            let mut d = accumulator.drain(cluster.clone(), &nodes, max).unwrap();
+            let b = d.remove(&1).unwrap().pop().unwrap();
+            accumulator.remove_incomplete_batches(b.write_batch.batch_id());
+        };
+        let target = || {
+            accumulator
+                .estimated_batch_size(&physical_table_path)
+                .unwrap()
+        };
+
+        accumulator.append(&kv(1), 0, &cluster, false).unwrap();
+        drain_one();
+        let after_shrink = target();
+        assert!(
+            after_shrink < max as usize,
+            "shrink failed: after_shrink={after_shrink} max={max}"
+        );
+
+        // 0.9 sits safely above GROW_THRESHOLD (0.8) to avoid f64 boundary noise.
+        accumulator
+            .append(&kv(after_shrink * 9 / 10), 0, &cluster, false)
+            .unwrap();
+        drain_one();
+        let after_grow = target();
+        assert!(
+            after_grow > after_shrink,
+            "grow failed: after_grow={after_grow} after_shrink={after_shrink}"
+        );
+    }
+
+    #[test]
+    fn dynamic_batch_size_disabled_keeps_static_target() {
+        let target = 256 * 1024;
+        let config = Config {
+            writer_dynamic_batch_size_enabled: false,
+            writer_batch_size: target,
+            writer_dynamic_batch_size_min: 4 * 1024,
+            writer_buffer_memory_size: 1024 * 1024,
+            ..Config::default()
+        };
+        let accumulator = RecordAccumulator::new(config, disabled_idempotence());
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+        let cluster = Arc::new(build_cluster(&table_path, 1, 1));
+        let row = GenericRow {
+            values: vec![Datum::Int32(1)],
+        };
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        for _ in 0..3 {
+            accumulator.append(&record, 0, &cluster, false).unwrap();
+            assert_eq!(*accumulator.memory_limiter.state.lock(), target as usize);
+
+            let mut drained = accumulator
+                .drain(cluster.clone(), &nodes, 1024 * 1024)
+                .unwrap();
+            let mut batches = drained.remove(&1).expect("drained batches");
+            let batch = batches.pop().expect("batch");
+            accumulator.remove_incomplete_batches(batch.write_batch.batch_id());
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/write/batch.rs b/fluss-rust/crates/fluss/src/client/write/batch.rs
new file mode 100644
index 0000000000..fd70cb9715
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/batch.rs
@@ -0,0 +1,790 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::broadcast::{BatchWriteResult, BroadcastOnce};
+use crate::client::{Record, ResultHandle, WriteRecord};
+use crate::compression::{ArrowCompressionInfo, ArrowCompressionRatioEstimator};
+use crate::error::{Error, Result};
+use crate::metadata::{KvFormat, PhysicalTablePath, RowType};
+use crate::record::MemoryLogRecordsArrowBuilder;
+use crate::record::kv::KvRecordBatchBuilder;
+use crate::record::{NO_BATCH_SEQUENCE, NO_WRITER_ID};
+use bytes::Bytes;
+use std::cmp::max;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, AtomicI32, Ordering};
+
+pub struct InnerWriteBatch {
+    batch_id: i64,
+    physical_table_path: Arc<PhysicalTablePath>,
+    create_ms: i64,
+    results: BroadcastOnce<BatchWriteResult>,
+    completed: AtomicBool,
+    attempts: AtomicI32,
+    drained_ms: i64,
+    batch_sequence: i32,
+    writer_id: i64,
+}
+
+impl InnerWriteBatch {
+    fn new(batch_id: i64, physical_table_path: Arc<PhysicalTablePath>, create_ms: i64) -> Self {
+        InnerWriteBatch {
+            batch_id,
+            physical_table_path,
+            create_ms,
+            results: Default::default(),
+            completed: AtomicBool::new(false),
+            attempts: AtomicI32::new(0),
+            drained_ms: -1,
+            batch_sequence: NO_BATCH_SEQUENCE,
+            writer_id: NO_WRITER_ID,
+        }
+    }
+
+    pub fn batch_sequence(&self) -> i32 {
+        self.batch_sequence
+    }
+
+    pub fn writer_id(&self) -> i64 {
+        self.writer_id
+    }
+
+    pub fn has_batch_sequence(&self) -> bool {
+        self.batch_sequence != NO_BATCH_SEQUENCE
+    }
+
+    fn waited_time_ms(&self, now: i64) -> i64 {
+        max(0i64, now - self.create_ms)
+    }
+
+    fn complete(&self, write_result: BatchWriteResult) -> bool {
+        if self
+            .completed
+            .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire)
+            .is_err()
+        {
+            return false;
+        }
+        self.results.broadcast(write_result);
+        true
+    }
+
+    fn drained(&mut self, now_ms: i64) {
+        self.drained_ms = max(self.drained_ms, now_ms);
+    }
+
+    fn physical_table_path(&self) -> &Arc<PhysicalTablePath> {
+        &self.physical_table_path
+    }
+
+    fn attempts(&self) -> i32 {
+        self.attempts.load(Ordering::Acquire)
+    }
+
+    fn re_enqueued(&self) {
+        self.attempts.fetch_add(1, Ordering::AcqRel);
+    }
+
+    fn is_done(&self) -> bool {
+        self.completed.load(Ordering::Acquire)
+    }
+}
+
+pub enum WriteBatch {
+    ArrowLog(ArrowLogWriteBatch),
+    Kv(KvWriteBatch),
+}
+
+impl WriteBatch {
+    pub fn inner_batch(&self) -> &InnerWriteBatch {
+        match self {
+            WriteBatch::ArrowLog(batch) => &batch.write_batch,
+            WriteBatch::Kv(batch) => &batch.write_batch,
+        }
+    }
+
+    pub fn inner_batch_mut(&mut self) -> &mut InnerWriteBatch {
+        match self {
+            WriteBatch::ArrowLog(batch) => &mut batch.write_batch,
+            WriteBatch::Kv(batch) => &mut batch.write_batch,
+        }
+    }
+
+    pub fn try_append(&mut self, write_record: &WriteRecord) -> Result<Option<ResultHandle>> {
+        match self {
+            WriteBatch::ArrowLog(batch) => batch.try_append(write_record),
+            WriteBatch::Kv(batch) => batch.try_append(write_record),
+        }
+    }
+
+    pub fn waited_time_ms(&self, now: i64) -> i64 {
+        self.inner_batch().waited_time_ms(now)
+    }
+
+    pub fn close(&mut self) -> Result<()> {
+        match self {
+            WriteBatch::ArrowLog(batch) => {
+                batch.close();
+                Ok(())
+            }
+            WriteBatch::Kv(batch) => batch.close(),
+        }
+    }
+
+    pub fn estimated_size_in_bytes(&self) -> usize {
+        match self {
+            WriteBatch::ArrowLog(batch) => batch.estimated_size_in_bytes(),
+            WriteBatch::Kv(batch) => batch.estimated_size_in_bytes(),
+        }
+    }
+
+    pub fn is_closed(&self) -> bool {
+        match self {
+            WriteBatch::ArrowLog(batch) => batch.is_closed(),
+            WriteBatch::Kv(batch) => batch.is_closed(),
+        }
+    }
+
+    pub fn drained(&mut self, now_ms: i64) {
+        self.inner_batch_mut().drained(now_ms);
+    }
+
+    pub fn build(&mut self) -> Result<Bytes> {
+        match self {
+            WriteBatch::ArrowLog(batch) => batch.build(),
+            WriteBatch::Kv(batch) => batch.build(),
+        }
+    }
+
+    pub fn complete(&self, write_result: BatchWriteResult) -> bool {
+        self.inner_batch().complete(write_result)
+    }
+
+    pub fn batch_id(&self) -> i64 {
+        self.inner_batch().batch_id
+    }
+
+    pub fn physical_table_path(&self) -> &Arc<PhysicalTablePath> {
+        self.inner_batch().physical_table_path()
+    }
+
+    pub fn attempts(&self) -> i32 {
+        self.inner_batch().attempts()
+    }
+
+    pub fn re_enqueued(&self) {
+        self.inner_batch().re_enqueued();
+    }
+
+    pub fn is_done(&self) -> bool {
+        self.inner_batch().is_done()
+    }
+
+    pub fn batch_sequence(&self) -> i32 {
+        self.inner_batch().batch_sequence()
+    }
+
+    pub fn writer_id(&self) -> i64 {
+        self.inner_batch().writer_id()
+    }
+
+    pub fn has_batch_sequence(&self) -> bool {
+        self.inner_batch().has_batch_sequence()
+    }
+
+    pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) {
+        match self {
+            WriteBatch::ArrowLog(batch) => batch.set_writer_state(writer_id, batch_base_sequence),
+            WriteBatch::Kv(batch) => batch.set_writer_state(writer_id, batch_base_sequence),
+        }
+    }
+}
+
+pub struct ArrowLogWriteBatch {
+    pub write_batch: InnerWriteBatch,
+    pub arrow_builder: MemoryLogRecordsArrowBuilder,
+    built_records: Option<Bytes>,
+}
+
+impl ArrowLogWriteBatch {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        batch_id: i64,
+        physical_table_path: Arc<PhysicalTablePath>,
+        schema_id: i32,
+        arrow_compression_info: ArrowCompressionInfo,
+        row_type: &RowType,
+        create_ms: i64,
+        to_append_record_batch: bool,
+        write_limit: usize,
+        compression_ratio_estimator: Arc<ArrowCompressionRatioEstimator>,
+    ) -> Result<Self> {
+        let base = InnerWriteBatch::new(batch_id, physical_table_path, create_ms);
+        Ok(Self {
+            write_batch: base,
+            arrow_builder: MemoryLogRecordsArrowBuilder::new(
+                schema_id,
+                row_type,
+                to_append_record_batch,
+                arrow_compression_info,
+                write_limit,
+                compression_ratio_estimator,
+            )?,
+            built_records: None,
+        })
+    }
+
+    pub fn batch_id(&self) -> i64 {
+        self.write_batch.batch_id
+    }
+
+    pub fn try_append(&mut self, write_record: &WriteRecord) -> Result<Option<ResultHandle>> {
+        if self.arrow_builder.is_closed() || self.arrow_builder.is_full() {
+            Ok(None)
+        } else {
+            // append successfully
+            if self.arrow_builder.append(write_record)? {
+                Ok(Some(ResultHandle::new(self.write_batch.results.receiver())))
+            } else {
+                // append fail
+                Ok(None)
+            }
+        }
+    }
+
+    pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) {
+        self.arrow_builder
+            .set_writer_state(writer_id, batch_base_sequence);
+        self.write_batch.batch_sequence = batch_base_sequence;
+        self.write_batch.writer_id = writer_id;
+        self.built_records = None;
+    }
+
+    pub fn build(&mut self) -> Result<Bytes> {
+        if let Some(bytes) = &self.built_records {
+            return Ok(bytes.clone());
+        }
+        let bytes = Bytes::from(self.arrow_builder.build()?);
+        self.built_records = Some(bytes.clone());
+        Ok(bytes)
+    }
+
+    pub fn is_closed(&self) -> bool {
+        self.arrow_builder.is_closed()
+    }
+
+    pub fn close(&mut self) {
+        self.arrow_builder.close()
+    }
+
+    /// Get an estimate of the number of bytes written to the underlying buffer.
+    /// The returned value is exactly correct if the batch has been built.
+    pub fn estimated_size_in_bytes(&self) -> usize {
+        if let Some(ref bytes) = self.built_records {
+            // Return actual size if already built
+            bytes.len()
+        } else {
+            // Delegate to arrow builder for estimated size
+            self.arrow_builder.estimated_size_in_bytes()
+        }
+    }
+}
+
+pub struct KvWriteBatch {
+    write_batch: InnerWriteBatch,
+    kv_batch_builder: KvRecordBatchBuilder,
+    target_columns: Option<Arc<Vec<usize>>>,
+    schema_id: i32,
+}
+
+impl KvWriteBatch {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        batch_id: i64,
+        physical_table_path: Arc<PhysicalTablePath>,
+        schema_id: i32,
+        write_limit: usize,
+        kv_format: KvFormat,
+        target_columns: Option<Arc<Vec<usize>>>,
+        create_ms: i64,
+    ) -> Self {
+        let base = InnerWriteBatch::new(batch_id, physical_table_path, create_ms);
+        Self {
+            write_batch: base,
+            kv_batch_builder: KvRecordBatchBuilder::new(schema_id, write_limit, kv_format),
+            target_columns,
+            schema_id,
+        }
+    }
+
+    pub fn try_append(&mut self, write_record: &WriteRecord) -> Result<Option<ResultHandle>> {
+        let kv_write_record = match &write_record.record {
+            Record::Kv(record) => record,
+            _ => {
+                return Err(Error::UnsupportedOperation {
+                    message: "Only KvRecord to append to KvWriteBatch ".to_string(),
+                });
+            }
+        };
+
+        let key = kv_write_record.key.as_ref();
+
+        if self.schema_id != write_record.schema_id {
+            return Err(Error::UnexpectedError {
+                message: format!(
+                    "schema id {} of the write record to append is not the same as the current schema id {} in the batch.",
+                    write_record.schema_id, self.schema_id
+                ),
+                source: None,
+            });
+        };
+
+        if self.target_columns != kv_write_record.target_columns {
+            return Err(Error::UnexpectedError {
+                message: format!(
+                    "target columns {:?} of the write record to append are not the same as the current target columns {:?} in the batch.",
+                    kv_write_record.target_columns,
+                    self.target_columns.as_deref()
+                ),
+                source: None,
+            });
+        }
+
+        let row_bytes = kv_write_record.row_bytes();
+
+        if self.is_closed() || !self.kv_batch_builder.has_room_for_row(key, row_bytes) {
+            Ok(None)
+        } else {
+            // append successfully
+            self.kv_batch_builder
+                .append_row(key, row_bytes)
+                .map_err(|e| Error::UnexpectedError {
+                    message: "Failed to append row to KvWriteBatch".to_string(),
+                    source: Some(Box::new(e)),
+                })?;
+            Ok(Some(ResultHandle::new(self.write_batch.results.receiver())))
+        }
+    }
+
+    pub fn build(&mut self) -> Result<Bytes> {
+        self.kv_batch_builder.build()
+    }
+
+    pub fn is_closed(&self) -> bool {
+        self.kv_batch_builder.is_closed()
+    }
+
+    pub fn close(&mut self) -> Result<()> {
+        self.kv_batch_builder.close()
+    }
+
+    pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) {
+        self.kv_batch_builder
+            .set_writer_state(writer_id, batch_base_sequence);
+        self.write_batch.batch_sequence = batch_base_sequence;
+        self.write_batch.writer_id = writer_id;
+    }
+
+    pub fn target_columns(&self) -> Option<&Arc<Vec<usize>>> {
+        self.target_columns.as_ref()
+    }
+
+    /// Get an estimate of the number of bytes written to the underlying buffer.
+    /// This returns the current size including header and all appended records.
+    pub fn estimated_size_in_bytes(&self) -> usize {
+        self.kv_batch_builder.get_size_in_bytes()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::client::{RowBytes, WriteFormat};
+    use crate::metadata::TablePath;
+    use crate::test_utils::build_table_info;
+
+    #[test]
+    fn complete_only_once() {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let physical_path = PhysicalTablePath::of(Arc::new(table_path));
+        let batch = InnerWriteBatch::new(1, Arc::new(physical_path), 0);
+        assert!(batch.complete(Ok(())));
+        assert!(!batch.complete(Err(crate::client::broadcast::Error::Dropped)));
+    }
+
+    #[test]
+    fn attempts_increment_on_reenqueue() {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let physical_path = PhysicalTablePath::of(Arc::new(table_path));
+        let batch = InnerWriteBatch::new(1, Arc::new(physical_path), 0);
+        assert_eq!(batch.attempts(), 0);
+        batch.re_enqueued();
+        assert_eq!(batch.attempts(), 1);
+    }
+
+    #[test]
+    fn test_arrow_log_write_batch_estimated_size() {
+        use crate::client::WriteRecord;
+        use crate::compression::{
+            ArrowCompressionInfo, ArrowCompressionType, DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+        };
+        use crate::metadata::{DataField, DataTypes, RowType};
+        use crate::row::GenericRow;
+        use arrow::array::{Int32Array, RecordBatch, StringArray};
+        use std::sync::Arc;
+
+        let row_type = RowType::new(vec![
+            DataField::new("id".to_string(), DataTypes::int(), None),
+            DataField::new("name".to_string(), DataTypes::string(), None),
+        ]);
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path)));
+
+        // Test 1: RowAppendRecordBatchBuilder (to_append_record_batch=false)
+        {
+            let mut batch = ArrowLogWriteBatch::new(
+                1,
+                Arc::clone(&physical_table_path),
+                1,
+                ArrowCompressionInfo {
+                    compression_type: ArrowCompressionType::None,
+                    compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+                },
+                &row_type,
+                0,
+                false,
+                2 * 1024 * 1024,
+                Arc::new(ArrowCompressionRatioEstimator::default()),
+            )
+            .unwrap();
+
+            // Append rows
+            for _ in 0..200 {
+                let mut row = GenericRow::new(2);
+                row.set_field(0, 1_i32);
+                row.set_field(1, "hello");
+                let record = WriteRecord::for_append(
+                    Arc::clone(&table_info),
+                    Arc::clone(&physical_table_path),
+                    1,
+                    &row,
+                );
+                batch.try_append(&record).unwrap();
+            }
+
+            let estimated_size = batch.estimated_size_in_bytes();
+            assert!(estimated_size > 0);
+
+            let built_data = batch.build().unwrap();
+            let actual_size = built_data.len();
+
+            let diff = actual_size.abs_diff(estimated_size);
+            let threshold = actual_size / 10; // 10% tolerance
+            assert!(
+                diff <= threshold,
+                "RowAppend: estimated_size {estimated_size} and actual_size {actual_size} differ by more than 10%"
+            );
+        }
+
+        // Test 2: PrebuiltRecordBatchBuilder (to_append_record_batch=true)
+        {
+            let mut batch = ArrowLogWriteBatch::new(
+                1,
+                physical_table_path.clone(),
+                1,
+                ArrowCompressionInfo {
+                    compression_type: ArrowCompressionType::None,
+                    compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+                },
+                &row_type,
+                0,
+                true,
+                2 * 1024 * 1024,
+                Arc::new(ArrowCompressionRatioEstimator::default()),
+            )
+            .unwrap();
+
+            // Create a pre-built RecordBatch
+            let schema = crate::record::to_arrow_schema(&row_type).unwrap();
+            let ids: Vec<i32> = (0..200).collect();
+            let names: Vec<&str> = (0..200).map(|_| "hello").collect();
+            let record_batch = RecordBatch::try_new(
+                schema,
+                vec![
+                    Arc::new(Int32Array::from(ids)),
+                    Arc::new(StringArray::from(names)),
+                ],
+            )
+            .unwrap();
+
+            let record = WriteRecord::for_append_record_batch(
+                Arc::clone(&table_info),
+                Arc::clone(&physical_table_path),
+                1,
+                record_batch,
+            );
+            batch.try_append(&record).unwrap();
+
+            let estimated_size = batch.estimated_size_in_bytes();
+            assert!(estimated_size > 0);
+
+            let built_data = batch.build().unwrap();
+            let actual_size = built_data.len();
+
+            let diff = actual_size.abs_diff(estimated_size);
+            let threshold = actual_size / 10; // 10% tolerance
+            assert!(
+                diff <= threshold,
+                "Prebuilt: estimated_size {estimated_size} and actual_size {actual_size} differ by more than 10%"
+            );
+        }
+    }
+
+    #[test]
+    fn test_kv_write_batch_estimated_size() {
+        use crate::metadata::KvFormat;
+
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path)));
+
+        let mut batch = KvWriteBatch::new(
+            1,
+            Arc::clone(&physical_path),
+            1,
+            256,
+            KvFormat::COMPACTED,
+            None,
+            0,
+        );
+
+        for _ in 0..200 {
+            let record = WriteRecord::for_upsert(
+                Arc::clone(&table_info),
+                Arc::clone(&physical_path),
+                1,
+                Bytes::from(vec![1_u8, 2_u8, 3_u8]),
+                None,
+                WriteFormat::CompactedKv,
+                None,
+                Some(RowBytes::Owned(Bytes::from(vec![1_u8, 2_u8, 3_u8]))),
+            );
+            batch.try_append(&record).unwrap();
+        }
+
+        let estimated_size = batch.estimated_size_in_bytes();
+        let actual_size = batch.build().unwrap().len();
+
+        assert_eq!(
+            actual_size, estimated_size,
+            "estimated size {estimated_size} is not equal to actual size"
+        );
+    }
+
+    /// Verifies byte-size-based fullness:
+    /// 1. Actual built size stays within the configured limit (no compression).
+    /// 2. Old 256-record cap is gone — large batches accept >256 small rows.
+    /// 3. Compression feedback loop: shared estimator updates after build(),
+    ///    second batch with same estimator accepts more records.
+    #[test]
+    fn test_arrow_batch_byte_size_fullness() {
+        use crate::client::WriteRecord;
+        use crate::compression::{
+            ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType,
+            DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+        };
+        use crate::metadata::{DataField, DataTypes, RowType};
+        use crate::row::GenericRow;
+        use std::sync::Arc;
+
+        let row_type = RowType::new(vec![
+            DataField::new("id".to_string(), DataTypes::int(), None),
+            DataField::new("name".to_string(), DataTypes::string(), None),
+        ]);
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path)));
+
+        // --- Part 1: actual built size stays within limit (uncompressed) ---
+        let write_limit: usize = 16 * 1024;
+        let mut batch = ArrowLogWriteBatch::new(
+            1,
+            Arc::clone(&physical_table_path),
+            1,
+            ArrowCompressionInfo {
+                compression_type: ArrowCompressionType::None,
+                compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+            },
+            &row_type,
+            0,
+            false,
+            write_limit,
+            Arc::new(ArrowCompressionRatioEstimator::default()),
+        )
+        .unwrap();
+
+        let mut appended = 0;
+        for i in 0..100_000 {
+            let mut row = GenericRow::new(2);
+            row.set_field(0, i);
+            row.set_field(1, "hello_world");
+            let record = WriteRecord::for_append(
+                Arc::clone(&table_info),
+                Arc::clone(&physical_table_path),
+                1,
+                &row,
+            );
+            match batch.try_append(&record).unwrap() {
+                Some(_) => appended += 1,
+                None => break,
+            }
+        }
+
+        assert!(
+            appended > 0 && appended < 100_000,
+            "batch should have filled, appended: {appended}"
+        );
+        let built = batch.build().unwrap();
+        assert!(
+            built.len() <= write_limit * 120 / 100,
+            "actual size {} exceeds write_limit {write_limit} by more than 20%",
+            built.len()
+        );
+
+        // --- Part 2: old 256-record cap is gone ---
+        let row_type_small = RowType::new(vec![DataField::new(
+            "id".to_string(),
+            DataTypes::int(),
+            None,
+        )]);
+        let mut batch = ArrowLogWriteBatch::new(
+            2,
+            Arc::clone(&physical_table_path),
+            1,
+            ArrowCompressionInfo {
+                compression_type: ArrowCompressionType::None,
+                compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+            },
+            &row_type_small,
+            0,
+            false,
+            2 * 1024 * 1024,
+            Arc::new(ArrowCompressionRatioEstimator::default()),
+        )
+        .unwrap();
+
+        let mut appended = 0;
+        for i in 0..1000 {
+            let mut row = GenericRow::new(1);
+            row.set_field(0, i);
+            let record = WriteRecord::for_append(
+                Arc::clone(&table_info),
+                Arc::clone(&physical_table_path),
+                1,
+                &row,
+            );
+            match batch.try_append(&record).unwrap() {
+                Some(_) => appended += 1,
+                None => break,
+            }
+        }
+        assert_eq!(appended, 1000, "2MB batch should fit 1000 tiny rows");
+
+        // --- Part 3: compression feedback loop ---
+        let estimator = Arc::new(ArrowCompressionRatioEstimator::default());
+        assert_eq!(estimator.estimation(), 1.0);
+
+        let write_limit = 64 * 1024;
+        let compression = ArrowCompressionInfo {
+            compression_type: ArrowCompressionType::Zstd,
+            compression_level: 3,
+        };
+
+        // First batch: fill and build with ZSTD.
+        let mut batch1 = ArrowLogWriteBatch::new(
+            3,
+            Arc::clone(&physical_table_path),
+            1,
+            compression.clone(),
+            &row_type,
+            0,
+            false,
+            write_limit,
+            Arc::clone(&estimator),
+        )
+        .unwrap();
+
+        for i in 0..500 {
+            let mut row = GenericRow::new(2);
+            row.set_field(0, i);
+            row.set_field(1, "aaaaaaaaaaaaaaaa");
+            let record = WriteRecord::for_append(
+                Arc::clone(&table_info),
+                Arc::clone(&physical_table_path),
+                1,
+                &row,
+            );
+            if batch1.try_append(&record).unwrap().is_none() {
+                break;
+            }
+        }
+        batch1.build().unwrap();
+
+        // Estimator should have decreased (ZSTD compresses repeated data well).
+        assert!(
+            estimator.estimation() < 1.0,
+            "ratio should decrease after compressed build, got: {}",
+            estimator.estimation()
+        );
+
+        // Second batch: same estimator → knows data compresses well → accepts more rows.
+        let mut batch2 = ArrowLogWriteBatch::new(
+            4,
+            Arc::clone(&physical_table_path),
+            1,
+            compression,
+            &row_type,
+            0,
+            false,
+            write_limit,
+            Arc::clone(&estimator),
+        )
+        .unwrap();
+
+        let mut appended2 = 0;
+        for i in 0..10_000 {
+            let mut row = GenericRow::new(2);
+            row.set_field(0, i);
+            row.set_field(1, "aaaaaaaaaaaaaaaa");
+            let record = WriteRecord::for_append(
+                Arc::clone(&table_info),
+                Arc::clone(&physical_table_path),
+                1,
+                &row,
+            );
+            match batch2.try_append(&record).unwrap() {
+                Some(_) => appended2 += 1,
+                None => break,
+            }
+        }
+        assert!(
+            appended2 > 500,
+            "second batch should accept more records with updated ratio, got: {appended2}"
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/write/broadcast.rs b/fluss-rust/crates/fluss/src/client/write/broadcast.rs
new file mode 100644
index 0000000000..9e00403586
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/broadcast.rs
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use parking_lot::RwLock;
+use std::sync::Arc;
+use thiserror::Error;
+use tokio::sync::Notify;
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+pub type BatchWriteResult = Result<(), Error>;
+
+#[derive(Debug, Error, Clone, PartialEq, Eq)]
+pub enum Error {
+    #[error("BroadcastOnce dropped")]
+    Dropped,
+    #[error("Write failed: {message} (code {code})")]
+    WriteFailed { code: i32, message: String },
+    #[error("Write failed before request was sent: {message}")]
+    Client { message: String },
+}
+
+#[derive(Debug, Clone)]
+pub struct BroadcastOnceReceiver<T> {
+    shared: Arc<Shared<T>>,
+}
+
+impl<T: Clone + Send + Sync> BroadcastOnceReceiver<T> {
+    /// Returns `Some(_)` if data has been produced
+    pub fn peek(&self) -> Option<Result<T>> {
+        self.shared.data.read().clone()
+    }
+
+    /// Waits for [`BroadcastOnce::broadcast`] to be called or returns an error
+    /// if the [`BroadcastOnce`] is dropped without a value being published
+    pub async fn receive(&self) -> Result<T> {
+        let notified = self.shared.notify.notified();
+
+        if let Some(v) = self.peek() {
+            return v;
+        }
+
+        notified.await;
+
+        self.peek().expect("just got notified")
+    }
+
+    /// Force-complete with an error if not already completed.
+    /// Used by `abort_batches` to fail in-flight handles that can't be
+    /// reached through `WriteBatch::complete`.
+    pub(crate) fn fail(&self, error: Error) {
+        let mut data = self.shared.data.write();
+        if data.is_none() {
+            *data = Some(Err(error));
+            self.shared.notify.notify_waiters();
+        }
+    }
+}
+
+#[derive(Debug)]
+struct Shared<T> {
+    data: RwLock<Option<Result<T>>>,
+    notify: Notify,
+}
+
+#[derive(Debug)]
+pub struct BroadcastOnce<T>
+where
+    T: Send + Sync,
+{
+    shared: Arc<Shared<T>>,
+}
+
+impl<T> Default for BroadcastOnce<T>
+where
+    T: Send + Sync,
+{
+    fn default() -> Self {
+        Self {
+            shared: Arc::new(Shared {
+                data: Default::default(),
+                notify: Default::default(),
+            }),
+        }
+    }
+}
+
+impl<T: Clone + Send + Sync> BroadcastOnce<T> {
+    /// Returns a [`BroadcastOnceReceiver`] that can be used to wait on
+    /// a call to [`BroadcastOnce::broadcast`] on this instance
+    pub fn receiver(&self) -> BroadcastOnceReceiver<T> {
+        BroadcastOnceReceiver {
+            shared: Arc::clone(&self.shared),
+        }
+    }
+
+    /// Broadcast a value to all [`BroadcastOnceReceiver`] handles
+    pub fn broadcast(&self, r: T) {
+        let mut locked = self.shared.data.write();
+        assert!(locked.is_none(), "double publish");
+
+        *locked = Some(Ok(r));
+        self.shared.notify.notify_waiters();
+    }
+}
+
+impl<T> Drop for BroadcastOnce<T>
+where
+    T: Send + Sync,
+{
+    fn drop(&mut self) {
+        let mut data = self.shared.data.write();
+        if data.is_none() {
+            log::warn!("BroadcastOnce dropped without producing");
+            *data = Some(Err(Error::Dropped));
+            self.shared.notify.notify_waiters();
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs
new file mode 100644
index 0000000000..8ad38e3d42
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/bucket_assigner.rs
@@ -0,0 +1,259 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::bucketing::BucketingFunction;
+use crate::cluster::Cluster;
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::PhysicalTablePath;
+use bytes::Bytes;
+use rand::Rng;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicI32, Ordering};
+
+pub trait BucketAssigner: Sync + Send {
+    fn abort_if_batch_full(&self) -> bool;
+
+    fn on_new_batch(&self, cluster: &Cluster, prev_bucket_id: i32);
+
+    fn assign_bucket(&self, bucket_key: Option<&Bytes>, cluster: &Cluster) -> Result<i32>;
+}
+
+#[derive(Debug)]
+pub struct StickyBucketAssigner {
+    table_path: Arc<PhysicalTablePath>,
+    current_bucket_id: AtomicI32,
+}
+
+impl StickyBucketAssigner {
+    pub fn new(table_path: Arc<PhysicalTablePath>) -> Self {
+        Self {
+            table_path,
+            current_bucket_id: AtomicI32::new(-1),
+        }
+    }
+
+    fn next_bucket(&self, cluster: &Cluster, prev_bucket_id: i32) -> i32 {
+        let old_bucket = self.current_bucket_id.load(Ordering::Relaxed);
+        let mut new_bucket = old_bucket;
+        if old_bucket < 0 || old_bucket == prev_bucket_id {
+            let available_buckets = cluster.get_available_buckets_for_table_path(&self.table_path);
+            if available_buckets.is_empty() {
+                let mut rng = rand::rng();
+                let mut random: i32 = rng.random();
+                random &= i32::MAX;
+                new_bucket = random % cluster.get_bucket_count(self.table_path.get_table_path());
+            } else if available_buckets.len() == 1 {
+                new_bucket = available_buckets[0].table_bucket.bucket_id();
+            } else {
+                let mut rng = rand::rng();
+                while new_bucket < 0 || new_bucket == old_bucket {
+                    let mut random: i32 = rng.random();
+                    random &= i32::MAX;
+                    new_bucket = available_buckets
+                        [(random % available_buckets.len() as i32) as usize]
+                        .bucket_id();
+                }
+            }
+        }
+
+        if old_bucket < 0 {
+            self.current_bucket_id.store(new_bucket, Ordering::Relaxed);
+        } else {
+            self.current_bucket_id
+                .compare_exchange(
+                    prev_bucket_id,
+                    new_bucket,
+                    Ordering::Relaxed,
+                    Ordering::Relaxed,
+                )
+                .ok();
+        }
+        self.current_bucket_id.load(Ordering::Relaxed)
+    }
+}
+
+impl BucketAssigner for StickyBucketAssigner {
+    fn abort_if_batch_full(&self) -> bool {
+        true
+    }
+
+    fn on_new_batch(&self, cluster: &Cluster, prev_bucket_id: i32) {
+        self.next_bucket(cluster, prev_bucket_id);
+    }
+
+    fn assign_bucket(&self, _bucket_key: Option<&Bytes>, cluster: &Cluster) -> Result<i32> {
+        let bucket_id = self.current_bucket_id.load(Ordering::Relaxed);
+        if bucket_id < 0 {
+            Ok(self.next_bucket(cluster, bucket_id))
+        } else {
+            Ok(bucket_id)
+        }
+    }
+}
+
+/// Unlike [StickyBucketAssigner], each record is assigned to the next bucket
+/// in a rotating sequence, providing even data distribution across all buckets.
+pub struct RoundRobinBucketAssigner {
+    table_path: Arc<PhysicalTablePath>,
+    num_buckets: i32,
+    counter: AtomicI32,
+}
+
+impl RoundRobinBucketAssigner {
+    pub fn new(table_path: Arc<PhysicalTablePath>, num_buckets: i32) -> Self {
+        let mut rng = rand::rng();
+        Self {
+            table_path,
+            num_buckets,
+            counter: AtomicI32::new(rng.random()),
+        }
+    }
+}
+
+impl BucketAssigner for RoundRobinBucketAssigner {
+    fn abort_if_batch_full(&self) -> bool {
+        false
+    }
+
+    fn on_new_batch(&self, _cluster: &Cluster, _prev_bucket_id: i32) {}
+
+    fn assign_bucket(&self, _bucket_key: Option<&Bytes>, cluster: &Cluster) -> Result<i32> {
+        let next_value = self.counter.fetch_add(1, Ordering::Relaxed);
+        let available_buckets = cluster.get_available_buckets_for_table_path(&self.table_path);
+        if available_buckets.is_empty() {
+            Ok((next_value & i32::MAX) % self.num_buckets)
+        } else {
+            let idx = (next_value & i32::MAX) % available_buckets.len() as i32;
+            Ok(available_buckets[idx as usize].bucket_id())
+        }
+    }
+}
+
+/// A [BucketAssigner] which assigns based on a modulo hashing function
+pub struct HashBucketAssigner {
+    num_buckets: i32,
+    bucketing_function: Box<dyn BucketingFunction>,
+}
+
+#[allow(dead_code)]
+impl HashBucketAssigner {
+    /// Creates a new [HashBucketAssigner] based on the given [BucketingFunction].
+    /// See [BucketingFunction.of(Option<&DataLakeFormat>)] for bucketing functions.
+    ///
+    ///
+    /// # Arguments
+    /// * `num_buckets` - The number of buckets
+    /// * `bucketing_function` - The bucketing function
+    ///
+    /// # Returns
+    /// * [HashBucketAssigner] - The hash bucket assigner
+    pub fn new(num_buckets: i32, bucketing_function: Box<dyn BucketingFunction>) -> Self {
+        HashBucketAssigner {
+            num_buckets,
+            bucketing_function,
+        }
+    }
+}
+
+impl BucketAssigner for HashBucketAssigner {
+    fn abort_if_batch_full(&self) -> bool {
+        false
+    }
+
+    fn on_new_batch(&self, _: &Cluster, _: i32) {
+        // do nothing
+    }
+
+    fn assign_bucket(&self, bucket_key: Option<&Bytes>, _: &Cluster) -> Result<i32> {
+        let key = bucket_key.ok_or_else(|| IllegalArgument {
+            message: "no bucket key provided".to_string(),
+        })?;
+        self.bucketing_function.bucketing(key, self.num_buckets)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::bucketing::BucketingFunction;
+    use crate::cluster::Cluster;
+    use crate::metadata::TablePath;
+    use crate::test_utils::build_cluster;
+    use std::sync::Arc;
+
+    #[test]
+    fn sticky_bucket_assigner_picks_available_bucket() {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let cluster = build_cluster(&table_path, 1, 2);
+        let assigner = StickyBucketAssigner::new(Arc::new(PhysicalTablePath::of(Arc::new(
+            table_path.clone(),
+        ))));
+        let bucket = assigner.assign_bucket(None, &cluster).expect("bucket");
+        assert!((0..2).contains(&bucket));
+
+        assigner.on_new_batch(&cluster, bucket);
+        let next_bucket = assigner.assign_bucket(None, &cluster).expect("bucket");
+        assert!((0..2).contains(&next_bucket));
+    }
+
+    #[test]
+    fn round_robin_assigner_cycles_through_buckets() {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let num_buckets = 3;
+        let cluster = build_cluster(&table_path, 1, num_buckets);
+        let physical = Arc::new(PhysicalTablePath::of(Arc::new(table_path)));
+        let assigner = RoundRobinBucketAssigner::new(physical, num_buckets);
+
+        let mut seen = Vec::new();
+        for _ in 0..(num_buckets * 2) {
+            let bucket = assigner.assign_bucket(None, &cluster).expect("bucket");
+            assert!((0..num_buckets).contains(&bucket));
+            seen.push(bucket);
+        }
+
+        assert_eq!(seen[0], seen[3]);
+        assert_eq!(seen[1], seen[4]);
+        assert_eq!(seen[2], seen[5]);
+    }
+
+    #[test]
+    fn round_robin_assigner_does_not_abort_on_batch_full() {
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let physical = Arc::new(PhysicalTablePath::of(Arc::new(table_path)));
+        let assigner = RoundRobinBucketAssigner::new(physical, 3);
+        assert!(!assigner.abort_if_batch_full());
+    }
+
+    #[test]
+    fn hash_bucket_assigner_requires_key() {
+        let assigner = HashBucketAssigner::new(3, <dyn BucketingFunction>::of(None));
+        let cluster = Cluster::default();
+        let err = assigner.assign_bucket(None, &cluster).unwrap_err();
+        assert!(matches!(err, IllegalArgument { .. }));
+    }
+
+    #[test]
+    fn hash_bucket_assigner_hashes_key() {
+        let assigner = HashBucketAssigner::new(4, <dyn BucketingFunction>::of(None));
+        let cluster = Cluster::default();
+        let bucket = assigner
+            .assign_bucket(Some(&Bytes::from_static(b"key")), &cluster)
+            .expect("bucket");
+        assert!((0..4).contains(&bucket));
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/write/dynamic_batch_size.rs b/fluss-rust/crates/fluss/src/client/write/dynamic_batch_size.rs
new file mode 100644
index 0000000000..408263ee5f
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/dynamic_batch_size.rs
@@ -0,0 +1,147 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Per-table batch size estimator. Mirrors Java's `DynamicWriteBatchSizeEstimator`:
+//! grow 10% above 80% fill, shrink 5% below 50%, clamped to `[min, max]`.
+
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+const GROW_THRESHOLD: f64 = 0.8;
+const SHRINK_THRESHOLD: f64 = 0.5;
+const GROW_FACTOR: f64 = 1.1;
+const SHRINK_FACTOR: f64 = 0.95;
+
+#[derive(Debug)]
+pub(crate) struct DynamicWriteBatchSizeEstimator {
+    current: AtomicUsize,
+    min: usize,
+    max: usize,
+}
+
+impl DynamicWriteBatchSizeEstimator {
+    pub fn new(min_size: usize, max_size: usize) -> Self {
+        Self {
+            current: AtomicUsize::new(max_size),
+            min: min_size.min(max_size),
+            max: max_size,
+        }
+    }
+
+    pub fn current(&self) -> usize {
+        self.current.load(Ordering::Relaxed)
+    }
+
+    /// Last-write-wins on races, matching Java's `ConcurrentHashMap.put`.
+    pub fn update(&self, actual: usize) -> usize {
+        let prev = self.current.load(Ordering::Relaxed);
+        let cur = prev as f64;
+        let actual = actual as f64;
+        let next = if actual > cur * GROW_THRESHOLD {
+            cur * GROW_FACTOR
+        } else if actual < cur * SHRINK_THRESHOLD {
+            cur * SHRINK_FACTOR
+        } else {
+            cur
+        };
+        let clamped = (next as usize).clamp(self.min, self.max);
+        if clamped != prev {
+            self.current.store(clamped, Ordering::Relaxed);
+        }
+        clamped
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const MIN: usize = 256 * 1024;
+    const MAX: usize = 2 * 1024 * 1024;
+    /// ~41 shrink steps, ~22 grow steps; 50 covers both with margin.
+    const CONVERGENCE_STEPS: usize = 50;
+
+    #[test]
+    fn starts_at_max() {
+        let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX);
+        assert_eq!(est.current(), MAX);
+    }
+
+    #[test]
+    fn min_clamped_to_max_when_misconfigured() {
+        let est = DynamicWriteBatchSizeEstimator::new(MAX * 2, MAX);
+        assert_eq!(est.current(), MAX);
+        assert_eq!(est.update(0), MAX);
+    }
+
+    #[test]
+    fn grows_when_above_grow_threshold() {
+        let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX);
+        for _ in 0..CONVERGENCE_STEPS {
+            est.update(0);
+        }
+        assert_eq!(est.current(), MIN);
+
+        // 0.9 sits safely past the 0.8 threshold and avoids f64 boundary noise.
+        let next = est.update((MIN as f64 * 0.9) as usize);
+        assert_eq!(next, ((MIN as f64) * GROW_FACTOR) as usize);
+    }
+
+    #[test]
+    fn shrinks_when_below_shrink_threshold() {
+        let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX);
+        // 0.4 sits safely below the strict 0.5 threshold.
+        let next = est.update((MAX as f64 * 0.4) as usize);
+        assert_eq!(next, ((MAX as f64) * SHRINK_FACTOR) as usize);
+    }
+
+    #[test]
+    fn shrink_clamps_to_min() {
+        let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX);
+        for _ in 0..CONVERGENCE_STEPS {
+            est.update(0);
+        }
+        assert_eq!(est.current(), MIN);
+    }
+
+    #[test]
+    fn grow_clamps_to_max() {
+        let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX);
+        for _ in 0..CONVERGENCE_STEPS {
+            est.update(0);
+        }
+        for _ in 0..CONVERGENCE_STEPS {
+            est.update(est.current());
+        }
+        assert_eq!(est.current(), MAX);
+    }
+
+    #[test]
+    fn oversized_actual_clamps_at_max() {
+        let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX);
+        assert_eq!(est.update(MAX * 4), MAX);
+    }
+
+    #[test]
+    fn dead_zone_is_a_fixed_point() {
+        let est = DynamicWriteBatchSizeEstimator::new(MIN, MAX);
+        let initial = est.current();
+        for _ in 0..20 {
+            est.update((est.current() as f64 * 0.65) as usize);
+        }
+        assert_eq!(est.current(), initial);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/write/idempotence.rs b/fluss-rust/crates/fluss/src/client/write/idempotence.rs
new file mode 100644
index 0000000000..eeec8761b2
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/idempotence.rs
@@ -0,0 +1,767 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::TableBucket;
+use crate::record::{NO_BATCH_SEQUENCE, NO_WRITER_ID};
+use crate::rpc::FlussError;
+use log::debug;
+use parking_lot::Mutex;
+use std::collections::{HashMap, HashSet};
+use std::sync::atomic::{AtomicI64, Ordering};
+
+struct InFlightBatch {
+    batch_sequence: i32,
+    batch_id: i64,
+}
+
+struct BucketEntry {
+    writer_id: i64,
+    next_sequence: i32,
+    last_acked_sequence: i32,
+    in_flight: Vec<InFlightBatch>,
+    reset_batch_ids: HashSet<i64>,
+}
+
+impl BucketEntry {
+    fn new() -> Self {
+        Self {
+            writer_id: NO_WRITER_ID,
+            next_sequence: 0,
+            last_acked_sequence: -1,
+            in_flight: Vec::new(),
+            reset_batch_ids: HashSet::new(),
+        }
+    }
+}
+
+pub struct IdempotenceManager {
+    writer_id: AtomicI64,
+    bucket_entries: Mutex<HashMap<TableBucket, BucketEntry>>,
+    enabled: bool,
+    max_in_flight_requests_per_bucket: usize,
+}
+
+impl IdempotenceManager {
+    pub fn new(enabled: bool, max_in_flight_requests_per_bucket: usize) -> Self {
+        Self {
+            writer_id: AtomicI64::new(NO_WRITER_ID),
+            bucket_entries: Mutex::new(HashMap::new()),
+            enabled,
+            max_in_flight_requests_per_bucket,
+        }
+    }
+
+    pub fn is_enabled(&self) -> bool {
+        self.enabled
+    }
+
+    pub fn writer_id(&self) -> i64 {
+        self.writer_id.load(Ordering::Acquire)
+    }
+
+    pub fn has_writer_id(&self) -> bool {
+        self.writer_id() != NO_WRITER_ID
+    }
+
+    pub fn is_writer_id_valid(&self) -> bool {
+        self.has_writer_id()
+    }
+
+    pub fn in_flight_count(&self, bucket: &TableBucket) -> usize {
+        let entries = self.bucket_entries.lock();
+        entries.get(bucket).map_or(0, |e| e.in_flight.len())
+    }
+
+    pub fn can_send_more_requests(&self, bucket: &TableBucket) -> bool {
+        self.in_flight_count(bucket) < self.max_in_flight_requests_per_bucket
+    }
+
+    pub fn set_writer_id(&self, id: i64) {
+        self.writer_id.store(id, Ordering::Release);
+    }
+
+    pub fn reset_writer_id(&self) {
+        self.writer_id.store(NO_WRITER_ID, Ordering::Release);
+        self.bucket_entries.lock().clear();
+    }
+
+    pub fn next_sequence_and_increment(&self, bucket: &TableBucket) -> i32 {
+        let mut entries = self.bucket_entries.lock();
+        let entry = entries
+            .entry(bucket.clone())
+            .or_insert_with(BucketEntry::new);
+        let seq = entry.next_sequence;
+        entry.next_sequence += 1;
+        seq
+    }
+
+    pub fn add_in_flight_batch(&self, bucket: &TableBucket, batch_sequence: i32, batch_id: i64) {
+        debug_assert!(
+            batch_sequence != NO_BATCH_SEQUENCE,
+            "Can't track batch for bucket {bucket} when batch sequence is not set"
+        );
+        let mut entries = self.bucket_entries.lock();
+        let entry = entries
+            .entry(bucket.clone())
+            .or_insert_with(BucketEntry::new);
+        // Insert sorted by batch_sequence
+        let pos = entry
+            .in_flight
+            .binary_search_by_key(&batch_sequence, |b| b.batch_sequence)
+            .unwrap_or_else(|e| e);
+        entry.in_flight.insert(
+            pos,
+            InFlightBatch {
+                batch_sequence,
+                batch_id,
+            },
+        );
+    }
+
+    pub fn handle_completed_batch(
+        &self,
+        bucket: &TableBucket,
+        batch_id: i64,
+        batch_writer_id: i64,
+    ) {
+        if batch_writer_id != self.writer_id() {
+            debug!(
+                "Ignoring completed batch for bucket {bucket} with stale writer_id {batch_writer_id} (current: {})",
+                self.writer_id()
+            );
+            return;
+        }
+        let mut entries = self.bucket_entries.lock();
+        if let Some(entry) = entries.get_mut(bucket) {
+            // Find by batch_id to handle the case where the in-flight entry's sequence
+            // was adjusted by a prior handle_failed_batch call.
+            if let Some(pos) = entry.in_flight.iter().position(|b| b.batch_id == batch_id) {
+                let adjusted_seq = entry.in_flight[pos].batch_sequence;
+                entry.in_flight.remove(pos);
+                entry.reset_batch_ids.remove(&batch_id);
+                if adjusted_seq > entry.last_acked_sequence {
+                    entry.last_acked_sequence = adjusted_seq;
+                }
+            }
+        }
+    }
+
+    /// Handle a failed batch. Matches Java's `IdempotenceManager.handleFailedBatch`.
+    ///
+    /// For `OutOfOrderSequenceException` or `UnknownWriterIdException`, resets ALL
+    /// writer state (matching Java: "we cannot make any guarantees about the previously
+    /// committed message").
+    ///
+    /// For other errors, removes the specific in-flight entry by `batch_id` and
+    /// optionally adjusts downstream sequences. `adjust_sequences` should only be true
+    /// when the batch has NOT exhausted its retries.
+    pub fn handle_failed_batch(
+        &self,
+        bucket: &TableBucket,
+        batch_id: i64,
+        batch_writer_id: i64,
+        error: Option<FlussError>,
+        adjust_sequences: bool,
+    ) {
+        if batch_writer_id != self.writer_id() {
+            debug!(
+                "Ignoring failed batch for bucket {bucket} with stale writer_id {batch_writer_id} (current: {})",
+                self.writer_id()
+            );
+            return;
+        }
+
+        let mut entries = self.bucket_entries.lock();
+
+        // Matches Java: OutOfOrderSequence or UnknownWriterId → reset all writer state.
+        // Java's synchronized handleFailedBatch can call synchronized resetWriterId
+        // because Java monitors are reentrant. We inline the reset here to stay in
+        // the same lock scope.
+        if let Some(e) = error {
+            if e == FlussError::OutOfOrderSequenceException
+                || e == FlussError::UnknownWriterIdException
+            {
+                debug!(
+                    "Resetting writer ID due to {e:?} for bucket {bucket} \
+                     (writer_id={batch_writer_id}, batch_id={batch_id})"
+                );
+                self.writer_id.store(NO_WRITER_ID, Ordering::Release);
+                entries.clear();
+                return;
+            }
+        }
+        if let Some(entry) = entries.get_mut(bucket) {
+            // Find and remove by batch_id, capturing the (possibly adjusted) sequence
+            let failed_sequence = entry
+                .in_flight
+                .iter()
+                .position(|b| b.batch_id == batch_id)
+                .map(|pos| {
+                    let seq = entry.in_flight[pos].batch_sequence;
+                    entry.in_flight.remove(pos);
+                    seq
+                });
+            entry.reset_batch_ids.remove(&batch_id);
+            if adjust_sequences {
+                if let Some(failed_seq) = failed_sequence {
+                    // Decrement sequences of in-flight batches that have higher sequences
+                    for b in &mut entry.in_flight {
+                        if b.batch_sequence > failed_seq {
+                            b.batch_sequence -= 1;
+                            debug_assert!(
+                                b.batch_sequence >= 0,
+                                "Batch sequence for batch_id={} went negative: {}",
+                                b.batch_id,
+                                b.batch_sequence
+                            );
+                            entry.reset_batch_ids.insert(b.batch_id);
+                        }
+                    }
+                    // Roll back next_sequence
+                    if entry.next_sequence > failed_seq {
+                        entry.next_sequence -= 1;
+                        debug_assert!(
+                            entry.next_sequence >= 0,
+                            "Next sequence went negative: {}",
+                            entry.next_sequence
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    #[cfg(test)]
+    pub fn remove_in_flight_batch(&self, bucket: &TableBucket, batch_id: i64) {
+        let mut entries = self.bucket_entries.lock();
+        if let Some(entry) = entries.get_mut(bucket) {
+            entry.in_flight.retain(|b| b.batch_id != batch_id);
+        }
+    }
+
+    /// If the bucket's stored writer_id doesn't match the current writer_id
+    /// and there are no in-flight batches, reset the bucket entry to start
+    /// sequences from 0. Matches Java's `IdempotenceManager.maybeUpdateWriterId`.
+    pub fn maybe_update_writer_id(&self, bucket: &TableBucket) {
+        let current_writer_id = self.writer_id();
+        let mut entries = self.bucket_entries.lock();
+        let entry = entries
+            .entry(bucket.clone())
+            .or_insert_with(BucketEntry::new);
+        if entry.writer_id != current_writer_id && entry.in_flight.is_empty() {
+            entry.writer_id = current_writer_id;
+            entry.next_sequence = 0;
+            entry.last_acked_sequence = -1;
+            debug!(
+                "Writer id of bucket {bucket} set to {current_writer_id}. Reinitialize batch sequence at beginning."
+            );
+        }
+    }
+
+    /// Returns true if the given batch (identified by `batch_id`) is the first
+    /// in-flight batch for its bucket. Uses batch_id rather than batch_sequence
+    /// because sequence adjustment (`handle_failed_batch` with `adjust_sequences`)
+    /// modifies InFlightBatch sequences without updating the actual WriteBatch,
+    /// so batch_sequence on the WriteBatch may be stale.
+    pub fn is_first_in_flight_batch(&self, bucket: &TableBucket, batch_id: i64) -> bool {
+        let entries = self.bucket_entries.lock();
+        entries
+            .get(bucket)
+            .and_then(|e| e.in_flight.first())
+            .is_some_and(|b| b.batch_id == batch_id)
+    }
+
+    /// Returns the current (possibly adjusted) in-flight sequence for a batch.
+    /// Used by `re_enqueue` to sync the WriteBatch's sequence with the adjusted
+    /// InFlightBatch sequence.
+    ///
+    /// Does NOT clear `reset_batch_ids` — the reset marker must survive
+    /// re-enqueue so that `can_retry_for_error` can still see it on subsequent
+    /// retries. It is cleared only on terminal events: `handle_completed_batch`
+    /// or `handle_failed_batch`. This matches Java where `reopened` persists
+    /// across retries and is only cleared in `close()` (resource cleanup).
+    pub fn get_adjusted_sequence(&self, bucket: &TableBucket, batch_id: i64) -> Option<i32> {
+        let entries = self.bucket_entries.lock();
+        let entry = entries.get(bucket)?;
+        entry
+            .in_flight
+            .iter()
+            .find(|b| b.batch_id == batch_id)
+            .map(|b| b.batch_sequence)
+    }
+
+    pub fn is_next_sequence(&self, bucket: &TableBucket, batch_sequence: i32) -> bool {
+        let entries = self.bucket_entries.lock();
+        if let Some(entry) = entries.get(bucket) {
+            entry.last_acked_sequence + 1 == batch_sequence
+        } else {
+            // No entry means sequence 0 is expected (last_acked = -1, so -1 + 1 = 0)
+            batch_sequence == 0
+        }
+    }
+
+    /// Returns true if the batch has already been committed on the server.
+    ///
+    /// If the batch's sequence is less than or equal to `last_acked_sequence`, it means
+    /// a higher-sequence batch has already been acknowledged. This implies the current batch
+    /// was also successfully written on the server (otherwise the higher-sequence batch could
+    /// not have been committed).
+    pub fn is_already_committed(&self, bucket: &TableBucket, batch_sequence: i32) -> bool {
+        let entries = self.bucket_entries.lock();
+        entries
+            .get(bucket)
+            .is_some_and(|e| e.last_acked_sequence >= 0 && batch_sequence <= e.last_acked_sequence)
+    }
+
+    pub fn can_retry_for_error(
+        &self,
+        bucket: &TableBucket,
+        batch_sequence: i32,
+        batch_id: i64,
+        error: FlussError,
+    ) -> bool {
+        if !self.has_writer_id() {
+            return false;
+        }
+        let entries = self.bucket_entries.lock();
+        let entry = entries.get(bucket);
+        let is_reset = entry.is_some_and(|e| e.reset_batch_ids.contains(&batch_id));
+
+        if error == FlussError::OutOfOrderSequenceException {
+            // Inline is_next_sequence logic to avoid double-locking
+            let is_next = entry.map_or(batch_sequence == 0, |e| {
+                e.last_acked_sequence + 1 == batch_sequence
+            });
+            return is_reset || !is_next;
+        }
+        if error == FlussError::UnknownWriterIdException {
+            return is_reset;
+        }
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_bucket(bucket_id: i32) -> TableBucket {
+        TableBucket::new(1, bucket_id)
+    }
+
+    /// Setup: 3 in-flight batches (seq 0,1,2 / batch_id 100,101,102) for bucket 0.
+    fn setup_three_in_flight() -> (IdempotenceManager, TableBucket) {
+        let mgr = IdempotenceManager::new(true, 5);
+        mgr.set_writer_id(42);
+        let b0 = test_bucket(0);
+        let _ = mgr.next_sequence_and_increment(&b0); // 0
+        let _ = mgr.next_sequence_and_increment(&b0); // 1
+        let _ = mgr.next_sequence_and_increment(&b0); // 2
+        mgr.add_in_flight_batch(&b0, 0, 100);
+        mgr.add_in_flight_batch(&b0, 1, 101);
+        mgr.add_in_flight_batch(&b0, 2, 102);
+        (mgr, b0)
+    }
+
+    #[test]
+    fn test_handle_completed_batch() {
+        let (mgr, b0) = setup_three_in_flight();
+
+        // Basic: complete middle batch, verify removal and last_acked update
+        mgr.handle_completed_batch(&b0, 101, 42);
+        {
+            let entries = mgr.bucket_entries.lock();
+            let entry = entries.get(&b0).unwrap();
+            assert_eq!(entry.last_acked_sequence, 1);
+            assert_eq!(entry.in_flight.len(), 2);
+            assert_eq!(entry.in_flight[0].batch_sequence, 0);
+            assert_eq!(entry.in_flight[1].batch_sequence, 2);
+        }
+
+        // Adjusted: fail batch_id=100 (seq=0) with adjustment, then complete
+        // batch_id=102 whose seq was adjusted from 2→1. last_acked should use
+        // the adjusted sequence.
+        let (mgr, b0) = setup_three_in_flight();
+        mgr.handle_failed_batch(&b0, 101, 42, None, true);
+        mgr.handle_completed_batch(&b0, 102, 42);
+        {
+            let entries = mgr.bucket_entries.lock();
+            let entry = entries.get(&b0).unwrap();
+            assert_eq!(entry.last_acked_sequence, 1); // adjusted, not original 2
+            assert_eq!(entry.in_flight.len(), 1);
+            assert_eq!(entry.in_flight[0].batch_id, 100);
+        }
+    }
+
+    #[test]
+    fn test_handle_failed_batch() {
+        // With sequence adjustment
+        let (mgr, b0) = setup_three_in_flight();
+        mgr.handle_failed_batch(&b0, 101, 42, None, true);
+        {
+            let entries = mgr.bucket_entries.lock();
+            let entry = entries.get(&b0).unwrap();
+            assert_eq!(entry.in_flight.len(), 2);
+            assert_eq!(entry.in_flight[0].batch_sequence, 0);
+            assert_eq!(entry.in_flight[1].batch_sequence, 1); // was 2, decremented
+            assert_eq!(entry.next_sequence, 2); // was 3, decremented
+        }
+
+        // Without sequence adjustment (retries exhausted)
+        let (mgr, b0) = setup_three_in_flight();
+        mgr.handle_failed_batch(&b0, 101, 42, None, false);
+        {
+            let entries = mgr.bucket_entries.lock();
+            let entry = entries.get(&b0).unwrap();
+            assert_eq!(entry.in_flight.len(), 2);
+            assert_eq!(entry.in_flight[0].batch_sequence, 0);
+            assert_eq!(entry.in_flight[1].batch_sequence, 2); // NOT decremented
+            assert_eq!(entry.next_sequence, 3); // NOT decremented
+        }
+
+        // OOS / UnknownWriterId errors reset all writer state
+        for error in [
+            FlussError::OutOfOrderSequenceException,
+            FlussError::UnknownWriterIdException,
+        ] {
+            let (mgr, b0) = setup_three_in_flight();
+            mgr.handle_failed_batch(&b0, 100, 42, Some(error), true);
+            assert!(!mgr.has_writer_id());
+            assert!(mgr.bucket_entries.lock().is_empty());
+        }
+    }
+
+    #[test]
+    fn test_can_retry_out_of_order() {
+        let mgr = IdempotenceManager::new(true, 5);
+        let b0 = test_bucket(0);
+
+        // No writer_id → never retriable
+        assert!(!mgr.can_retry_for_error(&b0, 0, 100, FlussError::OutOfOrderSequenceException));
+
+        mgr.set_writer_id(42);
+        mgr.add_in_flight_batch(&b0, 0, 100);
+        mgr.add_in_flight_batch(&b0, 1, 101);
+
+        // seq=0 IS next expected (last_acked=-1+1=0) → genuine violation, NOT retriable
+        assert!(!mgr.can_retry_for_error(&b0, 0, 100, FlussError::OutOfOrderSequenceException));
+        // seq=1 is NOT next expected → retriable
+        assert!(mgr.can_retry_for_error(&b0, 1, 101, FlussError::OutOfOrderSequenceException));
+    }
+
+    #[test]
+    fn test_can_retry_after_sequence_reset() {
+        // OOS: batch whose seq was adjusted to match last_acked+1 is still retriable
+        let (mgr, b0) = setup_three_in_flight();
+        mgr.handle_completed_batch(&b0, 100, 42); // last_acked=0
+        mgr.handle_failed_batch(&b0, 101, 42, None, true); // batch_id=102 adjusted to seq=1
+
+        // seq=1 == last_acked(0)+1, but batch was reset → retriable
+        assert!(mgr.can_retry_for_error(&b0, 1, 102, FlussError::OutOfOrderSequenceException));
+
+        // UnknownWriterId: non-reset → NOT retriable, reset → retriable
+        let (mgr, b0) = setup_three_in_flight();
+        assert!(!mgr.can_retry_for_error(&b0, 0, 100, FlussError::UnknownWriterIdException));
+        mgr.handle_failed_batch(&b0, 101, 42, None, true); // batch_id=102 is reset
+        assert!(mgr.can_retry_for_error(&b0, 1, 102, FlussError::UnknownWriterIdException));
+    }
+
+    #[test]
+    fn test_maybe_update_writer_id() {
+        let mgr = IdempotenceManager::new(true, 5);
+        mgr.set_writer_id(42);
+        let b0 = test_bucket(0);
+
+        mgr.maybe_update_writer_id(&b0);
+        let seq = mgr.next_sequence_and_increment(&b0);
+        mgr.add_in_flight_batch(&b0, seq, 100);
+
+        // With in-flight batches: rotation is deferred
+        mgr.set_writer_id(99);
+        mgr.maybe_update_writer_id(&b0);
+        {
+            let entries = mgr.bucket_entries.lock();
+            let entry = entries.get(&b0).unwrap();
+            assert_eq!(entry.writer_id, 42); // unchanged
+            assert_eq!(entry.next_sequence, 1);
+        }
+
+        // Complete must use the writer_id that was active when batch was sent
+        mgr.handle_completed_batch(&b0, 100, 99);
+        mgr.maybe_update_writer_id(&b0);
+        {
+            let entries = mgr.bucket_entries.lock();
+            let entry = entries.get(&b0).unwrap();
+            assert_eq!(entry.writer_id, 99);
+            assert_eq!(entry.next_sequence, 0);
+            assert_eq!(entry.last_acked_sequence, -1);
+        }
+    }
+
+    #[test]
+    fn test_is_first_in_flight_batch() {
+        let (mgr, b0) = setup_three_in_flight();
+
+        assert!(mgr.is_first_in_flight_batch(&b0, 100));
+        assert!(!mgr.is_first_in_flight_batch(&b0, 101));
+
+        // After adjustment + completion, batch_id still identifies first correctly
+        mgr.handle_failed_batch(&b0, 101, 42, None, true);
+        mgr.handle_completed_batch(&b0, 100, 42);
+        assert!(mgr.is_first_in_flight_batch(&b0, 102));
+        assert!(!mgr.is_first_in_flight_batch(&b0, 100));
+    }
+
+    #[test]
+    fn test_can_send_more_requests() {
+        let mgr = IdempotenceManager::new(true, 2);
+        let b0 = test_bucket(0);
+
+        assert!(mgr.can_send_more_requests(&b0));
+
+        mgr.add_in_flight_batch(&b0, 0, 100);
+        assert!(mgr.can_send_more_requests(&b0));
+
+        mgr.add_in_flight_batch(&b0, 1, 101);
+        assert!(!mgr.can_send_more_requests(&b0)); // at limit
+
+        mgr.remove_in_flight_batch(&b0, 100);
+        assert!(mgr.can_send_more_requests(&b0)); // under limit again
+    }
+
+    #[test]
+    fn test_is_already_committed() {
+        let mgr = IdempotenceManager::new(true, 5);
+        let b0 = test_bucket(0);
+        mgr.set_writer_id(42);
+
+        // No entry yet → not committed
+        assert!(!mgr.is_already_committed(&b0, 0));
+
+        // Initialize bucket and ack seq=0
+        let _ = mgr.next_sequence_and_increment(&b0); // 0
+        mgr.add_in_flight_batch(&b0, 0, 100);
+        mgr.handle_completed_batch(&b0, 100, 42); // last_acked=0
+
+        // seq=0 <= last_acked(0) → committed
+        assert!(mgr.is_already_committed(&b0, 0));
+        // seq=1 > last_acked(0) → not committed
+        assert!(!mgr.is_already_committed(&b0, 1));
+
+        // Ack up to seq=4, then check seq=0 still committed
+        for seq in 1..=4 {
+            let _ = mgr.next_sequence_and_increment(&b0);
+            mgr.add_in_flight_batch(&b0, seq, 100 + seq as i64);
+            mgr.handle_completed_batch(&b0, 100 + seq as i64, 42);
+        }
+        assert!(mgr.is_already_committed(&b0, 0)); // seq=0 <= last_acked(4)
+        assert!(mgr.is_already_committed(&b0, 4)); // seq=4 <= last_acked(4)
+        assert!(!mgr.is_already_committed(&b0, 5)); // seq=5 > last_acked(4)
+    }
+
+    #[test]
+    fn test_reset_batch_ids_cleaned_on_complete() {
+        let (mgr, b0) = setup_three_in_flight();
+
+        // Fail batch_id=100 → batch_id=101 and 102 marked as reset
+        mgr.handle_failed_batch(&b0, 100, 42, None, true);
+        {
+            let entries = mgr.bucket_entries.lock();
+            let entry = entries.get(&b0).unwrap();
+            assert!(entry.reset_batch_ids.contains(&101));
+            assert!(entry.reset_batch_ids.contains(&102));
+        }
+
+        // Complete batch_id=101 → cleaned from reset set
+        mgr.handle_completed_batch(&b0, 101, 42);
+        {
+            let entries = mgr.bucket_entries.lock();
+            let entry = entries.get(&b0).unwrap();
+            assert!(!entry.reset_batch_ids.contains(&101));
+            assert!(entry.reset_batch_ids.contains(&102)); // still there
+        }
+    }
+
+    #[test]
+    fn test_get_adjusted_sequence() {
+        let (mgr, b0) = setup_three_in_flight();
+
+        // No entry for unknown bucket
+        assert_eq!(mgr.get_adjusted_sequence(&test_bucket(9), 100), None);
+
+        // Before adjustment: returns original sequences
+        assert_eq!(mgr.get_adjusted_sequence(&b0, 101), Some(1));
+        assert_eq!(mgr.get_adjusted_sequence(&b0, 999), None);
+
+        // After adjustment: returns adjusted sequences
+        mgr.handle_failed_batch(&b0, 100, 42, None, true);
+        assert_eq!(mgr.get_adjusted_sequence(&b0, 100), None); // removed
+        assert_eq!(mgr.get_adjusted_sequence(&b0, 101), Some(0)); // was 1
+        assert_eq!(mgr.get_adjusted_sequence(&b0, 102), Some(1)); // was 2
+
+        // Reset flag survives get_adjusted_sequence (unlike the old take_ variant).
+        // This matches Java where `reopened` persists across retries.
+        {
+            let entries = mgr.bucket_entries.lock();
+            let entry = entries.get(&b0).unwrap();
+            assert!(entry.reset_batch_ids.contains(&101));
+            assert!(entry.reset_batch_ids.contains(&102));
+        }
+    }
+
+    // --- Scenario tests ---
+    // Simulate Sender-level orchestration on IdempotenceManager.
+    // Each test mirrors a Java SenderTest integration test, exercising the same
+    // state transitions that Sender.handle_write_batch_error / complete_batch make.
+    //
+    // Convention: retriable failures make NO IdempotenceManager call (batch stays
+    // in-flight, Sender re-enqueues via accumulator). Non-retriable failures call
+    // handle_failed_batch. Successes call handle_completed_batch.
+
+    #[test]
+    fn scenario_multiple_inflight_retried_in_order() {
+        // Java: testIdempotenceWithMultipleInflightBatchesRetriedInOrder
+        // 3 batches in-flight, batch 0 times out, batches 1+2 get OOS.
+        // All are retriable and must be retried one-at-a-time in sequence order.
+        let (mgr, b0) = setup_three_in_flight();
+
+        // Batch 0 (seq=0) times out → retriable, stays in in-flight
+        // Batch 1 (seq=1) OOS → retriable (not next expected seq)
+        assert!(mgr.can_retry_for_error(&b0, 1, 101, FlussError::OutOfOrderSequenceException));
+        // Batch 2 (seq=2) OOS → retriable
+        assert!(mgr.can_retry_for_error(&b0, 2, 102, FlussError::OutOfOrderSequenceException));
+
+        // Retry phase: only first-in-flight batch should be drained
+        assert!(mgr.is_first_in_flight_batch(&b0, 100));
+        assert!(!mgr.is_first_in_flight_batch(&b0, 101));
+
+        // Retry batch 0 succeeds → last_acked=0
+        mgr.handle_completed_batch(&b0, 100, 42);
+        assert_eq!(last_acked(&mgr, &b0), 0);
+
+        // Batch 1 is now first, retry succeeds → last_acked=1
+        assert!(mgr.is_first_in_flight_batch(&b0, 101));
+        mgr.handle_completed_batch(&b0, 101, 42);
+        assert_eq!(last_acked(&mgr, &b0), 1);
+
+        // Batch 2 is now first, retry succeeds → last_acked=2
+        assert!(mgr.is_first_in_flight_batch(&b0, 102));
+        mgr.handle_completed_batch(&b0, 102, 42);
+        assert_eq!(last_acked(&mgr, &b0), 2);
+    }
+
+    #[test]
+    fn scenario_out_of_order_responses() {
+        // Java: testCorrectHandlingOfOutOfOrderResponses
+        // Server responds to batch 1 (OOS) before batch 0 (timeout).
+        // Both re-enqueued, retried in order.
+        let mgr = IdempotenceManager::new(true, 5);
+        mgr.set_writer_id(42);
+        let b0 = test_bucket(0);
+        let _ = mgr.next_sequence_and_increment(&b0);
+        let _ = mgr.next_sequence_and_increment(&b0);
+        mgr.add_in_flight_batch(&b0, 0, 100);
+        mgr.add_in_flight_batch(&b0, 1, 101);
+
+        // Batch 1 response arrives first: OOS → retriable (seq 1 ≠ next expected 0)
+        assert!(mgr.can_retry_for_error(&b0, 1, 101, FlussError::OutOfOrderSequenceException));
+        // Batch 0 response: timeout → retriable (no IdempotenceManager call)
+
+        // Retry: batch 0 must go first
+        assert!(mgr.is_first_in_flight_batch(&b0, 100));
+        mgr.handle_completed_batch(&b0, 100, 42);
+        assert_eq!(last_acked(&mgr, &b0), 0);
+
+        // Then batch 1
+        assert!(mgr.is_first_in_flight_batch(&b0, 101));
+        mgr.handle_completed_batch(&b0, 101, 42);
+        assert_eq!(last_acked(&mgr, &b0), 1);
+    }
+
+    #[test]
+    fn scenario_second_batch_succeeds_first() {
+        // Java: testCorrectHandlingOfOutOfOrderResponsesWhenSecondSucceeds
+        //       + testCorrectHandlingOfDuplicateSequenceError (same at this level)
+        // Batch 1 succeeds before batch 0. last_acked jumps ahead, then batch 0
+        // completes without regressing last_acked.
+        let mgr = IdempotenceManager::new(true, 5);
+        mgr.set_writer_id(42);
+        let b0 = test_bucket(0);
+        let _ = mgr.next_sequence_and_increment(&b0);
+        let _ = mgr.next_sequence_and_increment(&b0);
+        mgr.add_in_flight_batch(&b0, 0, 100);
+        mgr.add_in_flight_batch(&b0, 1, 101);
+
+        // Batch 1 succeeds first → last_acked jumps to 1
+        mgr.handle_completed_batch(&b0, 101, 42);
+        assert_eq!(last_acked(&mgr, &b0), 1);
+
+        // Batch 0 timeout → retriable → re-enqueued → retry succeeds
+        mgr.handle_completed_batch(&b0, 100, 42);
+        // last_acked stays 1 (0 < 1, higher wins)
+        assert_eq!(last_acked(&mgr, &b0), 1);
+        assert!(
+            mgr.bucket_entries
+                .lock()
+                .get(&b0)
+                .unwrap()
+                .in_flight
+                .is_empty()
+        );
+    }
+
+    #[test]
+    fn scenario_unknown_writer_id_resets_and_restarts() {
+        // Java: testRetryAfterResettingInFlightBatchSequence
+        // Batch 0 times out (retriable), batch 1 gets UnknownWriterId (non-retriable).
+        // UnknownWriterId resets all state. After new writer ID, sequences restart at 0.
+        let mgr = IdempotenceManager::new(true, 5);
+        mgr.set_writer_id(42);
+        let b0 = test_bucket(0);
+        let _ = mgr.next_sequence_and_increment(&b0);
+        let _ = mgr.next_sequence_and_increment(&b0);
+        mgr.add_in_flight_batch(&b0, 0, 100);
+        mgr.add_in_flight_batch(&b0, 1, 101);
+
+        // Batch 0 times out → retriable (stays in in-flight)
+        // Batch 1 UnknownWriterId → NOT retriable (non-reset batch)
+        assert!(!mgr.can_retry_for_error(&b0, 1, 101, FlussError::UnknownWriterIdException));
+
+        // Sender calls fail_batch → handle_failed_batch with error → full reset
+        mgr.handle_failed_batch(
+            &b0,
+            101,
+            42,
+            Some(FlussError::UnknownWriterIdException),
+            true,
+        );
+        assert!(!mgr.has_writer_id());
+        assert!(mgr.bucket_entries.lock().is_empty());
+
+        // New writer ID allocated, sequences restart at 0
+        mgr.set_writer_id(99);
+        assert_eq!(mgr.next_sequence_and_increment(&b0), 0);
+    }
+
+    fn last_acked(mgr: &IdempotenceManager, bucket: &TableBucket) -> i32 {
+        mgr.bucket_entries
+            .lock()
+            .get(bucket)
+            .unwrap()
+            .last_acked_sequence
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/write/mod.rs b/fluss-rust/crates/fluss/src/client/write/mod.rs
new file mode 100644
index 0000000000..a65b5d5af1
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/mod.rs
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod accumulator;
+mod batch;
+mod dynamic_batch_size;
+mod idempotence;
+
+use crate::client::broadcast::{self as client_broadcast, BatchWriteResult, BroadcastOnceReceiver};
+use crate::error::Error;
+use crate::metadata::{PhysicalTablePath, TableInfo};
+
+use crate::row::InternalRow;
+pub use accumulator::*;
+use arrow::array::RecordBatch;
+use bytes::Bytes;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+pub(crate) mod broadcast;
+mod bucket_assigner;
+
+mod sender;
+mod write_format;
+mod writer_client;
+
+pub(crate) use idempotence::IdempotenceManager;
+pub use write_format::WriteFormat;
+pub(crate) use writer_client::WriterClient;
+
+#[allow(dead_code)]
+pub struct WriteRecord<'a> {
+    record: Record<'a>,
+    physical_table_path: Arc<PhysicalTablePath>,
+    bucket_key: Option<Bytes>,
+    schema_id: i32,
+    write_format: WriteFormat,
+    table_info: Arc<TableInfo>,
+}
+
+impl<'a> WriteRecord<'a> {
+    pub fn record(&self) -> &Record<'a> {
+        &self.record
+    }
+
+    pub fn physical_table_path(&self) -> &Arc<PhysicalTablePath> {
+        &self.physical_table_path
+    }
+
+    /// Minimum batch capacity needed to fit this record, including batch header
+    /// overhead. Used to size memory reservations and KV write limits so that
+    /// oversized records don't panic on append.
+    pub fn estimated_record_size(&self) -> usize {
+        match &self.record {
+            Record::Kv(kv) => {
+                let record_size = crate::record::kv::KvRecord::size_of(
+                    &kv.key,
+                    kv.row_bytes.as_ref().map(|rb| rb.as_slice()),
+                );
+                crate::record::kv::RECORD_BATCH_HEADER_SIZE + record_size
+            }
+            Record::Log(_) => 0, // Arrow batches use record count, not byte size
+        }
+    }
+}
+
+pub enum Record<'a> {
+    Log(LogWriteRecord<'a>),
+    Kv(KvWriteRecord<'a>),
+}
+
+pub enum LogWriteRecord<'a> {
+    InternalRow(&'a dyn InternalRow),
+    RecordBatch(Arc<RecordBatch>),
+}
+
+#[derive(Clone)]
+pub enum RowBytes<'a> {
+    Borrowed(&'a [u8]),
+    Owned(Bytes),
+}
+
+impl<'a> RowBytes<'a> {
+    pub fn as_slice(&self) -> &[u8] {
+        match self {
+            RowBytes::Borrowed(slice) => slice,
+            RowBytes::Owned(bytes) => bytes.as_ref(),
+        }
+    }
+}
+
+pub struct KvWriteRecord<'a> {
+    key: Bytes,
+    target_columns: Option<Arc<Vec<usize>>>,
+    row_bytes: Option<RowBytes<'a>>,
+}
+
+impl<'a> KvWriteRecord<'a> {
+    fn new(
+        key: Bytes,
+        target_columns: Option<Arc<Vec<usize>>>,
+        row_bytes: Option<RowBytes<'a>>,
+    ) -> Self {
+        KvWriteRecord {
+            key,
+            target_columns,
+            row_bytes,
+        }
+    }
+
+    pub fn row_bytes(&self) -> Option<&[u8]> {
+        self.row_bytes.as_ref().map(|rb| rb.as_slice())
+    }
+}
+
+impl<'a> WriteRecord<'a> {
+    pub fn for_append(
+        table_info: Arc<TableInfo>,
+        physical_table_path: Arc<PhysicalTablePath>,
+        schema_id: i32,
+        row: &'a dyn InternalRow,
+    ) -> Self {
+        Self {
+            table_info,
+            record: Record::Log(LogWriteRecord::InternalRow(row)),
+            physical_table_path,
+            bucket_key: None,
+            schema_id,
+            write_format: WriteFormat::ArrowLog,
+        }
+    }
+
+    pub fn for_append_record_batch(
+        table_info: Arc<TableInfo>,
+        physical_table_path: Arc<PhysicalTablePath>,
+        schema_id: i32,
+        row: RecordBatch,
+    ) -> Self {
+        Self {
+            table_info,
+            record: Record::Log(LogWriteRecord::RecordBatch(Arc::new(row))),
+            physical_table_path,
+            bucket_key: None,
+            schema_id,
+            write_format: WriteFormat::ArrowLog,
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn for_upsert(
+        table_info: Arc<TableInfo>,
+        physical_table_path: Arc<PhysicalTablePath>,
+        schema_id: i32,
+        key: Bytes,
+        bucket_key: Option<Bytes>,
+        write_format: WriteFormat,
+        target_columns: Option<Arc<Vec<usize>>>,
+        row_bytes: Option<RowBytes<'a>>,
+    ) -> Self {
+        Self {
+            table_info,
+            record: Record::Kv(KvWriteRecord::new(key, target_columns, row_bytes)),
+            physical_table_path,
+            bucket_key,
+            schema_id,
+            write_format,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct ResultHandle {
+    receiver: BroadcastOnceReceiver<BatchWriteResult>,
+}
+
+impl ResultHandle {
+    pub fn new(receiver: BroadcastOnceReceiver<BatchWriteResult>) -> Self {
+        ResultHandle { receiver }
+    }
+
+    /// Force-complete with an error if not already completed.
+    pub(crate) fn fail(&self, error: client_broadcast::Error) {
+        self.receiver.fail(error);
+    }
+
+    pub async fn wait(&self) -> Result<BatchWriteResult, Error> {
+        self.receiver
+            .receive()
+            .await
+            .map_err(|e| Error::UnexpectedError {
+                message: format!("Fail to wait write result {e:?}"),
+                source: None,
+            })
+    }
+
+    pub fn result(&self, batch_result: BatchWriteResult) -> Result<(), Error> {
+        batch_result.map_err(|e| match e {
+            client_broadcast::Error::WriteFailed { code, message } => Error::FlussAPIError {
+                api_error: crate::rpc::ApiError { code, message },
+            },
+            client_broadcast::Error::Client { message } => Error::UnexpectedError {
+                message,
+                source: None,
+            },
+            client_broadcast::Error::Dropped => Error::UnexpectedError {
+                message: "Fail to get write result because broadcast was dropped.".to_string(),
+                source: None,
+            },
+        })
+    }
+}
+
+/// A future that represents a pending write operation.
+///
+/// This type implements [`Future`], allowing users to either:
+/// 1. Await immediately to block on acknowledgment: `writer.upsert(&row)?.await?`
+/// 2. Fire-and-forget with later flush: `writer.upsert(&row)?; writer.flush().await?`
+///
+/// This pattern is similar to rdkafka's `DeliveryFuture` and allows for efficient batching
+/// when users don't need immediate per-record acknowledgment.
+pub struct WriteResultFuture {
+    inner: Pin<Box<dyn Future<Output = Result<(), Error>> + Send>>,
+}
+
+impl std::fmt::Debug for WriteResultFuture {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("WriteResultFuture").finish_non_exhaustive()
+    }
+}
+
+impl WriteResultFuture {
+    /// Create a new WriteResultFuture from a ResultHandle.
+    pub fn new(result_handle: ResultHandle) -> Self {
+        Self {
+            inner: Box::pin(async move {
+                let result = result_handle.wait().await?;
+                result_handle.result(result)
+            }),
+        }
+    }
+}
+
+impl Future for WriteResultFuture {
+    type Output = Result<(), Error>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        self.inner.as_mut().poll(cx)
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/write/sender.rs b/fluss-rust/crates/fluss/src/client/write/sender.rs
new file mode 100644
index 0000000000..8e738d0dc5
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/sender.rs
@@ -0,0 +1,1398 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::broadcast;
+use crate::client::metadata::Metadata;
+use crate::client::write::IdempotenceManager;
+use crate::client::write::batch::WriteBatch;
+use crate::client::{ReadyWriteBatch, RecordAccumulator};
+use crate::error::Error::UnexpectedError;
+use crate::error::{FlussError, Result};
+use crate::metadata::{PhysicalTablePath, TableBucket, TablePath};
+use crate::proto::{
+    PbProduceLogRespForBucket, PbPutKvRespForBucket, PbTablePath, ProduceLogResponse, PutKvResponse,
+};
+use crate::record::{NO_BATCH_SEQUENCE, NO_WRITER_ID};
+use crate::rpc::ServerConnection;
+use crate::rpc::message::{InitWriterRequest, ProduceLogRequest, PutKvRequest};
+use crate::{PartitionId, TableId};
+use futures::StreamExt;
+use futures::stream::FuturesUnordered;
+use log::{debug, warn};
+use parking_lot::Mutex;
+use std::collections::{HashMap, HashSet};
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::Duration;
+use tokio::sync::mpsc;
+
+type SendFuture<'a> = Pin<Box<dyn Future<Output = Result<()>> + Send + 'a>>;
+
+/// Result of a synchronous drain: send futures, optional delay, and unknown leader tables.
+type DrainResult<'a> = (
+    Vec<SendFuture<'a>>,
+    Option<u64>,
+    HashSet<Arc<PhysicalTablePath>>,
+);
+
+#[allow(dead_code)]
+pub struct Sender {
+    running: AtomicBool,
+    metadata: Arc<Metadata>,
+    accumulator: Arc<RecordAccumulator>,
+    in_flight_batches: Mutex<HashMap<TableBucket, Vec<i64>>>,
+    max_request_size: i32,
+    ack: i16,
+    max_request_timeout_ms: i32,
+    retries: i32,
+    idempotence_manager: Arc<IdempotenceManager>,
+}
+
+impl Sender {
+    pub fn new(
+        metadata: Arc<Metadata>,
+        accumulator: Arc<RecordAccumulator>,
+        max_request_size: i32,
+        max_request_timeout_ms: i32,
+        ack: i16,
+        retries: i32,
+        idempotence_manager: Arc<IdempotenceManager>,
+    ) -> Self {
+        Self {
+            running: AtomicBool::new(true),
+            metadata,
+            accumulator,
+            in_flight_batches: Default::default(),
+            max_request_size,
+            ack,
+            max_request_timeout_ms,
+            retries,
+            idempotence_manager,
+        }
+    }
+
+    const WRITER_ID_RETRY_TIMES: u32 = 3;
+    const WRITER_ID_RETRY_INTERVAL_MS: u64 = 100;
+
+    async fn maybe_wait_for_writer_id(&self) -> Result<()> {
+        if !self.idempotence_manager.is_enabled() || self.idempotence_manager.has_writer_id() {
+            return Ok(());
+        }
+        let mut retry_count = 0u32;
+        loop {
+            match self.try_init_writer_id().await {
+                Ok(()) => return Ok(()),
+                Err(e) => {
+                    // Authorization errors are not transient — fail immediately.
+                    if e.api_error() == Some(FlussError::AuthorizationException) {
+                        return Err(e);
+                    }
+                    if retry_count >= Self::WRITER_ID_RETRY_TIMES {
+                        return Err(e);
+                    }
+                    if e.api_error().is_some_and(Self::is_invalid_metadata_error) {
+                        let physical_paths = self.accumulator.get_physical_table_paths_in_batches();
+                        let physical_refs: HashSet<&Arc<PhysicalTablePath>> =
+                            physical_paths.iter().collect();
+                        if let Err(meta_err) = self
+                            .metadata
+                            .update_tables_metadata(&HashSet::new(), &physical_refs, vec![])
+                            .await
+                        {
+                            warn!("Failed to refresh metadata after writer ID error: {meta_err}");
+                        }
+                    }
+                    retry_count += 1;
+                    let delay_ms = Self::WRITER_ID_RETRY_INTERVAL_MS * 2u64.pow(retry_count);
+                    warn!(
+                        "Failed to allocate writer ID (attempt {retry_count}/{}), retrying in {delay_ms}ms: {e}",
+                        Self::WRITER_ID_RETRY_TIMES,
+                    );
+                    tokio::time::sleep(Duration::from_millis(delay_ms)).await;
+                }
+            }
+        }
+    }
+
+    async fn try_init_writer_id(&self) -> Result<()> {
+        // Deduplicate by (database, table) since multiple physical paths (partitions)
+        // may share the same table. Matches Java's Set<TablePath> dedup.
+        let mut seen = HashSet::new();
+        let table_paths: Vec<PbTablePath> = self
+            .accumulator
+            .get_physical_table_paths_in_batches()
+            .iter()
+            .filter_map(|path| {
+                let key = (
+                    path.get_database_name().to_string(),
+                    path.get_table_name().to_string(),
+                );
+                if seen.insert(key.clone()) {
+                    Some(PbTablePath {
+                        database_name: key.0,
+                        table_name: key.1,
+                    })
+                } else {
+                    None
+                }
+            })
+            .collect();
+        if table_paths.is_empty() {
+            debug!("No table paths in batches, skipping writer ID allocation");
+            return Ok(());
+        }
+        let cluster = self.metadata.get_cluster();
+        let server = cluster.get_one_available_server().ok_or(UnexpectedError {
+            message: "No tablet server available to allocate writer ID".to_string(),
+            source: None,
+        })?;
+        let connection = self.metadata.get_connection(server).await?;
+        let response = connection
+            .request(InitWriterRequest::new(table_paths))
+            .await?;
+        self.idempotence_manager.set_writer_id(response.writer_id);
+        debug!(
+            "Allocated writer ID {} for idempotent writes",
+            response.writer_id
+        );
+        Ok(())
+    }
+
+    fn maybe_abort_batches(&self, error: &crate::error::Error) {
+        if self.accumulator.has_incomplete() {
+            warn!("Aborting write batches due to fatal error: {error}");
+            self.accumulator.abort_batches(broadcast::Error::Client {
+                message: format!("Writer ID allocation failed: {error}"),
+            });
+        }
+    }
+
+    /// Sequential init + drain + metadata refresh. Used by `run_once` (shutdown)
+    /// where blocking is acceptable.
+    async fn prepare_sends(&self) -> Result<(Vec<SendFuture<'_>>, Option<u64>)> {
+        if let Err(e) = self.maybe_wait_for_writer_id().await {
+            warn!("Failed to allocate writer ID after retries: {e}");
+            self.maybe_abort_batches(&e);
+            return Ok((vec![], None));
+        }
+        let (futures, delay, unknown_leaders) = self.drain_ready_sends()?;
+        if !unknown_leaders.is_empty() {
+            if let Err(e) = self.refresh_unknown_leaders(&unknown_leaders).await {
+                warn!("Metadata refresh for unknown leaders failed: {e}");
+            }
+        }
+        Ok((futures, delay))
+    }
+
+    /// Fully synchronous drain: `ready()` → `drain()` → build send futures.
+    /// No async work — safe to call on the hot path without starving
+    /// `pending.next()`. Returns unknown leader tables so the caller can
+    /// schedule a concurrent metadata refresh.
+    fn drain_ready_sends(&self) -> Result<DrainResult<'_>> {
+        let cluster = self.metadata.get_cluster();
+        let ready_check_result = self.accumulator.ready(&cluster)?;
+
+        let unknown_leaders = ready_check_result.unknown_leader_tables;
+
+        if ready_check_result.ready_nodes.is_empty() {
+            return Ok((
+                vec![],
+                Some(ready_check_result.next_ready_check_delay_ms as u64),
+                unknown_leaders,
+            ));
+        }
+
+        let batches = self.accumulator.drain(
+            cluster.clone(),
+            &ready_check_result.ready_nodes,
+            self.max_request_size,
+        )?;
+
+        let mut futures = Vec::new();
+        if !batches.is_empty() {
+            self.add_to_inflight_batches(&batches);
+            for (leader_id, leader_batches) in batches {
+                futures.push(
+                    Box::pin(self.send_write_request(leader_id, self.ack, leader_batches))
+                        as SendFuture<'_>,
+                );
+            }
+        }
+
+        Ok((futures, None, unknown_leaders))
+    }
+
+    /// Refresh metadata for buckets with unknown leaders. Runs as a concurrent
+    /// maintenance task so it never blocks the response-processing hot path.
+    async fn refresh_unknown_leaders(
+        &self,
+        unknown_leaders: &HashSet<Arc<PhysicalTablePath>>,
+    ) -> Result<()> {
+        let mut table_paths: HashSet<&TablePath> = HashSet::new();
+        let mut physical_table_paths: HashSet<&Arc<PhysicalTablePath>> = HashSet::new();
+
+        for path in unknown_leaders {
+            if path.get_partition_name().is_some() {
+                physical_table_paths.insert(path);
+            } else {
+                table_paths.insert(path.get_table_path());
+            }
+        }
+
+        if let Err(e) = self
+            .metadata
+            .update_tables_metadata(&table_paths, &physical_table_paths, vec![])
+            .await
+        {
+            match e.api_error() {
+                Some(FlussError::PartitionNotExists) => {
+                    warn!("Partition does not exist during metadata update, continuing: {e}");
+                }
+                _ => return Err(e),
+            }
+        }
+
+        debug!("Updated metadata for unknown leader tables: {unknown_leaders:?}");
+        Ok(())
+    }
+
+    /// Blocking version of drain + send, used during shutdown drain.
+    async fn run_once(&self) -> Result<()> {
+        let (futures, delay) = self.prepare_sends().await?;
+        if let Some(ms) = delay {
+            tokio::time::sleep(Duration::from_millis(ms)).await;
+            return Ok(());
+        }
+        for result in futures::future::join_all(futures).await {
+            result?;
+        }
+        Ok(())
+    }
+
+    fn add_to_inflight_batches(&self, batches: &HashMap<i32, Vec<ReadyWriteBatch>>) {
+        let mut in_flight = self.in_flight_batches.lock();
+        for batch_list in batches.values() {
+            for batch in batch_list {
+                in_flight
+                    .entry(batch.table_bucket.clone())
+                    .or_default()
+                    .push(batch.write_batch.batch_id());
+            }
+        }
+    }
+
+    async fn send_write_request(
+        &self,
+        destination: i32,
+        acks: i16,
+        batches: Vec<ReadyWriteBatch>,
+    ) -> Result<()> {
+        if batches.is_empty() {
+            return Ok(());
+        }
+        let mut records_by_bucket = HashMap::new();
+        let mut write_batch_by_table: HashMap<TableId, Vec<TableBucket>> = HashMap::new();
+
+        for batch in batches {
+            let table_bucket = batch.table_bucket.clone();
+            write_batch_by_table
+                .entry(table_bucket.table_id())
+                .or_default()
+                .push(table_bucket.clone());
+            records_by_bucket.insert(table_bucket, batch);
+        }
+
+        let cluster = self.metadata.get_cluster();
+
+        let destination_node = match cluster.get_tablet_server(destination) {
+            Some(node) => node,
+            None => {
+                self.handle_batches_with_error(
+                    records_by_bucket.into_values().collect(),
+                    FlussError::LeaderNotAvailableException,
+                    format!("Destination node not found in metadata cache {destination}."),
+                )
+                .await?;
+                return Ok(());
+            }
+        };
+        let connection = match self.metadata.get_connection(destination_node).await {
+            Ok(connection) => connection,
+            Err(e) => {
+                self.handle_batches_with_error(
+                    records_by_bucket.into_values().collect(),
+                    FlussError::NetworkException,
+                    format!("Failed to connect destination node {destination}: {e}"),
+                )
+                .await?;
+                return Ok(());
+            }
+        };
+
+        for (table_id, table_buckets) in write_batch_by_table {
+            let mut request_batches: Vec<ReadyWriteBatch> = table_buckets
+                .iter()
+                .filter_map(|bucket| records_by_bucket.remove(bucket))
+                .collect();
+
+            if request_batches.is_empty() {
+                continue;
+            }
+
+            let write_request = match Self::build_write_request(
+                table_id,
+                acks,
+                self.max_request_timeout_ms,
+                &mut request_batches,
+            ) {
+                Ok(req) => req,
+                Err(e) => {
+                    self.handle_batches_with_local_error(
+                        request_batches,
+                        format!("Failed to build write request: {e}"),
+                    )?;
+                    continue;
+                }
+            };
+
+            // let's put in back into records_by_bucket
+            // since response handle will use it.
+            for request_batch in request_batches {
+                records_by_bucket.insert(request_batch.table_bucket.clone(), request_batch);
+            }
+
+            self.send_and_handle_response(
+                &connection,
+                write_request,
+                table_id,
+                &table_buckets,
+                &mut records_by_bucket,
+            )
+            .await?;
+        }
+
+        Ok(())
+    }
+
+    fn build_write_request(
+        table_id: i64,
+        acks: i16,
+        timeout_ms: i32,
+        request_batches: &mut [ReadyWriteBatch],
+    ) -> Result<WriteRequest> {
+        let first_batch = &request_batches.first().unwrap().write_batch;
+
+        let request = match first_batch {
+            WriteBatch::ArrowLog(_) => {
+                let req = ProduceLogRequest::new(table_id, acks, timeout_ms, request_batches)?;
+                WriteRequest::ProduceLog(req)
+            }
+            WriteBatch::Kv(kv_write_batch) => {
+                let target_columns = kv_write_batch.target_columns();
+                for batch in request_batches.iter().skip(1) {
+                    match &batch.write_batch {
+                        WriteBatch::ArrowLog(_) => {
+                            return Err(UnexpectedError {
+                                message: "Expecting KvWriteBatch but found ArrowLogWriteBatch"
+                                    .to_string(),
+                                source: None,
+                            });
+                        }
+                        WriteBatch::Kv(kvb) => {
+                            if target_columns != kvb.target_columns() {
+                                return Err(UnexpectedError {
+                                    message: format!(
+                                        "All the write batches to make put kv request should have the same target columns, but got {:?} and {:?}.",
+                                        target_columns,
+                                        kvb.target_columns()
+                                    ),
+                                    source: None,
+                                });
+                            }
+                        }
+                    }
+                }
+                let cols = target_columns
+                    .map(|arc| arc.iter().map(|&c| c as i32).collect())
+                    .unwrap_or_default();
+                let req = PutKvRequest::new(table_id, acks, timeout_ms, cols, request_batches)?;
+                WriteRequest::PutKv(req)
+            }
+        };
+
+        Ok(request)
+    }
+
+    async fn send_and_handle_response(
+        &self,
+        connection: &ServerConnection,
+        write_request: WriteRequest,
+        table_id: i64,
+        table_buckets: &[TableBucket],
+        records_by_bucket: &mut HashMap<TableBucket, ReadyWriteBatch>,
+    ) -> Result<()> {
+        macro_rules! send {
+            ($request:expr) => {
+                match connection.request($request).await {
+                    Ok(response) => {
+                        self.handle_write_response(
+                            table_id,
+                            table_buckets,
+                            records_by_bucket,
+                            response,
+                        )
+                        .await
+                    }
+                    Err(e) => {
+                        self.handle_batches_with_error(
+                            table_buckets
+                                .iter()
+                                .filter_map(|b| records_by_bucket.remove(b))
+                                .collect(),
+                            FlussError::NetworkException,
+                            format!("Failed to send write request: {e}"),
+                        )
+                        .await
+                    }
+                }
+            };
+        }
+
+        match write_request {
+            WriteRequest::ProduceLog(req) => send!(req),
+            WriteRequest::PutKv(req) => send!(req),
+        }
+    }
+
+    async fn handle_write_response<R: WriteResponse>(
+        &self,
+        table_id: i64,
+        request_buckets: &[TableBucket],
+        records_by_bucket: &mut HashMap<TableBucket, ReadyWriteBatch>,
+        response: R,
+    ) -> Result<()> {
+        let mut invalid_metadata_tables: HashSet<TablePath> = HashSet::new();
+        let mut invalid_physical_table_paths: HashSet<Arc<PhysicalTablePath>> = HashSet::new();
+        let mut pending_buckets: HashSet<TableBucket> = request_buckets.iter().cloned().collect();
+
+        for bucket_resp in response.buckets_resp() {
+            let tb = TableBucket::new_with_partition(
+                table_id,
+                bucket_resp.partition_id(),
+                bucket_resp.bucket_id(),
+            );
+            let Some(ready_batch) = records_by_bucket.remove(&tb) else {
+                panic!("Missing ready batch for table bucket {tb}");
+            };
+            pending_buckets.remove(&tb);
+
+            match bucket_resp.error_code() {
+                Some(code) if code != FlussError::None.code() => {
+                    let error = FlussError::for_code(code);
+                    let message = bucket_resp
+                        .error_message()
+                        .cloned()
+                        .unwrap_or_else(|| error.message().to_string());
+                    if let Some(physical_table_path) =
+                        self.handle_write_batch_error(ready_batch, error, message)?
+                    {
+                        invalid_metadata_tables
+                            .insert(physical_table_path.get_table_path().clone());
+                        invalid_physical_table_paths.insert(physical_table_path);
+                    }
+                }
+                _ => self.complete_batch(ready_batch),
+            }
+        }
+
+        for bucket in pending_buckets {
+            if let Some(ready_batch) = records_by_bucket.remove(&bucket) {
+                if let Some(physical_table_path) = self.handle_write_batch_error(
+                    ready_batch,
+                    FlussError::UnknownServerError,
+                    format!("Missing response for table bucket {bucket}"),
+                )? {
+                    invalid_metadata_tables.insert(physical_table_path.get_table_path().clone());
+                    invalid_physical_table_paths.insert(physical_table_path);
+                }
+            }
+        }
+
+        self.update_metadata_if_needed(invalid_metadata_tables, invalid_physical_table_paths)
+            .await;
+        Ok(())
+    }
+
+    // TODO: Java has a second overload `completeBatch(batch, bucket, logEndOffset)` used for
+    // KV responses. When callers need write offset info, change BatchWriteResult to carry
+    // optional offset metadata and plumb it through BroadcastOnce → ResultHandle → WriteResultFuture.
+    fn complete_batch(&self, ready_write_batch: ReadyWriteBatch) {
+        if self.idempotence_manager.is_enabled()
+            && ready_write_batch.write_batch.batch_sequence() != NO_BATCH_SEQUENCE
+        {
+            self.idempotence_manager.handle_completed_batch(
+                &ready_write_batch.table_bucket,
+                ready_write_batch.write_batch.batch_id(),
+                ready_write_batch.write_batch.writer_id(),
+            );
+        }
+        self.finish_batch(ready_write_batch, Ok(()));
+    }
+
+    fn fail_batch(
+        &self,
+        ready_write_batch: ReadyWriteBatch,
+        error: broadcast::Error,
+        fluss_error: Option<FlussError>,
+        adjust_sequences: bool,
+    ) {
+        if self.idempotence_manager.is_enabled()
+            && ready_write_batch.write_batch.batch_sequence() != NO_BATCH_SEQUENCE
+        {
+            self.idempotence_manager.handle_failed_batch(
+                &ready_write_batch.table_bucket,
+                ready_write_batch.write_batch.batch_id(),
+                ready_write_batch.write_batch.writer_id(),
+                fluss_error,
+                adjust_sequences,
+            );
+        }
+        self.finish_batch(ready_write_batch, Err(error));
+    }
+
+    fn finish_batch(&self, ready_write_batch: ReadyWriteBatch, result: broadcast::Result<()>) {
+        if ready_write_batch.write_batch.complete(result) {
+            self.remove_from_inflight_batches(&ready_write_batch);
+            // remove from incomplete batches
+            self.accumulator
+                .remove_incomplete_batches(ready_write_batch.write_batch.batch_id())
+        }
+    }
+
+    async fn handle_batches_with_error(
+        &self,
+        batches: Vec<ReadyWriteBatch>,
+        error: FlussError,
+        message: String,
+    ) -> Result<()> {
+        let mut invalid_metadata_tables: HashSet<TablePath> = HashSet::new();
+        let mut invalid_physical_table_paths: HashSet<Arc<PhysicalTablePath>> = HashSet::new();
+
+        for batch in batches {
+            if let Some(physical_table_path) =
+                self.handle_write_batch_error(batch, error, message.clone())?
+            {
+                invalid_metadata_tables.insert(physical_table_path.get_table_path().clone());
+                invalid_physical_table_paths.insert(physical_table_path);
+            }
+        }
+        self.update_metadata_if_needed(invalid_metadata_tables, invalid_physical_table_paths)
+            .await;
+        Ok(())
+    }
+
+    fn handle_batches_with_local_error(
+        &self,
+        batches: Vec<ReadyWriteBatch>,
+        message: String,
+    ) -> Result<()> {
+        for batch in batches {
+            // Local errors (e.g. build failure) — server never saw the batch,
+            // so it's always safe to adjust sequences.
+            self.fail_batch(
+                batch,
+                broadcast::Error::Client {
+                    message: message.clone(),
+                },
+                None,
+                true,
+            );
+        }
+        Ok(())
+    }
+
+    fn handle_write_batch_error(
+        &self,
+        ready_write_batch: ReadyWriteBatch,
+        error: FlussError,
+        message: String,
+    ) -> Result<Option<Arc<PhysicalTablePath>>> {
+        let physical_table_path = Arc::clone(ready_write_batch.write_batch.physical_table_path());
+
+        if error == FlussError::DuplicateSequenceException {
+            warn!(
+                "Duplicate sequence for {} on bucket {}: {message}",
+                physical_table_path.as_ref(),
+                ready_write_batch.table_bucket.bucket_id()
+            );
+            self.complete_batch(ready_write_batch);
+            return Ok(None);
+        }
+
+        if error == FlussError::OutOfOrderSequenceException
+            && self.idempotence_manager.is_enabled()
+            && self.idempotence_manager.is_already_committed(
+                &ready_write_batch.table_bucket,
+                ready_write_batch.write_batch.batch_sequence(),
+            )
+        {
+            warn!(
+                "Batch for {} on bucket {} with sequence {} received OutOfOrderSequenceException \
+                 but has already been committed. Treating as success due to lost response.",
+                physical_table_path.as_ref(),
+                ready_write_batch.table_bucket.bucket_id(),
+                ready_write_batch.write_batch.batch_sequence(),
+            );
+            self.complete_batch(ready_write_batch);
+            return Ok(None);
+        }
+
+        if self.can_retry(&ready_write_batch, error) {
+            warn!(
+                "Retrying write batch for {} on bucket {} after error {error:?}: {message}",
+                physical_table_path.as_ref(),
+                ready_write_batch.table_bucket.bucket_id()
+            );
+
+            // If idempotence is enabled, only retry if the current writer ID still matches
+            // the batch's writer ID. If the writer ID was reset (e.g., by another bucket's
+            // error), fail the batch instead of retrying with stale state.
+            if self.idempotence_manager.is_enabled() {
+                let batch_writer_id = ready_write_batch.write_batch.writer_id();
+                if batch_writer_id != NO_WRITER_ID
+                    && self.idempotence_manager.writer_id() != batch_writer_id
+                {
+                    warn!(
+                        "Writer ID changed from {} to {} since batch was sent, failing instead of retrying",
+                        batch_writer_id,
+                        self.idempotence_manager.writer_id()
+                    );
+                    self.fail_batch(
+                        ready_write_batch,
+                        broadcast::Error::WriteFailed {
+                            code: FlussError::UnknownWriterIdException.code(),
+                            message: format!(
+                                "Attempted to retry sending a batch but the writer id has changed from {} to {}. This batch will be dropped.",
+                                batch_writer_id,
+                                self.idempotence_manager.writer_id()
+                            ),
+                        },
+                        Some(FlussError::UnknownWriterIdException),
+                        false,
+                    );
+                    return Ok(
+                        Self::is_invalid_metadata_error(error).then_some(physical_table_path)
+                    );
+                }
+            }
+
+            self.re_enqueue_batch(ready_write_batch);
+            return Ok(Self::is_invalid_metadata_error(error).then_some(physical_table_path));
+        }
+
+        // Generic error path. handle_failed_batch will detect remaining
+        // OutOfOrderSequence (not already committed) / UnknownWriterId cases and
+        // reset all writer state internally (matching Java).
+        // For other errors, only adjust sequences if the batch didn't exhaust its retries.
+        let can_adjust = ready_write_batch.write_batch.attempts() < self.retries;
+        self.fail_batch(
+            ready_write_batch,
+            broadcast::Error::WriteFailed {
+                code: error.code(),
+                message,
+            },
+            Some(error),
+            can_adjust,
+        );
+        Ok(Self::is_invalid_metadata_error(error).then_some(physical_table_path))
+    }
+
+    fn re_enqueue_batch(&self, ready_write_batch: ReadyWriteBatch) {
+        self.remove_from_inflight_batches(&ready_write_batch);
+        // TODO: add retry metrics (Java: writerMetricGroup.recordsRetryTotal().inc(recordCount))
+        self.accumulator.re_enqueue(ready_write_batch);
+    }
+
+    fn remove_from_inflight_batches(&self, ready_write_batch: &ReadyWriteBatch) {
+        let batch_id = ready_write_batch.write_batch.batch_id();
+        let mut in_flight_guard = self.in_flight_batches.lock();
+        if let Some(in_flight) = in_flight_guard.get_mut(&ready_write_batch.table_bucket) {
+            in_flight.retain(|id| *id != batch_id);
+            if in_flight.is_empty() {
+                in_flight_guard.remove(&ready_write_batch.table_bucket);
+            }
+        }
+    }
+
+    fn can_retry(&self, ready_write_batch: &ReadyWriteBatch, error: FlussError) -> bool {
+        if ready_write_batch.write_batch.attempts() >= self.retries
+            || ready_write_batch.write_batch.is_done()
+        {
+            return false;
+        }
+        if Self::is_retriable_error(error) {
+            return true;
+        }
+        // Idempotent-specific retry logic
+        let seq = ready_write_batch.write_batch.batch_sequence();
+        if self.idempotence_manager.is_enabled() && seq != NO_BATCH_SEQUENCE {
+            return self.idempotence_manager.can_retry_for_error(
+                &ready_write_batch.table_bucket,
+                seq,
+                ready_write_batch.write_batch.batch_id(),
+                error,
+            );
+        }
+        false
+    }
+
+    async fn update_metadata_if_needed(
+        &self,
+        table_paths: HashSet<TablePath>,
+        physical_table_path: HashSet<Arc<PhysicalTablePath>>,
+    ) {
+        if table_paths.is_empty() {
+            return;
+        }
+        let table_path_refs: HashSet<&TablePath> = table_paths.iter().collect();
+        let physical_table_path_refs: HashSet<&Arc<PhysicalTablePath>> =
+            physical_table_path.iter().collect();
+        if let Err(e) = self
+            .metadata
+            .update_tables_metadata(&table_path_refs, &physical_table_path_refs, vec![])
+            .await
+        {
+            warn!("Failed to update metadata after write error: {e:?}");
+        }
+    }
+
+    fn is_invalid_metadata_error(error: FlussError) -> bool {
+        matches!(
+            error,
+            FlussError::NotLeaderOrFollower
+                | FlussError::UnknownTableOrBucketException
+                | FlussError::LeaderNotAvailableException
+                | FlussError::NetworkException
+        )
+    }
+
+    fn is_retriable_error(error: FlussError) -> bool {
+        matches!(
+            error,
+            FlussError::NetworkException
+                | FlussError::NotLeaderOrFollower
+                | FlussError::UnknownTableOrBucketException
+                | FlussError::LeaderNotAvailableException
+                | FlussError::LogStorageException
+                | FlussError::KvStorageException
+                | FlussError::StorageException
+                | FlussError::RequestTimeOut
+                | FlussError::NotEnoughReplicasAfterAppendException
+                | FlussError::NotEnoughReplicasException
+                | FlussError::CorruptMessage
+                | FlussError::CorruptRecordException
+        )
+    }
+
+    /// Event-loop sender: drain batches and fire RPCs into a `FuturesUnordered`,
+    /// then process responses as they arrive. This interleaves drain cycles with
+    /// response handling — when a fast leader responds, we immediately drain and
+    /// send more batches for its buckets while slow leaders are still in-flight.
+    ///
+    /// Slow work (writer-ID init with retry backoff, metadata refresh for
+    /// unknown leaders) runs as concurrent maintenance tasks so it never blocks
+    /// `pending.next()`. The drain path (`drain_ready_sends`) is fully
+    /// synchronous — no `.await` on the hot path. Without this separation,
+    /// backoff sleeps during writer-ID init could stall response processing
+    /// and cause severe backpressure when the accumulator memory budget is full
+    /// (responses not polled → memory not freed → writers block).
+    /// Single-select event loop with `need_drain` tick.
+    ///
+    /// Invariants:
+    /// - `need_drain` is a one-shot "try a drain tick ASAP" flag.
+    /// - Each iteration either performs a sync drain tick (if flagged) or blocks
+    ///   in a single `tokio::select!`.
+    /// - `accumulator.notified()` is always listened to (producer wakeups).
+    /// - The idle timer is only armed when truly idle (no futures in any pool).
+    /// - When writer_id isn't ready, a drain tick is a no-op but the loop stays
+    ///   responsive (notified/init/meta can still wake it).
+    pub async fn run_with_shutdown(&self, mut shutdown_rx: mpsc::Receiver<()>) -> Result<()> {
+        let mut pending: FuturesUnordered<SendFuture<'_>> = FuturesUnordered::new();
+        let mut init_futs: FuturesUnordered<SendFuture<'_>> = FuturesUnordered::new();
+        let mut meta_futs: FuturesUnordered<SendFuture<'_>> = FuturesUnordered::new();
+        let mut pending_unknown: HashSet<Arc<PhysicalTablePath>> = HashSet::new();
+
+        let mut need_drain = true; // drain on first iteration to pick up any pre-existing batches
+        let mut next_delay_ms: u64 = 1;
+
+        loop {
+            // Spawn writer-ID init task if needed and not already running.
+            if init_futs.is_empty()
+                && self.idempotence_manager.is_enabled()
+                && !self.idempotence_manager.has_writer_id()
+                && self.accumulator.has_undrained()
+            {
+                init_futs.push(Box::pin(self.maybe_wait_for_writer_id()));
+            }
+
+            // Spawn metadata refresh if we have accumulated unknown leaders
+            // and no refresh is currently running.
+            if !pending_unknown.is_empty() && meta_futs.is_empty() {
+                let leaders = std::mem::take(&mut pending_unknown);
+                meta_futs.push(Box::pin(async move {
+                    self.refresh_unknown_leaders(&leaders).await
+                }));
+            }
+
+            // Drain tick: synchronous, never blocks response processing.
+            // Clear unconditionally — "need_drain" means "try", not "must succeed".
+            if need_drain {
+                need_drain = false;
+
+                if !self.idempotence_manager.is_enabled()
+                    || self.idempotence_manager.has_writer_id()
+                {
+                    match self.drain_ready_sends() {
+                        Ok((futures, delay, unknown_leaders)) => {
+                            if let Some(d) = delay {
+                                next_delay_ms = d;
+                            }
+                            pending_unknown.extend(unknown_leaders);
+                            for f in futures {
+                                pending.push(f);
+                            }
+                        }
+                        Err(e) => {
+                            warn!("Error in drain cycle: {e}");
+                        }
+                    }
+                }
+            }
+
+            let truly_idle = pending.is_empty() && init_futs.is_empty() && meta_futs.is_empty();
+            debug_assert!(next_delay_ms >= 1);
+
+            // One select to rule them all.
+            tokio::select! {
+                _ = shutdown_rx.recv() => break,
+
+                // Always listen for producer wakeups.
+                _ = self.accumulator.notified() => {
+                    need_drain = true;
+                }
+
+                // Process in-flight send responses.
+                Some(result) = pending.next(), if !pending.is_empty() => {
+                    if let Err(e) = result {
+                        warn!("Uncaught error in send request, continuing: {e}");
+                    }
+                    need_drain = true;
+                }
+
+                // Writer-ID init completed.
+                Some(result) = init_futs.next(), if !init_futs.is_empty() => {
+                    match result {
+                        Ok(()) => need_drain = true,
+                        Err(e) => {
+                            warn!("Failed to allocate writer ID after retries: {e}");
+                            self.maybe_abort_batches(&e);
+                        }
+                    }
+                }
+
+                // Metadata refresh completed — new leaders may now be known.
+                Some(result) = meta_futs.next(), if !meta_futs.is_empty() => {
+                    if let Err(e) = result {
+                        warn!("Metadata refresh for unknown leaders failed: {e}");
+                    }
+                    need_drain = true;
+                }
+
+                // Idle timer: batch timeout / linger expiry.
+                _ = tokio::time::sleep(Duration::from_millis(next_delay_ms)), if truly_idle => {
+                    need_drain = true;
+                }
+            }
+        }
+
+        // Graceful shutdown: drain remaining batches, then wait for all
+        // in-flight sends to complete.
+        while self.accumulator.has_undrained() {
+            if let Err(e) = self.run_once().await {
+                warn!("Error during shutdown drain, continuing: {e}");
+            }
+        }
+        while let Some(result) = pending.next().await {
+            if let Err(e) = result {
+                warn!("Error in send during shutdown, continuing: {e}");
+            }
+        }
+        self.close();
+        Ok(())
+    }
+
+    pub fn close(&self) {
+        self.running.store(false, Ordering::Relaxed);
+    }
+}
+
+enum WriteRequest {
+    ProduceLog(ProduceLogRequest),
+    PutKv(PutKvRequest),
+}
+
+trait BucketResponse {
+    fn bucket_id(&self) -> i32;
+    fn error_code(&self) -> Option<i32>;
+    fn error_message(&self) -> Option<&String>;
+
+    fn partition_id(&self) -> Option<PartitionId>;
+}
+
+impl BucketResponse for PbProduceLogRespForBucket {
+    fn bucket_id(&self) -> i32 {
+        self.bucket_id
+    }
+    fn error_code(&self) -> Option<i32> {
+        self.error_code
+    }
+    fn error_message(&self) -> Option<&String> {
+        self.error_message.as_ref()
+    }
+
+    fn partition_id(&self) -> Option<PartitionId> {
+        self.partition_id
+    }
+}
+
+impl BucketResponse for PbPutKvRespForBucket {
+    fn bucket_id(&self) -> i32 {
+        self.bucket_id
+    }
+    fn error_code(&self) -> Option<i32> {
+        self.error_code
+    }
+    fn error_message(&self) -> Option<&String> {
+        self.error_message.as_ref()
+    }
+
+    fn partition_id(&self) -> Option<PartitionId> {
+        self.partition_id
+    }
+}
+
+trait WriteResponse {
+    type BucketResp: BucketResponse;
+    fn buckets_resp(&self) -> &[Self::BucketResp];
+}
+
+impl WriteResponse for ProduceLogResponse {
+    type BucketResp = PbProduceLogRespForBucket;
+    fn buckets_resp(&self) -> &[Self::BucketResp] {
+        &self.buckets_resp
+    }
+}
+
+impl WriteResponse for PutKvResponse {
+    type BucketResp = PbPutKvRespForBucket;
+    fn buckets_resp(&self) -> &[Self::BucketResp] {
+        &self.buckets_resp
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::client::WriteRecord;
+    use crate::cluster::Cluster;
+    use crate::config::Config;
+    use crate::metadata::{PhysicalTablePath, TablePath};
+    use crate::proto::{PbProduceLogRespForBucket, ProduceLogResponse};
+    use crate::row::{Datum, GenericRow};
+    use crate::rpc::FlussError;
+    use crate::test_utils::{build_cluster_arc, build_table_info};
+    use std::collections::{HashMap, HashSet};
+
+    fn disabled_idempotence() -> Arc<IdempotenceManager> {
+        Arc::new(IdempotenceManager::new(false, 5))
+    }
+
+    fn enabled_idempotence() -> Arc<IdempotenceManager> {
+        Arc::new(IdempotenceManager::new(true, 5))
+    }
+
+    fn build_ready_batch(
+        accumulator: &RecordAccumulator,
+        cluster: Arc<Cluster>,
+        table_path: Arc<TablePath>,
+    ) -> Result<(ReadyWriteBatch, crate::client::ResultHandle)> {
+        let table_info = Arc::new(build_table_info(table_path.as_ref().clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(table_path));
+        let row = GenericRow {
+            values: vec![Datum::Int32(1)],
+        };
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+        let result = accumulator.append(&record, 0, &cluster, false)?;
+        let result_handle = result.result_handle.expect("result handle");
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?;
+        let mut drained = batches.remove(&1).expect("drained batches");
+        let batch = drained.pop().expect("batch");
+        Ok((batch, result_handle))
+    }
+
+    #[tokio::test]
+    async fn handle_write_batch_error_retries() -> Result<()> {
+        let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string()));
+        let cluster = build_cluster_arc(table_path.as_ref(), 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster.clone()));
+        let idempotence = disabled_idempotence();
+        let accumulator = Arc::new(RecordAccumulator::new(
+            Config::default(),
+            Arc::clone(&idempotence),
+        ));
+        let sender = Sender::new(
+            metadata,
+            accumulator.clone(),
+            1024 * 1024,
+            1000,
+            1,
+            1,
+            idempotence,
+        );
+
+        let (batch, _handle) =
+            build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path.clone())?;
+        let mut inflight = HashMap::new();
+        inflight.insert(1, vec![batch]);
+        sender.add_to_inflight_batches(&inflight);
+        let batch = inflight.remove(&1).unwrap().pop().unwrap();
+
+        sender.handle_write_batch_error(
+            batch,
+            FlussError::RequestTimeOut,
+            "timeout".to_string(),
+        )?;
+
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?;
+        let mut drained = batches.remove(&1).expect("drained batches");
+        let batch = drained.pop().expect("batch");
+        assert_eq!(batch.write_batch.attempts(), 1);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn handle_write_batch_error_fails() -> Result<()> {
+        let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string()));
+        let cluster = build_cluster_arc(table_path.as_ref(), 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster.clone()));
+        let idempotence = disabled_idempotence();
+        let accumulator = Arc::new(RecordAccumulator::new(
+            Config::default(),
+            Arc::clone(&idempotence),
+        ));
+        let sender = Sender::new(
+            metadata,
+            accumulator.clone(),
+            1024 * 1024,
+            1000,
+            1,
+            0,
+            idempotence,
+        );
+
+        let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?;
+        sender.handle_write_batch_error(
+            batch,
+            FlussError::InvalidTableException,
+            "invalid".to_string(),
+        )?;
+
+        let batch_result = handle.wait().await?;
+        assert!(matches!(
+            batch_result,
+            Err(broadcast::Error::WriteFailed { code, .. })
+                if code == FlussError::InvalidTableException.code()
+        ));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn handle_produce_response_duplicate_sequence_completes() -> Result<()> {
+        let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string()));
+        let cluster = build_cluster_arc(table_path.as_ref(), 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster.clone()));
+        let idempotence = disabled_idempotence();
+        let accumulator = Arc::new(RecordAccumulator::new(
+            Config::default(),
+            Arc::clone(&idempotence),
+        ));
+        let sender = Sender::new(
+            metadata,
+            accumulator.clone(),
+            1024 * 1024,
+            1000,
+            1,
+            0,
+            idempotence,
+        );
+
+        let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster, table_path)?;
+        let request_buckets = vec![batch.table_bucket.clone()];
+        let mut records_by_bucket = HashMap::new();
+        records_by_bucket.insert(batch.table_bucket.clone(), batch);
+
+        let response = ProduceLogResponse {
+            buckets_resp: vec![PbProduceLogRespForBucket {
+                bucket_id: 0,
+                error_code: Some(FlussError::DuplicateSequenceException.code()),
+                error_message: Some("dup".to_string()),
+                ..Default::default()
+            }],
+        };
+
+        sender
+            .handle_write_response(1, &request_buckets, &mut records_by_bucket, response)
+            .await?;
+
+        let batch_result = handle.wait().await?;
+        assert!(matches!(batch_result, Ok(())));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_unknown_writer_id_resets() -> Result<()> {
+        let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string()));
+        let cluster = build_cluster_arc(table_path.as_ref(), 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster.clone()));
+        let idempotence = enabled_idempotence();
+        let accumulator = Arc::new(RecordAccumulator::new(
+            Config::default(),
+            Arc::clone(&idempotence),
+        ));
+        idempotence.set_writer_id(42);
+        let sender = Sender::new(
+            metadata,
+            accumulator.clone(),
+            1024 * 1024,
+            1000,
+            -1,
+            i32::MAX,
+            Arc::clone(&idempotence),
+        );
+
+        // build_ready_batch drains the batch, which assigns seq=0 and adds in-flight
+        let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?;
+        assert_eq!(batch.write_batch.batch_sequence(), 0);
+        assert_eq!(batch.write_batch.writer_id(), 42);
+
+        sender.handle_write_batch_error(
+            batch,
+            FlussError::UnknownWriterIdException,
+            "unknown writer".to_string(),
+        )?;
+
+        // Writer ID should be reset
+        assert!(!idempotence.has_writer_id());
+
+        // Batch should be failed (not retried)
+        let batch_result = handle.wait().await?;
+        assert!(matches!(
+            batch_result,
+            Err(broadcast::Error::WriteFailed { code, .. })
+                if code == FlussError::UnknownWriterIdException.code()
+        ));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_out_of_order_sequence_non_retriable_resets() -> Result<()> {
+        let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string()));
+        let cluster = build_cluster_arc(table_path.as_ref(), 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster.clone()));
+        let idempotence = enabled_idempotence();
+        let accumulator = Arc::new(RecordAccumulator::new(
+            Config::default(),
+            Arc::clone(&idempotence),
+        ));
+        idempotence.set_writer_id(42);
+        // retries=0 means can_retry returns false immediately (attempts >= retries)
+        let sender = Sender::new(
+            metadata,
+            accumulator.clone(),
+            1024 * 1024,
+            1000,
+            -1,
+            0,
+            Arc::clone(&idempotence),
+        );
+
+        // build_ready_batch drains the batch, which assigns seq=0 and adds in-flight
+        let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?;
+        assert_eq!(batch.write_batch.batch_sequence(), 0);
+
+        // OutOfOrderSequence with retries exhausted → non-retriable → resets writer ID
+        sender.handle_write_batch_error(
+            batch,
+            FlussError::OutOfOrderSequenceException,
+            "out of order".to_string(),
+        )?;
+
+        // Writer ID should be reset (matching Java behavior)
+        assert!(!idempotence.has_writer_id());
+
+        // Batch should be failed
+        let batch_result = handle.wait().await?;
+        assert!(matches!(
+            batch_result,
+            Err(broadcast::Error::WriteFailed { code, .. })
+                if code == FlussError::OutOfOrderSequenceException.code()
+        ));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_stale_writer_id_prevents_retry() -> Result<()> {
+        let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string()));
+        let cluster = build_cluster_arc(table_path.as_ref(), 1, 1);
+        let metadata = Arc::new(Metadata::new_for_test(cluster.clone()));
+        let idempotence = enabled_idempotence();
+        let accumulator = Arc::new(RecordAccumulator::new(
+            Config::default(),
+            Arc::clone(&idempotence),
+        ));
+        idempotence.set_writer_id(42);
+        let sender = Sender::new(
+            metadata,
+            accumulator.clone(),
+            1024 * 1024,
+            1000,
+            -1,
+            i32::MAX,
+            Arc::clone(&idempotence),
+        );
+
+        // build_ready_batch drains the batch, which assigns seq=0 and adds in-flight
+        let (batch, handle) = build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?;
+        assert_eq!(batch.write_batch.writer_id(), 42);
+        let mut inflight = HashMap::new();
+        inflight.insert(1, vec![batch]);
+        sender.add_to_inflight_batches(&inflight);
+        let batch = inflight.remove(&1).unwrap().pop().unwrap();
+
+        // Simulate writer ID reset (e.g., another bucket got UnknownWriterIdException)
+        idempotence.reset_writer_id();
+        idempotence.set_writer_id(99); // new writer ID allocated
+
+        // NetworkException is normally retriable, but writer ID changed
+        sender.handle_write_batch_error(
+            batch,
+            FlussError::NetworkException,
+            "connection reset".to_string(),
+        )?;
+
+        // Batch should be failed (not retried) because writer ID is stale
+        let batch_result = handle.wait().await?;
+        assert!(matches!(
+            batch_result,
+            Err(broadcast::Error::WriteFailed { code, .. })
+                if code == FlussError::UnknownWriterIdException.code()
+        ));
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_writer_state_assigned_on_drain() -> Result<()> {
+        let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string()));
+        let cluster = build_cluster_arc(table_path.as_ref(), 1, 1);
+        let idempotence = enabled_idempotence();
+        let accumulator = Arc::new(RecordAccumulator::new(
+            Config::default(),
+            Arc::clone(&idempotence),
+        ));
+        idempotence.set_writer_id(99);
+
+        // Append a record to the accumulator
+        let table_info = Arc::new(build_table_info(table_path.as_ref().clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(table_path));
+        let row = GenericRow {
+            values: vec![Datum::Int32(42)],
+        };
+        let record = WriteRecord::for_append(table_info, physical_table_path, 1, &row);
+        accumulator.append(&record, 0, &cluster, false)?;
+
+        // Drain the batches — accumulator now assigns writer state during drain
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        let batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?;
+
+        // Verify the batch got writer state assigned by the accumulator
+        let batch_list = batches.values().next().unwrap();
+        let batch = &batch_list[0];
+        assert_eq!(batch.write_batch.batch_sequence(), 0);
+        assert_eq!(batch.write_batch.writer_id(), 99);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_reenqueued_batch_keeps_sequence_on_redrain() -> Result<()> {
+        let table_path = Arc::new(TablePath::new("db".to_string(), "tbl".to_string()));
+        let cluster = build_cluster_arc(table_path.as_ref(), 1, 1);
+        let idempotence = enabled_idempotence();
+        let accumulator = Arc::new(RecordAccumulator::new(
+            Config::default(),
+            Arc::clone(&idempotence),
+        ));
+        idempotence.set_writer_id(99);
+
+        // build_ready_batch drains the batch, which now assigns writer state
+        // (seq=0) during drain since idempotence is enabled.
+        let (batch, _handle) =
+            build_ready_batch(accumulator.as_ref(), cluster.clone(), table_path)?;
+
+        let writer_id = idempotence.writer_id();
+        assert_eq!(batch.write_batch.batch_sequence(), 0);
+        assert!(batch.write_batch.has_batch_sequence());
+        assert_eq!(batch.write_batch.writer_id(), writer_id);
+
+        // Re-enqueue the batch (simulating a retriable error)
+        accumulator.re_enqueue(batch);
+
+        // Drain again
+        let server = cluster.get_tablet_server(1).expect("server");
+        let nodes = HashSet::from([server.clone()]);
+        let mut batches = accumulator.drain(cluster, &nodes, 1024 * 1024)?;
+        let batch_list = batches.values_mut().next().unwrap();
+        let ready_batch = &mut batch_list[0];
+
+        // Re-enqueued batch keeps its original sequence
+        assert!(ready_batch.write_batch.has_batch_sequence());
+        assert_eq!(ready_batch.write_batch.writer_id(), writer_id);
+        assert_eq!(ready_batch.write_batch.batch_sequence(), 0);
+        // Only one sequence was allocated (during the first drain)
+        assert_eq!(
+            idempotence.next_sequence_and_increment(&ready_batch.table_bucket),
+            1
+        );
+        Ok(())
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/write/write_format.rs b/fluss-rust/crates/fluss/src/client/write/write_format.rs
new file mode 100644
index 0000000000..147152cae4
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/write_format.rs
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::KvFormat;
+use std::fmt::Display;
+
+#[derive(Copy, Clone)]
+pub enum WriteFormat {
+    ArrowLog,
+    CompactedLog,
+    CompactedKv,
+}
+
+impl WriteFormat {
+    pub const fn is_log(&self) -> bool {
+        matches!(self, Self::ArrowLog | Self::CompactedLog)
+    }
+
+    pub fn is_kv(&self) -> bool {
+        !self.is_log()
+    }
+
+    pub fn to_kv_format(&self) -> Result<KvFormat> {
+        match self {
+            WriteFormat::CompactedKv => Ok(KvFormat::COMPACTED),
+            other => Err(IllegalArgument {
+                message: format!("WriteFormat `{other}` is not a KvFormat"),
+            }),
+        }
+    }
+
+    pub fn from_kv_format(kv_format: &KvFormat) -> Result<Self> {
+        match kv_format {
+            KvFormat::COMPACTED => Ok(WriteFormat::CompactedKv),
+            other => Err(IllegalArgument {
+                message: format!("Unknown KvFormat: `{other}`"),
+            }),
+        }
+    }
+}
+
+impl Display for WriteFormat {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            WriteFormat::ArrowLog => f.write_str("ArrowLog"),
+            WriteFormat::CompactedLog => f.write_str("CompactedLog"),
+            WriteFormat::CompactedKv => f.write_str("CompactedKv"),
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/client/write/writer_client.rs b/fluss-rust/crates/fluss/src/client/write/writer_client.rs
new file mode 100644
index 0000000000..ffdf96b1df
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/client/write/writer_client.rs
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::BucketId;
+use crate::bucketing::BucketingFunction;
+use crate::client::metadata::Metadata;
+use crate::client::write::IdempotenceManager;
+use crate::client::write::broadcast;
+use crate::client::write::bucket_assigner::{
+    BucketAssigner, HashBucketAssigner, RoundRobinBucketAssigner, StickyBucketAssigner,
+};
+use crate::client::write::sender::Sender;
+use crate::client::{RecordAccumulator, ResultHandle, WriteRecord};
+use crate::config::Config;
+use crate::config::NoKeyAssigner;
+use crate::error::{Error, Result};
+use crate::metadata::{PhysicalTablePath, TableInfo};
+use bytes::Bytes;
+use dashmap::DashMap;
+use log::warn;
+use parking_lot::Mutex;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::sync::mpsc;
+use tokio::task::JoinHandle;
+
+#[allow(dead_code)]
+pub struct WriterClient {
+    config: Config,
+    max_request_size: i32,
+    accumulate: Arc<RecordAccumulator>,
+    shutdown_tx: Mutex<Option<mpsc::Sender<()>>>,
+    sender_join_handle: Mutex<Option<JoinHandle<()>>>,
+    metadata: Arc<Metadata>,
+    bucket_assigners: DashMap<Arc<PhysicalTablePath>, Arc<dyn BucketAssigner>>,
+    idempotence_manager: Arc<IdempotenceManager>,
+}
+
+impl WriterClient {
+    pub fn new(config: Config, metadata: Arc<Metadata>) -> Result<Self> {
+        let ack = Self::get_ack(&config)?;
+
+        let idempotence_manager = Arc::new(IdempotenceManager::new(
+            config.writer_enable_idempotence,
+            config.writer_max_inflight_requests_per_bucket,
+        ));
+
+        let (shutdown_tx, shutdown_rx) = mpsc::channel(1);
+
+        let accumulator = Arc::new(RecordAccumulator::new(
+            config.clone(),
+            Arc::clone(&idempotence_manager),
+        ));
+
+        let sender = Arc::new(Sender::new(
+            metadata.clone(),
+            accumulator.clone(),
+            config.writer_request_max_size,
+            30_000,
+            ack,
+            config.writer_retries,
+            Arc::clone(&idempotence_manager),
+        ));
+
+        let join_handle = tokio::spawn(async move {
+            if let Err(e) = sender.run_with_shutdown(shutdown_rx).await {
+                warn!("Sender loop exited with error: {e}");
+            }
+        });
+
+        Ok(Self {
+            max_request_size: config.writer_request_max_size,
+            config,
+            shutdown_tx: Mutex::new(Some(shutdown_tx)),
+            sender_join_handle: Mutex::new(Some(join_handle)),
+            accumulate: accumulator,
+            metadata,
+            bucket_assigners: Default::default(),
+            idempotence_manager,
+        })
+    }
+
+    fn get_ack(config: &Config) -> Result<i16> {
+        let acks = config.writer_acks.as_str();
+        if acks.eq_ignore_ascii_case("all") {
+            Ok(-1)
+        } else {
+            acks.parse::<i16>().map_err(|e| Error::IllegalArgument {
+                message: format!("invalid writer ack '{acks}': {e}"),
+            })
+        }
+    }
+
+    pub fn send(&self, record: &WriteRecord<'_>) -> Result<ResultHandle> {
+        if self.accumulate.is_closed() {
+            return Err(Error::WriterClosed {
+                message: "Cannot send: writer is closed".to_string(),
+            });
+        }
+        let physical_table_path = &record.physical_table_path;
+        let cluster = self.metadata.get_cluster();
+        let bucket_key = record.bucket_key.as_ref();
+
+        let (bucket_assigner, bucket_id) =
+            self.assign_bucket(&record.table_info, bucket_key, physical_table_path)?;
+
+        let mut result = self.accumulate.append(
+            record,
+            bucket_id,
+            &cluster,
+            bucket_assigner.abort_if_batch_full(),
+        )?;
+
+        if result.abort_record_for_new_batch {
+            let prev_bucket_id = bucket_id;
+            bucket_assigner.on_new_batch(&cluster, prev_bucket_id);
+            let bucket_id = bucket_assigner.assign_bucket(bucket_key, &cluster)?;
+            result = self.accumulate.append(record, bucket_id, &cluster, false)?;
+        }
+
+        if result.batch_is_full || result.new_batch_created {
+            self.accumulate.wakeup_sender();
+        }
+
+        Ok(result.result_handle.expect("result_handle should exist"))
+    }
+    fn assign_bucket(
+        &self,
+        table_info: &Arc<TableInfo>,
+        bucket_key: Option<&Bytes>,
+        table_path: &Arc<PhysicalTablePath>,
+    ) -> Result<(Arc<dyn BucketAssigner>, BucketId)> {
+        let cluster = self.metadata.get_cluster();
+        let bucket_assigner = {
+            if let Some(assigner) = self.bucket_assigners.get(table_path) {
+                assigner.clone()
+            } else {
+                let assigner = Self::create_bucket_assigner(
+                    table_info,
+                    Arc::clone(table_path),
+                    bucket_key,
+                    &self.config,
+                )?;
+                self.bucket_assigners
+                    .insert(Arc::clone(table_path), Arc::clone(&assigner));
+                assigner
+            }
+        };
+        let bucket_id = bucket_assigner.assign_bucket(bucket_key, &cluster)?;
+        Ok((bucket_assigner, bucket_id))
+    }
+
+    /// Close the writer with a timeout. Matches Java's two-phase shutdown:
+    ///
+    /// 1. **Graceful**: Signal the sender to drain all remaining batches.
+    ///    `accumulator.close()` makes all batches immediately ready (no need
+    ///    to wait for `batch_timeout_ms`).
+    /// 2. **Force** (if timeout exceeded): Abort the sender task and fail
+    ///    all remaining batches with an error.
+    ///
+    /// Idempotent: calling `close` a second time returns `Ok(())` immediately.
+    pub async fn close(&self, timeout: Duration) -> Result<()> {
+        // Take shutdown_tx and join_handle out of their Mutexes.
+        // Second call sees None and returns early.
+        let shutdown_tx = self.shutdown_tx.lock().take();
+        let join_handle = self.sender_join_handle.lock().take();
+
+        let Some(mut join_handle) = join_handle else {
+            return Ok(());
+        };
+
+        // Phase 1: Signal graceful shutdown.
+        // Mark accumulator closed so all batches become immediately sendable.
+        self.accumulate.close();
+        // Drop the shutdown sender — recv() returns None, breaking the sender loop.
+        drop(shutdown_tx);
+
+        // Phase 2: Wait for graceful drain, bounded by timeout.
+        tokio::select! {
+            result = &mut join_handle => {
+                if let Err(e) = result {
+                    warn!("Sender task panicked during shutdown: {e}");
+                }
+            }
+            _ = tokio::time::sleep(timeout) => {
+                // Phase 3: Force close — timeout exceeded.
+                warn!("Graceful shutdown timed out after {timeout:?}, force closing");
+                join_handle.abort();
+                let _ = join_handle.await; // Wait for cancellation to complete
+                self.accumulate.abort_batches(broadcast::Error::Client {
+                    message: "Writer force closed (shutdown timeout exceeded)".to_string(),
+                });
+            }
+        }
+        Ok(())
+    }
+
+    pub async fn flush(&self) -> Result<()> {
+        self.accumulate.begin_flush();
+        self.accumulate.await_flush_completion().await?;
+        Ok(())
+    }
+
+    pub fn create_bucket_assigner(
+        table_info: &Arc<TableInfo>,
+        table_path: Arc<PhysicalTablePath>,
+        bucket_key: Option<&Bytes>,
+        config: &Config,
+    ) -> Result<Arc<dyn BucketAssigner>> {
+        if bucket_key.is_some() {
+            let datalake_format = table_info.get_table_config().get_datalake_format()?;
+            let function = <dyn BucketingFunction>::of(datalake_format.as_ref());
+            Ok(Arc::new(HashBucketAssigner::new(
+                table_info.num_buckets,
+                function,
+            )))
+        } else {
+            match config.writer_bucket_no_key_assigner {
+                NoKeyAssigner::Sticky => Ok(Arc::new(StickyBucketAssigner::new(table_path))),
+                NoKeyAssigner::RoundRobin => Ok(Arc::new(RoundRobinBucketAssigner::new(
+                    table_path,
+                    table_info.num_buckets,
+                ))),
+            }
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/cluster/cluster.rs b/fluss-rust/crates/fluss/src/cluster/cluster.rs
new file mode 100644
index 0000000000..d5518709ec
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/cluster/cluster.rs
@@ -0,0 +1,541 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::cluster::{BucketLocation, ServerNode, ServerType};
+use crate::error::{Error, Result};
+use crate::metadata::{
+    JsonSerde, PhysicalTablePath, TableBucket, TableDescriptor, TableInfo, TablePath,
+};
+use crate::proto::{MetadataResponse, PbBucketMetadata};
+use crate::rpc::{from_pb_server_node, from_pb_table_path};
+use crate::{BucketId, PartitionId, TableId};
+use rand::random_range;
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
+
+static EMPTY: Vec<BucketLocation> = Vec::new();
+
+#[derive(Default)]
+pub struct Cluster {
+    coordinator_server: Option<ServerNode>,
+    alive_tablet_servers_by_id: HashMap<i32, ServerNode>,
+    alive_tablet_servers: Vec<ServerNode>,
+    available_locations_by_path: HashMap<Arc<PhysicalTablePath>, Vec<BucketLocation>>,
+    available_locations_by_bucket: HashMap<TableBucket, BucketLocation>,
+    table_id_by_path: HashMap<TablePath, TableId>,
+    table_path_by_id: HashMap<TableId, TablePath>,
+    table_info_by_path: HashMap<TablePath, TableInfo>,
+    partitions_id_by_path: HashMap<Arc<PhysicalTablePath>, PartitionId>,
+    partition_name_by_id: HashMap<PartitionId, String>,
+}
+
+impl Cluster {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        coordinator_server: Option<ServerNode>,
+        alive_tablet_servers_by_id: HashMap<i32, ServerNode>,
+        available_locations_by_path: HashMap<Arc<PhysicalTablePath>, Vec<BucketLocation>>,
+        available_locations_by_bucket: HashMap<TableBucket, BucketLocation>,
+        table_id_by_path: HashMap<TablePath, TableId>,
+        table_info_by_path: HashMap<TablePath, TableInfo>,
+        partitions_id_by_path: HashMap<Arc<PhysicalTablePath>, PartitionId>,
+    ) -> Self {
+        let alive_tablet_servers = alive_tablet_servers_by_id.values().cloned().collect();
+        let table_path_by_id = table_id_by_path
+            .iter()
+            .map(|(path, table_id)| (*table_id, path.clone()))
+            .collect();
+        let partition_name_by_id = partitions_id_by_path
+            .iter()
+            .filter_map(|(path, id)| path.get_partition_name().map(|name| (*id, name.clone())))
+            .collect();
+        Cluster {
+            coordinator_server,
+            alive_tablet_servers_by_id,
+            alive_tablet_servers,
+            available_locations_by_path,
+            available_locations_by_bucket,
+            table_id_by_path,
+            table_path_by_id,
+            table_info_by_path,
+            partitions_id_by_path,
+            partition_name_by_id,
+        }
+    }
+
+    pub fn invalidate_server(&self, server_id: &i32, table_ids: Vec<TableId>) -> Self {
+        let alive_tablet_servers_by_id = self
+            .alive_tablet_servers_by_id
+            .iter()
+            .filter(|&(id, _)| id != server_id)
+            .map(|(id, ts)| (*id, ts.clone()))
+            .collect();
+
+        let table_paths: HashSet<&TablePath> = table_ids
+            .iter()
+            .filter_map(|id| self.table_path_by_id.get(id))
+            .collect();
+
+        let (available_locations_by_path, available_locations_by_bucket) =
+            self.filter_bucket_locations_by_path(&table_paths);
+
+        Cluster::new(
+            self.coordinator_server.clone(),
+            alive_tablet_servers_by_id,
+            available_locations_by_path,
+            available_locations_by_bucket,
+            self.table_id_by_path.clone(),
+            self.table_info_by_path.clone(),
+            self.partitions_id_by_path.clone(),
+        )
+    }
+
+    pub fn invalidate_physical_table_meta(
+        &self,
+        physical_tables_to_invalid: &HashSet<PhysicalTablePath>,
+    ) -> Self {
+        let table_paths: HashSet<&TablePath> = physical_tables_to_invalid
+            .iter()
+            .map(|path| path.get_table_path())
+            .collect();
+        let (available_locations_by_path, available_locations_by_bucket) =
+            self.filter_bucket_locations_by_path(&table_paths);
+
+        Cluster::new(
+            self.coordinator_server.clone(),
+            self.alive_tablet_servers_by_id.clone(),
+            available_locations_by_path,
+            available_locations_by_bucket,
+            self.table_id_by_path.clone(),
+            self.table_info_by_path.clone(),
+            self.partitions_id_by_path.clone(),
+        )
+    }
+
+    pub fn update(&mut self, cluster: Cluster) {
+        let Cluster {
+            coordinator_server,
+            alive_tablet_servers_by_id,
+            alive_tablet_servers,
+            available_locations_by_path,
+            available_locations_by_bucket,
+            table_id_by_path,
+            table_path_by_id,
+            table_info_by_path,
+            partitions_id_by_path,
+            partition_name_by_id,
+        } = cluster;
+        self.coordinator_server = coordinator_server;
+        self.alive_tablet_servers_by_id = alive_tablet_servers_by_id;
+        self.alive_tablet_servers = alive_tablet_servers;
+        self.available_locations_by_path = available_locations_by_path;
+        self.available_locations_by_bucket = available_locations_by_bucket;
+        self.table_id_by_path = table_id_by_path;
+        self.table_path_by_id = table_path_by_id;
+        self.table_info_by_path = table_info_by_path;
+        self.partitions_id_by_path = partitions_id_by_path;
+        self.partition_name_by_id = partition_name_by_id;
+    }
+
+    fn filter_bucket_locations_by_path(
+        &self,
+        table_paths: &HashSet<&TablePath>,
+    ) -> (
+        HashMap<Arc<PhysicalTablePath>, Vec<BucketLocation>>,
+        HashMap<TableBucket, BucketLocation>,
+    ) {
+        let available_locations_by_path = self
+            .available_locations_by_path
+            .iter()
+            .filter(|&(path, _)| !table_paths.contains(path.get_table_path()))
+            .map(|(path, locations)| (path.clone(), locations.clone()))
+            .collect();
+
+        let available_locations_by_bucket = self
+            .available_locations_by_bucket
+            .iter()
+            .filter(|&(_bucket, location)| {
+                !table_paths.contains(&location.physical_table_path.get_table_path())
+            })
+            .map(|(bucket, location)| (bucket.clone(), location.clone()))
+            .collect();
+
+        (available_locations_by_path, available_locations_by_bucket)
+    }
+
+    pub fn from_metadata_response(
+        metadata_response: MetadataResponse,
+        origin_cluster: Option<&Cluster>,
+    ) -> Result<Cluster> {
+        let mut servers = HashMap::with_capacity(metadata_response.tablet_servers.len());
+        for pb_server in metadata_response.tablet_servers {
+            let server_id = pb_server.node_id;
+            let server_node = from_pb_server_node(pb_server, ServerType::TabletServer);
+            servers.insert(server_id, server_node);
+        }
+
+        let coordinator_server = metadata_response
+            .coordinator_server
+            .map(|node| from_pb_server_node(node, ServerType::CoordinatorServer));
+
+        let mut table_id_by_path = HashMap::new();
+        let mut table_info_by_path = HashMap::new();
+        let mut partitions_id_by_path = HashMap::new();
+        let mut tmp_available_locations_by_path = HashMap::new();
+        let mut tmp_available_location_by_bucket = HashMap::new();
+
+        if let Some(origin) = origin_cluster {
+            table_info_by_path.extend(origin.get_table_info_by_path().clone());
+            table_id_by_path.extend(origin.get_table_id_by_path().clone());
+            partitions_id_by_path.extend(origin.partitions_id_by_path.clone());
+            tmp_available_locations_by_path.extend(origin.available_locations_by_path.clone());
+            tmp_available_location_by_bucket.extend(origin.available_locations_by_bucket.clone());
+        }
+
+        // iterate all table metadata
+        for table_metadata in metadata_response.table_metadata {
+            let table_id = table_metadata.table_id;
+            let table_path = from_pb_table_path(&table_metadata.table_path);
+            let table_descriptor = TableDescriptor::deserialize_json(
+                &serde_json::from_slice(table_metadata.table_json.as_slice()).map_err(|e| {
+                    Error::JsonSerdeError {
+                        message: format!(
+                            "Error deserializing table_json into TableDescriptor for table_id {table_id} and table_path {table_path}: {e}"
+                        )
+                    }
+                })?,
+            )?;
+            let table_info = TableInfo::of(
+                table_path.clone(),
+                table_id,
+                table_metadata.schema_id,
+                table_descriptor,
+                table_metadata.created_time,
+                table_metadata.modified_time,
+            );
+            table_info_by_path.insert(table_path.clone(), table_info);
+            table_id_by_path.insert(table_path.clone(), table_id);
+
+            let bucket_metadata = table_metadata.bucket_metadata;
+            let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone())));
+
+            let bucket_locations = get_bucket_locations(
+                &mut servers,
+                bucket_metadata.as_slice(),
+                table_id,
+                None,
+                &physical_table_path,
+            );
+            tmp_available_locations_by_path.insert(physical_table_path, bucket_locations);
+        }
+
+        // iterate all partition metadata
+        for partition_metadata in metadata_response.partition_metadata {
+            let table_id = partition_metadata.table_id;
+
+            if let Some(cluster) = origin_cluster {
+                let partition_name = partition_metadata.partition_name;
+                let table_path = cluster.get_table_path_by_id(table_id).unwrap();
+                let partition_id = partition_metadata.partition_id;
+
+                let physical_table_path = Arc::new(PhysicalTablePath::of_partitioned(
+                    Arc::new(table_path.clone()),
+                    Some(partition_name),
+                ));
+
+                partitions_id_by_path.insert(Arc::clone(&physical_table_path), partition_id);
+
+                let bucket_locations = get_bucket_locations(
+                    &mut servers,
+                    partition_metadata.bucket_metadata.as_slice(),
+                    table_id,
+                    Some(partition_id),
+                    &physical_table_path,
+                );
+
+                tmp_available_locations_by_path.insert(physical_table_path, bucket_locations);
+            }
+        }
+
+        for bucket_locations in &mut tmp_available_locations_by_path.values() {
+            for location in bucket_locations {
+                if location.leader().is_some() {
+                    tmp_available_location_by_bucket
+                        .insert(location.table_bucket.clone(), location.clone());
+                }
+            }
+        }
+
+        Ok(Cluster::new(
+            coordinator_server,
+            servers,
+            tmp_available_locations_by_path,
+            tmp_available_location_by_bucket,
+            table_id_by_path,
+            table_info_by_path,
+            partitions_id_by_path,
+        ))
+    }
+
+    pub fn get_coordinator_server(&self) -> Option<&ServerNode> {
+        self.coordinator_server.as_ref()
+    }
+
+    pub fn leader_for(&self, table_bucket: &TableBucket) -> Option<&ServerNode> {
+        let location = self.available_locations_by_bucket.get(table_bucket);
+        if let Some(location) = location {
+            location.leader().as_ref()
+        } else {
+            None
+        }
+    }
+
+    pub fn get_tablet_server(&self, id: i32) -> Option<&ServerNode> {
+        self.alive_tablet_servers_by_id.get(&id)
+    }
+
+    pub fn get_table_bucket(
+        &self,
+        physical_table_path: &PhysicalTablePath,
+        bucket_id: BucketId,
+    ) -> Result<TableBucket> {
+        let table_info = self.get_table(physical_table_path.get_table_path())?;
+        let partition_id = self.get_partition_id(physical_table_path);
+
+        if physical_table_path.get_partition_name().is_some() && partition_id.is_none() {
+            return Err(Error::partition_not_exist(format!(
+                "The partition {} is not found in cluster",
+                physical_table_path.get_partition_name().unwrap()
+            )));
+        }
+
+        Ok(TableBucket::new_with_partition(
+            table_info.table_id,
+            partition_id,
+            bucket_id,
+        ))
+    }
+
+    pub fn get_partition_id(&self, physical_table_path: &PhysicalTablePath) -> Option<PartitionId> {
+        self.partitions_id_by_path.get(physical_table_path).copied()
+    }
+
+    pub fn get_partition_name(&self, partition_id: PartitionId) -> Option<&String> {
+        self.partition_name_by_id.get(&partition_id)
+    }
+
+    pub fn get_table_id(&self, table_path: &TablePath) -> Option<i64> {
+        self.table_id_by_path.get(table_path).copied()
+    }
+
+    pub fn get_bucket_locations_by_path(
+        &self,
+    ) -> &HashMap<Arc<PhysicalTablePath>, Vec<BucketLocation>> {
+        &self.available_locations_by_path
+    }
+
+    pub fn get_table_info_by_path(&self) -> &HashMap<TablePath, TableInfo> {
+        &self.table_info_by_path
+    }
+
+    pub fn get_table_id_by_path(&self) -> &HashMap<TablePath, i64> {
+        &self.table_id_by_path
+    }
+
+    pub fn get_table_path_by_id(&self, table_id: TableId) -> Option<&TablePath> {
+        self.table_path_by_id.get(&table_id)
+    }
+
+    pub fn get_available_buckets_for_table_path(
+        &self,
+        table_path: &PhysicalTablePath,
+    ) -> &Vec<BucketLocation> {
+        self.available_locations_by_path
+            .get(table_path)
+            .unwrap_or(&EMPTY)
+    }
+
+    pub fn get_server_nodes(&self) -> Vec<ServerNode> {
+        let mut nodes = Vec::new();
+        if let Some(coordinator) = &self.coordinator_server {
+            nodes.push(coordinator.clone());
+        }
+        nodes.extend(self.alive_tablet_servers.iter().cloned());
+        nodes
+    }
+
+    pub fn get_one_available_server(&self) -> Option<&ServerNode> {
+        if self.alive_tablet_servers.is_empty() {
+            return None;
+        }
+        let offset = random_range(0..self.alive_tablet_servers.len());
+        self.alive_tablet_servers.get(offset)
+    }
+
+    pub fn get_bucket_count(&self, table_path: &TablePath) -> i32 {
+        self.table_info_by_path
+            .get(table_path)
+            .unwrap_or_else(|| panic!("can't not table info by path {table_path}"))
+            .num_buckets
+    }
+
+    pub fn get_table(&self, table_path: &TablePath) -> Result<&TableInfo> {
+        self.table_info_by_path
+            .get(table_path)
+            .ok_or_else(|| Error::invalid_table(format!("Table info not found for {table_path}")))
+    }
+
+    pub fn opt_get_table(&self, table_path: &TablePath) -> Option<&TableInfo> {
+        self.table_info_by_path.get(table_path)
+    }
+
+    pub fn get_partition_id_by_path(&self) -> &HashMap<Arc<PhysicalTablePath>, PartitionId> {
+        &self.partitions_id_by_path
+    }
+}
+
+fn get_bucket_locations(
+    servers: &mut HashMap<i32, ServerNode>,
+    bucket_metadata: &[PbBucketMetadata],
+    table_id: i64,
+    partition_id: Option<PartitionId>,
+    physical_table_path: &Arc<PhysicalTablePath>,
+) -> Vec<BucketLocation> {
+    let mut bucket_locations = Vec::new();
+    for metadata in bucket_metadata {
+        let bucket_id = metadata.bucket_id;
+        let bucket = TableBucket::new_with_partition(table_id, partition_id, bucket_id);
+
+        let server = if let Some(leader_id) = metadata.leader_id
+            && let Some(server_node) = servers.get(&leader_id)
+        {
+            Some(server_node.clone())
+        } else {
+            None
+        };
+
+        bucket_locations.push(BucketLocation::new(
+            bucket.clone(),
+            server,
+            Arc::clone(physical_table_path),
+        ));
+    }
+    bucket_locations
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn make_coordinator() -> ServerNode {
+        ServerNode::new(
+            0,
+            "coord-host".to_string(),
+            9123,
+            ServerType::CoordinatorServer,
+        )
+    }
+
+    fn make_tablet_servers() -> HashMap<i32, ServerNode> {
+        let mut servers = HashMap::new();
+        servers.insert(
+            1,
+            ServerNode::new(1, "ts1-host".to_string(), 9124, ServerType::TabletServer),
+        );
+        servers.insert(
+            2,
+            ServerNode::new(2, "ts2-host".to_string(), 9125, ServerType::TabletServer),
+        );
+        servers
+    }
+
+    #[test]
+    fn test_server_node_getters() {
+        let node = ServerNode::new(5, "myhost".to_string(), 8080, ServerType::TabletServer);
+        assert_eq!(node.id(), 5);
+        assert_eq!(node.host(), "myhost");
+        assert_eq!(node.port(), 8080);
+        assert_eq!(node.server_type(), &ServerType::TabletServer);
+        assert_eq!(node.uid(), "ts-5");
+        assert_eq!(node.url(), "myhost:8080");
+    }
+
+    #[test]
+    fn test_server_type_display() {
+        assert_eq!(ServerType::TabletServer.to_string(), "TabletServer");
+        assert_eq!(
+            ServerType::CoordinatorServer.to_string(),
+            "CoordinatorServer"
+        );
+    }
+
+    #[test]
+    fn test_get_server_nodes_with_coordinator_and_tablets() {
+        let cluster = Cluster::new(
+            Some(make_coordinator()),
+            make_tablet_servers(),
+            HashMap::new(),
+            HashMap::new(),
+            HashMap::new(),
+            HashMap::new(),
+            HashMap::new(),
+        );
+
+        let nodes = cluster.get_server_nodes();
+        assert_eq!(nodes.len(), 3);
+
+        let coordinator_count = nodes
+            .iter()
+            .filter(|n| *n.server_type() == ServerType::CoordinatorServer)
+            .count();
+        assert_eq!(coordinator_count, 1);
+
+        let tablet_count = nodes
+            .iter()
+            .filter(|n| *n.server_type() == ServerType::TabletServer)
+            .count();
+        assert_eq!(tablet_count, 2);
+    }
+
+    #[test]
+    fn test_get_server_nodes_no_coordinator() {
+        let cluster = Cluster::new(
+            None,
+            make_tablet_servers(),
+            HashMap::new(),
+            HashMap::new(),
+            HashMap::new(),
+            HashMap::new(),
+            HashMap::new(),
+        );
+
+        let nodes = cluster.get_server_nodes();
+        assert_eq!(nodes.len(), 2);
+        assert!(
+            nodes
+                .iter()
+                .all(|n| *n.server_type() == ServerType::TabletServer)
+        );
+    }
+
+    #[test]
+    fn test_get_server_nodes_empty_cluster() {
+        let cluster = Cluster::default();
+        let nodes = cluster.get_server_nodes();
+        assert!(nodes.is_empty());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/cluster/mod.rs b/fluss-rust/crates/fluss/src/cluster/mod.rs
new file mode 100644
index 0000000000..863f8ed509
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/cluster/mod.rs
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::BucketId;
+use crate::metadata::{PhysicalTablePath, TableBucket};
+use std::fmt;
+use std::sync::Arc;
+
+#[allow(clippy::module_inception)]
+mod cluster;
+
+pub use cluster::Cluster;
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct ServerNode {
+    id: i32,
+    uid: String,
+    host: String,
+    port: u32,
+    server_type: ServerType,
+}
+
+impl ServerNode {
+    pub fn new(id: i32, host: String, port: u32, server_type: ServerType) -> ServerNode {
+        ServerNode {
+            id,
+            uid: match server_type {
+                ServerType::CoordinatorServer => format!("cs-{id}"),
+                ServerType::TabletServer => format!("ts-{id}"),
+            },
+            host,
+            port,
+            server_type,
+        }
+    }
+
+    pub fn uid(&self) -> &str {
+        &self.uid
+    }
+
+    pub fn url(&self) -> String {
+        format!("{}:{}", self.host, self.port)
+    }
+
+    pub fn id(&self) -> i32 {
+        self.id
+    }
+
+    pub fn host(&self) -> &str {
+        &self.host
+    }
+
+    pub fn port(&self) -> u32 {
+        self.port
+    }
+
+    pub fn server_type(&self) -> &ServerType {
+        &self.server_type
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum ServerType {
+    TabletServer,
+    CoordinatorServer,
+}
+
+impl ServerType {
+    pub fn to_type_id(&self) -> i32 {
+        match self {
+            ServerType::CoordinatorServer => 1,
+            ServerType::TabletServer => 2,
+        }
+    }
+
+    pub fn from_type_id(type_id: i32) -> Option<ServerType> {
+        match type_id {
+            1 => Some(ServerType::CoordinatorServer),
+            2 => Some(ServerType::TabletServer),
+            _ => None,
+        }
+    }
+}
+
+impl fmt::Display for ServerType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ServerType::TabletServer => write!(f, "TabletServer"),
+            ServerType::CoordinatorServer => write!(f, "CoordinatorServer"),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct BucketLocation {
+    pub table_bucket: TableBucket,
+    leader: Option<ServerNode>,
+    physical_table_path: Arc<PhysicalTablePath>,
+}
+
+impl BucketLocation {
+    pub fn new(
+        table_bucket: TableBucket,
+        leader: Option<ServerNode>,
+        physical_table_path: Arc<PhysicalTablePath>,
+    ) -> BucketLocation {
+        BucketLocation {
+            table_bucket,
+            leader,
+            physical_table_path,
+        }
+    }
+
+    pub fn leader(&self) -> &Option<ServerNode> {
+        &self.leader
+    }
+
+    pub fn table_bucket(&self) -> &TableBucket {
+        &self.table_bucket
+    }
+
+    pub fn bucket_id(&self) -> BucketId {
+        self.table_bucket.bucket_id()
+    }
+
+    pub fn physical_table_path(&self) -> &Arc<PhysicalTablePath> {
+        &self.physical_table_path
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/compression/arrow_compression.rs b/fluss-rust/crates/fluss/src/compression/arrow_compression.rs
new file mode 100644
index 0000000000..8121a512b1
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/compression/arrow_compression.rs
@@ -0,0 +1,264 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::{Error, Result};
+use arrow::ipc::CompressionType;
+use arrow_schema::ArrowError;
+use std::collections::HashMap;
+
+pub const TABLE_LOG_ARROW_COMPRESSION_ZSTD_LEVEL: &str = "table.log.arrow.compression.zstd.level";
+pub const TABLE_LOG_ARROW_COMPRESSION_TYPE: &str = "table.log.arrow.compression.type";
+pub const DEFAULT_NON_ZSTD_COMPRESSION_LEVEL: i32 = -1;
+pub const DEFAULT_ZSTD_COMPRESSION_LEVEL: i32 = 3;
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum ArrowCompressionType {
+    None,
+    Lz4Frame,
+    Zstd,
+}
+
+impl ArrowCompressionType {
+    fn from_conf(properties: &HashMap<String, String>) -> Result<Self> {
+        match properties
+            .get(TABLE_LOG_ARROW_COMPRESSION_TYPE)
+            .map(|s| s.as_str())
+        {
+            Some("NONE") => Ok(Self::None),
+            Some("LZ4_FRAME") => Ok(Self::Lz4Frame),
+            Some("ZSTD") => Ok(Self::Zstd),
+            Some(other) => Err(Error::IllegalArgument {
+                message: format!("Unsupported compression type: {other}"),
+            }),
+            None => Ok(Self::Zstd),
+        }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct ArrowCompressionInfo {
+    pub compression_type: ArrowCompressionType,
+    pub compression_level: i32,
+}
+
+impl ArrowCompressionInfo {
+    pub fn from_conf(properties: &HashMap<String, String>) -> Result<Self> {
+        let compression_type = ArrowCompressionType::from_conf(properties)?;
+
+        if compression_type != ArrowCompressionType::Zstd {
+            return Ok(Self {
+                compression_type,
+                compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+            });
+        }
+
+        match properties
+            .get(TABLE_LOG_ARROW_COMPRESSION_ZSTD_LEVEL)
+            .map(|s| s.as_str().parse::<i32>())
+        {
+            Some(Ok(level)) if !(1..=22).contains(&level) => Err(Error::IllegalArgument {
+                message: format!(
+                    "Invalid ZSTD compression level: {level}. Expected a value between 1 and 22."
+                ),
+            }),
+            Some(Err(e)) => Err(Error::IllegalArgument {
+                message: format!(
+                    "Invalid ZSTD compression level. Expected a value between 1 and 22. {e}"
+                ),
+            }),
+            Some(Ok(level)) => {
+                // TODO Remove once non-default ZSTD compression level is implemented https://github.com/apache/fluss-rust/issues/109
+                if level != DEFAULT_ZSTD_COMPRESSION_LEVEL {
+                    return Err(Error::ArrowError {
+                        message: format!(
+                            "Rust client currently only implements default ZSTD compression level {DEFAULT_ZSTD_COMPRESSION_LEVEL}. Got: {level}."
+                        ),
+                        source: ArrowError::NotYetImplemented(format!(
+                            "zstd compression level {level}."
+                        )),
+                    });
+                }
+                Ok(Self {
+                    compression_type,
+                    compression_level: level,
+                })
+            }
+            None => Ok(Self {
+                compression_type,
+                compression_level: DEFAULT_ZSTD_COMPRESSION_LEVEL,
+            }),
+        }
+    }
+
+    #[cfg(test)]
+    fn new(compression_type: ArrowCompressionType, compression_level: i32) -> ArrowCompressionInfo {
+        Self {
+            compression_type,
+            compression_level,
+        }
+    }
+
+    pub fn get_compression_type(&self) -> Option<CompressionType> {
+        match self.compression_type {
+            ArrowCompressionType::Zstd => Some(CompressionType::ZSTD),
+            ArrowCompressionType::Lz4Frame => Some(CompressionType::LZ4_FRAME),
+            ArrowCompressionType::None => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::collections::HashMap;
+
+    #[test]
+    fn test_from_conf() {
+        assert_eq!(
+            ArrowCompressionType::from_conf(&HashMap::new()).unwrap(),
+            ArrowCompressionType::Zstd
+        );
+
+        assert_eq!(
+            ArrowCompressionType::from_conf(&mk_map(&[(
+                "table.log.arrow.compression.type",
+                "NONE"
+            )]))
+            .unwrap(),
+            ArrowCompressionType::None
+        );
+
+        assert_eq!(
+            ArrowCompressionType::from_conf(&mk_map(&[(
+                "table.log.arrow.compression.type",
+                "LZ4_FRAME"
+            )]))
+            .unwrap(),
+            ArrowCompressionType::Lz4Frame
+        );
+
+        assert_eq!(
+            ArrowCompressionType::from_conf(&mk_map(&[(
+                "table.log.arrow.compression.type",
+                "ZSTD"
+            )]))
+            .unwrap(),
+            ArrowCompressionType::Zstd
+        );
+    }
+
+    #[test]
+    fn test_from_conf_invalid_compression_type() {
+        let props = mk_map(&[("table.log.arrow.compression.type", "FOO")]);
+
+        assert!(
+            ArrowCompressionInfo::from_conf(&props)
+                .unwrap_err()
+                .to_string()
+                .contains(
+                    "Fluss hitting illegal argument error Unsupported compression type: FOO."
+                )
+        );
+    }
+
+    #[test]
+    fn test_from_conf_zstd_compression_level() {
+        let compression_info = ArrowCompressionInfo::from_conf(&mk_map(&[(
+            "table.log.arrow.compression.type",
+            "ZSTD",
+        )]));
+        assert_eq!(compression_info.unwrap().compression_level, 3);
+    }
+
+    // TODO Remove once non-default ZSTD compression level is implemented https://github.com/apache/fluss-rust/issues/109
+    #[test]
+    fn test_from_conf_zstd_compression_level_error_when_non_default() {
+        let result = ArrowCompressionInfo::from_conf(&mk_map(&[
+            ("table.log.arrow.compression.type", "ZSTD"),
+            ("table.log.arrow.compression.zstd.level", "1"),
+        ]));
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains(
+            "Rust client currently only implements default ZSTD compression level 3. Got: 1."
+        ));
+    }
+
+    #[test]
+    fn test_from_conf_compression_level_out_of_range() {
+        let props = mk_map(&[
+            ("table.log.arrow.compression.type", "ZSTD"),
+            ("table.log.arrow.compression.zstd.level", "0"),
+        ]);
+
+        assert!(
+            ArrowCompressionInfo::from_conf(&props)
+                .unwrap_err()
+                .to_string()
+                .contains("Expected a value between 1 and 22.")
+        );
+
+        let props = mk_map(&[
+            ("table.log.arrow.compression.type", "ZSTD"),
+            ("table.log.arrow.compression.zstd.level", "23"),
+        ]);
+
+        assert!(
+            ArrowCompressionInfo::from_conf(&props)
+                .unwrap_err()
+                .to_string()
+                .contains("Expected a value between 1 and 22.")
+        );
+    }
+
+    #[test]
+    fn test_from_conf_compression_level_parse_error() {
+        let props = mk_map(&[
+            ("table.log.arrow.compression.type", "ZSTD"),
+            ("table.log.arrow.compression.zstd.level", "not-a-number"),
+        ]);
+
+        assert!(
+            ArrowCompressionInfo::from_conf(&props)
+                .unwrap_err()
+                .to_string()
+                .contains("Expected a value between 1 and 22.")
+        );
+    }
+
+    #[test]
+    fn get_compression_type_maps_correctly() {
+        assert_eq!(
+            ArrowCompressionInfo::new(ArrowCompressionType::None, -1).get_compression_type(),
+            None
+        );
+        assert_eq!(
+            ArrowCompressionInfo::new(ArrowCompressionType::Lz4Frame, -1).get_compression_type(),
+            Some(CompressionType::LZ4_FRAME)
+        );
+        assert_eq!(
+            ArrowCompressionInfo::new(ArrowCompressionType::Zstd, -1).get_compression_type(),
+            Some(CompressionType::ZSTD)
+        );
+    }
+
+    fn mk_map(pairs: &[(&str, &str)]) -> HashMap<String, String> {
+        pairs
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect()
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/compression/arrow_compression_ratio_estimator.rs b/fluss-rust/crates/fluss/src/compression/arrow_compression_ratio_estimator.rs
new file mode 100644
index 0000000000..08b8048aa4
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/compression/arrow_compression_ratio_estimator.rs
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::atomic::{AtomicU32, Ordering};
+
+/// Adaptive estimator for Arrow compression ratios.
+///
+/// Tracks the ratio between compressed and uncompressed Arrow body sizes.
+/// The estimate adjusts asymmetrically: it increases quickly when compression
+/// worsens (to avoid underestimating batch sizes) and decreases slowly when
+/// compression improves (conservative).
+///
+/// Thread-safe: uses atomic f32 (stored as u32 bits) matching Java's `volatile float`.
+///
+/// Matching Java's `ArrowCompressionRatioEstimator`.
+pub struct ArrowCompressionRatioEstimator {
+    /// Stored as `f32::to_bits()` for atomic access.
+    ratio_bits: AtomicU32,
+}
+
+const COMPRESSION_RATIO_IMPROVING_STEP: f32 = 0.005;
+const COMPRESSION_RATIO_DETERIORATE_STEP: f32 = 0.05;
+const DEFAULT_COMPRESSION_RATIO: f32 = 1.0;
+
+impl ArrowCompressionRatioEstimator {
+    pub fn new() -> Self {
+        Self {
+            ratio_bits: AtomicU32::new(DEFAULT_COMPRESSION_RATIO.to_bits()),
+        }
+    }
+
+    pub fn estimation(&self) -> f32 {
+        f32::from_bits(self.ratio_bits.load(Ordering::Relaxed))
+    }
+
+    pub fn update_estimation(&self, observed_ratio: f32) {
+        let current = self.estimation();
+        let new_ratio = if observed_ratio > current {
+            (current + COMPRESSION_RATIO_DETERIORATE_STEP).max(observed_ratio)
+        } else if observed_ratio < current {
+            (current - COMPRESSION_RATIO_IMPROVING_STEP).max(observed_ratio)
+        } else {
+            return;
+        };
+        self.ratio_bits
+            .store(new_ratio.to_bits(), Ordering::Relaxed);
+    }
+}
+
+impl Default for ArrowCompressionRatioEstimator {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_ratio_is_one() {
+        let e = ArrowCompressionRatioEstimator::new();
+        assert_eq!(e.estimation(), 1.0);
+    }
+
+    #[test]
+    fn test_deterioration_jumps_quickly() {
+        let e = ArrowCompressionRatioEstimator::new();
+        // Observed ratio worse than estimate: jump by at least DETERIORATE_STEP
+        e.update_estimation(1.1);
+        assert!(e.estimation() >= 1.05);
+    }
+
+    #[test]
+    fn test_improvement_moves_slowly() {
+        let e = ArrowCompressionRatioEstimator::new();
+        // Observed ratio better than estimate: move down by at most IMPROVING_STEP
+        e.update_estimation(0.5);
+        assert!((e.estimation() - 0.995).abs() < 0.001);
+    }
+
+    #[test]
+    fn test_converges_to_observed() {
+        let e = ArrowCompressionRatioEstimator::new();
+        // After many updates with same ratio, should converge
+        for _ in 0..1000 {
+            e.update_estimation(0.7);
+        }
+        assert!((e.estimation() - 0.7).abs() < 0.01);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/compression/mod.rs b/fluss-rust/crates/fluss/src/compression/mod.rs
new file mode 100644
index 0000000000..29923c0a84
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/compression/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod arrow_compression;
+mod arrow_compression_ratio_estimator;
+
+pub use arrow_compression::*;
+pub use arrow_compression_ratio_estimator::*;
diff --git a/fluss-rust/crates/fluss/src/config.rs b/fluss-rust/crates/fluss/src/config.rs
new file mode 100644
index 0000000000..cad8d9cb55
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/config.rs
@@ -0,0 +1,683 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use clap::{Parser, ValueEnum};
+use serde::{Deserialize, Serialize};
+use strum_macros::{Display, EnumString};
+
+const DEFAULT_BOOTSTRAP_SERVER: &str = "127.0.0.1:9123";
+const DEFAULT_REQUEST_MAX_SIZE: i32 = 10 * 1024 * 1024;
+const DEFAULT_WRITER_BATCH_SIZE: i32 = 2 * 1024 * 1024;
+// Mirrors Java's `2 * pageSize` floor with default pageSize = 128 KB.
+const DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_MIN: i32 = 256 * 1024;
+const DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_ENABLED: bool = true;
+const DEFAULT_RETRIES: i32 = i32::MAX;
+const DEFAULT_PREFETCH_NUM: usize = 4;
+const DEFAULT_DOWNLOAD_THREADS: usize = 3;
+const DEFAULT_SCANNER_REMOTE_LOG_READ_CONCURRENCY: usize = 4;
+const DEFAULT_MAX_POLL_RECORDS: usize = 500;
+const DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024;
+const DEFAULT_SCANNER_LOG_FETCH_MIN_BYTES: i32 = 1;
+const DEFAULT_SCANNER_LOG_FETCH_WAIT_MAX_TIME_MS: i32 = 500;
+const DEFAULT_WRITER_BATCH_TIMEOUT_MS: i64 = 100;
+const DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES_FOR_BUCKET: i32 = 1024 * 1024;
+const DEFAULT_WRITER_MAX_INFLIGHT_REQUESTS_PER_BUCKET: usize = 5;
+const DEFAULT_WRITER_BUFFER_MEMORY_SIZE: usize = 64 * 1024 * 1024; // 64MB, matching Java
+const DEFAULT_WRITER_BUFFER_WAIT_TIMEOUT_MS: u64 = u64::MAX;
+
+const MAX_IN_FLIGHT_REQUESTS_PER_BUCKET_FOR_IDEMPOTENCE: usize = 5;
+const DEFAULT_ACKS: &str = "all";
+const DEFAULT_CONNECT_TIMEOUT_MS: u64 = 120_000;
+const DEFAULT_SECURITY_PROTOCOL: &str = "PLAINTEXT";
+const DEFAULT_SASL_MECHANISM: &str = "PLAIN";
+
+/// Bucket assigner strategy for tables without bucket keys.
+/// Matches Java `client.writer.bucket.no-key-assigner`.
+#[derive(
+    Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Deserialize, Serialize, EnumString, Display,
+)]
+#[serde(rename_all = "snake_case")]
+#[strum(ascii_case_insensitive)]
+pub enum NoKeyAssigner {
+    /// Sticks to one bucket until the batch is full, then switches.
+    #[strum(serialize = "sticky")]
+    Sticky,
+    /// Assigns each record to the next bucket in a rotating sequence.
+    #[strum(serialize = "round_robin")]
+    RoundRobin,
+}
+
+#[derive(Parser, Clone, Deserialize, Serialize)]
+#[command(author, version, about, long_about = None)]
+pub struct Config {
+    #[arg(long, default_value_t = String::from(DEFAULT_BOOTSTRAP_SERVER))]
+    pub bootstrap_servers: String,
+
+    #[arg(long, default_value_t = DEFAULT_REQUEST_MAX_SIZE)]
+    pub writer_request_max_size: i32,
+
+    #[arg(long, default_value_t = String::from(DEFAULT_ACKS))]
+    pub writer_acks: String,
+
+    #[arg(long, default_value_t = DEFAULT_RETRIES)]
+    pub writer_retries: i32,
+
+    #[arg(long, default_value_t = DEFAULT_WRITER_BATCH_SIZE)]
+    pub writer_batch_size: i32,
+
+    /// Tune the per-table writer batch size from observed fill ratios.
+    /// Default: true (matching Java `client.writer.dynamic-batch-size.enabled`).
+    #[arg(long, default_value_t = DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_ENABLED)]
+    pub writer_dynamic_batch_size_enabled: bool,
+
+    /// Lower bound for the dynamic batch size estimator.
+    /// Default: 262144 (256 KB), matching Java's `2 * pageSize` floor.
+    /// Ignored when `writer_dynamic_batch_size_enabled` is false.
+    #[arg(long, default_value_t = DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_MIN)]
+    pub writer_dynamic_batch_size_min: i32,
+
+    #[arg(long, value_enum, default_value_t = NoKeyAssigner::Sticky)]
+    pub writer_bucket_no_key_assigner: NoKeyAssigner,
+
+    /// Maximum number of remote log segments to prefetch
+    /// Default: 4 (matching Java CLIENT_SCANNER_REMOTE_LOG_PREFETCH_NUM)
+    #[arg(long, default_value_t = DEFAULT_PREFETCH_NUM)]
+    pub scanner_remote_log_prefetch_num: usize,
+
+    /// Maximum concurrent remote log downloads
+    /// Default: 3 (matching Java REMOTE_FILE_DOWNLOAD_THREAD_NUM)
+    #[arg(long, default_value_t = DEFAULT_DOWNLOAD_THREADS)]
+    pub remote_file_download_thread_num: usize,
+
+    /// Intra-file remote log read concurrency for each remote segment download.
+    /// Download path always uses streaming reader.
+    #[arg(long, default_value_t = DEFAULT_SCANNER_REMOTE_LOG_READ_CONCURRENCY)]
+    pub scanner_remote_log_read_concurrency: usize,
+
+    /// Maximum number of records returned in a single call to poll() for LogScanner.
+    /// Default: 500 (matching Java CLIENT_SCANNER_LOG_MAX_POLL_RECORDS)
+    #[arg(long, default_value_t = DEFAULT_MAX_POLL_RECORDS)]
+    pub scanner_log_max_poll_records: usize,
+
+    /// Maximum bytes per fetch response for LogScanner.
+    /// Default: 16777216 (16MB)
+    #[arg(long, default_value_t = DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES)]
+    pub scanner_log_fetch_max_bytes: i32,
+
+    /// Minimum bytes to accumulate before returning a fetch response.
+    /// Default: 1
+    #[arg(long, default_value_t = DEFAULT_SCANNER_LOG_FETCH_MIN_BYTES)]
+    pub scanner_log_fetch_min_bytes: i32,
+
+    /// Maximum time the server may wait (ms) to satisfy min-bytes.
+    /// Default: 500
+    #[arg(long, default_value_t = DEFAULT_SCANNER_LOG_FETCH_WAIT_MAX_TIME_MS)]
+    pub scanner_log_fetch_wait_max_time_ms: i32,
+
+    /// The maximum time to wait for a batch to be completed in milliseconds.
+    /// Default: 100 (matching Java CLIENT_WRITER_BATCH_TIMEOUT)
+    #[arg(long, default_value_t = DEFAULT_WRITER_BATCH_TIMEOUT_MS)]
+    pub writer_batch_timeout_ms: i64,
+
+    /// Maximum bytes per fetch response **per bucket** for LogScanner.
+    /// Default: 1048576 (1MB)
+    #[arg(long, default_value_t = DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES_FOR_BUCKET)]
+    pub scanner_log_fetch_max_bytes_for_bucket: i32,
+
+    /// Whether to enable idempotent writes. When enabled, each batch is tagged with
+    /// a server-allocated writer ID and per-bucket sequence number so the server can
+    /// detect and deduplicate retried batches.
+    /// Default: true (matching Java CLIENT_WRITER_ENABLE_IDEMPOTENCE)
+    #[arg(long, default_value_t = true)]
+    pub writer_enable_idempotence: bool,
+
+    /// Maximum number of in-flight requests per bucket for idempotent writes.
+    /// Default: 5 (matching Java client.writer.max-inflight-requests-per-bucket)
+    #[arg(long, default_value_t = DEFAULT_WRITER_MAX_INFLIGHT_REQUESTS_PER_BUCKET)]
+    pub writer_max_inflight_requests_per_bucket: usize,
+
+    /// Total memory available for buffering write batches across all buckets.
+    /// When this limit is reached, `upsert()`/`append()` will block until
+    /// in-flight batches complete and free memory.
+    /// Default: 64MB (matching Java's LazyMemorySegmentPool: 512 pages x 128KB)
+    #[arg(long, default_value_t = DEFAULT_WRITER_BUFFER_MEMORY_SIZE)]
+    pub writer_buffer_memory_size: usize,
+
+    /// Maximum time in milliseconds to block waiting for buffer memory.
+    /// If the timeout is exceeded, the write call returns an error.
+    #[arg(long, default_value_t = DEFAULT_WRITER_BUFFER_WAIT_TIMEOUT_MS)]
+    pub writer_buffer_wait_timeout_ms: u64,
+
+    /// Connect timeout in milliseconds for TCP transport connect.
+    /// Default: 120000 (120 seconds).
+    #[arg(long, default_value_t = DEFAULT_CONNECT_TIMEOUT_MS)]
+    pub connect_timeout_ms: u64,
+
+    #[arg(long, default_value_t = String::from(DEFAULT_SECURITY_PROTOCOL))]
+    pub security_protocol: String,
+
+    #[arg(long, default_value_t = String::from(DEFAULT_SASL_MECHANISM))]
+    pub security_sasl_mechanism: String,
+
+    #[arg(long, default_value_t = String::new())]
+    pub security_sasl_username: String,
+
+    #[arg(long, default_value_t = String::new())]
+    #[serde(skip_serializing)]
+    pub security_sasl_password: String,
+    /// Maximum number of pending lookup operations
+    /// Default: 25600 (matching Java CLIENT_LOOKUP_QUEUE_SIZE)
+    #[arg(long, default_value_t = 25600)]
+    pub lookup_queue_size: usize,
+
+    /// Maximum batch size of merging lookup operations to one lookup request
+    /// Default: 128 (matching Java CLIENT_LOOKUP_MAX_BATCH_SIZE)
+    #[arg(long, default_value_t = 128)]
+    pub lookup_max_batch_size: usize,
+
+    /// Maximum time to wait for the lookup batch to fill (in milliseconds)
+    /// Default: 100 (matching Java CLIENT_LOOKUP_BATCH_TIMEOUT)
+    #[arg(long, default_value_t = 100)]
+    pub lookup_batch_timeout_ms: u64,
+
+    /// Maximum number of unacknowledged lookup requests
+    /// Default: 128 (matching Java CLIENT_LOOKUP_MAX_INFLIGHT_SIZE)
+    #[arg(long, default_value_t = 128)]
+    pub lookup_max_inflight_requests: usize,
+
+    /// Maximum number of lookup retries
+    /// Default: i32::MAX (matching Java CLIENT_LOOKUP_MAX_RETRIES)
+    #[arg(long, default_value_t = i32::MAX)]
+    pub lookup_max_retries: i32,
+}
+
+impl std::fmt::Debug for Config {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Config")
+            .field("bootstrap_servers", &self.bootstrap_servers)
+            .field("writer_request_max_size", &self.writer_request_max_size)
+            .field("writer_acks", &self.writer_acks)
+            .field("writer_retries", &self.writer_retries)
+            .field("writer_batch_size", &self.writer_batch_size)
+            .field(
+                "writer_dynamic_batch_size_enabled",
+                &self.writer_dynamic_batch_size_enabled,
+            )
+            .field(
+                "writer_dynamic_batch_size_min",
+                &self.writer_dynamic_batch_size_min,
+            )
+            .field(
+                "writer_bucket_no_key_assigner",
+                &self.writer_bucket_no_key_assigner,
+            )
+            .field(
+                "scanner_remote_log_prefetch_num",
+                &self.scanner_remote_log_prefetch_num,
+            )
+            .field(
+                "remote_file_download_thread_num",
+                &self.remote_file_download_thread_num,
+            )
+            .field(
+                "scanner_log_max_poll_records",
+                &self.scanner_log_max_poll_records,
+            )
+            .field(
+                "scanner_log_fetch_max_bytes",
+                &self.scanner_log_fetch_max_bytes,
+            )
+            .field(
+                "scanner_log_fetch_min_bytes",
+                &self.scanner_log_fetch_min_bytes,
+            )
+            .field(
+                "scanner_log_fetch_max_bytes_for_bucket",
+                &self.scanner_log_fetch_max_bytes_for_bucket,
+            )
+            .field(
+                "scanner_log_fetch_wait_max_time_ms",
+                &self.scanner_log_fetch_wait_max_time_ms,
+            )
+            .field("writer_batch_timeout_ms", &self.writer_batch_timeout_ms)
+            .field("writer_enable_idempotence", &self.writer_enable_idempotence)
+            .field(
+                "writer_max_inflight_requests_per_bucket",
+                &self.writer_max_inflight_requests_per_bucket,
+            )
+            .field("writer_buffer_memory_size", &self.writer_buffer_memory_size)
+            .field(
+                "writer_buffer_wait_timeout_ms",
+                &self.writer_buffer_wait_timeout_ms,
+            )
+            .field("connect_timeout_ms", &self.connect_timeout_ms)
+            .field("security_protocol", &self.security_protocol)
+            .field("security_sasl_mechanism", &self.security_sasl_mechanism)
+            .field("security_sasl_username", &self.security_sasl_username)
+            .field("security_sasl_password", &"[REDACTED]")
+            .field("lookup_queue_size", &self.lookup_queue_size)
+            .field("lookup_max_batch_size", &self.lookup_max_batch_size)
+            .field("lookup_batch_timeout_ms", &self.lookup_batch_timeout_ms)
+            .field(
+                "lookup_max_inflight_requests",
+                &self.lookup_max_inflight_requests,
+            )
+            .field("lookup_max_retries", &self.lookup_max_retries)
+            .finish()
+    }
+}
+
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            bootstrap_servers: String::from(DEFAULT_BOOTSTRAP_SERVER),
+            writer_request_max_size: DEFAULT_REQUEST_MAX_SIZE,
+            writer_acks: String::from(DEFAULT_ACKS),
+            writer_retries: i32::MAX,
+            writer_batch_size: DEFAULT_WRITER_BATCH_SIZE,
+            writer_dynamic_batch_size_enabled: DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_ENABLED,
+            writer_dynamic_batch_size_min: DEFAULT_WRITER_DYNAMIC_BATCH_SIZE_MIN,
+            writer_bucket_no_key_assigner: NoKeyAssigner::Sticky,
+            scanner_remote_log_prefetch_num: DEFAULT_PREFETCH_NUM,
+            remote_file_download_thread_num: DEFAULT_DOWNLOAD_THREADS,
+            scanner_remote_log_read_concurrency: DEFAULT_SCANNER_REMOTE_LOG_READ_CONCURRENCY,
+            scanner_log_max_poll_records: DEFAULT_MAX_POLL_RECORDS,
+            scanner_log_fetch_max_bytes: DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES,
+            scanner_log_fetch_min_bytes: DEFAULT_SCANNER_LOG_FETCH_MIN_BYTES,
+            scanner_log_fetch_wait_max_time_ms: DEFAULT_SCANNER_LOG_FETCH_WAIT_MAX_TIME_MS,
+            scanner_log_fetch_max_bytes_for_bucket: DEFAULT_SCANNER_LOG_FETCH_MAX_BYTES_FOR_BUCKET,
+            writer_batch_timeout_ms: DEFAULT_WRITER_BATCH_TIMEOUT_MS,
+            writer_enable_idempotence: true,
+            writer_max_inflight_requests_per_bucket:
+                DEFAULT_WRITER_MAX_INFLIGHT_REQUESTS_PER_BUCKET,
+            writer_buffer_memory_size: DEFAULT_WRITER_BUFFER_MEMORY_SIZE,
+            writer_buffer_wait_timeout_ms: DEFAULT_WRITER_BUFFER_WAIT_TIMEOUT_MS,
+            connect_timeout_ms: DEFAULT_CONNECT_TIMEOUT_MS,
+            security_protocol: String::from(DEFAULT_SECURITY_PROTOCOL),
+            security_sasl_mechanism: String::from(DEFAULT_SASL_MECHANISM),
+            security_sasl_username: String::new(),
+            security_sasl_password: String::new(),
+            lookup_queue_size: 25600,
+            lookup_max_batch_size: 128,
+            lookup_batch_timeout_ms: 100,
+            lookup_max_inflight_requests: 128,
+            lookup_max_retries: i32::MAX,
+        }
+    }
+}
+
+impl Config {
+    /// Returns true when the security protocol indicates SASL authentication
+    /// should be performed. Matches Java's `SaslAuthenticationPlugin` which
+    /// registers as `"sasl"` (case-insensitive).
+    pub fn is_sasl_enabled(&self) -> bool {
+        self.security_protocol.eq_ignore_ascii_case("sasl")
+    }
+    /// Validates security configuration. Returns `Ok(())` when the config is
+    /// consistent, or an error message when SASL is enabled but the config is
+    /// incomplete or uses an unsupported mechanism.
+    pub fn validate_security(&self) -> Result<(), String> {
+        if !self.is_sasl_enabled() {
+            return Ok(());
+        }
+        if !self.security_sasl_mechanism.eq_ignore_ascii_case("PLAIN") {
+            return Err(format!(
+                "Unsupported SASL mechanism: '{}'. Only 'PLAIN' is supported.",
+                self.security_sasl_mechanism
+            ));
+        }
+        if self.security_sasl_username.is_empty() {
+            return Err(
+                "security_sasl_username must be set when security_protocol is 'sasl'".to_string(),
+            );
+        }
+        if self.security_sasl_password.is_empty() {
+            return Err(
+                "security_sasl_password must be set when security_protocol is 'sasl'".to_string(),
+            );
+        }
+        Ok(())
+    }
+    pub fn validate_scanner(&self) -> Result<(), String> {
+        if self.scanner_remote_log_prefetch_num == 0 {
+            return Err("scanner_remote_log_prefetch_num must be > 0".to_string());
+        }
+        if self.scanner_remote_log_read_concurrency == 0 {
+            return Err("scanner_remote_log_read_concurrency must be > 0".to_string());
+        }
+        if self.remote_file_download_thread_num == 0 {
+            return Err("remote_file_download_thread_num must be > 0".to_string());
+        }
+        // scanner_log_max_poll_records: validation intentionally omitted to match Java behavior.
+        // Java allows 0 — tracked in https://github.com/apache/fluss/issues/3068
+        if self.scanner_log_fetch_min_bytes <= 0 {
+            return Err("scanner_log_fetch_min_bytes must be > 0".to_string());
+        }
+        if self.scanner_log_fetch_max_bytes <= 0 {
+            return Err("scanner_log_fetch_max_bytes must be > 0".to_string());
+        }
+        if self.scanner_log_fetch_max_bytes < self.scanner_log_fetch_min_bytes {
+            return Err(
+                "scanner_log_fetch_max_bytes must be >= scanner_log_fetch_min_bytes".to_string(),
+            );
+        }
+        if self.scanner_log_fetch_wait_max_time_ms < 0 {
+            return Err("scanner_log_fetch_wait_max_time_ms must be >= 0".to_string());
+        }
+        if self.scanner_log_fetch_max_bytes_for_bucket <= 0 {
+            return Err("scanner_log_fetch_max_bytes_for_bucket must be > 0".to_string());
+        }
+        if self.scanner_log_fetch_max_bytes_for_bucket > self.scanner_log_fetch_max_bytes {
+            return Err(
+                "scanner_log_fetch_max_bytes_for_bucket must be <= scanner_log_fetch_max_bytes"
+                    .to_string(),
+            );
+        }
+        Ok(())
+    }
+
+    pub fn validate_writer(&self) -> Result<(), String> {
+        if self.writer_request_max_size <= 0 {
+            return Err("writer_request_max_size must be > 0".to_string());
+        }
+        if self.writer_batch_size <= 0 {
+            return Err("writer_batch_size must be > 0".to_string());
+        }
+        if self.writer_batch_timeout_ms < 0 {
+            return Err("writer_batch_timeout_ms must be >= 0".to_string());
+        }
+        if self.writer_max_inflight_requests_per_bucket == 0 {
+            return Err("writer_max_inflight_requests_per_bucket must be > 0".to_string());
+        }
+        if self.writer_buffer_memory_size == 0 {
+            return Err("writer_buffer_memory_size must be > 0".to_string());
+        }
+        if self.writer_batch_size > self.writer_request_max_size {
+            return Err("writer_batch_size must be <= writer_request_max_size".to_string());
+        }
+        if self.writer_batch_size as usize > self.writer_buffer_memory_size {
+            return Err("writer_batch_size must be <= writer_buffer_memory_size".to_string());
+        }
+        if self.writer_dynamic_batch_size_min <= 0 {
+            return Err("writer_dynamic_batch_size_min must be > 0".to_string());
+        }
+        if self.writer_dynamic_batch_size_min > self.writer_batch_size {
+            return Err("writer_dynamic_batch_size_min must be <= writer_batch_size".to_string());
+        }
+        // idempotence checks
+        if !self.writer_enable_idempotence {
+            return Ok(());
+        }
+        let acks_is_all = self.writer_acks.eq_ignore_ascii_case("all") || self.writer_acks == "-1";
+        if !acks_is_all {
+            return Err(format!(
+                "Idempotent writes require acks='all' (-1), but got acks='{}'",
+                self.writer_acks
+            ));
+        }
+        if self.writer_retries <= 0 {
+            return Err(format!(
+                "Idempotent writes require retries > 0, but got retries={}",
+                self.writer_retries
+            ));
+        }
+        if self.writer_max_inflight_requests_per_bucket
+            > MAX_IN_FLIGHT_REQUESTS_PER_BUCKET_FOR_IDEMPOTENCE
+        {
+            return Err(format!(
+                "Idempotent writes require max-inflight-requests-per-bucket <= {}, but got {}",
+                MAX_IN_FLIGHT_REQUESTS_PER_BUCKET_FOR_IDEMPOTENCE,
+                self.writer_max_inflight_requests_per_bucket
+            ));
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_is_not_sasl() {
+        let config = Config::default();
+        assert!(!config.is_sasl_enabled());
+        assert!(config.validate_security().is_ok());
+    }
+
+    #[test]
+    fn test_sasl_enabled_valid() {
+        let config = Config {
+            security_protocol: "sasl".to_string(),
+            security_sasl_mechanism: "PLAIN".to_string(),
+            security_sasl_username: "admin".to_string(),
+            security_sasl_password: "secret".to_string(),
+            ..Config::default()
+        };
+        assert!(config.is_sasl_enabled());
+        assert!(config.validate_security().is_ok());
+    }
+
+    #[test]
+    fn test_sasl_enabled_case_insensitive() {
+        let config = Config {
+            security_protocol: "SASL".to_string(),
+            security_sasl_username: "admin".to_string(),
+            security_sasl_password: "secret".to_string(),
+            ..Config::default()
+        };
+        assert!(config.is_sasl_enabled());
+        assert!(config.validate_security().is_ok());
+    }
+
+    #[test]
+    fn test_sasl_missing_username() {
+        let config = Config {
+            security_protocol: "sasl".to_string(),
+            security_sasl_password: "secret".to_string(),
+            ..Config::default()
+        };
+        assert!(config.validate_security().is_err());
+    }
+
+    #[test]
+    fn test_sasl_missing_password() {
+        let config = Config {
+            security_protocol: "sasl".to_string(),
+            security_sasl_username: "admin".to_string(),
+            ..Config::default()
+        };
+        assert!(config.validate_security().is_err());
+    }
+
+    #[test]
+    fn test_sasl_unsupported_mechanism() {
+        let config = Config {
+            security_protocol: "sasl".to_string(),
+            security_sasl_mechanism: "SCRAM-SHA-256".to_string(),
+            security_sasl_username: "admin".to_string(),
+            security_sasl_password: "secret".to_string(),
+            ..Config::default()
+        };
+        assert!(config.validate_security().is_err());
+    }
+
+    #[test]
+    fn test_scanner_defaults_valid() {
+        let config = Config::default();
+        assert!(config.validate_scanner().is_ok());
+    }
+
+    #[test]
+    fn test_scanner_remote_log_prefetch_num_zero() {
+        let config = Config {
+            scanner_remote_log_prefetch_num: 0,
+            ..Config::default()
+        };
+        assert!(config.validate_scanner().is_err());
+    }
+
+    #[test]
+    fn test_scanner_remote_log_read_concurrency_zero() {
+        let config = Config {
+            scanner_remote_log_read_concurrency: 0,
+            ..Config::default()
+        };
+        assert!(config.validate_scanner().is_err());
+    }
+
+    #[test]
+    fn test_remote_file_download_thread_num_zero() {
+        let config = Config {
+            remote_file_download_thread_num: 0,
+            ..Config::default()
+        };
+        assert!(config.validate_scanner().is_err());
+    }
+
+    #[test]
+    fn test_scanner_fetch_invalid_ranges() {
+        let config = Config {
+            scanner_log_fetch_min_bytes: 2,
+            scanner_log_fetch_max_bytes: 1,
+            ..Config::default()
+        };
+        assert!(config.validate_scanner().is_err());
+    }
+
+    #[test]
+    fn test_scanner_fetch_negative_wait() {
+        let config = Config {
+            scanner_log_fetch_wait_max_time_ms: -1,
+            ..Config::default()
+        };
+        assert!(config.validate_scanner().is_err());
+    }
+
+    #[test]
+    fn test_writer_defaults_valid() {
+        let config = Config::default();
+        assert!(config.validate_writer().is_ok());
+    }
+
+    #[test]
+    fn test_writer_request_max_size_zero() {
+        let config = Config {
+            writer_request_max_size: 0,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+
+    #[test]
+    fn test_writer_batch_size_zero() {
+        let config = Config {
+            writer_batch_size: 0,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+
+    #[test]
+    fn test_writer_batch_timeout_negative() {
+        let config = Config {
+            writer_batch_timeout_ms: -1,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+
+    #[test]
+    fn test_writer_max_inflight_requests_per_bucket_zero() {
+        let config = Config {
+            writer_max_inflight_requests_per_bucket: 0,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+
+    #[test]
+    fn test_writer_buffer_memory_size_zero() {
+        let config = Config {
+            writer_buffer_memory_size: 0,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+
+    #[test]
+    fn test_writer_batch_size_exceeds_request_max_size() {
+        let config = Config {
+            writer_batch_size: 20 * 1024 * 1024,
+            writer_request_max_size: 10 * 1024 * 1024,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+
+    #[test]
+    fn test_writer_batch_size_exceeds_buffer_memory_size() {
+        let config = Config {
+            writer_batch_size: 128 * 1024 * 1024,
+            writer_buffer_memory_size: 64 * 1024 * 1024,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+
+    #[test]
+    fn test_idempotence_disabled_skips_validation() {
+        let config = Config {
+            writer_enable_idempotence: false,
+            writer_acks: "0".to_string(),
+            writer_retries: 0,
+            writer_max_inflight_requests_per_bucket: 100,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_ok());
+    }
+
+    #[test]
+    fn test_idempotence_requires_acks_all() {
+        let config = Config {
+            writer_enable_idempotence: true,
+            writer_acks: "1".to_string(),
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+
+    #[test]
+    fn test_idempotence_requires_retries() {
+        let config = Config {
+            writer_enable_idempotence: true,
+            writer_retries: 0,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+
+    #[test]
+    fn test_idempotence_requires_bounded_inflight() {
+        let config = Config {
+            writer_enable_idempotence: true,
+            writer_max_inflight_requests_per_bucket: 10,
+            ..Config::default()
+        };
+        assert!(config.validate_writer().is_err());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/error.rs b/fluss-rust/crates/fluss/src/error.rs
new file mode 100644
index 0000000000..4bd0690ead
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/error.rs
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub use crate::rpc::RpcError;
+pub use crate::rpc::{ApiError, FlussError};
+
+use arrow_schema::ArrowError;
+use snafu::Snafu;
+use std::{io, result};
+use strum::ParseError;
+
+pub type Result<T> = result::Result<T, Error>;
+
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(
+        whatever,
+        display("Fluss hitting unexpected error {}: {:?}", message, source)
+    )]
+    UnexpectedError {
+        message: String,
+        /// see <https://github.com/shepmaster/snafu/issues/446>
+        #[snafu(source(from(Box<dyn std::error::Error + Send + Sync + 'static>, Some)))]
+        source: Option<Box<dyn std::error::Error + Send + Sync + 'static>>,
+    },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss hitting unexpected io error {}: {:?}", message, source)
+    )]
+    IoUnexpectedError { message: String, source: io::Error },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display(
+            "Fluss hitting remote storage unexpected error {}: {:?}",
+            message,
+            source
+        )
+    )]
+    RemoteStorageUnexpectedError {
+        message: String,
+        source: opendal::Error,
+    },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss hitting json serde error {}.", message)
+    )]
+    JsonSerdeError { message: String },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss hitting unexpected rpc error {}: {:?}", message, source)
+    )]
+    RpcError { message: String, source: RpcError },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss hitting row convert error {}.", message)
+    )]
+    RowConvertError { message: String },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss hitting Arrow error {}: {:?}.", message, source)
+    )]
+    ArrowError { message: String, source: ArrowError },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss hitting illegal argument error {}.", message)
+    )]
+    IllegalArgument { message: String },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss hitting IO not supported error {}.", message)
+    )]
+    IoUnsupported { message: String },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss hitting wakeup error {}.", message)
+    )]
+    WakeupError { message: String },
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss hitting unsupported operation error {}.", message)
+    )]
+    UnsupportedOperation { message: String },
+
+    #[snafu(visibility(pub(crate)), display("Fluss writer closed: {}.", message))]
+    WriterClosed { message: String },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Fluss buffer exhausted: {}.", message)
+    )]
+    BufferExhausted { message: String },
+
+    #[snafu(visibility(pub(crate)), display("Fluss API Error: {}.", api_error))]
+    FlussAPIError { api_error: ApiError },
+
+    #[snafu(
+        visibility(pub(crate)),
+        display("Unsupported API version: {}.", message)
+    )]
+    UnsupportedVersion { message: String },
+
+    /// The server advertised a `server_type` that does not match the one expected
+    /// for the target `ServerNode` (e.g. connecting to a coordinator on a tablet
+    /// server address).
+    #[snafu(visibility(pub(crate)), display("Invalid server type: {}.", message))]
+    InvalidServerType { message: String },
+}
+
+/// Convenience constructors for API errors that may be raised client-side.
+/// These create `FlussAPIError` with the correct protocol error code,
+/// consistent with Java where e.g. `InvalidTableException` always carries code 15.
+impl Error {
+    pub fn table_not_exist(message: impl Into<String>) -> Self {
+        Error::FlussAPIError {
+            api_error: ApiError {
+                code: FlussError::TableNotExist.code(),
+                message: message.into(),
+            },
+        }
+    }
+
+    pub fn invalid_table(message: impl Into<String>) -> Self {
+        Error::FlussAPIError {
+            api_error: ApiError {
+                code: FlussError::InvalidTableException.code(),
+                message: message.into(),
+            },
+        }
+    }
+
+    pub fn partition_not_exist(message: impl Into<String>) -> Self {
+        Error::FlussAPIError {
+            api_error: ApiError {
+                code: FlussError::PartitionNotExists.code(),
+                message: message.into(),
+            },
+        }
+    }
+
+    pub fn invalid_partition(message: impl Into<String>) -> Self {
+        Error::FlussAPIError {
+            api_error: ApiError {
+                code: FlussError::PartitionSpecInvalidException.code(),
+                message: message.into(),
+            },
+        }
+    }
+
+    pub fn leader_not_available(message: impl Into<String>) -> Self {
+        Error::FlussAPIError {
+            api_error: ApiError {
+                code: FlussError::LeaderNotAvailableException.code(),
+                message: message.into(),
+            },
+        }
+    }
+
+    /// Returns the API error kind if this is an API error, for ergonomic pattern matching.
+    pub fn api_error(&self) -> Option<FlussError> {
+        if let Error::FlussAPIError { api_error } = self {
+            Some(FlussError::for_code(api_error.code))
+        } else {
+            None
+        }
+    }
+
+    /// Returns `true` if retrying the request may succeed.
+    /// [`Error::RpcError`] is always retriable; [`Error::FlussAPIError`] delegates to
+    /// [`ApiError::is_retriable`]; all other variants are not.
+    pub fn is_retriable(&self) -> bool {
+        match self {
+            Error::RpcError { .. } => true,
+            Error::FlussAPIError { api_error } => api_error.is_retriable(),
+            _ => false,
+        }
+    }
+}
+
+impl From<ArrowError> for Error {
+    fn from(value: ArrowError) -> Self {
+        Error::ArrowError {
+            message: format!("{value}"),
+            source: value,
+        }
+    }
+}
+
+impl From<RpcError> for Error {
+    fn from(value: RpcError) -> Self {
+        Error::RpcError {
+            message: format!("{value}"),
+            source: value,
+        }
+    }
+}
+
+impl From<io::Error> for Error {
+    fn from(value: io::Error) -> Self {
+        Error::IoUnexpectedError {
+            message: format!("{value}"),
+            source: value,
+        }
+    }
+}
+
+impl From<opendal::Error> for Error {
+    fn from(value: opendal::Error) -> Self {
+        Error::RemoteStorageUnexpectedError {
+            message: format!("{value}"),
+            source: value,
+        }
+    }
+}
+
+impl From<ApiError> for Error {
+    fn from(value: ApiError) -> Self {
+        Error::FlussAPIError { api_error: value }
+    }
+}
+
+impl From<ParseError> for Error {
+    fn from(value: ParseError) -> Self {
+        Error::IllegalArgument {
+            message: value.to_string(),
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/io/file_io.rs b/fluss-rust/crates/fluss/src/io/file_io.rs
new file mode 100644
index 0000000000..adca333f6b
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/io/file_io.rs
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use crate::error::*;
+use std::collections::HashMap;
+use std::ops::Range;
+use std::sync::Arc;
+
+use bytes::Bytes;
+use jiff::Timestamp;
+use opendal::Operator;
+
+use url::Url;
+
+use super::Storage;
+
+use crate::error::Result;
+
+#[derive(Clone, Debug)]
+pub struct FileIO {
+    storage: Arc<Storage>,
+}
+
+impl FileIO {
+    /// Try to infer file io scheme from path.
+    pub fn from_url(path: &str) -> Result<FileIOBuilder> {
+        let url = Url::parse(path).map_err(|e| Error::IllegalArgument {
+            message: format!("Invalid URL '{path}': {e}"),
+        })?;
+        Ok(FileIOBuilder::new(url.scheme()))
+    }
+
+    /// Create a new input file to read data.
+    pub fn new_input(&self, path: &str) -> Result<InputFile> {
+        let (op, relative_path) = self.storage.create(path)?;
+        let path = path.to_string();
+        let relative_path_pos = path.len() - relative_path.len();
+        Ok(InputFile {
+            op,
+            path,
+            relative_path_pos,
+        })
+    }
+}
+
+#[derive(Debug)]
+pub struct FileIOBuilder {
+    scheme_str: Option<String>,
+    props: HashMap<String, String>,
+}
+
+impl FileIOBuilder {
+    pub fn new(scheme_str: impl ToString) -> Self {
+        Self {
+            scheme_str: Some(scheme_str.to_string()),
+            props: HashMap::default(),
+        }
+    }
+
+    pub(crate) fn into_parts(self) -> (String, HashMap<String, String>) {
+        (self.scheme_str.unwrap_or_default(), self.props)
+    }
+
+    pub fn with_prop(mut self, key: impl ToString, value: impl ToString) -> Self {
+        self.props.insert(key.to_string(), value.to_string());
+        self
+    }
+
+    pub fn with_props(
+        mut self,
+        args: impl IntoIterator<Item = (impl ToString, impl ToString)>,
+    ) -> Self {
+        self.props
+            .extend(args.into_iter().map(|e| (e.0.to_string(), e.1.to_string())));
+        self
+    }
+
+    pub fn build(self) -> Result<FileIO> {
+        let storage = Storage::build(self)?;
+        Ok(FileIO {
+            storage: Arc::new(storage),
+        })
+    }
+}
+
+pub trait FileRead: Send + Unpin + 'static {
+    fn read(&self, range: Range<u64>) -> impl Future<Output = Result<Bytes>> + Send;
+}
+
+impl FileRead for opendal::Reader {
+    async fn read(&self, range: Range<u64>) -> Result<Bytes> {
+        Ok(opendal::Reader::read(self, range).await?.to_bytes())
+    }
+}
+
+#[derive(Debug)]
+pub struct InputFile {
+    op: Operator,
+    path: String,
+    relative_path_pos: usize,
+}
+
+impl InputFile {
+    pub fn location(&self) -> &str {
+        &self.path
+    }
+
+    pub async fn exists(&self) -> Result<bool> {
+        Ok(self.op.exists(&self.path[self.relative_path_pos..]).await?)
+    }
+
+    pub async fn metadata(&self) -> Result<FileStatus> {
+        let meta = self.op.stat(&self.path[self.relative_path_pos..]).await?;
+
+        Ok(FileStatus {
+            size: meta.content_length(),
+            is_dir: meta.is_dir(),
+            path: self.path.clone(),
+            last_modified: meta.last_modified().map(Into::into),
+        })
+    }
+
+    pub async fn read(&self) -> Result<Bytes> {
+        Ok(self
+            .op
+            .read(&self.path[self.relative_path_pos..])
+            .await?
+            .to_bytes())
+    }
+
+    pub async fn reader(&self) -> Result<impl FileRead> {
+        Ok(self.op.reader(&self.path[self.relative_path_pos..]).await?)
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct FileStatus {
+    pub size: u64,
+    pub is_dir: bool,
+    pub path: String,
+    pub last_modified: Option<Timestamp>,
+}
diff --git a/fluss-rust/crates/fluss/src/io/mod.rs b/fluss-rust/crates/fluss/src/io/mod.rs
new file mode 100644
index 0000000000..74265017aa
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/io/mod.rs
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+mod file_io;
+
+pub use file_io::*;
+
+mod storage;
+pub use storage::*;
+
+#[cfg(feature = "storage-fs")]
+mod storage_fs;
+#[cfg(feature = "storage-fs")]
+use storage_fs::*;
+
+#[cfg(feature = "storage-memory")]
+mod storage_memory;
+#[cfg(feature = "storage-memory")]
+use storage_memory::*;
+
+#[cfg(feature = "storage-s3")]
+mod storage_s3;
+#[cfg(feature = "storage-s3")]
+use storage_s3::*;
+
+#[cfg(feature = "storage-oss")]
+mod storage_oss;
+#[cfg(feature = "storage-oss")]
+use storage_oss::*;
diff --git a/fluss-rust/crates/fluss/src/io/storage.rs b/fluss-rust/crates/fluss/src/io/storage.rs
new file mode 100644
index 0000000000..a57351783e
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/io/storage.rs
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+use crate::error;
+use crate::error::Result;
+use crate::io::FileIOBuilder;
+use opendal::{Operator, Scheme};
+#[cfg(any(feature = "storage-s3", feature = "storage-oss"))]
+use std::collections::HashMap;
+
+/// The storage carries all supported storage services in fluss
+#[derive(Debug)]
+pub enum Storage {
+    #[cfg(feature = "storage-memory")]
+    Memory,
+    #[cfg(feature = "storage-fs")]
+    LocalFs,
+    #[cfg(feature = "storage-s3")]
+    S3 { props: HashMap<String, String> },
+    #[cfg(feature = "storage-oss")]
+    Oss { props: HashMap<String, String> },
+}
+
+impl Storage {
+    #[allow(unused_variables)]
+    pub(crate) fn build(file_io_builder: FileIOBuilder) -> Result<Self> {
+        let (scheme_str, props) = file_io_builder.into_parts();
+        let scheme = Self::parse_scheme(&scheme_str)?;
+
+        match scheme {
+            #[cfg(feature = "storage-memory")]
+            Scheme::Memory => Ok(Self::Memory),
+            #[cfg(feature = "storage-fs")]
+            Scheme::Fs => Ok(Self::LocalFs),
+            #[cfg(feature = "storage-s3")]
+            Scheme::S3 => Ok(Self::S3 { props }),
+            #[cfg(feature = "storage-oss")]
+            Scheme::Oss => Ok(Self::Oss { props }),
+            _ => Err(error::Error::IoUnsupported {
+                message: format!("Unsupported storage feature {scheme_str}"),
+            }),
+        }
+    }
+
+    pub(crate) fn create<'a>(&self, path: &'a str) -> Result<(Operator, &'a str)> {
+        match self {
+            #[cfg(feature = "storage-memory")]
+            Storage::Memory => {
+                let op = super::memory_config_build()?;
+
+                if let Some(stripped) = path.strip_prefix("memory:/") {
+                    Ok((op, stripped))
+                } else {
+                    Ok((op, &path[1..]))
+                }
+            }
+            #[cfg(feature = "storage-fs")]
+            Storage::LocalFs => {
+                let op = super::fs_config_build()?;
+                if let Some(stripped) = path.strip_prefix("file:/") {
+                    Ok((op, stripped))
+                } else {
+                    Ok((op, &path[1..]))
+                }
+            }
+            #[cfg(feature = "storage-s3")]
+            Storage::S3 { props } => {
+                let (bucket, key) = super::parse_s3_path(path);
+                let mut s3_props = props.clone();
+                s3_props.insert("bucket".to_string(), bucket.to_string());
+                let op = super::s3_config_build(&s3_props)?;
+                Ok((op, key))
+            }
+            #[cfg(feature = "storage-oss")]
+            Storage::Oss { props } => {
+                let (bucket, key) = super::parse_oss_path(path);
+                let mut oss_props = props.clone();
+                oss_props.insert("bucket".to_string(), bucket.to_string());
+                let op = super::oss_config_build(&oss_props)?;
+                Ok((op, key))
+            }
+        }
+    }
+
+    fn parse_scheme(scheme: &str) -> Result<Scheme> {
+        match scheme {
+            "memory" => Ok(Scheme::Memory),
+            "file" | "" => Ok(Scheme::Fs),
+            "s3" | "s3a" => Ok(Scheme::S3),
+            "oss" => Ok(Scheme::Oss),
+            s => Ok(s.parse::<Scheme>()?),
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/io/storage_fs.rs b/fluss-rust/crates/fluss/src/io/storage_fs.rs
new file mode 100644
index 0000000000..95ca6fa95f
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/io/storage_fs.rs
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+use opendal::Operator;
+use opendal::services::FsConfig;
+
+use crate::error::Result;
+
+/// Build new opendal operator from give path.
+pub(crate) fn fs_config_build() -> Result<Operator> {
+    let mut cfg = FsConfig::default();
+    cfg.root = Some("/".to_string());
+
+    Ok(Operator::from_config(cfg)?.finish())
+}
diff --git a/fluss-rust/crates/fluss/src/io/storage_memory.rs b/fluss-rust/crates/fluss/src/io/storage_memory.rs
new file mode 100644
index 0000000000..af73a90174
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/io/storage_memory.rs
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+use crate::error::Result;
+use opendal::Operator;
+use opendal::services::MemoryConfig;
+
+pub(crate) fn memory_config_build() -> Result<Operator> {
+    Ok(Operator::from_config(MemoryConfig::default())?.finish())
+}
diff --git a/fluss-rust/crates/fluss/src/io/storage_oss.rs b/fluss-rust/crates/fluss/src/io/storage_oss.rs
new file mode 100644
index 0000000000..3d5d05499a
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/io/storage_oss.rs
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Result;
+use opendal::Configurator;
+use opendal::Operator;
+use opendal::layers::TimeoutLayer;
+use opendal::services::OssConfig;
+use std::collections::HashMap;
+use std::time::Duration;
+
+pub(crate) fn oss_config_build(props: &HashMap<String, String>) -> Result<Operator> {
+    let config = OssConfig::from_iter(props.clone())?;
+    let op = Operator::from_config(config)?.finish();
+
+    // Add timeout layer to prevent hanging on OSS operations
+    let timeout_layer = TimeoutLayer::new()
+        .with_timeout(Duration::from_secs(10))
+        .with_io_timeout(Duration::from_secs(30));
+
+    Ok(op.layer(timeout_layer))
+}
+
+pub(crate) fn parse_oss_path(path: &str) -> (&str, &str) {
+    let path = path.strip_prefix("oss://").unwrap_or(path);
+
+    match path.find('/') {
+        Some(idx) => (&path[..idx], &path[idx + 1..]),
+        None => (path, ""),
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/io/storage_s3.rs b/fluss-rust/crates/fluss/src/io/storage_s3.rs
new file mode 100644
index 0000000000..8000d091dd
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/io/storage_s3.rs
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Result;
+use opendal::Configurator;
+use opendal::Operator;
+use opendal::layers::TimeoutLayer;
+use opendal::services::S3Config;
+use std::collections::HashMap;
+use std::time::Duration;
+
+pub(crate) fn s3_config_build(props: &HashMap<String, String>) -> Result<Operator> {
+    let config = S3Config::from_iter(props.clone())?;
+    let op = Operator::from_config(config)?.finish();
+
+    // Add timeout layer to prevent hanging on S3 operations
+    let timeout_layer = TimeoutLayer::new()
+        .with_timeout(Duration::from_secs(10))
+        .with_io_timeout(Duration::from_secs(30));
+
+    Ok(op.layer(timeout_layer))
+}
+
+pub(crate) fn parse_s3_path(path: &str) -> (&str, &str) {
+    let path = path
+        .strip_prefix("s3a://")
+        .or_else(|| path.strip_prefix("s3://"))
+        .unwrap_or(path);
+
+    match path.find('/') {
+        Some(idx) => (&path[..idx], &path[idx + 1..]),
+        None => (path, ""),
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/lib.rs b/fluss-rust/crates/fluss/src/lib.rs
new file mode 100644
index 0000000000..027465235c
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/lib.rs
@@ -0,0 +1,153 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Apache Fluss (Incubating) Official Rust Client
+//!
+//! Official Rust client library for [Apache Fluss (Incubating)](https://fluss.apache.org/).
+//! It supports **primary key (KV) tables** (upsert + lookup) and **log tables** (append + scan).
+//!
+//! # Examples
+//!
+//! ## Primary key table and log table
+//!
+//! Connect to a cluster, create a KV table (upsert and lookup), then a log table (append and scan):
+//!
+//! ```rust,no_run
+//! use fluss::client::EARLIEST_OFFSET;
+//! use fluss::client::FlussConnection;
+//! use fluss::config::Config;
+//! use fluss::error::Result;
+//! use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+//! use fluss::row::{GenericRow, InternalRow};
+//! use std::time::Duration;
+//!
+//! #[tokio::main]
+//! async fn main() -> Result<()> {
+//!     let mut config = Config::default();
+//!     config.bootstrap_servers = "127.0.0.1:9123".to_string();
+//!     let connection = FlussConnection::new(config).await?;
+//!     let admin = connection.get_admin()?;
+//!
+//!     // ---- Primary key (KV) table: upsert and lookup ----
+//!     let kv_path = TablePath::new("fluss", "users");
+//!     let mut kv_schema = Schema::builder()
+//!         .column("id", DataTypes::int())
+//!         .column("name", DataTypes::string())
+//!         .column("age", DataTypes::bigint())
+//!         .primary_key(vec!["id"]);
+//!     let kv_descriptor = TableDescriptor::builder()
+//!         .schema(kv_schema.build()?)
+//!         .build()?;
+//!     admin.create_table(&kv_path, &kv_descriptor, false).await?;
+//!
+//!     let kv_table = connection.get_table(&kv_path).await?;
+//!     let upsert_writer = kv_table.new_upsert()?.create_writer()?;
+//!     let mut row = GenericRow::new(3);
+//!     row.set_field(0, 1i32);
+//!     row.set_field(1, "Alice");
+//!     row.set_field(2, 30i64);
+//!     upsert_writer.upsert(&row)?;
+//!     upsert_writer.flush().await?;
+//!
+//!     let mut lookuper = kv_table.new_lookup()?.create_lookuper()?;
+//!     let mut key = GenericRow::new(1);
+//!     key.set_field(0, 1i32);
+//!     let result = lookuper.lookup(&key).await?;
+//!     if let Some(r) = result.get_single_row()? {
+//!         println!("KV lookup: id={}, name={}, age={}",
+//!                  r.get_int(0)?, r.get_string(1)?, r.get_long(2)?);
+//!     }
+//!
+//!     // ---- Log table: append and scan ----
+//!     let log_path = TablePath::new("fluss", "events");
+//!     let mut log_schema_builder = Schema::builder()
+//!         .column("ts", DataTypes::bigint())
+//!         .column("message", DataTypes::string());
+//!     let log_descriptor = TableDescriptor::builder()
+//!         .schema(log_schema_builder.build()?)
+//!         .build()?;
+//!     admin.create_table(&log_path, &log_descriptor, false).await?;
+//!
+//!     let log_table = connection.get_table(&log_path).await?;
+//!     let append_writer = log_table.new_append()?.create_writer()?;
+//!     let mut event = GenericRow::new(2);
+//!     event.set_field(0, 1700000000i64);
+//!     event.set_field(1, "hello");
+//!     append_writer.append(&event)?;
+//!     append_writer.flush().await?;
+//!
+//!     let scanner = log_table.new_scan().create_log_scanner()?;
+//!     scanner.subscribe(0, EARLIEST_OFFSET).await?;
+//!     let scan_records = scanner.poll(Duration::from_secs(1)).await?;
+//!     for record in scan_records {
+//!         let r = record.row();
+//!         println!("Log scan: ts={}, message={}", r.get_long(0)?, r.get_string(1)?);
+//!     }
+//!
+//!     Ok(())
+//! }
+//! ```
+//!
+//! # Performance
+//!
+//! For production deployments on Linux, we recommend using
+//! [jemalloc](https://crates.io/crates/tikv-jemallocator) as the global allocator.
+//! The default glibc allocator (ptmalloc2) can cause RSS bloat and fragmentation under
+//! sustained write loads due to repeated same-size alloc/free cycles in Arrow batch building.
+//! jemalloc's thread-local size-class bins handle this pattern efficiently.
+//!
+//! ```toml
+//! [target.'cfg(not(target_env = "msvc"))'.dependencies]
+//! tikv-jemallocator = "0.6"
+//! ```
+//!
+//! ```rust,ignore
+//! #[cfg(not(target_env = "msvc"))]
+//! #[global_allocator]
+//! static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+//! ```
+
+pub mod client;
+pub mod metadata;
+pub mod record;
+pub mod row;
+pub mod rpc;
+
+mod cluster;
+pub use cluster::{ServerNode, ServerType};
+
+pub mod config;
+pub mod error;
+pub mod metrics;
+
+mod bucketing;
+mod compression;
+pub mod io;
+mod util;
+
+#[cfg(test)]
+mod test_utils;
+
+pub type TableId = i64;
+pub type PartitionId = i64;
+pub type BucketId = i32;
+
+pub mod proto {
+    // generated from the canonical proto; its doc comments aren't clippy-clean
+    #![allow(clippy::doc_lazy_continuation)]
+    include!(concat!(env!("OUT_DIR"), "/fluss.rs"));
+}
diff --git a/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs b/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs
new file mode 100644
index 0000000000..77e5ad3c1c
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/metadata/data_lake_format.rs
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use strum_macros::{Display, EnumString};
+
+/// Identifies the logical format of a data lake table supported by Fluss.
+///
+/// This enum is typically used in metadata and configuration to distinguish
+/// between different table formats so that the appropriate integration and
+/// semantics can be applied.
+#[derive(Debug, EnumString, Display, PartialEq)]
+#[strum(ascii_case_insensitive)]
+pub enum DataLakeFormat {
+    #[strum(serialize = "paimon")]
+    Paimon,
+
+    #[strum(serialize = "lance")]
+    Lance,
+
+    #[strum(serialize = "iceberg")]
+    Iceberg,
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::metadata::DataLakeFormat;
+    use crate::metadata::DataLakeFormat::{Iceberg, Lance, Paimon};
+
+    #[test]
+    fn test_parse() {
+        let cases = vec![
+            ("paimon", Paimon),
+            ("Paimon", Paimon),
+            ("PAIMON", Paimon),
+            ("lance", Lance),
+            ("LANCE", Lance),
+            ("iceberg", Iceberg),
+            ("ICEBERG", Iceberg),
+        ];
+
+        for (raw, expected) in cases {
+            let parsed = raw.parse::<DataLakeFormat>().unwrap();
+            assert_eq!(parsed, expected, "failed to parse: {raw}");
+        }
+
+        // negative cases
+        assert!("unknown".parse::<DataLakeFormat>().is_err());
+        assert!("".parse::<DataLakeFormat>().is_err());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/metadata/database.rs b/fluss-rust/crates/fluss/src/metadata/database.rs
new file mode 100644
index 0000000000..15fefb5496
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/metadata/database.rs
@@ -0,0 +1,239 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::JsonSerdeError;
+use crate::error::Result;
+use crate::metadata::JsonSerde;
+use serde::{Deserialize, Serialize};
+use serde_json::{Value, json};
+use std::collections::HashMap;
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct DatabaseDescriptor {
+    comment: Option<String>,
+    custom_properties: HashMap<String, String>,
+}
+
+#[derive(Debug, Clone)]
+pub struct DatabaseInfo {
+    database_name: String,
+    database_descriptor: DatabaseDescriptor,
+    created_time: i64,
+    modified_time: i64,
+}
+
+impl DatabaseInfo {
+    pub fn new(
+        database_name: String,
+        database_descriptor: DatabaseDescriptor,
+        created_time: i64,
+        modified_time: i64,
+    ) -> Self {
+        Self {
+            database_name,
+            database_descriptor,
+            created_time,
+            modified_time,
+        }
+    }
+
+    pub fn database_name(&self) -> &str {
+        &self.database_name
+    }
+
+    pub fn database_descriptor(&self) -> &DatabaseDescriptor {
+        &self.database_descriptor
+    }
+
+    pub fn created_time(&self) -> i64 {
+        self.created_time
+    }
+
+    pub fn modified_time(&self) -> i64 {
+        self.modified_time
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct DatabaseDescriptorBuilder {
+    comment: Option<String>,
+    custom_properties: HashMap<String, String>,
+}
+
+impl DatabaseDescriptor {
+    pub fn builder() -> DatabaseDescriptorBuilder {
+        DatabaseDescriptorBuilder::default()
+    }
+
+    pub fn comment(&self) -> Option<&str> {
+        self.comment.as_deref()
+    }
+
+    pub fn custom_properties(&self) -> &HashMap<String, String> {
+        &self.custom_properties
+    }
+}
+
+impl DatabaseDescriptorBuilder {
+    pub fn comment<C: Into<String>>(mut self, comment: C) -> Self {
+        self.comment = Some(comment.into());
+        self
+    }
+
+    pub fn custom_properties<K: Into<String>, V: Into<String>>(
+        mut self,
+        properties: HashMap<K, V>,
+    ) -> Self {
+        for (k, v) in properties {
+            self.custom_properties.insert(k.into(), v.into());
+        }
+        self
+    }
+
+    pub fn custom_property<K: Into<String>, V: Into<String>>(mut self, key: K, value: V) -> Self {
+        self.custom_properties.insert(key.into(), value.into());
+        self
+    }
+
+    pub fn build(self) -> DatabaseDescriptor {
+        DatabaseDescriptor {
+            comment: self.comment,
+            custom_properties: self.custom_properties,
+        }
+    }
+}
+
+impl DatabaseDescriptor {
+    const CUSTOM_PROPERTIES_NAME: &'static str = "custom_properties";
+    const COMMENT_NAME: &'static str = "comment";
+    const VERSION_KEY: &'static str = "version";
+    const VERSION: u32 = 1;
+}
+
+impl JsonSerde for DatabaseDescriptor {
+    fn serialize_json(&self) -> Result<Value> {
+        let mut obj = serde_json::Map::new();
+
+        // Serialize version
+        obj.insert(Self::VERSION_KEY.to_string(), json!(Self::VERSION));
+
+        // Serialize comment if present
+        if let Some(comment) = self.comment() {
+            obj.insert(Self::COMMENT_NAME.to_string(), json!(comment));
+        }
+
+        // Serialize custom properties
+        obj.insert(
+            Self::CUSTOM_PROPERTIES_NAME.to_string(),
+            json!(self.custom_properties()),
+        );
+
+        Ok(Value::Object(obj))
+    }
+
+    fn deserialize_json(node: &Value) -> Result<Self> {
+        let mut builder = DatabaseDescriptor::builder();
+
+        // Deserialize comment if present
+        if let Some(comment_node) = node.get(Self::COMMENT_NAME) {
+            let comment = comment_node
+                .as_str()
+                .ok_or_else(|| JsonSerdeError {
+                    message: format!("{} should be a string", Self::COMMENT_NAME),
+                })?
+                .to_owned();
+            builder = builder.comment(&comment);
+        }
+
+        // Deserialize custom properties directly
+        let custom_properties = if let Some(props_node) = node.get(Self::CUSTOM_PROPERTIES_NAME) {
+            let obj = props_node.as_object().ok_or_else(|| JsonSerdeError {
+                message: "Custom properties should be an object".to_string(),
+            })?;
+
+            let mut properties = HashMap::with_capacity(obj.len());
+            for (key, value) in obj {
+                properties.insert(
+                    key.clone(),
+                    value
+                        .as_str()
+                        .ok_or_else(|| JsonSerdeError {
+                            message: "Property value should be a string".to_string(),
+                        })?
+                        .to_owned(),
+                );
+            }
+            properties
+        } else {
+            HashMap::new()
+        };
+        builder = builder.custom_properties(custom_properties);
+
+        Ok(builder.build())
+    }
+}
+
+impl DatabaseDescriptor {
+    /// Create DatabaseDescriptor from JSON bytes (equivalent to Java's fromJsonBytes)
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self> {
+        let json_value: Value = serde_json::from_slice(bytes).map_err(|e| JsonSerdeError {
+            message: format!("Failed to parse JSON: {e}"),
+        })?;
+        Self::deserialize_json(&json_value)
+    }
+
+    /// Convert DatabaseDescriptor to JSON bytes
+    pub fn to_json_bytes(&self) -> Result<Vec<u8>> {
+        let json_value = self.serialize_json()?;
+        serde_json::to_vec(&json_value).map_err(|e| JsonSerdeError {
+            message: format!("Failed to serialize to JSON: {e}"),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_database_descriptor_json_serde() {
+        let mut custom_props = HashMap::new();
+        custom_props.insert("key1".to_string(), "value1".to_string());
+        custom_props.insert("key2".to_string(), "value2".to_string());
+
+        let descriptor = DatabaseDescriptor::builder()
+            .comment("Test database")
+            .custom_properties(custom_props)
+            .build();
+
+        // Test serialization
+        let json_bytes = descriptor.to_json_bytes().unwrap();
+        println!("Serialized JSON: {}", String::from_utf8_lossy(&json_bytes));
+
+        // Test deserialization
+        let deserialized = DatabaseDescriptor::from_json_bytes(&json_bytes).unwrap();
+        assert_eq!(descriptor, deserialized);
+    }
+
+    #[test]
+    fn test_empty_database_descriptor() {
+        let descriptor = DatabaseDescriptor::builder().build();
+        let json_bytes = descriptor.to_json_bytes().unwrap();
+        let deserialized = DatabaseDescriptor::from_json_bytes(&json_bytes).unwrap();
+        assert_eq!(descriptor, deserialized);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/metadata/datatype.rs b/fluss-rust/crates/fluss/src/metadata/datatype.rs
new file mode 100644
index 0000000000..60a44ba718
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/metadata/datatype.rs
@@ -0,0 +1,1992 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use serde::{Deserialize, Serialize};
+use std::fmt::{Display, Formatter};
+
+/// Data type for Fluss table.
+/// Impl reference: <todo: link>
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub enum DataType {
+    Boolean(BooleanType),
+    TinyInt(TinyIntType),
+    SmallInt(SmallIntType),
+    Int(IntType),
+    BigInt(BigIntType),
+    Float(FloatType),
+    Double(DoubleType),
+    Char(CharType),
+    String(StringType),
+    Decimal(DecimalType),
+    Date(DateType),
+    Time(TimeType),
+    Timestamp(TimestampType),
+    TimestampLTz(TimestampLTzType),
+    Bytes(BytesType),
+    Binary(BinaryType),
+    Array(ArrayType),
+    Map(MapType),
+    Row(RowType),
+}
+
+impl DataType {
+    pub fn is_nullable(&self) -> bool {
+        match self {
+            DataType::Boolean(v) => v.nullable,
+            DataType::TinyInt(v) => v.nullable,
+            DataType::SmallInt(v) => v.nullable,
+            DataType::Int(v) => v.nullable,
+            DataType::BigInt(v) => v.nullable,
+            DataType::Decimal(v) => v.nullable,
+            DataType::Double(v) => v.nullable,
+            DataType::Float(v) => v.nullable,
+            DataType::Binary(v) => v.nullable,
+            DataType::Char(v) => v.nullable,
+            DataType::String(v) => v.nullable,
+            DataType::Date(v) => v.nullable,
+            DataType::TimestampLTz(v) => v.nullable,
+            DataType::Time(v) => v.nullable,
+            DataType::Timestamp(v) => v.nullable,
+            DataType::Array(v) => v.nullable,
+            DataType::Map(v) => v.nullable,
+            DataType::Row(v) => v.nullable,
+            DataType::Bytes(v) => v.nullable,
+        }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        match self {
+            DataType::Boolean(v) => DataType::Boolean(v.as_non_nullable()),
+            DataType::TinyInt(v) => DataType::TinyInt(v.as_non_nullable()),
+            DataType::SmallInt(v) => DataType::SmallInt(v.as_non_nullable()),
+            DataType::Int(v) => DataType::Int(v.as_non_nullable()),
+            DataType::BigInt(v) => DataType::BigInt(v.as_non_nullable()),
+            DataType::Decimal(v) => DataType::Decimal(v.as_non_nullable()),
+            DataType::Double(v) => DataType::Double(v.as_non_nullable()),
+            DataType::Float(v) => DataType::Float(v.as_non_nullable()),
+            DataType::Binary(v) => DataType::Binary(v.as_non_nullable()),
+            DataType::Char(v) => DataType::Char(v.as_non_nullable()),
+            DataType::String(v) => DataType::String(v.as_non_nullable()),
+            DataType::Date(v) => DataType::Date(v.as_non_nullable()),
+            DataType::TimestampLTz(v) => DataType::TimestampLTz(v.as_non_nullable()),
+            DataType::Time(v) => DataType::Time(v.as_non_nullable()),
+            DataType::Timestamp(v) => DataType::Timestamp(v.as_non_nullable()),
+            DataType::Array(v) => DataType::Array(v.as_non_nullable()),
+            DataType::Map(v) => DataType::Map(v.as_non_nullable()),
+            DataType::Row(v) => DataType::Row(v.as_non_nullable()),
+            DataType::Bytes(v) => DataType::Bytes(v.as_non_nullable()),
+        }
+    }
+
+    /// Structural equality ignoring the outermost nullability flag at
+    /// every level. Equivalent to comparing `as_non_nullable()` on both
+    /// sides but without the recursive clone.
+    pub(crate) fn eq_ignore_nullable(&self, other: &DataType) -> bool {
+        match self {
+            DataType::Boolean(_) => matches!(other, DataType::Boolean(_)),
+            DataType::TinyInt(_) => matches!(other, DataType::TinyInt(_)),
+            DataType::SmallInt(_) => matches!(other, DataType::SmallInt(_)),
+            DataType::Int(_) => matches!(other, DataType::Int(_)),
+            DataType::BigInt(_) => matches!(other, DataType::BigInt(_)),
+            DataType::Float(_) => matches!(other, DataType::Float(_)),
+            DataType::Double(_) => matches!(other, DataType::Double(_)),
+            DataType::Date(_) => matches!(other, DataType::Date(_)),
+            DataType::String(_) => matches!(other, DataType::String(_)),
+            DataType::Bytes(_) => matches!(other, DataType::Bytes(_)),
+            DataType::Char(a) => {
+                matches!(other, DataType::Char(b) if a.length() == b.length())
+            }
+            DataType::Binary(a) => {
+                matches!(other, DataType::Binary(b) if a.length() == b.length())
+            }
+            DataType::Decimal(a) => matches!(
+                other,
+                DataType::Decimal(b) if a.precision() == b.precision() && a.scale() == b.scale()
+            ),
+            DataType::Time(a) => {
+                matches!(other, DataType::Time(b) if a.precision() == b.precision())
+            }
+            DataType::Timestamp(a) => {
+                matches!(other, DataType::Timestamp(b) if a.precision() == b.precision())
+            }
+            DataType::TimestampLTz(a) => {
+                matches!(other, DataType::TimestampLTz(b) if a.precision() == b.precision())
+            }
+            DataType::Array(a) => matches!(
+                other,
+                DataType::Array(b) if a.get_element_type().eq_ignore_nullable(b.get_element_type())
+            ),
+            DataType::Map(a) => matches!(
+                other,
+                DataType::Map(b)
+                    if a.key_type().eq_ignore_nullable(b.key_type())
+                        && a.value_type().eq_ignore_nullable(b.value_type())
+            ),
+            DataType::Row(a) => matches!(
+                other,
+                DataType::Row(b) if a.fields().len() == b.fields().len()
+                    && a.fields().iter().zip(b.fields().iter()).all(|(x, y)| {
+                        x.name() == y.name() && x.data_type().eq_ignore_nullable(y.data_type())
+                    })
+            ),
+        }
+    }
+}
+
+impl Display for DataType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DataType::Boolean(v) => write!(f, "{v}"),
+            DataType::TinyInt(v) => write!(f, "{v}"),
+            DataType::SmallInt(v) => write!(f, "{v}"),
+            DataType::Int(v) => write!(f, "{v}"),
+            DataType::BigInt(v) => write!(f, "{v}"),
+            DataType::Float(v) => write!(f, "{v}"),
+            DataType::Double(v) => write!(f, "{v}"),
+            DataType::Char(v) => write!(f, "{v}"),
+            DataType::String(v) => write!(f, "{v}"),
+            DataType::Decimal(v) => write!(f, "{v}"),
+            DataType::Date(v) => write!(f, "{v}"),
+            DataType::Time(v) => write!(f, "{v}"),
+            DataType::Timestamp(v) => write!(f, "{v}"),
+            DataType::TimestampLTz(v) => write!(f, "{v}"),
+            DataType::Bytes(v) => write!(f, "{v}"),
+            DataType::Binary(v) => write!(f, "{v}"),
+            DataType::Array(v) => write!(f, "{v}"),
+            DataType::Map(v) => write!(f, "{v}"),
+            DataType::Row(v) => write!(f, "{v}"),
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct BooleanType {
+    nullable: bool,
+}
+
+impl Default for BooleanType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BooleanType {
+    pub fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for BooleanType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "BOOLEAN")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct TinyIntType {
+    nullable: bool,
+}
+
+impl Default for TinyIntType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TinyIntType {
+    pub fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for TinyIntType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "TINYINT")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct SmallIntType {
+    nullable: bool,
+}
+
+impl Default for SmallIntType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SmallIntType {
+    pub fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for SmallIntType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "SMALLINT")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct IntType {
+    nullable: bool,
+}
+
+impl Default for IntType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl IntType {
+    pub fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for IntType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "INT")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct BigIntType {
+    nullable: bool,
+}
+
+impl Default for BigIntType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BigIntType {
+    pub fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for BigIntType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "BIGINT")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct FloatType {
+    nullable: bool,
+}
+
+impl Default for FloatType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl FloatType {
+    pub fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for FloatType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "FLOAT")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct DoubleType {
+    nullable: bool,
+}
+
+impl Default for DoubleType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl DoubleType {
+    pub fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for DoubleType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DOUBLE")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct CharType {
+    nullable: bool,
+    length: u32,
+}
+
+impl CharType {
+    pub fn new(length: u32) -> Self {
+        Self::with_nullable(length, true)
+    }
+
+    pub fn with_nullable(length: u32, nullable: bool) -> Self {
+        Self { nullable, length }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(self.length, false)
+    }
+
+    pub fn length(&self) -> u32 {
+        self.length
+    }
+}
+
+impl Default for CharType {
+    fn default() -> Self {
+        Self::new(1)
+    }
+}
+
+impl Display for CharType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "CHAR({})", self.length)?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct StringType {
+    nullable: bool,
+}
+
+impl Default for StringType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl StringType {
+    pub fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for StringType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "STRING")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct DecimalType {
+    nullable: bool,
+    precision: u32,
+    scale: u32,
+}
+
+impl DecimalType {
+    pub const MIN_PRECISION: u32 = 1;
+
+    pub const MAX_PRECISION: u32 = 38;
+
+    pub const DEFAULT_PRECISION: u32 = 10;
+
+    pub const MIN_SCALE: u32 = 0;
+
+    pub const DEFAULT_SCALE: u32 = 0;
+
+    pub fn new(precision: u32, scale: u32) -> Result<Self> {
+        Self::with_nullable(true, precision, scale)
+    }
+
+    /// Create a DecimalType with validation, returning an error if parameters are invalid.
+    pub fn with_nullable(nullable: bool, precision: u32, scale: u32) -> Result<Self> {
+        // Validate precision
+        if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Decimal precision must be between {} and {} (both inclusive), got: {}",
+                    Self::MIN_PRECISION,
+                    Self::MAX_PRECISION,
+                    precision
+                ),
+            });
+        }
+        // Validate scale
+        if scale > precision {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Decimal scale must be between {} and the precision {} (both inclusive), got: {}",
+                    Self::MIN_SCALE,
+                    precision,
+                    scale
+                ),
+            });
+        }
+        Ok(DecimalType {
+            nullable,
+            precision,
+            scale,
+        })
+    }
+
+    pub fn precision(&self) -> u32 {
+        self.precision
+    }
+
+    pub fn scale(&self) -> u32 {
+        self.scale
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false, self.precision, self.scale)
+            .expect("Invalid decimal precision or scale")
+    }
+}
+
+impl Default for DecimalType {
+    fn default() -> Self {
+        Self::new(Self::DEFAULT_PRECISION, Self::DEFAULT_SCALE)
+            .expect("Invalid default decimal precision or scale")
+    }
+}
+
+impl Display for DecimalType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DECIMAL({}, {})", self.precision, self.scale)?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct DateType {
+    nullable: bool,
+}
+
+impl Default for DateType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl DateType {
+    pub fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for DateType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DATE")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct TimeType {
+    nullable: bool,
+    precision: u32,
+}
+
+impl Default for TimeType {
+    fn default() -> Self {
+        Self::new(Self::DEFAULT_PRECISION).expect("Invalid default time precision")
+    }
+}
+
+impl TimeType {
+    pub const MIN_PRECISION: u32 = 0;
+
+    pub const MAX_PRECISION: u32 = 9;
+
+    pub const DEFAULT_PRECISION: u32 = 0;
+
+    pub fn new(precision: u32) -> Result<Self> {
+        Self::with_nullable(true, precision)
+    }
+
+    /// Create a TimeType with validation, returning an error if precision is invalid.
+    pub fn with_nullable(nullable: bool, precision: u32) -> Result<Self> {
+        // Validate precision
+        if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Time precision must be between {} and {} (both inclusive), got: {}",
+                    Self::MIN_PRECISION,
+                    Self::MAX_PRECISION,
+                    precision
+                ),
+            });
+        }
+        Ok(TimeType {
+            nullable,
+            precision,
+        })
+    }
+
+    pub fn precision(&self) -> u32 {
+        self.precision
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false, self.precision).expect("Invalid time precision")
+    }
+}
+
+impl Display for TimeType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "TIME({})", self.precision)?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct TimestampType {
+    nullable: bool,
+    precision: u32,
+}
+
+impl Default for TimestampType {
+    fn default() -> Self {
+        Self::new(Self::DEFAULT_PRECISION).expect("Invalid default timestamp precision")
+    }
+}
+
+impl TimestampType {
+    pub const MIN_PRECISION: u32 = 0;
+
+    pub const MAX_PRECISION: u32 = 9;
+
+    pub const DEFAULT_PRECISION: u32 = 6;
+
+    pub fn new(precision: u32) -> Result<Self> {
+        Self::with_nullable(true, precision)
+    }
+
+    /// Create a TimestampType with validation, returning an error if precision is invalid.
+    pub fn with_nullable(nullable: bool, precision: u32) -> Result<Self> {
+        // Validate precision
+        if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Timestamp precision must be between {} and {} (both inclusive), got: {}",
+                    Self::MIN_PRECISION,
+                    Self::MAX_PRECISION,
+                    precision
+                ),
+            });
+        }
+        Ok(TimestampType {
+            nullable,
+            precision,
+        })
+    }
+
+    pub fn precision(&self) -> u32 {
+        self.precision
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false, self.precision).expect("Invalid timestamp precision")
+    }
+}
+
+impl Display for TimestampType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "TIMESTAMP({})", self.precision)?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct TimestampLTzType {
+    nullable: bool,
+    precision: u32,
+}
+
+impl Default for TimestampLTzType {
+    fn default() -> Self {
+        Self::new(Self::DEFAULT_PRECISION)
+            .expect("Invalid default timestamp with local time zone precision")
+    }
+}
+
+impl TimestampLTzType {
+    pub const MIN_PRECISION: u32 = 0;
+
+    pub const MAX_PRECISION: u32 = 9;
+
+    pub const DEFAULT_PRECISION: u32 = 6;
+
+    pub fn new(precision: u32) -> Result<Self> {
+        Self::with_nullable(true, precision)
+    }
+
+    /// Create a TimestampLTzType with validation, returning an error if precision is invalid.
+    pub fn with_nullable(nullable: bool, precision: u32) -> Result<Self> {
+        // Validate precision
+        if !(Self::MIN_PRECISION..=Self::MAX_PRECISION).contains(&precision) {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Timestamp with local time zone precision must be between {} and {} (both inclusive), got: {}",
+                    Self::MIN_PRECISION,
+                    Self::MAX_PRECISION,
+                    precision
+                ),
+            });
+        }
+        Ok(TimestampLTzType {
+            nullable,
+            precision,
+        })
+    }
+
+    pub fn precision(&self) -> u32 {
+        self.precision
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false, self.precision)
+            .expect("Invalid timestamp with local time zone precision")
+    }
+}
+
+impl Display for TimestampLTzType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "TIMESTAMP_LTZ({})", self.precision)?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct BytesType {
+    nullable: bool,
+}
+
+impl Default for BytesType {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl BytesType {
+    pub const fn new() -> Self {
+        Self::with_nullable(true)
+    }
+
+    pub const fn with_nullable(nullable: bool) -> Self {
+        Self { nullable }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false)
+    }
+}
+
+impl Display for BytesType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "BYTES")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
+pub struct BinaryType {
+    nullable: bool,
+    length: usize,
+}
+
+impl BinaryType {
+    pub const MIN_LENGTH: usize = 1;
+
+    pub const MAX_LENGTH: usize = usize::MAX;
+
+    pub const DEFAULT_LENGTH: usize = 1;
+
+    pub fn new(length: usize) -> Self {
+        Self::with_nullable(true, length)
+    }
+
+    pub fn with_nullable(nullable: bool, length: usize) -> Self {
+        Self { nullable, length }
+    }
+
+    pub fn length(&self) -> usize {
+        self.length
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false, self.length)
+    }
+}
+
+impl Default for BinaryType {
+    fn default() -> Self {
+        Self::new(Self::DEFAULT_LENGTH)
+    }
+}
+
+impl Display for BinaryType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "BINARY({})", self.length)?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+pub struct ArrayType {
+    nullable: bool,
+    element_type: Box<DataType>,
+}
+
+impl ArrayType {
+    pub fn new(element_type: DataType) -> Self {
+        Self::with_nullable(true, element_type)
+    }
+
+    pub fn with_nullable(nullable: bool, element_type: DataType) -> Self {
+        Self {
+            nullable,
+            element_type: Box::new(element_type),
+        }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self {
+            nullable: false,
+            element_type: self.element_type.clone(),
+        }
+    }
+
+    pub fn get_element_type(&self) -> &DataType {
+        &self.element_type
+    }
+}
+
+impl Display for ArrayType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "ARRAY<{}>", self.element_type)?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Hash)]
+pub struct MapType {
+    nullable: bool,
+    key_type: Box<DataType>,
+    value_type: Box<DataType>,
+}
+
+// Route Deserialize through `with_nullable` so a Serde-built MapType
+// collapses to the same canonical form as the constructor (otherwise
+// equivalent maps disagree under `PartialEq`).
+impl<'de> Deserialize<'de> for MapType {
+    fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        #[derive(Deserialize)]
+        struct Raw {
+            nullable: bool,
+            key_type: Box<DataType>,
+            value_type: Box<DataType>,
+        }
+        let raw = Raw::deserialize(deserializer)?;
+        Ok(MapType::with_nullable(
+            raw.nullable,
+            *raw.key_type,
+            *raw.value_type,
+        ))
+    }
+}
+
+impl MapType {
+    pub fn new(key_type: DataType, value_type: DataType) -> Self {
+        Self::with_nullable(true, key_type, value_type)
+    }
+
+    pub fn with_nullable(nullable: bool, key_type: DataType, value_type: DataType) -> Self {
+        Self {
+            nullable,
+            key_type: Box::new(key_type.as_non_nullable()),
+            value_type: Box::new(value_type),
+        }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self {
+            nullable: false,
+            key_type: self.key_type.clone(),
+            value_type: self.value_type.clone(),
+        }
+    }
+
+    pub fn key_type(&self) -> &DataType {
+        &self.key_type
+    }
+
+    pub fn value_type(&self) -> &DataType {
+        &self.value_type
+    }
+}
+
+impl Display for MapType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "MAP<{}, {}>", self.key_type, self.value_type)?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize, Hash)]
+pub struct RowType {
+    nullable: bool,
+    fields: Vec<DataField>,
+}
+
+impl RowType {
+    pub const fn new(fields: Vec<DataField>) -> Self {
+        Self::with_nullable(true, fields)
+    }
+
+    pub const fn with_nullable(nullable: bool, fields: Vec<DataField>) -> Self {
+        Self { nullable, fields }
+    }
+
+    pub fn as_non_nullable(&self) -> Self {
+        Self::with_nullable(false, self.fields.clone())
+    }
+
+    pub fn fields(&self) -> &Vec<DataField> {
+        &self.fields
+    }
+
+    pub fn get_field_index(&self, field_name: &str) -> Option<usize> {
+        self.fields.iter().position(|f| f.name == field_name)
+    }
+
+    pub fn field_types(&self) -> impl Iterator<Item = &DataType> + '_ {
+        self.fields.iter().map(|f| &f.data_type)
+    }
+
+    pub fn get_field_names(&self) -> Vec<&str> {
+        self.fields.iter().map(|f| f.name.as_str()).collect()
+    }
+
+    pub fn project_with_field_names(&self, field_names: &[String]) -> Result<RowType> {
+        let indices: Vec<usize> = field_names
+            .iter()
+            .map(|name| {
+                self.get_field_index(name).ok_or_else(|| IllegalArgument {
+                    message: format!("Field '{name}' does not exist in the row type"),
+                })
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        self.project(indices.as_slice())
+    }
+
+    pub fn project(&self, project_field_positions: &[usize]) -> Result<RowType> {
+        Ok(RowType::with_nullable(
+            self.nullable,
+            project_field_positions
+                .iter()
+                .map(|pos| {
+                    self.fields
+                        .get(*pos)
+                        .cloned()
+                        .ok_or_else(|| IllegalArgument {
+                            message: format!("invalid field position: {}", *pos),
+                        })
+                })
+                .collect::<Result<Vec<_>>>()?,
+        ))
+    }
+
+    #[cfg(test)]
+    pub fn with_data_types(data_types: Vec<DataType>) -> Self {
+        let mut fields: Vec<DataField> = Vec::new();
+        data_types.iter().enumerate().for_each(|(idx, data_type)| {
+            fields.push(DataField::new(format!("f{idx}"), data_type.clone(), None));
+        });
+
+        Self::with_nullable(true, fields)
+    }
+
+    #[cfg(test)]
+    pub fn with_data_types_and_field_names(
+        data_types: Vec<DataType>,
+        field_names: Vec<&str>,
+    ) -> Self {
+        let fields = data_types
+            .into_iter()
+            .zip(field_names)
+            .map(|(data_type, field_name)| {
+                DataField::new(field_name.to_string(), data_type.clone(), None)
+            })
+            .collect::<Vec<_>>();
+
+        Self::with_nullable(true, fields)
+    }
+}
+
+impl Display for RowType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "ROW<")?;
+        for (i, field) in self.fields.iter().enumerate() {
+            if i > 0 {
+                write!(f, ", ")?;
+            }
+            write!(f, "{field}")?;
+        }
+        write!(f, ">")?;
+        if !self.nullable {
+            write!(f, " NOT NULL")?;
+        }
+        Ok(())
+    }
+}
+
+pub struct DataTypes;
+
+impl DataTypes {
+    pub fn binary(length: usize) -> DataType {
+        DataType::Binary(BinaryType::new(length))
+    }
+
+    pub const fn bytes() -> DataType {
+        DataType::Bytes(BytesType::new())
+    }
+
+    pub fn boolean() -> DataType {
+        DataType::Boolean(BooleanType::new())
+    }
+
+    pub fn int() -> DataType {
+        DataType::Int(IntType::new())
+    }
+
+    /// Data type of a 1-byte signed integer with values from -128 to 127.
+    pub fn tinyint() -> DataType {
+        DataType::TinyInt(TinyIntType::new())
+    }
+
+    /// Data type of a 2-byte signed integer with values from -32,768 to 32,767.
+    pub fn smallint() -> DataType {
+        DataType::SmallInt(SmallIntType::new())
+    }
+
+    pub fn bigint() -> DataType {
+        DataType::BigInt(BigIntType::new())
+    }
+
+    /// Data type of a 4-byte single precision floating point number.
+    pub fn float() -> DataType {
+        DataType::Float(FloatType::new())
+    }
+
+    /// Data type of an 8-byte double precision floating point number.
+    pub fn double() -> DataType {
+        DataType::Double(DoubleType::new())
+    }
+
+    pub fn char(length: u32) -> DataType {
+        DataType::Char(CharType::new(length))
+    }
+
+    /// Data type of a variable-length character string.
+    pub fn string() -> DataType {
+        DataType::String(StringType::new())
+    }
+
+    /// Data type of a decimal number with fixed precision and scale `DECIMAL(p, s)` where
+    /// `p` is the number of digits in a number (=precision) and `s` is the number of
+    /// digits to the right of the decimal point in a number (=scale). `p` must have a value
+    /// between 1 and 38 (both inclusive). `s` must have a value between 0 and `p` (both inclusive).
+    pub fn decimal(precision: u32, scale: u32) -> DataType {
+        DataType::Decimal(DecimalType::new(precision, scale).expect("Invalid decimal parameters"))
+    }
+
+    pub fn date() -> DataType {
+        DataType::Date(DateType::new())
+    }
+
+    /// Data type of a time WITHOUT time zone `TIME` with no fractional seconds by default.
+    pub fn time() -> DataType {
+        DataType::Time(TimeType::default())
+    }
+
+    /// Data type of a time WITHOUT time zone `TIME(p)` where `p` is the number of digits
+    /// of fractional seconds (=precision). `p` must have a value between 0 and 9 (both inclusive).
+    pub fn time_with_precision(precision: u32) -> DataType {
+        DataType::Time(TimeType::new(precision).expect("Invalid time precision"))
+    }
+
+    /// Data type of a timestamp WITHOUT time zone `TIMESTAMP` with 6 digits of fractional
+    /// seconds by default.
+    pub fn timestamp() -> DataType {
+        DataType::Timestamp(TimestampType::default())
+    }
+
+    /// Data type of a timestamp WITHOUT time zone `TIMESTAMP(p)` where `p` is the number
+    /// of digits of fractional seconds (=precision). `p` must have a value between 0 and 9
+    /// (both inclusive).
+    pub fn timestamp_with_precision(precision: u32) -> DataType {
+        DataType::Timestamp(TimestampType::new(precision).expect("Invalid timestamp precision"))
+    }
+
+    /// Data type of a timestamp WITH time zone `TIMESTAMP WITH TIME ZONE` with 6 digits of
+    /// fractional seconds by default.
+    pub fn timestamp_ltz() -> DataType {
+        DataType::TimestampLTz(TimestampLTzType::default())
+    }
+
+    /// Data type of a timestamp WITH time zone `TIMESTAMP WITH TIME ZONE(p)` where `p` is the number
+    /// of digits of fractional seconds (=precision). `p` must have a value between 0 and 9 (both inclusive).
+    pub fn timestamp_ltz_with_precision(precision: u32) -> DataType {
+        DataType::TimestampLTz(
+            TimestampLTzType::new(precision)
+                .expect("Invalid timestamp with local time zone precision"),
+        )
+    }
+
+    /// Data type of an array of elements with same subtype.
+    pub fn array(element: DataType) -> DataType {
+        DataType::Array(ArrayType::new(element))
+    }
+
+    /// Data type of an associative array that maps keys to values.
+    pub fn map(key_type: DataType, value_type: DataType) -> DataType {
+        DataType::Map(MapType::new(key_type, value_type))
+    }
+
+    /// Field definition with field name and data type.
+    pub fn field<N: Into<String>>(name: N, data_type: DataType) -> DataField {
+        DataField::new(name, data_type, None)
+    }
+
+    /// Field definition with field name, data type, and a description.
+    pub fn field_with_description<N: Into<String>>(
+        name: N,
+        data_type: DataType,
+        description: String,
+    ) -> DataField {
+        DataField::new(name, data_type, Some(description))
+    }
+
+    /// Data type of a sequence of fields.
+    pub fn row(fields: Vec<DataField>) -> DataType {
+        DataType::Row(RowType::new(fields))
+    }
+
+    /// Data type of a sequence of fields with generated field names (f0, f1, f2, ...).
+    pub fn row_from_types(field_types: Vec<DataType>) -> DataType {
+        let fields = field_types
+            .into_iter()
+            .enumerate()
+            .map(|(i, dt)| DataField::new(format!("f{i}"), dt, None))
+            .collect();
+        DataType::Row(RowType::new(fields))
+    }
+}
+
+pub const UNASSIGNED_FIELD_ID: i32 = -1;
+
+pub fn reassign_field_ids(data_type: &DataType, counter: &mut i32) -> DataType {
+    match data_type {
+        DataType::Array(at) => DataType::Array(ArrayType::with_nullable(
+            at.nullable,
+            reassign_field_ids(at.get_element_type(), counter),
+        )),
+        DataType::Map(mt) => DataType::Map(MapType::with_nullable(
+            mt.nullable,
+            reassign_field_ids(mt.key_type(), counter),
+            reassign_field_ids(mt.value_type(), counter),
+        )),
+        DataType::Row(rt) => {
+            let new_fields: Vec<DataField> = rt
+                .fields()
+                .iter()
+                .map(|f| {
+                    *counter += 1;
+                    let id = *counter;
+                    let new_inner = reassign_field_ids(&f.data_type, counter);
+                    DataField::with_field_id(f.name.clone(), new_inner, f.description.clone(), id)
+                })
+                .collect();
+            DataType::Row(RowType::with_nullable(rt.nullable, new_fields))
+        }
+        _ => data_type.clone(),
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct DataField {
+    pub name: String,
+    pub data_type: DataType,
+    pub description: Option<String>,
+    pub field_id: i32,
+}
+
+// field_id is excluded from PartialEq/Eq/Hash to match Java's DataField.equals/hashCode.
+impl PartialEq for DataField {
+    fn eq(&self, other: &Self) -> bool {
+        self.name == other.name
+            && self.data_type == other.data_type
+            && self.description == other.description
+    }
+}
+
+impl Eq for DataField {}
+
+impl std::hash::Hash for DataField {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.name.hash(state);
+        self.data_type.hash(state);
+        self.description.hash(state);
+    }
+}
+
+impl DataField {
+    pub fn new<N: Into<String>>(
+        name: N,
+        data_type: DataType,
+        description: Option<String>,
+    ) -> DataField {
+        DataField {
+            name: name.into(),
+            data_type,
+            description,
+            field_id: UNASSIGNED_FIELD_ID,
+        }
+    }
+
+    pub fn with_field_id<N: Into<String>>(
+        name: N,
+        data_type: DataType,
+        description: Option<String>,
+        field_id: i32,
+    ) -> DataField {
+        DataField {
+            name: name.into(),
+            data_type,
+            description,
+            field_id,
+        }
+    }
+
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    pub fn data_type(&self) -> &DataType {
+        &self.data_type
+    }
+
+    pub fn field_id(&self) -> i32 {
+        self.field_id
+    }
+}
+
+impl Display for DataField {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} {}", self.name, self.data_type)
+    }
+}
+
+#[test]
+fn test_primitive_types_display() {
+    // Test simple primitive types with nullable and non-nullable variants
+    assert_eq!(BooleanType::new().to_string(), "BOOLEAN");
+    assert_eq!(
+        BooleanType::with_nullable(false).to_string(),
+        "BOOLEAN NOT NULL"
+    );
+
+    assert_eq!(TinyIntType::new().to_string(), "TINYINT");
+    assert_eq!(
+        TinyIntType::with_nullable(false).to_string(),
+        "TINYINT NOT NULL"
+    );
+
+    assert_eq!(SmallIntType::new().to_string(), "SMALLINT");
+    assert_eq!(
+        SmallIntType::with_nullable(false).to_string(),
+        "SMALLINT NOT NULL"
+    );
+
+    assert_eq!(IntType::new().to_string(), "INT");
+    assert_eq!(IntType::with_nullable(false).to_string(), "INT NOT NULL");
+
+    assert_eq!(BigIntType::new().to_string(), "BIGINT");
+    assert_eq!(
+        BigIntType::with_nullable(false).to_string(),
+        "BIGINT NOT NULL"
+    );
+
+    assert_eq!(FloatType::new().to_string(), "FLOAT");
+    assert_eq!(
+        FloatType::with_nullable(false).to_string(),
+        "FLOAT NOT NULL"
+    );
+
+    assert_eq!(DoubleType::new().to_string(), "DOUBLE");
+    assert_eq!(
+        DoubleType::with_nullable(false).to_string(),
+        "DOUBLE NOT NULL"
+    );
+
+    assert_eq!(StringType::new().to_string(), "STRING");
+    assert_eq!(
+        StringType::with_nullable(false).to_string(),
+        "STRING NOT NULL"
+    );
+
+    assert_eq!(DateType::new().to_string(), "DATE");
+    assert_eq!(DateType::with_nullable(false).to_string(), "DATE NOT NULL");
+
+    assert_eq!(BytesType::new().to_string(), "BYTES");
+    assert_eq!(
+        BytesType::with_nullable(false).to_string(),
+        "BYTES NOT NULL"
+    );
+}
+
+#[test]
+fn test_parameterized_types_display() {
+    // Test types with parameters (length, precision, scale, etc.)
+    assert_eq!(CharType::new(10).to_string(), "CHAR(10)");
+    assert_eq!(
+        CharType::with_nullable(20, false).to_string(),
+        "CHAR(20) NOT NULL"
+    );
+
+    assert_eq!(BinaryType::new(100).to_string(), "BINARY(100)");
+    assert_eq!(
+        BinaryType::with_nullable(false, 256).to_string(),
+        "BINARY(256) NOT NULL"
+    );
+
+    assert_eq!(
+        DecimalType::new(10, 2).unwrap().to_string(),
+        "DECIMAL(10, 2)"
+    );
+    assert_eq!(
+        DecimalType::with_nullable(false, 38, 10)
+            .unwrap()
+            .to_string(),
+        "DECIMAL(38, 10) NOT NULL"
+    );
+
+    assert_eq!(TimeType::new(0).unwrap().to_string(), "TIME(0)");
+    assert_eq!(TimeType::new(3).unwrap().to_string(), "TIME(3)");
+    assert_eq!(
+        TimeType::with_nullable(false, 9).unwrap().to_string(),
+        "TIME(9) NOT NULL"
+    );
+
+    assert_eq!(TimestampType::new(6).unwrap().to_string(), "TIMESTAMP(6)");
+    assert_eq!(TimestampType::new(0).unwrap().to_string(), "TIMESTAMP(0)");
+    assert_eq!(
+        TimestampType::with_nullable(false, 9).unwrap().to_string(),
+        "TIMESTAMP(9) NOT NULL"
+    );
+
+    assert_eq!(
+        TimestampLTzType::new(6).unwrap().to_string(),
+        "TIMESTAMP_LTZ(6)"
+    );
+    assert_eq!(
+        TimestampLTzType::new(3).unwrap().to_string(),
+        "TIMESTAMP_LTZ(3)"
+    );
+    assert_eq!(
+        TimestampLTzType::with_nullable(false, 9)
+            .unwrap()
+            .to_string(),
+        "TIMESTAMP_LTZ(9) NOT NULL"
+    );
+}
+
+#[test]
+fn test_array_display() {
+    let array_type = ArrayType::new(DataTypes::int());
+    assert_eq!(array_type.to_string(), "ARRAY<INT>");
+
+    let array_type_non_null = ArrayType::with_nullable(false, DataTypes::string());
+    assert_eq!(array_type_non_null.to_string(), "ARRAY<STRING> NOT NULL");
+
+    let nested_array = ArrayType::new(DataTypes::array(DataTypes::int()));
+    assert_eq!(nested_array.to_string(), "ARRAY<ARRAY<INT>>");
+}
+
+#[test]
+fn test_map_display() {
+    let map_type = MapType::new(DataTypes::string(), DataTypes::int());
+    assert_eq!(map_type.to_string(), "MAP<STRING NOT NULL, INT>");
+
+    let map_type_non_null = MapType::with_nullable(false, DataTypes::int(), DataTypes::string());
+    assert_eq!(
+        map_type_non_null.to_string(),
+        "MAP<INT NOT NULL, STRING> NOT NULL"
+    );
+
+    let nested_map = MapType::new(
+        DataTypes::string(),
+        DataTypes::map(DataTypes::int(), DataTypes::boolean()),
+    );
+    assert_eq!(
+        nested_map.to_string(),
+        "MAP<STRING NOT NULL, MAP<INT NOT NULL, BOOLEAN>>"
+    );
+}
+
+#[test]
+fn test_map_deserialize_normalises_key_nullability() {
+    let json = r#"{
+        "nullable": true,
+        "key_type": {"Int": {"nullable": true}},
+        "value_type": {"String": {"nullable": true}}
+    }"#;
+    let from_json: MapType = serde_json::from_str(json).expect("deserialize");
+    let from_ctor = MapType::new(DataTypes::int(), DataTypes::string());
+    assert_eq!(from_json, from_ctor);
+    assert!(!from_json.key_type().is_nullable());
+}
+
+#[test]
+fn test_map_deserialize_normalises_nested_map_keys() {
+    let json = r#"{
+        "nullable": true,
+        "key_type": {"String": {"nullable": true}},
+        "value_type": {"Map": {
+            "nullable": true,
+            "key_type": {"Int": {"nullable": true}},
+            "value_type": {"Boolean": {"nullable": true}}
+        }}
+    }"#;
+    let from_json: MapType = serde_json::from_str(json).expect("deserialize");
+    let from_ctor = MapType::new(
+        DataTypes::string(),
+        DataTypes::map(DataTypes::int(), DataTypes::boolean()),
+    );
+    assert_eq!(from_json, from_ctor);
+    assert!(!from_json.key_type().is_nullable());
+    let inner = match from_json.value_type() {
+        DataType::Map(m) => m,
+        other => panic!("expected nested Map, got {other:?}"),
+    };
+    assert!(!inner.key_type().is_nullable());
+}
+
+#[test]
+fn test_row_display() {
+    let fields = vec![
+        DataTypes::field("id", DataTypes::int()),
+        DataTypes::field("name", DataTypes::string()),
+    ];
+    let row_type = RowType::new(fields);
+    assert_eq!(row_type.to_string(), "ROW<id INT, name STRING>");
+
+    let fields_non_null = vec![DataTypes::field("age", DataTypes::bigint())];
+    let row_type_non_null = RowType::with_nullable(false, fields_non_null);
+    assert_eq!(row_type_non_null.to_string(), "ROW<age BIGINT> NOT NULL");
+}
+
+#[test]
+fn test_datatype_display() {
+    assert_eq!(DataTypes::boolean().to_string(), "BOOLEAN");
+    assert_eq!(DataTypes::int().to_string(), "INT");
+    assert_eq!(DataTypes::string().to_string(), "STRING");
+    assert_eq!(DataTypes::char(50).to_string(), "CHAR(50)");
+    assert_eq!(DataTypes::decimal(10, 2).to_string(), "DECIMAL(10, 2)");
+    assert_eq!(DataTypes::time_with_precision(3).to_string(), "TIME(3)");
+    assert_eq!(
+        DataTypes::timestamp_with_precision(6).to_string(),
+        "TIMESTAMP(6)"
+    );
+    assert_eq!(
+        DataTypes::timestamp_ltz_with_precision(9).to_string(),
+        "TIMESTAMP_LTZ(9)"
+    );
+    assert_eq!(DataTypes::array(DataTypes::int()).to_string(), "ARRAY<INT>");
+    assert_eq!(
+        DataTypes::map(DataTypes::string(), DataTypes::int()).to_string(),
+        "MAP<STRING NOT NULL, INT>"
+    );
+}
+
+#[test]
+fn test_datafield_display() {
+    let field = DataTypes::field("user_id", DataTypes::bigint());
+    assert_eq!(field.to_string(), "user_id BIGINT");
+
+    let field2 = DataTypes::field("email", DataTypes::string());
+    assert_eq!(field2.to_string(), "email STRING");
+
+    let field3 = DataTypes::field("score", DataTypes::decimal(10, 2));
+    assert_eq!(field3.to_string(), "score DECIMAL(10, 2)");
+}
+
+#[test]
+fn test_complex_nested_display() {
+    let row_type = DataTypes::row(vec![
+        DataTypes::field("id", DataTypes::int()),
+        DataTypes::field("tags", DataTypes::array(DataTypes::string())),
+        DataTypes::field(
+            "metadata",
+            DataTypes::map(DataTypes::string(), DataTypes::string()),
+        ),
+    ]);
+    assert_eq!(
+        row_type.to_string(),
+        "ROW<id INT, tags ARRAY<STRING>, metadata MAP<STRING NOT NULL, STRING>>"
+    );
+}
+
+#[test]
+fn test_non_nullable_datatype() {
+    let nullable_int = DataTypes::int();
+    assert_eq!(nullable_int.to_string(), "INT");
+
+    let non_nullable_int = nullable_int.as_non_nullable();
+    assert_eq!(non_nullable_int.to_string(), "INT NOT NULL");
+}
+
+#[test]
+fn test_deeply_nested_types() {
+    let nested = DataTypes::array(DataTypes::map(
+        DataTypes::string(),
+        DataTypes::row(vec![
+            DataTypes::field("x", DataTypes::int()),
+            DataTypes::field("y", DataTypes::int()),
+        ]),
+    ));
+    assert_eq!(
+        nested.to_string(),
+        "ARRAY<MAP<STRING NOT NULL, ROW<x INT, y INT>>>"
+    );
+}
+
+// ============================================================================
+// DecimalType validation tests
+// ============================================================================
+
+#[test]
+fn test_decimal_invalid_precision() {
+    // DecimalType::with_nullable should return an error for invalid precision
+    let result = DecimalType::with_nullable(true, 50, 2);
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Decimal precision must be between 1 and 38")
+    );
+}
+
+#[test]
+fn test_decimal_invalid_scale() {
+    // DecimalType::with_nullable should return an error when scale > precision
+    let result = DecimalType::with_nullable(true, 10, 15);
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Decimal scale must be between 0 and the precision 10")
+    );
+}
+
+// ============================================================================
+// DecimalType validation tests - edge cases
+// ============================================================================
+
+#[test]
+fn test_decimal_valid_precision_and_scale() {
+    // Valid: precision=10, scale=2
+    let result = DecimalType::with_nullable(true, 10, 2);
+    assert!(result.is_ok());
+    let decimal = result.unwrap();
+    assert_eq!(decimal.precision(), 10);
+    assert_eq!(decimal.scale(), 2);
+    // Nullable: should NOT contain "NOT NULL"
+    assert!(!decimal.to_string().contains("NOT NULL"));
+
+    // Valid: precision=38, scale=0
+    let result = DecimalType::with_nullable(true, 38, 0);
+    assert!(result.is_ok());
+    let decimal = result.unwrap();
+    assert_eq!(decimal.precision(), 38);
+    assert_eq!(decimal.scale(), 0);
+
+    // Valid: precision=1, scale=0
+    let result = DecimalType::with_nullable(false, 1, 0);
+    assert!(result.is_ok());
+    let decimal = result.unwrap();
+    assert_eq!(decimal.precision(), 1);
+    assert_eq!(decimal.scale(), 0);
+    // Non-nullable: should contain "NOT NULL"
+    assert!(decimal.to_string().contains("NOT NULL"));
+}
+
+#[test]
+fn test_decimal_invalid_precision_zero() {
+    // Invalid: precision=0 (edge case not covered by existing tests)
+    let result = DecimalType::with_nullable(true, 0, 0);
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Decimal precision must be between 1 and 38")
+    );
+}
+
+#[test]
+fn test_decimal_scale_equals_precision_boundary() {
+    // Boundary: precision=10, scale=10 (scale == precision is valid)
+    let result = DecimalType::with_nullable(true, 10, 10);
+    assert!(result.is_ok());
+    let decimal = result.unwrap();
+    assert_eq!(decimal.precision(), 10);
+    assert_eq!(decimal.scale(), 10);
+}
+
+// ============================================================================
+// TimeType validation tests
+// ============================================================================
+
+#[test]
+fn test_time_valid_precision() {
+    // Test all valid precision values 0 through 9
+    for precision in 0..=9 {
+        let result = TimeType::with_nullable(true, precision);
+        assert!(result.is_ok(), "precision {precision} should be valid");
+        let time = result.unwrap();
+        assert_eq!(time.precision(), precision);
+    }
+}
+
+#[test]
+fn test_time_invalid_precision() {
+    // TimeType::with_nullable should return an error for invalid precision
+    let result = TimeType::with_nullable(true, 10);
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Time precision must be between 0 and 9")
+    );
+}
+
+// ============================================================================
+// TimestampType validation tests
+// ============================================================================
+
+#[test]
+fn test_timestamp_valid_precision() {
+    // Test all valid precision values 0 through 9
+    for precision in 0..=9 {
+        let result = TimestampType::with_nullable(true, precision);
+        assert!(result.is_ok(), "precision {precision} should be valid");
+        let timestamp_type = result.unwrap();
+        assert_eq!(timestamp_type.precision(), precision);
+    }
+}
+
+#[test]
+fn test_timestamp_invalid_precision() {
+    // TimestampType::with_nullable should return an error for invalid precision
+    let result = TimestampType::with_nullable(true, 10);
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Timestamp precision must be between 0 and 9")
+    );
+}
+
+#[test]
+fn test_timestamp_ltz_invalid_precision() {
+    // TimestampLTzType::with_nullable should return an error for invalid precision
+    let result = TimestampLTzType::with_nullable(true, 10);
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Timestamp with local time zone precision must be between 0 and 9")
+    );
+}
+
+// ============================================================================
+// RowType projection tests
+// ============================================================================
+
+#[test]
+fn test_row_type_project_valid_indices() {
+    // Create a 3-column row type
+    let row_type = RowType::with_data_types_and_field_names(
+        vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()],
+        vec!["id", "name", "age"],
+    );
+
+    // Valid projection by indices: [0, 2]
+    let projected = row_type.project(&[0, 2]).unwrap();
+    assert_eq!(projected.fields().len(), 2);
+    assert_eq!(projected.fields()[0].name, "id");
+    assert_eq!(projected.fields()[1].name, "age");
+}
+
+#[test]
+fn test_row_type_project_empty_indices() {
+    // Create a 3-column row type
+    let row_type = RowType::with_data_types_and_field_names(
+        vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()],
+        vec!["id", "name", "age"],
+    );
+
+    // Projection with an empty indices array should yield an empty RowType
+    let projected = row_type.project(&[]).unwrap();
+    assert_eq!(projected.fields().len(), 0);
+}
+
+#[test]
+fn test_row_type_project_with_field_names_valid() {
+    // Create a 3-column row type
+    let row_type = RowType::with_data_types_and_field_names(
+        vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()],
+        vec!["id", "name", "age"],
+    );
+
+    // Valid projection by names: ["id", "name"]
+    let projected = row_type
+        .project_with_field_names(&["id".to_string(), "name".to_string()])
+        .unwrap();
+    assert_eq!(projected.fields().len(), 2);
+    assert_eq!(projected.fields()[0].name, "id");
+    assert_eq!(projected.fields()[1].name, "name");
+}
+
+#[test]
+fn test_row_type_project_index_out_of_bounds() {
+    // Create a 3-column row type
+    let row_type = RowType::with_data_types_and_field_names(
+        vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()],
+        vec!["id", "name", "age"],
+    );
+
+    // Error: index out of bounds
+    let result = row_type.project(&[0, 5]);
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("invalid field position: 5")
+    );
+}
+
+#[test]
+fn test_row_type_project_with_field_names_nonexistent() {
+    // Create a 3-column row type
+    let row_type = RowType::with_data_types_and_field_names(
+        vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()],
+        vec!["id", "name", "age"],
+    );
+
+    // Error: non-existent field name should throw exception
+    let result = row_type.project_with_field_names(&["nonexistent".to_string()]);
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Field 'nonexistent' does not exist in the row type")
+    );
+
+    // Mixed existing and non-existing: should also error on the first non-existent field
+    let result = row_type.project_with_field_names(&["id".to_string(), "nonexistent".to_string()]);
+    assert!(result.is_err());
+    assert!(
+        result
+            .unwrap_err()
+            .to_string()
+            .contains("Field 'nonexistent' does not exist in the row type")
+    );
+}
+
+#[test]
+fn test_row_type_project_duplicate_indices() {
+    // Create a 3-column row type
+    let row_type = RowType::with_data_types_and_field_names(
+        vec![DataTypes::int(), DataTypes::string(), DataTypes::bigint()],
+        vec!["id", "name", "age"],
+    );
+
+    // Projection with duplicate indices: [0, 0, 1]
+    // This documents the expected behavior - duplicates are allowed
+    let projected = row_type.project(&[0, 0, 1]).unwrap();
+    assert_eq!(projected.fields().len(), 3);
+    assert_eq!(projected.fields()[0].name, "id");
+    assert_eq!(projected.fields()[1].name, "id");
+    assert_eq!(projected.fields()[2].name, "name");
+}
+
+#[cfg(test)]
+mod eq_ignore_nullable_tests {
+    use super::*;
+
+    #[test]
+    fn ignores_nullability_at_top_level() {
+        let nullable = DataType::Int(IntType::new());
+        let non_nullable = DataType::Int(IntType::with_nullable(false));
+        assert_ne!(nullable, non_nullable, "PartialEq still distinguishes");
+        assert!(nullable.eq_ignore_nullable(&non_nullable));
+        assert!(non_nullable.eq_ignore_nullable(&nullable));
+    }
+
+    #[test]
+    fn rejects_different_kinds() {
+        assert!(
+            !DataType::Int(IntType::new()).eq_ignore_nullable(&DataType::BigInt(BigIntType::new()))
+        );
+    }
+
+    #[test]
+    fn compares_parameterized_types() {
+        // Char length must match.
+        assert!(
+            DataType::Char(CharType::with_nullable(10, true))
+                .eq_ignore_nullable(&DataType::Char(CharType::with_nullable(10, false)))
+        );
+        assert!(
+            !DataType::Char(CharType::with_nullable(10, true))
+                .eq_ignore_nullable(&DataType::Char(CharType::with_nullable(11, true)))
+        );
+
+        // Decimal precision + scale must match.
+        let a = DataType::Decimal(DecimalType::with_nullable(true, 10, 2).unwrap());
+        let b = DataType::Decimal(DecimalType::with_nullable(false, 10, 2).unwrap());
+        let c = DataType::Decimal(DecimalType::with_nullable(true, 10, 3).unwrap());
+        assert!(a.eq_ignore_nullable(&b));
+        assert!(!a.eq_ignore_nullable(&c));
+    }
+
+    #[test]
+    fn recurses_into_array_and_map() {
+        // Array<Int NULL> ~ Array<Int NOT NULL>
+        let a = DataType::Array(ArrayType::with_nullable(
+            true,
+            DataType::Int(IntType::new()),
+        ));
+        let b = DataType::Array(ArrayType::with_nullable(
+            false,
+            DataType::Int(IntType::with_nullable(false)),
+        ));
+        assert!(a.eq_ignore_nullable(&b));
+
+        // Map<String, Int> on both sides, mixed nullability.
+        let m1 = DataType::Map(MapType::with_nullable(
+            true,
+            DataType::String(StringType::new()),
+            DataType::Int(IntType::new()),
+        ));
+        let m2 = DataType::Map(MapType::with_nullable(
+            false,
+            DataType::String(StringType::with_nullable(false)),
+            DataType::Int(IntType::with_nullable(false)),
+        ));
+        assert!(m1.eq_ignore_nullable(&m2));
+
+        // Map element-type mismatch is still caught.
+        let m3 = DataType::Map(MapType::with_nullable(
+            true,
+            DataType::String(StringType::new()),
+            DataType::BigInt(BigIntType::new()),
+        ));
+        assert!(!m1.eq_ignore_nullable(&m3));
+    }
+
+    #[test]
+    fn recurses_into_row_fields() {
+        let r1 = DataType::Row(RowType::new(vec![
+            DataField::new("a", DataType::Int(IntType::new()), None),
+            DataField::new("b", DataType::String(StringType::new()), None),
+        ]));
+        let r2 = DataType::Row(RowType::with_nullable(
+            false,
+            vec![
+                DataField::new("a", DataType::Int(IntType::with_nullable(false)), None),
+                DataField::new(
+                    "b",
+                    DataType::String(StringType::with_nullable(false)),
+                    None,
+                ),
+            ],
+        ));
+        assert!(r1.eq_ignore_nullable(&r2));
+
+        // Field name mismatch must fail.
+        let r3 = DataType::Row(RowType::new(vec![
+            DataField::new("renamed_a", DataType::Int(IntType::new()), None),
+            DataField::new("b", DataType::String(StringType::new()), None),
+        ]));
+        assert!(!r1.eq_ignore_nullable(&r3));
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/metadata/json_serde.rs b/fluss-rust/crates/fluss/src/metadata/json_serde.rs
new file mode 100644
index 0000000000..b08159ae66
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/metadata/json_serde.rs
@@ -0,0 +1,1154 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::JsonSerdeError;
+use crate::error::{Error, Result};
+use crate::metadata::datatype::{
+    DataField, DataType, DataTypes, DecimalType, TimeType, TimestampLTzType, TimestampType,
+    UNASSIGNED_FIELD_ID,
+};
+use crate::metadata::table::{Column, Schema, TableDescriptor};
+use serde_json::{Value, json};
+use std::collections::HashMap;
+
+pub trait JsonSerde: Sized {
+    fn serialize_json(&self) -> Result<Value>;
+
+    fn deserialize_json(node: &Value) -> Result<Self>;
+}
+
+impl DataType {
+    pub fn to_type_root(&self) -> &str {
+        match &self {
+            DataType::Boolean(_) => "BOOLEAN",
+            DataType::TinyInt(_) => "TINYINT",
+            DataType::SmallInt(_) => "SMALLINT",
+            DataType::Int(_) => "INTEGER",
+            DataType::BigInt(_) => "BIGINT",
+            DataType::Float(_) => "FLOAT",
+            DataType::Double(_) => "DOUBLE",
+            DataType::Char(_) => "CHAR",
+            DataType::String(_) => "STRING",
+            DataType::Decimal(_) => "DECIMAL",
+            DataType::Date(_) => "DATE",
+            DataType::Time(_) => "TIME_WITHOUT_TIME_ZONE",
+            DataType::Timestamp(_) => "TIMESTAMP_WITHOUT_TIME_ZONE",
+            DataType::TimestampLTz(_) => "TIMESTAMP_WITH_LOCAL_TIME_ZONE",
+            DataType::Bytes(_) => "BYTES",
+            DataType::Binary(_) => "BINARY",
+            DataType::Array(_) => "ARRAY",
+            DataType::Map(_) => "MAP",
+            DataType::Row(_) => "ROW",
+        }
+    }
+}
+
+impl DataType {
+    const FIELD_NAME_TYPE_NAME: &'static str = "type";
+    const FIELD_NAME_NULLABLE: &'static str = "nullable";
+    const FIELD_NAME_LENGTH: &'static str = "length";
+    const FIELD_NAME_PRECISION: &'static str = "precision";
+    const FIELD_NAME_SCALE: &'static str = "scale";
+    #[allow(dead_code)]
+    const FIELD_NAME_ELEMENT_TYPE: &'static str = "element_type";
+    #[allow(dead_code)]
+    const FIELD_NAME_KEY_TYPE: &'static str = "key_type";
+    #[allow(dead_code)]
+    const FIELD_NAME_VALUE_TYPE: &'static str = "value_type";
+    #[allow(dead_code)]
+    const FIELD_NAME_FIELDS: &'static str = "fields";
+    #[allow(dead_code)]
+    const FIELD_NAME_FIELD_NAME: &'static str = "name";
+    // ROW
+    #[allow(dead_code)]
+    const FIELD_NAME_FIELD_TYPE: &'static str = "field_type";
+    #[allow(dead_code)]
+    const FIELD_NAME_FIELD_DESCRIPTION: &'static str = "description";
+}
+
+impl JsonSerde for DataType {
+    fn serialize_json(&self) -> Result<Value> {
+        let mut obj = serde_json::Map::new();
+
+        obj.insert(
+            Self::FIELD_NAME_TYPE_NAME.to_string(),
+            json!(Self::to_type_root(self)),
+        );
+        if !self.is_nullable() {
+            obj.insert(Self::FIELD_NAME_NULLABLE.to_string(), json!(false));
+        }
+
+        match &self {
+            DataType::Boolean(_)
+            | DataType::TinyInt(_)
+            | DataType::SmallInt(_)
+            | DataType::Int(_)
+            | DataType::BigInt(_)
+            | DataType::Float(_)
+            | DataType::Double(_)
+            | DataType::String(_)
+            | DataType::Bytes(_)
+            | DataType::Date(_) => {
+                // do nothing
+            }
+            DataType::Char(_type) => {
+                obj.insert(Self::FIELD_NAME_LENGTH.to_string(), json!(_type.length()));
+            }
+            DataType::Binary(_type) => {
+                obj.insert(Self::FIELD_NAME_LENGTH.to_string(), json!(_type.length()));
+            }
+            DataType::Decimal(_type) => {
+                obj.insert(
+                    Self::FIELD_NAME_PRECISION.to_string(),
+                    json!(_type.precision()),
+                );
+                obj.insert(Self::FIELD_NAME_SCALE.to_string(), json!(_type.scale()));
+            }
+            DataType::Time(_type) => {
+                obj.insert(
+                    Self::FIELD_NAME_PRECISION.to_string(),
+                    json!(_type.precision()),
+                );
+            }
+            DataType::Timestamp(_type) => {
+                obj.insert(
+                    Self::FIELD_NAME_PRECISION.to_string(),
+                    json!(_type.precision()),
+                );
+            }
+            DataType::TimestampLTz(_type) => {
+                obj.insert(
+                    Self::FIELD_NAME_PRECISION.to_string(),
+                    json!(_type.precision()),
+                );
+            }
+            DataType::Array(_type) => {
+                obj.insert(
+                    Self::FIELD_NAME_ELEMENT_TYPE.to_string(),
+                    _type.get_element_type().serialize_json()?,
+                );
+            }
+            DataType::Map(_type) => {
+                obj.insert(
+                    Self::FIELD_NAME_KEY_TYPE.to_string(),
+                    _type.key_type().serialize_json()?,
+                );
+                obj.insert(
+                    Self::FIELD_NAME_VALUE_TYPE.to_string(),
+                    _type.value_type().serialize_json()?,
+                );
+            }
+            DataType::Row(_type) => {
+                let fields: Vec<Value> = _type
+                    .fields()
+                    .iter()
+                    .map(|field| field.serialize_json())
+                    .collect::<Result<_>>()?;
+                obj.insert(Self::FIELD_NAME_FIELDS.to_string(), json!(fields));
+            }
+        }
+        Ok(Value::Object(obj))
+    }
+
+    fn deserialize_json(node: &Value) -> Result<Self> {
+        let mut _is_nullable = true;
+        let type_root = node
+            .get(Self::FIELD_NAME_TYPE_NAME)
+            .and_then(|v| v.as_str())
+            .ok_or_else(|| Error::JsonSerdeError {
+                message: format!(
+                    "Couldn't find field {} while deserializing datatype.",
+                    Self::FIELD_NAME_TYPE_NAME
+                ),
+            })?;
+
+        let mut data_type = match type_root {
+            "BOOLEAN" => DataTypes::boolean(),
+            "TINYINT" => DataTypes::tinyint(),
+            "SMALLINT" => DataTypes::smallint(),
+            "INTEGER" => DataTypes::int(),
+            "BIGINT" => DataTypes::bigint(),
+            "FLOAT" => DataTypes::float(),
+            "DOUBLE" => DataTypes::double(),
+            "CHAR" => {
+                let length = node
+                    .get(Self::FIELD_NAME_LENGTH)
+                    .and_then(|v| v.as_u64())
+                    .ok_or_else(|| Error::JsonSerdeError {
+                        message: format!("Missing required field: {}", Self::FIELD_NAME_LENGTH),
+                    })? as u32;
+                DataTypes::char(length)
+            }
+            "STRING" => DataTypes::string(),
+            "DECIMAL" => {
+                let precision = node
+                    .get(Self::FIELD_NAME_PRECISION)
+                    .and_then(|v| v.as_u64())
+                    .ok_or_else(|| Error::JsonSerdeError {
+                        message: format!("Missing required field: {}", Self::FIELD_NAME_PRECISION),
+                    })? as u32;
+                let scale = node
+                    .get(Self::FIELD_NAME_SCALE)
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(0) as u32;
+                DataType::Decimal(DecimalType::with_nullable(true, precision, scale).map_err(
+                    |e| Error::JsonSerdeError {
+                        message: format!("Invalid DECIMAL parameters: {e}"),
+                    },
+                )?)
+            }
+            "DATE" => DataTypes::date(),
+            "TIME_WITHOUT_TIME_ZONE" => {
+                let precision = node
+                    .get(Self::FIELD_NAME_PRECISION)
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(0) as u32;
+                DataType::Time(TimeType::with_nullable(true, precision).map_err(|e| {
+                    Error::JsonSerdeError {
+                        message: format!("Invalid TIME_WITHOUT_TIME_ZONE precision: {e}"),
+                    }
+                })?)
+            }
+            "TIMESTAMP_WITHOUT_TIME_ZONE" => {
+                let precision = node
+                    .get(Self::FIELD_NAME_PRECISION)
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(6) as u32;
+                DataType::Timestamp(TimestampType::with_nullable(true, precision).map_err(|e| {
+                    Error::JsonSerdeError {
+                        message: format!("Invalid TIMESTAMP_WITHOUT_TIME_ZONE precision: {e}"),
+                    }
+                })?)
+            }
+            "TIMESTAMP_WITH_LOCAL_TIME_ZONE" => {
+                let precision = node
+                    .get(Self::FIELD_NAME_PRECISION)
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(6) as u32;
+                DataType::TimestampLTz(TimestampLTzType::with_nullable(true, precision).map_err(
+                    |e| Error::JsonSerdeError {
+                        message: format!("Invalid TIMESTAMP_WITH_LOCAL_TIME_ZONE precision: {e}"),
+                    },
+                )?)
+            }
+            "BYTES" => DataTypes::bytes(),
+            "BINARY" => {
+                let length = node
+                    .get(Self::FIELD_NAME_LENGTH)
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(1) as usize;
+                DataTypes::binary(length)
+            }
+            "ARRAY" => {
+                let element_type_node =
+                    node.get(Self::FIELD_NAME_ELEMENT_TYPE).ok_or_else(|| {
+                        Error::JsonSerdeError {
+                            message: format!(
+                                "Missing required field: {}",
+                                Self::FIELD_NAME_ELEMENT_TYPE
+                            ),
+                        }
+                    })?;
+                let element_type = DataType::deserialize_json(element_type_node)?;
+                DataTypes::array(element_type)
+            }
+            "MAP" => {
+                let key_type_node =
+                    node.get(Self::FIELD_NAME_KEY_TYPE)
+                        .ok_or_else(|| Error::JsonSerdeError {
+                            message: format!(
+                                "Missing required field: {}",
+                                Self::FIELD_NAME_KEY_TYPE
+                            ),
+                        })?;
+                let key_type = DataType::deserialize_json(key_type_node)?;
+                let value_type_node =
+                    node.get(Self::FIELD_NAME_VALUE_TYPE)
+                        .ok_or_else(|| Error::JsonSerdeError {
+                            message: format!(
+                                "Missing required field: {}",
+                                Self::FIELD_NAME_VALUE_TYPE
+                            ),
+                        })?;
+                let value_type = DataType::deserialize_json(value_type_node)?;
+                DataTypes::map(key_type, value_type)
+            }
+            "ROW" => {
+                let fields_node = node
+                    .get(Self::FIELD_NAME_FIELDS)
+                    .ok_or_else(|| Error::JsonSerdeError {
+                        message: format!("Missing required field: {}", Self::FIELD_NAME_FIELDS),
+                    })?
+                    .as_array()
+                    .ok_or_else(|| Error::JsonSerdeError {
+                        message: format!("{} must be an array", Self::FIELD_NAME_FIELDS),
+                    })?;
+                let mut fields = Vec::with_capacity(fields_node.len());
+                for field_node in fields_node {
+                    fields.push(DataField::deserialize_json(field_node)?);
+                }
+                DataTypes::row(fields)
+            }
+            _ => {
+                return Err(Error::JsonSerdeError {
+                    message: format!("Unknown type root: {type_root}"),
+                });
+            }
+        };
+
+        if let Some(nullable) = node.get(Self::FIELD_NAME_NULLABLE) {
+            let nullable_value = nullable.as_bool().unwrap_or(true);
+            if !nullable_value {
+                data_type = data_type.as_non_nullable();
+            }
+        }
+        Ok(data_type)
+    }
+}
+
+impl DataField {
+    const NAME: &'static str = "name";
+    const FIELD_TYPE: &'static str = "field_type";
+    const DESCRIPTION: &'static str = "description";
+    const FIELD_ID: &'static str = "field_id";
+}
+
+impl JsonSerde for DataField {
+    fn serialize_json(&self) -> Result<Value> {
+        let mut obj = serde_json::Map::new();
+
+        obj.insert(Self::NAME.to_string(), json!(self.name()));
+        obj.insert(
+            Self::FIELD_TYPE.to_string(),
+            self.data_type.serialize_json()?,
+        );
+
+        if let Some(description) = &self.description {
+            obj.insert(Self::DESCRIPTION.to_string(), json!(description));
+        }
+
+        obj.insert(Self::FIELD_ID.to_string(), json!(self.field_id()));
+
+        Ok(Value::Object(obj))
+    }
+
+    fn deserialize_json(node: &Value) -> Result<DataField> {
+        let name = node
+            .get(Self::NAME)
+            .and_then(|v| v.as_str())
+            .ok_or_else(|| Error::JsonSerdeError {
+                message: format!("Missing required field: {}", Self::NAME),
+            })?
+            .to_string();
+
+        let field_type_node = node
+            .get(Self::FIELD_TYPE)
+            .ok_or_else(|| Error::JsonSerdeError {
+                message: format!("Missing required field: {}", Self::FIELD_TYPE),
+            })?;
+
+        let data_type = DataType::deserialize_json(field_type_node)?;
+
+        let description = node
+            .get(Self::DESCRIPTION)
+            .and_then(|v| v.as_str())
+            .map(|s| s.to_string());
+
+        let field_id = node
+            .get(Self::FIELD_ID)
+            .and_then(|v| v.as_i64())
+            .map(|v| v as i32)
+            .unwrap_or(UNASSIGNED_FIELD_ID);
+
+        Ok(DataField::with_field_id(
+            name,
+            data_type,
+            description,
+            field_id,
+        ))
+    }
+}
+
+impl Column {
+    const NAME: &'static str = "name";
+    const DATA_TYPE: &'static str = "data_type";
+    const COMMENT: &'static str = "comment";
+    const ID: &'static str = "id";
+}
+
+impl JsonSerde for Column {
+    fn serialize_json(&self) -> Result<Value> {
+        let mut obj = serde_json::Map::new();
+
+        // Common fields
+        obj.insert(Self::NAME.to_string(), json!(self.name()));
+        obj.insert(
+            Self::DATA_TYPE.to_string(),
+            self.data_type().serialize_json()?,
+        );
+
+        if let Some(comment) = &self.comment() {
+            obj.insert(Self::COMMENT.to_string(), json!(comment));
+        }
+
+        // The Java client requires `id` on input.
+        obj.insert(Self::ID.to_string(), json!(self.id()));
+
+        Ok(Value::Object(obj))
+    }
+
+    fn deserialize_json(node: &Value) -> Result<Column> {
+        let name = node
+            .get(Self::NAME)
+            .and_then(|v| v.as_str())
+            .ok_or_else(|| Error::JsonSerdeError {
+                message: format!("Missing required field: {}", Self::NAME),
+            })?;
+
+        let data_type_node = node
+            .get(Self::DATA_TYPE)
+            .ok_or_else(|| Error::JsonSerdeError {
+                message: format!("Missing required field: {}", Self::DATA_TYPE),
+            })?;
+
+        let data_type = DataType::deserialize_json(data_type_node)?;
+
+        let mut column = Column::new(name, data_type);
+
+        if let Some(comment) = node.get(Self::COMMENT).and_then(|v| v.as_str()) {
+            column = column.with_comment(comment);
+        }
+
+        // Pre-id JSON is treated as unassigned; SchemaBuilder will
+        // auto-assign on build.
+        if let Some(id) = node.get(Self::ID).and_then(|v| v.as_i64()) {
+            let id = i32::try_from(id).map_err(|_| Error::JsonSerdeError {
+                message: format!("Column id {id} does not fit in i32"),
+            })?;
+            column = column.with_id(id);
+        }
+
+        Ok(column)
+    }
+}
+
+impl Schema {
+    const COLUMNS_NAME: &'static str = "columns";
+    const PRIMARY_KEY_NAME: &'static str = "primary_key";
+    const HIGHEST_FIELD_ID: &'static str = "highest_field_id";
+    const VERSION_KEY: &'static str = "version";
+    const VERSION: u32 = 1;
+}
+
+impl JsonSerde for Schema {
+    fn serialize_json(&self) -> Result<Value> {
+        let mut obj = serde_json::Map::new();
+
+        // Serialize version
+        obj.insert(Self::VERSION_KEY.to_string(), json!(Self::VERSION));
+
+        // Serialize columns
+        let columns: Vec<Value> = self
+            .columns()
+            .iter()
+            .map(|col| col.serialize_json())
+            .collect::<Result<_>>()?;
+        obj.insert(Self::COLUMNS_NAME.to_string(), json!(columns));
+
+        // Serialize primary key if present
+        if let Some(primary_key) = &self.primary_key() {
+            let pk_values: Vec<Value> = primary_key
+                .column_names()
+                .iter()
+                .map(|name| json!(name))
+                .collect();
+            obj.insert(Self::PRIMARY_KEY_NAME.to_string(), json!(pk_values));
+        }
+
+        obj.insert(
+            Self::HIGHEST_FIELD_ID.to_string(),
+            json!(self.highest_field_id()),
+        );
+
+        Ok(Value::Object(obj))
+    }
+
+    fn deserialize_json(node: &Value) -> Result<Schema> {
+        let columns_node = node
+            .get(Self::COLUMNS_NAME)
+            .ok_or_else(|| Error::JsonSerdeError {
+                message: format!("Missing required field: {}", Self::COLUMNS_NAME),
+            })?
+            .as_array()
+            .ok_or_else(|| Error::JsonSerdeError {
+                message: format!("{} must be an array", Self::COLUMNS_NAME),
+            })?;
+
+        let mut columns = Vec::with_capacity(columns_node.len());
+        for col_node in columns_node {
+            columns.push(Column::deserialize_json(col_node)?);
+        }
+
+        let mut schema_builder = Schema::builder().with_columns(columns);
+
+        if let Some(pk_node) = node.get(Self::PRIMARY_KEY_NAME) {
+            let pk_array = pk_node
+                .as_array()
+                .ok_or_else(|| Error::invalid_table("Primary key must be an array"))?;
+
+            let mut primary_keys = Vec::with_capacity(pk_array.len());
+            for name_node in pk_array {
+                primary_keys.push(
+                    name_node.as_str().ok_or_else(|| {
+                        Error::invalid_table("Primary key element must be a string")
+                    })?,
+                );
+            }
+
+            schema_builder = schema_builder.primary_key(primary_keys);
+        }
+
+        schema_builder.build()
+    }
+}
+
+impl TableDescriptor {
+    const SCHEMA_NAME: &'static str = "schema";
+    const COMMENT_NAME: &'static str = "comment";
+    const PARTITION_KEY_NAME: &'static str = "partition_key";
+    const BUCKET_KEY_NAME: &'static str = "bucket_key";
+    const BUCKET_COUNT_NAME: &'static str = "bucket_count";
+    const PROPERTIES_NAME: &'static str = "properties";
+    const CUSTOM_PROPERTIES_NAME: &'static str = "custom_properties";
+    const VERSION_KEY: &'static str = "version";
+    const VERSION: u32 = 1;
+
+    fn deserialize_properties(node: &Value) -> Result<HashMap<String, String>> {
+        let obj = node.as_object().ok_or_else(|| Error::JsonSerdeError {
+            message: "Properties must be an object".to_string(),
+        })?;
+
+        let mut properties = HashMap::with_capacity(obj.len());
+        for (key, value) in obj {
+            properties.insert(
+                key.clone(),
+                value
+                    .as_str()
+                    .ok_or_else(|| Error::JsonSerdeError {
+                        message: "Property value must be a string".to_string(),
+                    })?
+                    .to_owned(),
+            );
+        }
+
+        Ok(properties)
+    }
+}
+
+impl JsonSerde for TableDescriptor {
+    fn serialize_json(&self) -> Result<Value> {
+        let mut obj = serde_json::Map::new();
+
+        // Serialize version
+        obj.insert(Self::VERSION_KEY.to_string(), json!(Self::VERSION));
+
+        // Serialize schema
+        obj.insert(
+            Self::SCHEMA_NAME.to_string(),
+            self.schema().serialize_json()?,
+        );
+
+        // Serialize comment if present
+        if let Some(comment) = &self.comment() {
+            obj.insert(Self::COMMENT_NAME.to_string(), json!(comment));
+        }
+
+        // Serialize partition keys
+        let partition_keys: Vec<Value> =
+            self.partition_keys().iter().map(|key| json!(key)).collect();
+        obj.insert(Self::PARTITION_KEY_NAME.to_string(), json!(partition_keys));
+
+        // Serialize table distribution if present
+        if let Some(dist) = &self.table_distribution() {
+            let bucket_keys: Vec<Value> = dist.bucket_keys().iter().map(|key| json!(key)).collect();
+            obj.insert(Self::BUCKET_KEY_NAME.to_string(), json!(bucket_keys));
+
+            if let Some(count) = dist.bucket_count() {
+                obj.insert(Self::BUCKET_COUNT_NAME.to_string(), json!(count));
+            }
+        }
+
+        // Serialize properties
+        obj.insert(Self::PROPERTIES_NAME.to_string(), json!(self.properties()));
+
+        obj.insert(
+            Self::CUSTOM_PROPERTIES_NAME.to_string(),
+            json!(self.custom_properties()),
+        );
+
+        Ok(Value::Object(obj))
+    }
+
+    fn deserialize_json(node: &Value) -> Result<Self> {
+        let mut builder = TableDescriptor::builder();
+
+        // Deserialize schema
+        let schema_node = node.get(Self::SCHEMA_NAME).ok_or_else(|| JsonSerdeError {
+            message: format!("Missing required field: {}", Self::SCHEMA_NAME),
+        })?;
+        let schema = Schema::deserialize_json(schema_node)?;
+        builder = builder.schema(schema);
+
+        // Deserialize comment if present
+        if let Some(comment_node) = node.get(Self::COMMENT_NAME) {
+            let comment = comment_node
+                .as_str()
+                .ok_or_else(|| JsonSerdeError {
+                    message: format!("{} must be a string", Self::COMMENT_NAME),
+                })?
+                .to_owned();
+            builder = builder.comment(comment.as_str());
+        }
+
+        let partition_node = node
+            .get(Self::PARTITION_KEY_NAME)
+            .ok_or_else(|| JsonSerdeError {
+                message: format!("Missing required field: {}", Self::PARTITION_KEY_NAME),
+            })?
+            .as_array()
+            .ok_or_else(|| JsonSerdeError {
+                message: format!("{} must be an array", Self::PARTITION_KEY_NAME),
+            })?;
+
+        let mut partition_keys = Vec::with_capacity(partition_node.len());
+        for key_node in partition_node {
+            partition_keys.push(
+                key_node
+                    .as_str()
+                    .ok_or_else(|| JsonSerdeError {
+                        message: format!("{} element must be a string", Self::PARTITION_KEY_NAME),
+                    })?
+                    .to_owned(),
+            );
+        }
+        builder = builder.partitioned_by(partition_keys);
+
+        let mut bucket_count = None;
+        let mut bucket_keys = vec![];
+        if let Some(bucket_key_node) = node.get(Self::BUCKET_KEY_NAME) {
+            let bucket_key_node = bucket_key_node.as_array().ok_or_else(|| JsonSerdeError {
+                message: format!("{} must be an array", Self::BUCKET_KEY_NAME),
+            })?;
+
+            for key_node in bucket_key_node {
+                bucket_keys.push(
+                    key_node
+                        .as_str()
+                        .ok_or_else(|| JsonSerdeError {
+                            message: "Bucket key must be a string".to_string(),
+                        })?
+                        .to_owned(),
+                );
+            }
+        }
+
+        if let Some(bucket_count_node) = node.get(Self::BUCKET_COUNT_NAME) {
+            bucket_count = bucket_count_node.as_u64().map(|n| n as i32);
+        }
+
+        if bucket_count.is_some() || !bucket_keys.is_empty() {
+            builder = builder.distributed_by(bucket_count, bucket_keys);
+        }
+
+        // Deserialize properties
+        let properties =
+            Self::deserialize_properties(node.get(Self::PROPERTIES_NAME).ok_or_else(|| {
+                JsonSerdeError {
+                    message: format!("Missing required field: {}", Self::PROPERTIES_NAME),
+                }
+            })?)?;
+        builder = builder.properties(properties);
+
+        // Deserialize custom properties
+        let custom_properties = Self::deserialize_properties(
+            node.get(Self::CUSTOM_PROPERTIES_NAME)
+                .ok_or_else(|| JsonSerdeError {
+                    message: format!("Missing required field: {}", Self::CUSTOM_PROPERTIES_NAME),
+                })?,
+        )?;
+        builder = builder.custom_properties(custom_properties);
+
+        builder.build()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::reassign_field_ids;
+    use crate::metadata::{
+        Column, DataField, DataType, DataTypes as DT, DataTypes, MapType, Schema,
+    };
+
+    #[test]
+    fn column_id_round_trip_through_json() {
+        let col = Column::new("a", DataTypes::int())
+            .with_id(7)
+            .with_comment("desc");
+        let json = col.serialize_json().unwrap();
+        assert_eq!(json.get("id").and_then(|v| v.as_i64()), Some(7));
+        let round_tripped = Column::deserialize_json(&json).unwrap();
+        assert_eq!(round_tripped, col);
+    }
+
+    #[test]
+    fn schema_assigns_ids_when_absent_and_preserves_when_present() {
+        let auto = Schema::builder()
+            .column("a", DataTypes::int())
+            .column("b", DataTypes::string())
+            .build()
+            .unwrap();
+        let ids: Vec<i32> = auto.columns().iter().map(|c| c.id()).collect();
+        assert_eq!(ids, vec![0, 1]);
+
+        let preserved = Schema::builder()
+            .with_columns(vec![
+                Column::new("a", DataTypes::int()).with_id(3),
+                Column::new("b", DataTypes::string()).with_id(7),
+            ])
+            .build()
+            .unwrap();
+        let ids: Vec<i32> = preserved.columns().iter().map(|c| c.id()).collect();
+        assert_eq!(ids, vec![3, 7]);
+    }
+
+    #[test]
+    fn schema_rejects_duplicate_ids() {
+        let err = Schema::builder()
+            .with_columns(vec![
+                Column::new("a", DataTypes::int()).with_id(7),
+                Column::new("b", DataTypes::string()).with_id(7),
+            ])
+            .build()
+            .unwrap_err();
+        assert!(err.to_string().contains("Duplicate field id 7"), "{err}");
+    }
+
+    #[test]
+    fn schema_rejects_negative_non_sentinel_ids() {
+        let err = Schema::builder()
+            .with_columns(vec![Column::new("a", DataTypes::int()).with_id(-7)])
+            .build()
+            .unwrap_err();
+        assert!(err.to_string().contains("invalid id -7"), "{err}");
+    }
+
+    #[test]
+    fn column_json_id_overflow_errors() {
+        let json = serde_json::json!({
+            "name": "a",
+            "data_type": Column::new("a", DataTypes::int()).serialize_json().unwrap()
+                .get("data_type").unwrap(),
+            "id": (i32::MAX as i64) + 1,
+        });
+        let err = Column::deserialize_json(&json).unwrap_err();
+        assert!(err.to_string().contains("does not fit in i32"), "{err}");
+    }
+
+    #[test]
+    fn schema_rejects_partially_assigned_ids() {
+        let err = Schema::builder()
+            .with_columns(vec![
+                Column::new("a", DataTypes::int()).with_id(0),
+                Column::new("b", DataTypes::string()),
+            ])
+            .build()
+            .unwrap_err();
+        assert!(
+            err.to_string().contains("All columns must have an id"),
+            "{err}"
+        );
+    }
+
+    #[test]
+    fn schema_assigns_nested_field_ids_in_java_dfs_order() {
+        let inner_row = DataTypes::row(vec![DataField::new("n", DataTypes::int(), None)]);
+        let nested_row = DataTypes::row(vec![
+            DataField::new("x", DataTypes::int(), None),
+            DataField::new("label", DataTypes::string(), None),
+        ]);
+        let deep_row = DataTypes::row(vec![DataField::new("inner", inner_row, None)]);
+
+        let schema = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("nested", nested_row)
+            .column("deep", deep_row)
+            .build()
+            .unwrap();
+
+        let top_ids: Vec<i32> = schema.columns().iter().map(|c| c.id()).collect();
+        assert_eq!(top_ids, vec![0, 1, 4]);
+
+        fn nested_field(dt: &DataType, idx: usize) -> &DataField {
+            match dt {
+                DataType::Row(rt) => &rt.fields()[idx],
+                _ => panic!("not a Row"),
+            }
+        }
+        let nested_dt = schema.columns()[1].data_type();
+        assert_eq!(nested_field(nested_dt, 0).field_id(), 2); // x
+        assert_eq!(nested_field(nested_dt, 1).field_id(), 3); // label
+
+        let deep_dt = schema.columns()[2].data_type();
+        let inner_field = nested_field(deep_dt, 0); // inner
+        assert_eq!(inner_field.field_id(), 5);
+        let n_field = nested_field(inner_field.data_type(), 0); // n
+        assert_eq!(n_field.field_id(), 6);
+
+        assert_eq!(schema.highest_field_id(), 6);
+
+        for c in schema.columns() {
+            assert_ne!(c.id(), UNASSIGNED_FIELD_ID);
+        }
+    }
+
+    #[test]
+    fn schema_array_of_row_assigns_nested_ids() {
+        let elem = DataTypes::row(vec![
+            DataField::new("seq", DataTypes::int(), None),
+            DataField::new("label", DataTypes::string(), None),
+        ]);
+        let schema = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("events", DataTypes::array(elem))
+            .build()
+            .unwrap();
+        assert_eq!(schema.highest_field_id(), 3);
+        let array_dt = schema.columns()[1].data_type();
+        let elem_dt = match array_dt {
+            DataType::Array(at) => at.get_element_type(),
+            _ => unreachable!(),
+        };
+        let fields = match elem_dt {
+            DataType::Row(rt) => rt.fields(),
+            _ => unreachable!(),
+        };
+        assert_eq!(fields[0].field_id(), 2);
+        assert_eq!(fields[1].field_id(), 3);
+    }
+
+    #[test]
+    fn schema_nested_row_round_trips_through_json() {
+        let nested = DataTypes::row(vec![
+            DataField::new("x", DataTypes::int(), None),
+            DataField::new("label", DataTypes::string(), None),
+        ]);
+        let original = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("nested", nested)
+            .build()
+            .unwrap();
+
+        let json = original.serialize_json().unwrap();
+
+        assert_eq!(
+            json.get("highest_field_id").and_then(|v| v.as_i64()),
+            Some(3)
+        );
+
+        let round_tripped = Schema::deserialize_json(&json).unwrap();
+        assert_eq!(round_tripped.highest_field_id(), 3);
+        assert_eq!(
+            round_tripped
+                .columns()
+                .iter()
+                .map(|c| c.id())
+                .collect::<Vec<_>>(),
+            vec![0, 1],
+        );
+        assert_eq!(round_tripped, original);
+    }
+
+    #[test]
+    fn schema_rejects_duplicate_nested_field_ids() {
+        let nested = DataTypes::row(vec![
+            DataField::with_field_id("x", DT::int(), None, 0),
+            DataField::with_field_id("y", DT::int(), None, 2),
+        ]);
+        let err = Schema::builder()
+            .with_columns(vec![
+                Column::new("a", DT::int()).with_id(0),
+                Column::new("b", nested).with_id(1),
+            ])
+            .build()
+            .unwrap_err();
+        assert!(err.to_string().contains("Duplicate field id 0"), "{err}");
+    }
+
+    #[test]
+    fn schema_rejects_partially_assigned_nested_field_ids() {
+        let nested = DataTypes::row(vec![DataField::new("x", DT::int(), None)]);
+        let err = Schema::builder()
+            .with_columns(vec![
+                Column::new("a", DT::int()).with_id(0),
+                Column::new("b", nested).with_id(1),
+            ])
+            .build()
+            .unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("nested DataField ids are unassigned"),
+            "{err}"
+        );
+    }
+
+    #[test]
+    fn schema_preserves_nested_ids_with_gaps() {
+        // n2.m1=11), f2=2 (nested n0=9, n1=10).
+        let inner_for_n2 = DataTypes::row(vec![DataField::with_field_id(
+            "m1",
+            DataTypes::tinyint(),
+            None,
+            11,
+        )]);
+        let f1_row = DataTypes::row(vec![
+            DataField::with_field_id("n0", DataTypes::tinyint(), None, 6),
+            DataField::with_field_id("n1", DataTypes::string(), None, 7),
+            DataField::with_field_id("n2", inner_for_n2, None, 8),
+        ]);
+        let f2_row = DataTypes::row(vec![
+            DataField::with_field_id("n0", DataTypes::tinyint(), None, 9),
+            DataField::with_field_id("n1", DataTypes::string(), None, 10),
+        ]);
+
+        let schema = Schema::builder()
+            .with_columns(vec![
+                Column::new("f0", DataTypes::string().as_non_nullable()).with_id(0),
+                Column::new("f1", f1_row).with_id(1),
+                Column::new("f2", f2_row).with_id(2),
+            ])
+            .build()
+            .unwrap();
+
+        let top_ids: Vec<i32> = schema.columns().iter().map(|c| c.id()).collect();
+        assert_eq!(top_ids, vec![0, 1, 2]);
+
+        fn row_fields(dt: &DataType) -> &[DataField] {
+            match dt {
+                DataType::Row(rt) => rt.fields(),
+                _ => panic!("not a Row"),
+            }
+        }
+        let f1_fields = row_fields(schema.columns()[1].data_type());
+        assert_eq!(f1_fields[0].field_id(), 6); // n0
+        assert_eq!(f1_fields[1].field_id(), 7); // n1
+        assert_eq!(f1_fields[2].field_id(), 8); // n2
+        let n2_fields = row_fields(f1_fields[2].data_type());
+        assert_eq!(n2_fields[0].field_id(), 11); // m1 — the "gap"
+
+        let f2_fields = row_fields(schema.columns()[2].data_type());
+        assert_eq!(f2_fields[0].field_id(), 9);
+        assert_eq!(f2_fields[1].field_id(), 10);
+
+        assert_eq!(schema.highest_field_id(), 11);
+    }
+
+    #[test]
+    fn schema_deserializes_legacy_json_without_column_ids() {
+        let legacy_json: Value = serde_json::from_str(
+            r#"{
+                "version": 1,
+                "columns": [
+                    {"name": "a", "data_type": {"type": "INTEGER", "nullable": false}, "comment": "first"},
+                    {"name": "b", "data_type": {"type": "STRING"}, "comment": "second"},
+                    {"name": "c", "data_type": {"type": "CHAR", "nullable": false, "length": 10}, "comment": "third"}
+                ],
+                "primary_key": ["a", "c"]
+            }"#,
+        )
+        .unwrap();
+
+        let schema = Schema::deserialize_json(&legacy_json).expect("legacy JSON must deserialize");
+        let ids: Vec<i32> = schema.columns().iter().map(|c| c.id()).collect();
+        assert_eq!(ids, vec![0, 1, 2], "missing IDs auto-assigned 0..N-1");
+        assert_eq!(schema.highest_field_id(), 2);
+        assert!(schema.primary_key().is_some());
+    }
+
+    #[test]
+    fn empty_schema_has_minus_one_highest_field_id() {
+        let s = Schema::builder().build().unwrap();
+        assert_eq!(s.highest_field_id(), -1);
+        let json = s.serialize_json().unwrap();
+        assert_eq!(
+            json.get("highest_field_id").and_then(|v| v.as_i64()),
+            Some(-1)
+        );
+    }
+
+    #[test]
+    fn reassign_field_ids_walks_array_map_row() {
+        let dt = DataTypes::array(DataTypes::row(vec![
+            DataField::new("a", DataTypes::int(), None),
+            DataField::new("b", DataTypes::string(), None),
+        ]));
+        let mut counter = -1_i32;
+        let assigned = reassign_field_ids(&dt, &mut counter);
+        match assigned {
+            DataType::Array(at) => match at.get_element_type() {
+                DataType::Row(rt) => {
+                    assert_eq!(rt.fields()[0].field_id(), 0);
+                    assert_eq!(rt.fields()[1].field_id(), 1);
+                }
+                _ => panic!("expected Row"),
+            },
+            _ => panic!("expected Array"),
+        }
+        assert_eq!(counter, 1);
+
+        let dt = DataType::Map(MapType::new(
+            DataTypes::int(),
+            DataTypes::row(vec![DataField::new("x", DataTypes::int(), None)]),
+        ));
+        let mut counter = -1_i32;
+        let assigned = reassign_field_ids(&dt, &mut counter);
+        let value_type = match &assigned {
+            DataType::Map(mt) => mt.value_type(),
+            _ => panic!("expected Map"),
+        };
+        match value_type {
+            DataType::Row(rt) => assert_eq!(rt.fields()[0].field_id(), 0),
+            _ => panic!("expected Row"),
+        }
+        assert_eq!(counter, 0);
+    }
+
+    #[test]
+    fn test_datatype_json_serde() {
+        let data_types = vec![
+            DataTypes::boolean(),
+            DataTypes::tinyint(),
+            DataTypes::smallint(),
+            DataTypes::int().as_non_nullable(),
+            DataTypes::bigint(),
+            DataTypes::float(),
+            DataTypes::double(),
+            DataTypes::char(10),
+            DataTypes::string(),
+            DataTypes::decimal(10, 2),
+            DataTypes::date(),
+            DataTypes::time(),
+            DataTypes::timestamp(),
+            DataTypes::timestamp_ltz(),
+            DataTypes::bytes(),
+            DataTypes::binary(100),
+            DataTypes::array(DataTypes::int()),
+            DataTypes::map(DataTypes::string(), DataTypes::int()),
+            DataTypes::row(vec![
+                DataField::new("f1".to_string(), DataTypes::int(), None),
+                DataField::new(
+                    "f2".to_string(),
+                    DataTypes::string(),
+                    Some("desc".to_string()),
+                ),
+            ]),
+        ];
+
+        for dt in data_types {
+            let json = dt.serialize_json().unwrap();
+            let deserialized = DataType::deserialize_json(&json).unwrap();
+            assert_eq!(dt, deserialized);
+        }
+    }
+
+    #[test]
+    fn test_invalid_datatype_validation() {
+        use serde_json::json;
+
+        // Invalid DECIMAL precision (> 38)
+        let invalid_decimal = json!({
+            "type": "DECIMAL",
+            "precision": 50,
+            "scale": 2
+        });
+        let result = DataType::deserialize_json(&invalid_decimal);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Invalid DECIMAL parameters")
+        );
+
+        // Invalid TIME precision (> 9)
+        let invalid_time = json!({
+            "type": "TIME_WITHOUT_TIME_ZONE",
+            "precision": 15
+        });
+        let result = DataType::deserialize_json(&invalid_time);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Invalid TIME_WITHOUT_TIME_ZONE precision")
+        );
+
+        // Invalid TIMESTAMP precision (> 9)
+        let invalid_timestamp = json!({
+            "type": "TIMESTAMP_WITHOUT_TIME_ZONE",
+            "precision": 20
+        });
+        let result = DataType::deserialize_json(&invalid_timestamp);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Invalid TIMESTAMP_WITHOUT_TIME_ZONE precision")
+        );
+
+        // Invalid TIMESTAMP_LTZ precision (> 9)
+        let invalid_timestamp_ltz = json!({
+            "type": "TIMESTAMP_WITH_LOCAL_TIME_ZONE",
+            "precision": 10
+        });
+        let result = DataType::deserialize_json(&invalid_timestamp_ltz);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Invalid TIMESTAMP_WITH_LOCAL_TIME_ZONE precision")
+        );
+
+        // Invalid DECIMAL scale (> precision)
+        let invalid_decimal_scale = json!({
+            "type": "DECIMAL",
+            "precision": 10,
+            "scale": 15
+        });
+        let result = DataType::deserialize_json(&invalid_decimal_scale);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("Invalid DECIMAL parameters")
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/metadata/mod.rs b/fluss-rust/crates/fluss/src/metadata/mod.rs
new file mode 100644
index 0000000000..c1d1b72c69
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/metadata/mod.rs
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod data_lake_format;
+mod database;
+mod datatype;
+mod json_serde;
+mod partition;
+mod schema_util;
+mod table;
+
+pub use data_lake_format::*;
+pub use database::*;
+pub use datatype::*;
+pub use json_serde::*;
+pub use partition::*;
+pub(crate) use schema_util::{UNEXIST_MAPPING, index_mapping};
+pub use table::*;
diff --git a/fluss-rust/crates/fluss/src/metadata/partition.rs b/fluss-rust/crates/fluss/src/metadata/partition.rs
new file mode 100644
index 0000000000..c63fe296c5
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/metadata/partition.rs
@@ -0,0 +1,476 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::{Error, Result};
+use crate::proto::{PbKeyValue, PbPartitionInfo, PbPartitionSpec};
+use crate::{PartitionId, TableId};
+use std::collections::HashMap;
+use std::fmt::{Display, Formatter};
+use std::sync::Arc;
+
+/// Represents a partition spec in fluss. Partition columns and values are NOT of strict order, and
+/// they need to be re-arranged to the correct order by comparing with a list of strictly ordered
+/// partition keys.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct PartitionSpec {
+    partition_spec: HashMap<String, String>,
+}
+
+impl PartitionSpec {
+    pub fn new<K: Into<String>, V: Into<String>>(partition_spec: HashMap<K, V>) -> Self {
+        let mut new_map = HashMap::new();
+        for (k, v) in partition_spec {
+            new_map.insert(k.into(), v.into());
+        }
+        Self {
+            partition_spec: new_map,
+        }
+    }
+
+    pub fn get_spec_map(&self) -> &HashMap<String, String> {
+        &self.partition_spec
+    }
+
+    pub fn to_pb(&self) -> PbPartitionSpec {
+        PbPartitionSpec {
+            partition_key_values: self
+                .partition_spec
+                .iter()
+                .map(|(k, v)| PbKeyValue {
+                    key: k.clone(),
+                    value: v.clone(),
+                })
+                .collect(),
+        }
+    }
+
+    pub fn from_pb(pb: &PbPartitionSpec) -> Self {
+        let partition_spec = pb
+            .partition_key_values
+            .iter()
+            .map(|kv| (kv.key.clone(), kv.value.clone()))
+            .collect();
+        Self { partition_spec }
+    }
+}
+
+impl Display for PartitionSpec {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "PartitionSpec{{{:?}}}", self.partition_spec)
+    }
+}
+
+/// Represents a partition, which is the resolved version of PartitionSpec. The partition
+/// spec is re-arranged into the correct order by comparing it with a list of strictly ordered
+/// partition keys.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct ResolvedPartitionSpec {
+    partition_keys: Arc<[String]>,
+    partition_values: Vec<String>,
+}
+
+pub const PARTITION_SPEC_SEPARATOR: &str = "$";
+
+impl ResolvedPartitionSpec {
+    pub fn new(partition_keys: Arc<[String]>, partition_values: Vec<String>) -> Result<Self> {
+        if partition_keys.len() != partition_values.len() {
+            return Err(Error::IllegalArgument {
+                message: "The number of partition keys and partition values should be the same."
+                    .to_string(),
+            });
+        }
+
+        Ok(Self {
+            partition_keys,
+            partition_values,
+        })
+    }
+
+    pub fn from_partition_spec(
+        partition_keys: Arc<[String]>,
+        partition_spec: &PartitionSpec,
+    ) -> Self {
+        let partition_values =
+            Self::get_reordered_partition_values(&partition_keys, partition_spec);
+        Self {
+            partition_keys,
+            partition_values,
+        }
+    }
+
+    pub fn from_partition_name(partition_keys: Arc<[String]>, partition_name: &str) -> Self {
+        let partition_values: Vec<String> = partition_name
+            .split(PARTITION_SPEC_SEPARATOR)
+            .map(|s| s.to_string())
+            .collect();
+        Self {
+            partition_keys,
+            partition_values,
+        }
+    }
+
+    pub fn from_partition_qualified_name(qualified_partition_name: &str) -> Result<Self> {
+        let mut keys = Vec::new();
+        let mut values = Vec::new();
+
+        for pair in qualified_partition_name.split('/') {
+            let parts: Vec<&str> = pair.splitn(2, '=').collect();
+            if parts.len() != 2 {
+                return Err(Error::IllegalArgument {
+                    message: format!(
+                        "Invalid partition name format. Expected key=value, got: {pair}"
+                    ),
+                });
+            }
+            keys.push(parts[0].to_string());
+            values.push(parts[1].to_string());
+        }
+
+        Ok(Self {
+            partition_keys: Arc::from(keys),
+            partition_values: values,
+        })
+    }
+
+    pub fn get_partition_keys(&self) -> &[String] {
+        &self.partition_keys
+    }
+
+    pub fn get_partition_values(&self) -> &[String] {
+        &self.partition_values
+    }
+
+    pub fn to_partition_spec(&self) -> PartitionSpec {
+        let mut spec_map = HashMap::new();
+        for (i, key) in self.partition_keys.iter().enumerate() {
+            spec_map.insert(key.clone(), self.partition_values[i].clone());
+        }
+        PartitionSpec::new(spec_map)
+    }
+
+    /// Generate the partition name for a partition table with specified partition values.
+    ///
+    /// The partition name is in the following format: value1$value2$...$valueN
+    pub fn get_partition_name(&self) -> String {
+        self.partition_values.join(PARTITION_SPEC_SEPARATOR)
+    }
+
+    /// Returns the qualified partition name for a partition spec.
+    /// The format is: key1=value1/key2=value2/.../keyN=valueN
+    pub fn get_partition_qualified_name(&self) -> String {
+        let mut sb = String::new();
+        for (i, key) in self.partition_keys.iter().enumerate() {
+            sb.push_str(key);
+            sb.push('=');
+            sb.push_str(&self.partition_values[i]);
+            if i != self.partition_keys.len() - 1 {
+                sb.push('/');
+            }
+        }
+        sb
+    }
+
+    pub fn contains(&self, other: &ResolvedPartitionSpec) -> Result<bool> {
+        let other_partition_keys = other.get_partition_keys();
+        let other_partition_values = other.get_partition_values();
+
+        let mut expected_partition_values = Vec::new();
+        for other_partition_key in other_partition_keys {
+            let key_index = self
+                .partition_keys
+                .iter()
+                .position(|k| k == other_partition_key);
+            match key_index {
+                Some(idx) => expected_partition_values.push(self.partition_values[idx].clone()),
+                None => {
+                    return Err(Error::IllegalArgument {
+                        message: format!(
+                            "table does not contain partitionKey: {other_partition_key}"
+                        ),
+                    });
+                }
+            }
+        }
+
+        let expected_partition_name = expected_partition_values.join(PARTITION_SPEC_SEPARATOR);
+        let other_partition_name = other_partition_values.join(PARTITION_SPEC_SEPARATOR);
+
+        Ok(expected_partition_name == other_partition_name)
+    }
+
+    pub fn to_pb(&self) -> PbPartitionSpec {
+        PbPartitionSpec {
+            partition_key_values: self
+                .partition_keys
+                .iter()
+                .zip(self.partition_values.iter())
+                .map(|(k, v)| PbKeyValue {
+                    key: k.clone(),
+                    value: v.clone(),
+                })
+                .collect(),
+        }
+    }
+
+    pub fn from_pb(pb: &PbPartitionSpec) -> Self {
+        let partition_keys = pb
+            .partition_key_values
+            .iter()
+            .map(|kv| kv.key.clone())
+            .collect();
+        let partition_values = pb
+            .partition_key_values
+            .iter()
+            .map(|kv| kv.value.clone())
+            .collect();
+
+        Self {
+            partition_keys,
+            partition_values,
+        }
+    }
+
+    fn get_reordered_partition_values(
+        partition_keys: &Arc<[String]>,
+        partition_spec: &PartitionSpec,
+    ) -> Vec<String> {
+        let partition_spec_map = partition_spec.get_spec_map();
+        partition_keys
+            .iter()
+            .map(|key| partition_spec_map.get(key).cloned().unwrap_or_default())
+            .collect()
+    }
+}
+
+impl Display for ResolvedPartitionSpec {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.get_partition_qualified_name())
+    }
+}
+
+/// Information of a partition metadata, includes the partition's name and the partition id that
+/// represents the unique identifier of the partition.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct PartitionInfo {
+    partition_id: PartitionId,
+    partition_spec: ResolvedPartitionSpec,
+}
+
+impl PartitionInfo {
+    pub fn new(partition_id: PartitionId, partition_spec: ResolvedPartitionSpec) -> Self {
+        Self {
+            partition_id,
+            partition_spec,
+        }
+    }
+
+    /// Get the partition id. The id is globally unique in the Fluss cluster.
+    pub fn get_partition_id(&self) -> PartitionId {
+        self.partition_id
+    }
+
+    /// Get the partition name.
+    pub fn get_partition_name(&self) -> String {
+        self.partition_spec.get_partition_name()
+    }
+
+    pub fn get_resolved_partition_spec(&self) -> &ResolvedPartitionSpec {
+        &self.partition_spec
+    }
+
+    pub fn get_partition_spec(&self) -> PartitionSpec {
+        self.partition_spec.to_partition_spec()
+    }
+
+    pub fn to_pb(&self) -> PbPartitionInfo {
+        PbPartitionInfo {
+            partition_id: self.partition_id,
+            partition_spec: self.partition_spec.to_pb(),
+            remote_data_dir: None,
+        }
+    }
+
+    pub fn from_pb(pb: &PbPartitionInfo) -> Self {
+        Self {
+            partition_id: pb.partition_id,
+            partition_spec: ResolvedPartitionSpec::from_pb(&pb.partition_spec),
+        }
+    }
+}
+
+impl Display for PartitionInfo {
+    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+        write!(
+            f,
+            "Partition{{name='{}', id={}}}",
+            self.get_partition_name(),
+            self.partition_id
+        )
+    }
+}
+
+/// A class to identify a table partition, containing the table id and the partition id.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TablePartition {
+    table_id: TableId,
+    partition_id: PartitionId,
+}
+
+impl TablePartition {
+    pub fn new(table_id: TableId, partition_id: PartitionId) -> Self {
+        Self {
+            table_id,
+            partition_id,
+        }
+    }
+
+    pub fn get_table_id(&self) -> i64 {
+        self.table_id
+    }
+
+    pub fn get_partition_id(&self) -> PartitionId {
+        self.partition_id
+    }
+}
+
+impl Display for TablePartition {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "TablePartition{{tableId={}, partitionId={}}}",
+            self.table_id, self.partition_id
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_resolved_partition_spec_name() {
+        let spec = ResolvedPartitionSpec::new(
+            Arc::from(["date".to_string(), "region".to_string()]),
+            vec!["2024-01-15".to_string(), "US".to_string()],
+        )
+        .unwrap();
+
+        assert_eq!(spec.get_partition_name(), "2024-01-15$US");
+        assert_eq!(
+            spec.get_partition_qualified_name(),
+            "date=2024-01-15/region=US"
+        );
+    }
+
+    #[test]
+    fn test_resolved_partition_spec_from_partition_name() {
+        let spec = ResolvedPartitionSpec::from_partition_name(
+            Arc::from(["date".to_string(), "region".to_string()]),
+            "2024-01-15$US",
+        );
+
+        assert_eq!(spec.get_partition_values(), &["2024-01-15", "US"]);
+    }
+
+    #[test]
+    fn test_resolved_partition_spec_from_qualified_name() {
+        let spec =
+            ResolvedPartitionSpec::from_partition_qualified_name("date=2024-01-15/region=US")
+                .unwrap();
+
+        assert_eq!(spec.get_partition_keys(), &["date", "region"]);
+        assert_eq!(spec.get_partition_values(), &["2024-01-15", "US"]);
+    }
+
+    #[test]
+    fn test_resolved_partition_spec_mismatched_lengths() {
+        let result = ResolvedPartitionSpec::new(
+            Arc::from(["date".to_string(), "region".to_string()]),
+            vec!["2024-01-15".to_string()],
+        );
+
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_partition_info() {
+        let spec = ResolvedPartitionSpec::new(
+            Arc::from(["date".to_string()]),
+            vec!["2024-01-15".to_string()],
+        )
+        .unwrap();
+
+        let info = PartitionInfo::new(42, spec);
+        assert_eq!(info.get_partition_id(), 42);
+        assert_eq!(info.get_partition_name(), "2024-01-15");
+    }
+
+    #[test]
+    fn test_table_partition() {
+        let tp = TablePartition::new(100, 42);
+        assert_eq!(tp.get_table_id(), 100);
+        assert_eq!(tp.get_partition_id(), 42);
+    }
+
+    #[test]
+    fn test_partition_spec_pb_roundtrip() {
+        let mut map = HashMap::new();
+        map.insert("date".to_string(), "2024-01-15".to_string());
+        let spec = PartitionSpec::new(map);
+
+        let pb = spec.to_pb();
+        let restored = PartitionSpec::from_pb(&pb);
+
+        assert_eq!(
+            spec.get_spec_map().get("date"),
+            restored.get_spec_map().get("date")
+        );
+    }
+
+    #[test]
+    fn test_partition_info_pb_roundtrip() {
+        let spec = ResolvedPartitionSpec::new(
+            Arc::from(["date".to_string()]),
+            vec!["2024-01-15".to_string()],
+        )
+        .unwrap();
+        let info = PartitionInfo::new(42, spec);
+
+        let pb = info.to_pb();
+        let restored = PartitionInfo::from_pb(&pb);
+
+        assert_eq!(info.get_partition_id(), restored.get_partition_id());
+        assert_eq!(info.get_partition_name(), restored.get_partition_name());
+    }
+
+    #[test]
+    fn test_contains() {
+        let full_spec = ResolvedPartitionSpec::new(
+            Arc::from(["date".to_string(), "region".to_string()]),
+            vec!["2024-01-15".to_string(), "US".to_string()],
+        )
+        .unwrap();
+
+        let partial_spec = ResolvedPartitionSpec::new(
+            Arc::from(["date".to_string()]),
+            vec!["2024-01-15".to_string()],
+        )
+        .unwrap();
+
+        assert!(full_spec.contains(&partial_spec).unwrap());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/metadata/schema_util.rs b/fluss-rust/crates/fluss/src/metadata/schema_util.rs
new file mode 100644
index 0000000000..498a526eae
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/metadata/schema_util.rs
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::{Error, Result};
+use crate::metadata::{Schema, UNKNOWN_COLUMN_ID};
+use std::collections::{HashMap, HashSet};
+
+/// Sentinel for an expected column that does not exist in the origin
+/// schema. Used by [`index_mapping`] and [`crate::row::ProjectedRow`].
+pub(crate) const UNEXIST_MAPPING: i32 = -1;
+
+/// For each column in `expected_schema`, return the index of the column
+/// with the same id in `origin_schema`, or [`UNEXIST_MAPPING`] if absent.
+/// Matching by id keeps mappings stable across `ALTER TABLE … RENAME`.
+pub(crate) fn index_mapping(origin_schema: &Schema, expected_schema: &Schema) -> Result<Vec<i32>> {
+    let origin_columns = origin_schema.columns();
+    let mut origin_id_to_index: HashMap<i32, usize> = HashMap::with_capacity(origin_columns.len());
+    for (i, col) in origin_columns.iter().enumerate() {
+        if col.id() == UNKNOWN_COLUMN_ID {
+            return Err(Error::RowConvertError {
+                message: format!(
+                    "origin schema column '{}' has no assigned id; cannot build index mapping",
+                    col.name()
+                ),
+            });
+        }
+        if origin_id_to_index.insert(col.id(), i).is_some() {
+            return Err(Error::RowConvertError {
+                message: format!("duplicate column id {} in origin schema", col.id()),
+            });
+        }
+    }
+
+    let expected_columns = expected_schema.columns();
+    let mut mapping = Vec::with_capacity(expected_columns.len());
+    let mut expected_seen: HashSet<i32> = HashSet::with_capacity(expected_columns.len());
+
+    for expected in expected_columns {
+        if expected.id() == UNKNOWN_COLUMN_ID {
+            return Err(Error::RowConvertError {
+                message: format!(
+                    "expected schema column '{}' has no assigned id; cannot build index mapping",
+                    expected.name()
+                ),
+            });
+        }
+        if !expected_seen.insert(expected.id()) {
+            return Err(Error::RowConvertError {
+                message: format!("duplicate column id {} in expected schema", expected.id()),
+            });
+        }
+        match origin_id_to_index.get(&expected.id()) {
+            None => mapping.push(UNEXIST_MAPPING),
+            Some(&idx) => {
+                let origin = &origin_columns[idx];
+                if !origin.data_type().eq_ignore_nullable(expected.data_type()) {
+                    return Err(Error::RowConvertError {
+                        message: format!(
+                            "Expected datatype of column(id={},name={}) is [{}], while the actual datatype is [{}]",
+                            expected.id(),
+                            expected.name(),
+                            expected.data_type(),
+                            origin.data_type()
+                        ),
+                    });
+                }
+                mapping.push(idx as i32);
+            }
+        }
+    }
+
+    Ok(mapping)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{Column, DataType, DataTypes};
+
+    fn schema_auto(columns: &[(&str, DataType)]) -> Schema {
+        let mut b = Schema::builder();
+        for (name, dt) in columns {
+            b = b.column(*name, dt.clone());
+        }
+        b.build().expect("schema build")
+    }
+
+    fn schema_with_ids(columns: &[(i32, &str, DataType)]) -> Schema {
+        let cols: Vec<Column> = columns
+            .iter()
+            .map(|(id, name, dt)| Column::new(*name, dt.clone()).with_id(*id))
+            .collect();
+        Schema::builder()
+            .with_columns(cols)
+            .build()
+            .expect("schema build")
+    }
+
+    #[test]
+    fn identity_mapping_when_schemas_equal() {
+        let s = schema_auto(&[
+            ("a", DataTypes::bigint()),
+            ("b", DataTypes::string()),
+            ("c", DataTypes::int()),
+        ]);
+        assert_eq!(index_mapping(&s, &s).unwrap(), vec![0, 1, 2]);
+    }
+
+    #[test]
+    fn projection_subset_in_order() {
+        let origin = schema_auto(&[
+            ("a", DataTypes::bigint()),
+            ("b", DataTypes::string()),
+            ("c", DataTypes::int()),
+        ]);
+        let expected =
+            schema_with_ids(&[(0, "a", DataTypes::bigint()), (2, "c", DataTypes::int())]);
+        assert_eq!(index_mapping(&origin, &expected).unwrap(), vec![0, 2]);
+    }
+
+    #[test]
+    fn reorder_mapping() {
+        let origin = schema_auto(&[
+            ("a", DataTypes::bigint()),
+            ("b", DataTypes::string()),
+            ("c", DataTypes::int()),
+        ]);
+        let expected = schema_with_ids(&[
+            (2, "c", DataTypes::int()),
+            (0, "a", DataTypes::bigint()),
+            (1, "b", DataTypes::string()),
+        ]);
+        assert_eq!(index_mapping(&origin, &expected).unwrap(), vec![2, 0, 1]);
+    }
+
+    #[test]
+    fn missing_column_returns_sentinel() {
+        let origin = schema_auto(&[("a", DataTypes::bigint())]);
+        let expected = schema_with_ids(&[
+            (0, "a", DataTypes::bigint()),
+            (1, "new_col", DataTypes::string()),
+        ]);
+        assert_eq!(
+            index_mapping(&origin, &expected).unwrap(),
+            vec![0, UNEXIST_MAPPING]
+        );
+    }
+
+    #[test]
+    fn rename_preserves_mapping_when_id_matches() {
+        let origin = schema_with_ids(&[(0, "old_name", DataTypes::int())]);
+        let expected = schema_with_ids(&[(0, "new_name", DataTypes::int())]);
+        assert_eq!(index_mapping(&origin, &expected).unwrap(), vec![0]);
+    }
+
+    #[test]
+    fn drop_then_add_with_same_name_does_not_alias() {
+        let origin = schema_with_ids(&[(0, "a", DataTypes::int())]);
+        let expected = schema_with_ids(&[(5, "a", DataTypes::int())]);
+        assert_eq!(
+            index_mapping(&origin, &expected).unwrap(),
+            vec![UNEXIST_MAPPING]
+        );
+    }
+
+    #[test]
+    fn datatype_mismatch_returns_error() {
+        let origin = schema_auto(&[("a", DataTypes::bigint())]);
+        let expected = schema_with_ids(&[(0, "a", DataTypes::int())]);
+        let err = index_mapping(&origin, &expected).unwrap_err();
+        let msg = err.to_string();
+        assert!(msg.contains("id=0"), "{msg}");
+        assert!(msg.contains("name=a"), "{msg}");
+        assert!(msg.contains("INT"), "{msg}");
+        assert!(msg.contains("BIGINT"), "{msg}");
+    }
+
+    #[test]
+    fn nullability_difference_does_not_error() {
+        // Primary-key normalization makes the origin non-nullable while
+        // the expected is nullable.
+        let origin = Schema::builder()
+            .column("a", DataTypes::int())
+            .primary_key(["a"])
+            .build()
+            .unwrap();
+        let expected = schema_with_ids(&[(0, "a", DataTypes::int())]);
+        assert_eq!(index_mapping(&origin, &expected).unwrap(), vec![0]);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/metadata/table.rs b/fluss-rust/crates/fluss/src/metadata/table.rs
new file mode 100644
index 0000000000..390bdbfcc9
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/metadata/table.rs
@@ -0,0 +1,1646 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::compression::ArrowCompressionInfo;
+use crate::error::Error::IllegalArgument;
+use crate::error::{Error, Result};
+use crate::metadata::DataLakeFormat;
+use crate::metadata::datatype::{
+    DataField, DataType, RowType, UNASSIGNED_FIELD_ID, reassign_field_ids,
+};
+use crate::{BucketId, PartitionId, TableId};
+use core::fmt;
+use serde::{Deserialize, Serialize};
+use std::collections::{HashMap, HashSet};
+use std::fmt::{Display, Formatter};
+use std::sync::Arc;
+use strum_macros::EnumString;
+
+/// Sentinel for a column whose stable id has not yet been assigned.
+pub const UNKNOWN_COLUMN_ID: i32 = -1;
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct Column {
+    name: String,
+    data_type: DataType,
+    comment: Option<String>,
+    id: i32,
+}
+
+impl Column {
+    pub fn new<N: Into<String>>(name: N, data_type: DataType) -> Self {
+        Self {
+            name: name.into(),
+            data_type,
+            comment: None,
+            id: UNKNOWN_COLUMN_ID,
+        }
+    }
+
+    pub fn with_comment<C: Into<String>>(mut self, comment: C) -> Self {
+        self.comment = Some(comment.into());
+        self
+    }
+
+    pub fn with_data_type(&self, data_type: DataType) -> Self {
+        Self {
+            name: self.name.clone(),
+            data_type: data_type.clone(),
+            comment: self.comment.clone(),
+            id: self.id,
+        }
+    }
+
+    pub fn with_id(mut self, id: i32) -> Self {
+        self.id = id;
+        self
+    }
+
+    // Getters...
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    pub fn data_type(&self) -> &DataType {
+        &self.data_type
+    }
+
+    pub fn comment(&self) -> Option<&str> {
+        self.comment.as_deref()
+    }
+
+    /// Returns the stable column id, or [`UNKNOWN_COLUMN_ID`] when the
+    /// id has not yet been assigned by a [`SchemaBuilder`].
+    pub fn id(&self) -> i32 {
+        self.id
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct PrimaryKey {
+    constraint_name: String,
+    column_names: Vec<String>,
+}
+
+impl PrimaryKey {
+    pub fn new<N: Into<String>>(constraint_name: N, column_names: Vec<String>) -> Self {
+        Self {
+            constraint_name: constraint_name.into(),
+            column_names,
+        }
+    }
+
+    // Getters...
+    pub fn constraint_name(&self) -> &str {
+        &self.constraint_name
+    }
+
+    pub fn column_names(&self) -> &[String] {
+        &self.column_names
+    }
+}
+
+fn collect_field_id_state(data_type: &DataType, max_id: &mut i32, has_unassigned: &mut bool) {
+    match data_type {
+        DataType::Row(rt) => {
+            for f in rt.fields() {
+                if f.field_id == UNASSIGNED_FIELD_ID {
+                    *has_unassigned = true;
+                } else {
+                    *max_id = (*max_id).max(f.field_id);
+                }
+                collect_field_id_state(&f.data_type, max_id, has_unassigned);
+            }
+        }
+        DataType::Array(at) => {
+            collect_field_id_state(at.get_element_type(), max_id, has_unassigned);
+        }
+        DataType::Map(mt) => {
+            collect_field_id_state(mt.key_type(), max_id, has_unassigned);
+            collect_field_id_state(mt.value_type(), max_id, has_unassigned);
+        }
+        _ => {}
+    }
+}
+
+fn collect_nested_field_ids(data_type: &DataType, ids: &mut Vec<i32>) {
+    match data_type {
+        DataType::Row(rt) => {
+            for f in rt.fields() {
+                if f.field_id != UNASSIGNED_FIELD_ID {
+                    ids.push(f.field_id);
+                }
+                collect_nested_field_ids(&f.data_type, ids);
+            }
+        }
+        DataType::Array(at) => collect_nested_field_ids(at.get_element_type(), ids),
+        DataType::Map(mt) => {
+            collect_nested_field_ids(mt.key_type(), ids);
+            collect_nested_field_ids(mt.value_type(), ids);
+        }
+        _ => {}
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct Schema {
+    columns: Vec<Column>,
+    primary_key: Option<PrimaryKey>,
+    row_type: RowType,
+    auto_increment_col_names: Vec<String>,
+    highest_field_id: i32,
+}
+
+impl Schema {
+    pub fn empty() -> Result<Self> {
+        Self::builder().build()
+    }
+
+    pub fn builder() -> SchemaBuilder {
+        SchemaBuilder::new()
+    }
+
+    pub fn columns(&self) -> &[Column] {
+        &self.columns
+    }
+
+    pub fn primary_key(&self) -> Option<&PrimaryKey> {
+        self.primary_key.as_ref()
+    }
+
+    pub fn row_type(&self) -> &RowType {
+        &self.row_type
+    }
+
+    pub fn primary_key_indexes(&self) -> Vec<usize> {
+        self.primary_key
+            .as_ref()
+            .map(|pk| {
+                pk.column_names
+                    .iter()
+                    .filter_map(|name| self.columns.iter().position(|c| &c.name == name))
+                    .collect()
+            })
+            .unwrap_or_default()
+    }
+
+    pub fn primary_key_column_names(&self) -> Vec<&str> {
+        self.primary_key
+            .as_ref()
+            .map(|pk| pk.column_names.iter().map(|s| s.as_str()).collect())
+            .unwrap_or_default()
+    }
+
+    pub fn column_names(&self) -> Vec<&str> {
+        self.columns.iter().map(|c| c.name.as_str()).collect()
+    }
+
+    pub fn auto_increment_col_names(&self) -> &Vec<String> {
+        &self.auto_increment_col_names
+    }
+
+    pub fn highest_field_id(&self) -> i32 {
+        self.highest_field_id
+    }
+}
+
+/// A schema together with its server-assigned version id.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct SchemaInfo {
+    schema: Schema,
+    schema_id: i32,
+}
+
+impl SchemaInfo {
+    pub fn new(schema: Schema, schema_id: i32) -> Self {
+        Self { schema, schema_id }
+    }
+
+    pub fn schema(&self) -> &Schema {
+        &self.schema
+    }
+
+    pub fn schema_id(&self) -> i32 {
+        self.schema_id
+    }
+
+    pub fn into_parts(self) -> (Schema, i32) {
+        (self.schema, self.schema_id)
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct SchemaBuilder {
+    columns: Vec<Column>,
+    primary_key: Option<PrimaryKey>,
+    auto_increment_col_names: Vec<String>,
+}
+
+impl SchemaBuilder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn with_row_type(mut self, row_type: &DataType) -> Self {
+        match row_type {
+            DataType::Row(row) => {
+                for data_field in row.fields() {
+                    self = self.column(&data_field.name, data_field.data_type.clone())
+                }
+                self
+            }
+            _ => {
+                panic!("data type must be row type")
+            }
+        }
+    }
+
+    pub fn column<N: Into<String>>(mut self, name: N, data_type: DataType) -> Self {
+        self.columns.push(Column::new(name.into(), data_type));
+        self
+    }
+
+    pub fn with_columns(mut self, columns: Vec<Column>) -> Self {
+        self.columns.extend_from_slice(columns.as_ref());
+        self
+    }
+
+    pub fn with_comment<C: Into<String>>(mut self, comment: C) -> Self {
+        if let Some(last) = self.columns.last_mut() {
+            *last = last.clone().with_comment(comment.into());
+        }
+        self
+    }
+
+    pub fn primary_key<I, S>(self, column_names: I) -> Self
+    where
+        I: IntoIterator<Item = S>,
+        S: Into<String>,
+    {
+        let names: Vec<String> = column_names.into_iter().map(|s| s.into()).collect();
+
+        let constraint_name = format!("PK_{}", names.join("_"));
+
+        self.primary_key_named(&constraint_name, names)
+    }
+
+    pub fn primary_key_named<N: Into<String>, P: Into<String>>(
+        mut self,
+        constraint_name: N,
+        column_names: Vec<P>,
+    ) -> Self {
+        self.primary_key = Some(PrimaryKey::new(
+            constraint_name.into(),
+            column_names.into_iter().map(|s| s.into()).collect(),
+        ));
+        self
+    }
+
+    /// Declares a column to be auto-incremented. With an auto-increment column in the table,
+    /// whenever a new row is inserted into the table, the new row will be assigned with the next
+    /// available value from the auto-increment sequence. A table can have at most one auto
+    /// increment column.
+    pub fn enable_auto_increment<N: Into<String>>(mut self, column_name: N) -> Result<Self> {
+        if !self.auto_increment_col_names.is_empty() {
+            return Err(IllegalArgument {
+                message: "Multiple auto increment columns are not supported yet.".to_string(),
+            });
+        }
+
+        self.auto_increment_col_names.push(column_name.into());
+        Ok(self)
+    }
+
+    pub fn build(&self) -> Result<Schema> {
+        let columns = Self::normalize_columns(&self.columns, self.primary_key.as_ref())?;
+        let (columns_with_ids, highest_field_id) = Self::assign_all_field_ids(columns)?;
+
+        let column_names: HashSet<_> = columns_with_ids.iter().map(|c| &c.name).collect();
+        for auto_inc_col in &self.auto_increment_col_names {
+            if !column_names.contains(auto_inc_col) {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "Auto increment column '{auto_inc_col}' is not found in the schema columns."
+                    ),
+                });
+            }
+        }
+
+        let data_fields = columns_with_ids
+            .iter()
+            .map(|c| DataField {
+                name: c.name.clone(),
+                data_type: c.data_type.clone(),
+                description: c.comment.clone(),
+                field_id: c.id,
+            })
+            .collect();
+
+        Ok(Schema {
+            columns: columns_with_ids,
+            primary_key: self.primary_key.clone(),
+            row_type: RowType::new(data_fields),
+            auto_increment_col_names: self.auto_increment_col_names.clone(),
+            highest_field_id,
+        })
+    }
+
+    fn assign_all_field_ids(columns: Vec<Column>) -> Result<(Vec<Column>, i32)> {
+        let with_top_id = columns.iter().filter(|c| c.id != UNKNOWN_COLUMN_ID).count();
+        let none_set = with_top_id == 0;
+        let all_top_set = with_top_id == columns.len();
+
+        if !none_set && !all_top_set {
+            return Err(IllegalArgument {
+                message: "All columns must have an id assigned, or none of them must.".to_string(),
+            });
+        }
+
+        let mut max_nested_id = -1_i32;
+        let mut has_unassigned_nested = false;
+        for c in &columns {
+            collect_field_id_state(&c.data_type, &mut max_nested_id, &mut has_unassigned_nested);
+        }
+
+        if all_top_set && !has_unassigned_nested {
+            let mut seen: HashSet<i32> = HashSet::new();
+            let mut max_id = -1_i32;
+            for col in &columns {
+                if col.id < 0 {
+                    return Err(IllegalArgument {
+                        message: format!(
+                            "Column '{}' has invalid id {}; ids must be non-negative",
+                            col.name, col.id
+                        ),
+                    });
+                }
+                if !seen.insert(col.id) {
+                    return Err(IllegalArgument {
+                        message: format!("Duplicate field id {} in schema", col.id),
+                    });
+                }
+                max_id = max_id.max(col.id);
+
+                let mut nested_ids = Vec::new();
+                collect_nested_field_ids(&col.data_type, &mut nested_ids);
+                for id in nested_ids {
+                    if id < 0 {
+                        return Err(IllegalArgument {
+                            message: format!(
+                                "Nested DataField in column '{}' has invalid id {}; ids must be non-negative",
+                                col.name, id
+                            ),
+                        });
+                    }
+                    if !seen.insert(id) {
+                        return Err(IllegalArgument {
+                            message: format!(
+                                "Duplicate field id {} in schema (column '{}')",
+                                id, col.name
+                            ),
+                        });
+                    }
+                }
+            }
+            max_id = max_id.max(max_nested_id);
+            return Ok((columns, max_id));
+        }
+
+        if all_top_set && has_unassigned_nested {
+            return Err(IllegalArgument {
+                message: "Top-level column ids are set but some nested DataField ids are unassigned; reassign all or none."
+                    .to_string(),
+            });
+        }
+
+        let mut counter: i32 = -1;
+        let new_columns: Vec<Column> = columns
+            .into_iter()
+            .map(|c| {
+                counter += 1;
+                let id = counter;
+                let new_data_type = reassign_field_ids(&c.data_type, &mut counter);
+                Column {
+                    name: c.name,
+                    data_type: new_data_type,
+                    comment: c.comment,
+                    id,
+                }
+            })
+            .collect();
+        Ok((new_columns, counter))
+    }
+
+    /// All-or-none: preserve ids if every column has one, auto-assign
+    /// 0..N-1 if none do, error on mixed input. When preserving ids,
+    /// also reject duplicates and negative-but-not-sentinel values.
+    #[allow(dead_code)]
+    fn assign_column_ids(columns: Vec<Column>) -> Result<Vec<Column>> {
+        let with_id = columns.iter().filter(|c| c.id != UNKNOWN_COLUMN_ID).count();
+        if with_id == 0 {
+            return Ok(columns
+                .into_iter()
+                .enumerate()
+                .map(|(i, c)| c.with_id(i as i32))
+                .collect());
+        }
+        if with_id != columns.len() {
+            return Err(IllegalArgument {
+                message: "All columns must have an id assigned, or none of them must.".to_string(),
+            });
+        }
+        let mut seen: HashSet<i32> = HashSet::with_capacity(columns.len());
+        for col in &columns {
+            if col.id < 0 {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "Column '{}' has invalid id {}; ids must be non-negative",
+                        col.name, col.id
+                    ),
+                });
+            }
+            if !seen.insert(col.id) {
+                return Err(IllegalArgument {
+                    message: format!("Duplicate column id {} in schema", col.id),
+                });
+            }
+        }
+        Ok(columns)
+    }
+
+    fn normalize_columns(
+        columns: &[Column],
+        primary_key: Option<&PrimaryKey>,
+    ) -> Result<Vec<Column>> {
+        let names: Vec<_> = columns.iter().map(|c| &c.name).collect();
+        if let Some(duplicates) = Self::find_duplicates(&names) {
+            return Err(Error::invalid_table(format!(
+                "Duplicate column names found: {duplicates:?}"
+            )));
+        }
+
+        let Some(pk) = primary_key else {
+            return Ok(columns.to_vec());
+        };
+
+        let pk_set: HashSet<_> = pk.column_names.iter().collect();
+        let all_columns: HashSet<_> = columns.iter().map(|c| &c.name).collect();
+        if !pk_set.is_subset(&all_columns) {
+            return Err(Error::invalid_table(format!(
+                "Primary key columns {pk_set:?} not found in schema"
+            )));
+        }
+
+        Ok(columns
+            .iter()
+            .map(|col| {
+                if pk_set.contains(&col.name) && col.data_type.is_nullable() {
+                    col.with_data_type(col.data_type.as_non_nullable())
+                } else {
+                    col.clone()
+                }
+            })
+            .collect())
+    }
+
+    fn find_duplicates<'a>(names: &'a [&String]) -> Option<HashSet<&'a String>> {
+        let mut seen = HashSet::new();
+        let mut duplicates = HashSet::new();
+
+        for name in names {
+            if !seen.insert(name) {
+                duplicates.insert(*name);
+            }
+        }
+
+        if duplicates.is_empty() {
+            None
+        } else {
+            Some(duplicates)
+        }
+    }
+}
+
+/// distribution of table
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct TableDistribution {
+    bucket_count: Option<i32>,
+    bucket_keys: Vec<String>,
+}
+
+impl TableDistribution {
+    pub fn bucket_keys(&self) -> &[String] {
+        &self.bucket_keys
+    }
+
+    pub fn bucket_count(&self) -> Option<i32> {
+        self.bucket_count
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct TableDescriptorBuilder {
+    schema: Option<Schema>,
+    properties: HashMap<String, String>,
+    custom_properties: HashMap<String, String>,
+    partition_keys: Arc<[String]>,
+    comment: Option<String>,
+    table_distribution: Option<TableDistribution>,
+}
+
+impl TableDescriptorBuilder {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn schema(mut self, schema: Schema) -> Self {
+        self.schema = Some(schema);
+        self
+    }
+
+    pub fn log_format(mut self, log_format: LogFormat) -> Self {
+        self.properties
+            .insert("table.log.format".to_string(), log_format.to_string());
+        self
+    }
+
+    pub fn kv_format(mut self, kv_format: KvFormat) -> Self {
+        self.properties
+            .insert("table.kv.format".to_string(), kv_format.to_string());
+        self
+    }
+
+    pub fn property<K: Into<String>, V: Into<String>>(mut self, key: K, value: V) -> Self {
+        self.properties.insert(key.into(), value.into());
+        self
+    }
+
+    pub fn properties<K: Into<String>, V: Into<String>>(
+        mut self,
+        properties: HashMap<K, V>,
+    ) -> Self {
+        for (k, v) in properties {
+            self.properties.insert(k.into(), v.into());
+        }
+        self
+    }
+
+    pub fn custom_property<K: Into<String>, V: Into<String>>(mut self, key: K, value: V) -> Self {
+        self.custom_properties.insert(key.into(), value.into());
+        self
+    }
+
+    pub fn custom_properties<K: Into<String>, V: Into<String>>(
+        mut self,
+        custom_properties: HashMap<K, V>,
+    ) -> Self {
+        for (k, v) in custom_properties {
+            self.custom_properties.insert(k.into(), v.into());
+        }
+        self
+    }
+
+    pub fn partitioned_by<P: Into<String>>(mut self, partition_keys: Vec<P>) -> Self {
+        self.partition_keys = Arc::from(
+            partition_keys
+                .into_iter()
+                .map(|s| s.into())
+                .collect::<Vec<String>>(),
+        );
+        self
+    }
+
+    pub fn distributed_by(mut self, bucket_count: Option<i32>, bucket_keys: Vec<String>) -> Self {
+        self.table_distribution = Some(TableDistribution {
+            bucket_count,
+            bucket_keys,
+        });
+        self
+    }
+
+    pub fn comment<S: Into<String>>(mut self, comment: S) -> Self {
+        self.comment = Some(comment.into());
+        self
+    }
+
+    pub fn build(self) -> Result<TableDescriptor> {
+        let schema = self.schema.expect("Schema must be set");
+        let table_distribution = TableDescriptor::normalize_distribution(
+            &schema,
+            &self.partition_keys,
+            self.table_distribution,
+        )?;
+        Ok(TableDescriptor {
+            schema,
+            comment: self.comment,
+            partition_keys: self.partition_keys,
+            table_distribution,
+            properties: self.properties,
+            custom_properties: self.custom_properties,
+        })
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct TableDescriptor {
+    schema: Schema,
+    comment: Option<String>,
+    partition_keys: Arc<[String]>,
+    table_distribution: Option<TableDistribution>,
+    properties: HashMap<String, String>,
+    custom_properties: HashMap<String, String>,
+}
+
+impl TableDescriptor {
+    pub fn builder() -> TableDescriptorBuilder {
+        TableDescriptorBuilder::new()
+    }
+
+    pub fn schema(&self) -> &Schema {
+        &self.schema
+    }
+
+    pub fn bucket_keys(&self) -> Vec<&str> {
+        self.table_distribution
+            .as_ref()
+            .map(|td| td.bucket_keys.iter().map(|s| s.as_str()).collect())
+            .unwrap_or_default()
+    }
+
+    pub fn is_default_bucket_key(&self) -> Result<bool> {
+        if self.schema.primary_key().is_some() {
+            Ok(self.bucket_keys()
+                == Self::default_bucket_key_of_primary_key_table(
+                    self.schema(),
+                    &self.partition_keys,
+                )?
+                .iter()
+                .map(|s| s.as_str())
+                .collect::<Vec<_>>())
+        } else {
+            Ok(self.bucket_keys().is_empty())
+        }
+    }
+
+    pub fn is_partitioned(&self) -> bool {
+        !self.partition_keys.is_empty()
+    }
+
+    pub fn has_primary_key(&self) -> bool {
+        self.schema.primary_key().is_some()
+    }
+
+    pub fn partition_keys(&self) -> &[String] {
+        &self.partition_keys
+    }
+
+    pub fn table_distribution(&self) -> Option<&TableDistribution> {
+        self.table_distribution.as_ref()
+    }
+
+    pub fn properties(&self) -> &HashMap<String, String> {
+        &self.properties
+    }
+
+    pub fn custom_properties(&self) -> &HashMap<String, String> {
+        &self.custom_properties
+    }
+
+    pub fn replication_factor(&self) -> Result<i32> {
+        self.properties
+            .get("table.replication.factor")
+            .ok_or_else(|| Error::invalid_table("Replication factor is not set"))?
+            .parse()
+            .map_err(|_e| Error::invalid_table("Replication factor can't be converted to int"))
+    }
+
+    pub fn with_properties<K: Into<String>, V: Into<String>>(
+        &self,
+        new_properties: HashMap<K, V>,
+    ) -> Self {
+        let mut properties = HashMap::new();
+        for (k, v) in new_properties {
+            properties.insert(k.into(), v.into());
+        }
+        Self {
+            properties,
+            ..self.clone()
+        }
+    }
+
+    pub fn with_replication_factor(&self, new_replication_factor: i32) -> Self {
+        let mut properties = self.properties.clone();
+        properties.insert(
+            "table.replication.factor".to_string(),
+            new_replication_factor.to_string(),
+        );
+        self.with_properties(properties)
+    }
+
+    pub fn with_bucket_count(&self, new_bucket_count: i32) -> Self {
+        Self {
+            table_distribution: Some(TableDistribution {
+                bucket_count: Some(new_bucket_count),
+                bucket_keys: self
+                    .table_distribution
+                    .as_ref()
+                    .map(|td| td.bucket_keys.clone())
+                    .unwrap_or_default(),
+            }),
+            ..self.clone()
+        }
+    }
+
+    pub fn comment(&self) -> Option<&str> {
+        self.comment.as_deref()
+    }
+
+    fn default_bucket_key_of_primary_key_table(
+        schema: &Schema,
+        partition_keys: &[String],
+    ) -> Result<Vec<String>> {
+        let mut bucket_keys = schema
+            .primary_key()
+            .expect("Primary key must be set")
+            .column_names()
+            .to_vec();
+
+        bucket_keys.retain(|k| !partition_keys.contains(k));
+
+        if bucket_keys.is_empty() {
+            return Err(Error::invalid_table(format!(
+                "Primary Key constraint {:?} should not be same with partition fields {:?}.",
+                schema.primary_key().unwrap().column_names(),
+                partition_keys
+            )));
+        }
+
+        Ok(bucket_keys)
+    }
+
+    fn normalize_distribution(
+        schema: &Schema,
+        partition_keys: &[String],
+        origin_distribution: Option<TableDistribution>,
+    ) -> Result<Option<TableDistribution>> {
+        if let Some(distribution) = origin_distribution {
+            if distribution
+                .bucket_keys
+                .iter()
+                .any(|k| partition_keys.contains(k))
+            {
+                return Err(Error::invalid_table(format!(
+                    "Bucket key {:?} shouldn't include any column in partition keys {:?}.",
+                    distribution.bucket_keys, partition_keys
+                )));
+            }
+
+            return if let Some(pk) = schema.primary_key() {
+                if distribution.bucket_keys.is_empty() {
+                    Ok(Some(TableDistribution {
+                        bucket_count: distribution.bucket_count,
+                        bucket_keys: Self::default_bucket_key_of_primary_key_table(
+                            schema,
+                            partition_keys,
+                        )?,
+                    }))
+                } else {
+                    let pk_columns: HashSet<_> = pk.column_names().iter().collect();
+                    if !distribution
+                        .bucket_keys
+                        .iter()
+                        .all(|k| pk_columns.contains(k))
+                    {
+                        return Err(Error::invalid_table(format!(
+                            "Bucket keys must be a subset of primary keys excluding partition keys for primary-key tables. \
+                            The primary keys are {:?}, the partition keys are {:?}, but the user-defined bucket keys are {:?}.",
+                            pk.column_names(),
+                            partition_keys,
+                            distribution.bucket_keys
+                        )));
+                    }
+                    Ok(Some(distribution))
+                }
+            } else {
+                Ok(Some(distribution))
+            };
+        } else if schema.primary_key().is_some() {
+            return Ok(Some(TableDistribution {
+                bucket_count: None,
+                bucket_keys: Self::default_bucket_key_of_primary_key_table(schema, partition_keys)?,
+            }));
+        }
+
+        Ok(None)
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub enum LogFormat {
+    ARROW,
+    INDEXED,
+}
+
+impl Display for LogFormat {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            LogFormat::ARROW => {
+                write!(f, "ARROW")?;
+            }
+            LogFormat::INDEXED => {
+                write!(f, "INDEXED")?;
+            }
+        }
+        Ok(())
+    }
+}
+
+impl LogFormat {
+    pub fn parse(s: &str) -> Result<Self> {
+        match s.to_uppercase().as_str() {
+            "ARROW" => Ok(LogFormat::ARROW),
+            "INDEXED" => Ok(LogFormat::INDEXED),
+            _ => Err(Error::invalid_table(format!("Unknown log format: {s}"))),
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, EnumString)]
+pub enum KvFormat {
+    INDEXED,
+    COMPACTED,
+}
+
+impl Display for KvFormat {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match self {
+            KvFormat::COMPACTED => write!(f, "COMPACTED")?,
+            KvFormat::INDEXED => write!(f, "INDEXED")?,
+        }
+        Ok(())
+    }
+}
+
+impl KvFormat {
+    pub fn parse(s: &str) -> Result<Self> {
+        match s.to_uppercase().as_str() {
+            "INDEXED" => Ok(KvFormat::INDEXED),
+            "COMPACTED" => Ok(KvFormat::COMPACTED),
+            _ => Err(Error::invalid_table(format!("Unknown kv format: {s}"))),
+        }
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)]
+pub struct TablePath {
+    database: String,
+    table: String,
+}
+
+impl Display for TablePath {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(f, "{}.{}", self.database, self.table)
+    }
+}
+
+const MAX_NAME_LENGTH: usize = 200;
+
+const INTERNAL_NAME_PREFIX: &str = "__";
+
+impl TablePath {
+    pub fn new<D: Into<String>, T: Into<String>>(db: D, tbl: T) -> Self {
+        TablePath {
+            database: db.into(),
+            table: tbl.into(),
+        }
+    }
+
+    #[inline]
+    pub fn database(&self) -> &str {
+        &self.database
+    }
+
+    #[inline]
+    pub fn table(&self) -> &str {
+        &self.table
+    }
+
+    pub fn detect_invalid_name(identifier: &str) -> Option<String> {
+        if identifier.is_empty() {
+            return Some("the empty string is not allowed".to_string());
+        }
+        if identifier == "." {
+            return Some("'.' is not allowed".to_string());
+        }
+        if identifier == ".." {
+            return Some("'..' is not allowed".to_string());
+        }
+        if identifier.len() > MAX_NAME_LENGTH {
+            return Some(format!(
+                "the length of '{identifier}' is longer than the max allowed length {MAX_NAME_LENGTH}"
+            ));
+        }
+        if Self::contains_invalid_pattern(identifier) {
+            return Some(format!(
+                "'{identifier}' contains one or more characters other than ASCII alphanumerics, '_' and '-'"
+            ));
+        }
+        None
+    }
+
+    pub fn validate_prefix(identifier: &str) -> Option<String> {
+        if identifier.starts_with(INTERNAL_NAME_PREFIX) {
+            return Some(format!(
+                "'{INTERNAL_NAME_PREFIX}' is not allowed as prefix, since it is reserved for internal databases/internal tables/internal partitions in Fluss server"
+            ));
+        }
+        None
+    }
+
+    // Valid characters for Fluss table names are the ASCII alphanumerics, '_' and '-'.
+    fn contains_invalid_pattern(identifier: &str) -> bool {
+        for c in identifier.chars() {
+            let valid_char = c.is_ascii_alphanumeric() || c == '_' || c == '-';
+            if !valid_char {
+                return true;
+            }
+        }
+        false
+    }
+}
+
+/// A database name, table name and partition name combo. It's used to represent the physical path of
+/// a bucket. If the bucket belongs to a partition (i.e., the table is a partitioned table),
+/// `partition_name` will be `Some(...)`; otherwise, it will be `None`.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct PhysicalTablePath {
+    table_path: Arc<TablePath>,
+    partition_name: Option<String>,
+}
+
+impl PhysicalTablePath {
+    pub fn of(table_path: Arc<TablePath>) -> Self {
+        Self {
+            table_path,
+            partition_name: None,
+        }
+    }
+
+    pub fn of_partitioned(table_path: Arc<TablePath>, partition_name: Option<String>) -> Self {
+        Self {
+            table_path,
+            partition_name,
+        }
+    }
+
+    pub fn of_with_names<D: Into<String>, T: Into<String>, P: Into<String>>(
+        database_name: D,
+        table_name: T,
+        partition_name: Option<P>,
+    ) -> Self {
+        Self {
+            table_path: Arc::new(TablePath::new(database_name, table_name)),
+            partition_name: partition_name.map(|p| p.into()),
+        }
+    }
+
+    pub fn get_table_path(&self) -> &TablePath {
+        &self.table_path
+    }
+
+    pub fn get_database_name(&self) -> &str {
+        self.table_path.database()
+    }
+
+    pub fn get_table_name(&self) -> &str {
+        self.table_path.table()
+    }
+
+    pub fn get_partition_name(&self) -> Option<&String> {
+        self.partition_name.as_ref()
+    }
+}
+
+impl Display for PhysicalTablePath {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        match &self.partition_name {
+            Some(partition) => write!(f, "{}(p={})", self.table_path, partition),
+            None => write!(f, "{}", self.table_path),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct TableInfo {
+    pub table_path: TablePath,
+    pub table_id: TableId,
+    pub schema_id: i32,
+    pub schema: Schema,
+    pub row_type: RowType,
+    pub primary_keys: Vec<String>,
+    pub physical_primary_keys: Vec<String>,
+    pub bucket_keys: Vec<String>,
+    pub partition_keys: Arc<[String]>,
+    pub num_buckets: i32,
+    pub properties: HashMap<String, String>,
+    pub table_config: TableConfig,
+    pub custom_properties: HashMap<String, String>,
+    pub comment: Option<String>,
+    pub created_time: i64,
+    pub modified_time: i64,
+}
+
+impl TableInfo {
+    pub fn row_type(&self) -> &RowType {
+        &self.row_type
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct AutoPartitionStrategy {
+    auto_partition_enabled: bool,
+    auto_partition_key: Option<String>,
+    auto_partition_time_unit: String,
+    auto_partition_num_precreate: i32,
+    auto_partition_num_retention: i32,
+    auto_partition_timezone: String,
+}
+
+impl AutoPartitionStrategy {
+    pub fn from(properties: &HashMap<String, String>) -> Self {
+        Self {
+            auto_partition_enabled: properties
+                .get("table.auto-partition.enabled")
+                .and_then(|s| s.parse().ok())
+                .unwrap_or(false),
+            auto_partition_key: properties
+                .get("table.auto-partition.key")
+                .map(|s| s.to_string()),
+            auto_partition_time_unit: properties
+                .get("table.auto-partition.time-unit")
+                .map(|s| s.to_string())
+                .unwrap_or_else(|| "DAY".to_string()),
+            auto_partition_num_precreate: properties
+                .get("table.auto-partition.num-precreate")
+                .and_then(|s| s.parse().ok())
+                .unwrap_or(2),
+            auto_partition_num_retention: properties
+                .get("table.auto-partition.num-retention")
+                .and_then(|s| s.parse().ok())
+                .unwrap_or(7),
+            auto_partition_timezone: properties
+                .get("table.auto-partition.time-zone")
+                .map(|s| s.to_string())
+                .unwrap_or_else(|| {
+                    jiff::tz::TimeZone::system()
+                        .iana_name()
+                        .unwrap_or("UTC")
+                        .to_string()
+                }),
+        }
+    }
+
+    pub fn is_auto_partition_enabled(&self) -> bool {
+        self.auto_partition_enabled
+    }
+
+    pub fn key(&self) -> Option<&str> {
+        self.auto_partition_key.as_deref()
+    }
+
+    pub fn time_unit(&self) -> &str {
+        &self.auto_partition_time_unit
+    }
+
+    pub fn num_precreate(&self) -> i32 {
+        self.auto_partition_num_precreate
+    }
+
+    pub fn num_retention(&self) -> i32 {
+        self.auto_partition_num_retention
+    }
+
+    pub fn timezone(&self) -> &str {
+        &self.auto_partition_timezone
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct TableConfig {
+    pub properties: HashMap<String, String>,
+}
+
+impl TableConfig {
+    pub fn from_properties(properties: HashMap<String, String>) -> Self {
+        TableConfig { properties }
+    }
+
+    pub fn get_arrow_compression_info(&self) -> Result<ArrowCompressionInfo> {
+        ArrowCompressionInfo::from_conf(&self.properties)
+    }
+
+    /// Returns the data lake format if configured, or None if not set.
+    pub fn get_datalake_format(&self) -> Result<Option<DataLakeFormat>> {
+        self.properties
+            .get("table.datalake.format")
+            .map(|f| f.parse().map_err(Error::from))
+            .transpose()
+    }
+
+    pub fn get_kv_format(&self) -> Result<KvFormat> {
+        // TODO: Consolidate configurations logic, constants, defaults in a single place
+        const DEFAULT_KV_FORMAT: &str = "COMPACTED";
+        let kv_format = self
+            .properties
+            .get("table.kv.format")
+            .map(String::as_str)
+            .unwrap_or(DEFAULT_KV_FORMAT);
+        kv_format.parse().map_err(Into::into)
+    }
+
+    pub fn get_log_format(&self) -> Result<LogFormat> {
+        // TODO: Consolidate configurations logic, constants, defaults in a single place
+        const DEFAULT_LOG_FORMAT: &str = "ARROW";
+        let log_format = self
+            .properties
+            .get("table.log.format")
+            .map(String::as_str)
+            .unwrap_or(DEFAULT_LOG_FORMAT);
+        LogFormat::parse(log_format)
+    }
+
+    pub fn get_auto_partition_strategy(&self) -> AutoPartitionStrategy {
+        AutoPartitionStrategy::from(&self.properties)
+    }
+}
+
+impl TableInfo {
+    pub fn of(
+        table_path: TablePath,
+        table_id: i64,
+        schema_id: i32,
+        table_descriptor: TableDescriptor,
+        created_time: i64,
+        modified_time: i64,
+    ) -> TableInfo {
+        let TableDescriptor {
+            schema,
+            table_distribution,
+            comment,
+            partition_keys,
+            properties,
+            custom_properties,
+        } = table_descriptor;
+        let TableDistribution {
+            bucket_count,
+            bucket_keys,
+        } = table_distribution.unwrap();
+        TableInfo::new(
+            table_path,
+            table_id,
+            schema_id,
+            schema,
+            bucket_keys,
+            partition_keys,
+            bucket_count.unwrap(),
+            properties,
+            custom_properties,
+            comment,
+            created_time,
+            modified_time,
+        )
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        table_path: TablePath,
+        table_id: TableId,
+        schema_id: i32,
+        schema: Schema,
+        bucket_keys: Vec<String>,
+        partition_keys: Arc<[String]>,
+        num_buckets: i32,
+        properties: HashMap<String, String>,
+        custom_properties: HashMap<String, String>,
+        comment: Option<String>,
+        created_time: i64,
+        modified_time: i64,
+    ) -> Self {
+        let row_type = schema.row_type.clone();
+        let primary_keys: Vec<String> = schema
+            .primary_key_column_names()
+            .iter()
+            .map(|col| (*col).to_string())
+            .collect();
+        let physical_primary_keys =
+            Self::generate_physical_primary_key(&primary_keys, &partition_keys);
+        let table_config = TableConfig::from_properties(properties.clone());
+
+        TableInfo {
+            table_path,
+            table_id,
+            schema_id,
+            schema,
+            row_type,
+            primary_keys,
+            physical_primary_keys,
+            bucket_keys,
+            partition_keys,
+            num_buckets,
+            properties,
+            table_config,
+            custom_properties,
+            comment,
+            created_time,
+            modified_time,
+        }
+    }
+
+    pub fn get_table_path(&self) -> &TablePath {
+        &self.table_path
+    }
+
+    pub fn get_table_id(&self) -> i64 {
+        self.table_id
+    }
+
+    pub fn get_schema_id(&self) -> i32 {
+        self.schema_id
+    }
+
+    pub fn get_schema(&self) -> &Schema {
+        &self.schema
+    }
+
+    pub fn get_row_type(&self) -> &RowType {
+        &self.row_type
+    }
+
+    pub fn has_primary_key(&self) -> bool {
+        !self.primary_keys.is_empty()
+    }
+
+    pub fn get_primary_keys(&self) -> &Vec<String> {
+        &self.primary_keys
+    }
+
+    pub fn get_physical_primary_keys(&self) -> &[String] {
+        &self.physical_primary_keys
+    }
+
+    pub fn has_bucket_key(&self) -> bool {
+        !self.bucket_keys.is_empty()
+    }
+
+    pub fn is_default_bucket_key(&self) -> bool {
+        if self.has_primary_key() {
+            self.bucket_keys == self.physical_primary_keys
+        } else {
+            self.bucket_keys.is_empty()
+        }
+    }
+
+    pub fn get_bucket_keys(&self) -> &[String] {
+        &self.bucket_keys
+    }
+
+    pub fn is_partitioned(&self) -> bool {
+        !self.partition_keys.is_empty()
+    }
+
+    pub fn is_auto_partitioned(&self) -> bool {
+        self.is_partitioned()
+            && self
+                .table_config
+                .get_auto_partition_strategy()
+                .is_auto_partition_enabled()
+    }
+
+    pub fn get_partition_keys(&self) -> &Arc<[String]> {
+        &self.partition_keys
+    }
+
+    pub fn get_num_buckets(&self) -> i32 {
+        self.num_buckets
+    }
+
+    pub fn get_properties(&self) -> &HashMap<String, String> {
+        &self.properties
+    }
+
+    pub fn get_table_config(&self) -> &TableConfig {
+        &self.table_config
+    }
+
+    pub fn get_custom_properties(&self) -> &HashMap<String, String> {
+        &self.custom_properties
+    }
+
+    pub fn get_comment(&self) -> Option<&str> {
+        self.comment.as_deref()
+    }
+
+    pub fn get_created_time(&self) -> i64 {
+        self.created_time
+    }
+
+    pub fn get_modified_time(&self) -> i64 {
+        self.modified_time
+    }
+
+    pub fn to_table_descriptor(&self) -> Result<TableDescriptor> {
+        let mut builder = TableDescriptor::builder()
+            .schema(self.schema.clone())
+            .partitioned_by(self.partition_keys.to_vec())
+            .distributed_by(Some(self.num_buckets), self.bucket_keys.clone())
+            .properties(self.properties.clone())
+            .custom_properties(self.custom_properties.clone());
+
+        if let Some(comment) = &self.comment {
+            builder = builder.comment(comment.clone());
+        }
+
+        builder.build()
+    }
+
+    fn generate_physical_primary_key(
+        primary_keys: &[String],
+        partition_keys: &[String],
+    ) -> Vec<String> {
+        primary_keys
+            .iter()
+            .filter(|pk| !partition_keys.contains(*pk))
+            .cloned()
+            .collect()
+    }
+}
+
+impl Display for TableInfo {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        write!(
+            f,
+            "TableInfo{{ table_path={:?}, table_id={}, schema_id={}, schema={:?}, physical_primary_keys={:?}, bucket_keys={:?}, partition_keys={:?}, num_buckets={}, properties={:?}, custom_properties={:?}, comment={:?}, created_time={}, modified_time={} }}",
+            self.table_path,
+            self.table_id,
+            self.schema_id,
+            self.schema,
+            self.physical_primary_keys,
+            self.bucket_keys,
+            self.partition_keys,
+            self.num_buckets,
+            self.properties,
+            self.custom_properties,
+            self.comment,
+            self.created_time,
+            self.modified_time
+        )
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Hash, PartialEq, Eq)]
+pub struct TableBucket {
+    table_id: TableId,
+    partition_id: Option<PartitionId>,
+    bucket: BucketId,
+}
+
+impl TableBucket {
+    pub fn new(table_id: TableId, bucket: BucketId) -> Self {
+        Self {
+            table_id,
+            partition_id: None,
+            bucket,
+        }
+    }
+
+    pub fn new_with_partition(
+        table_id: TableId,
+        partition_id: Option<PartitionId>,
+        bucket: BucketId,
+    ) -> Self {
+        TableBucket {
+            table_id,
+            partition_id,
+            bucket,
+        }
+    }
+
+    pub fn table_id(&self) -> TableId {
+        self.table_id
+    }
+
+    pub fn bucket_id(&self) -> BucketId {
+        self.bucket
+    }
+
+    pub fn partition_id(&self) -> Option<PartitionId> {
+        self.partition_id
+    }
+}
+
+impl Display for TableBucket {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        if let Some(partition_id) = self.partition_id {
+            write!(
+                f,
+                "TableBucket(table_id={}, partition_id={}, bucket={})",
+                self.table_id, partition_id, self.bucket
+            )
+        } else {
+            write!(
+                f,
+                "TableBucket(table_id={}, bucket={})",
+                self.table_id, self.bucket
+            )
+        }
+    }
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LakeSnapshot {
+    pub snapshot_id: i64,
+    pub table_buckets_offset: HashMap<TableBucket, i64>,
+}
+
+impl LakeSnapshot {
+    pub fn new(snapshot_id: i64, table_buckets_offset: HashMap<TableBucket, i64>) -> Self {
+        Self {
+            snapshot_id,
+            table_buckets_offset,
+        }
+    }
+
+    pub fn snapshot_id(&self) -> i64 {
+        self.snapshot_id
+    }
+
+    pub fn table_buckets_offset(&self) -> &HashMap<TableBucket, i64> {
+        &self.table_buckets_offset
+    }
+}
+
+/// Tests for [`TablePath`].
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::DataTypes;
+
+    #[test]
+    fn test_validate() {
+        // assert valid name
+        let path = TablePath::new("db_2-abc3".to_string(), "table-1_abc_2".to_string());
+        assert!(TablePath::detect_invalid_name(path.database()).is_none());
+        assert!(TablePath::detect_invalid_name(path.table()).is_none());
+        assert_eq!(path.to_string(), "db_2-abc3.table-1_abc_2");
+
+        // assert invalid name prefix
+        assert!(
+            TablePath::validate_prefix("__table-1")
+                .unwrap()
+                .contains("'__' is not allowed as prefix")
+        );
+
+        // check max length
+        let long_name = "a".repeat(200);
+        assert!(TablePath::detect_invalid_name(&long_name).is_none());
+
+        // assert invalid names
+        assert_invalid_name("*abc", "'*abc' contains one or more characters other than");
+        assert_invalid_name(
+            "table.abc",
+            "'table.abc' contains one or more characters other than",
+        );
+        assert_invalid_name("", "the empty string is not allowed");
+        assert_invalid_name(" ", "' ' contains one or more characters other than");
+        assert_invalid_name(".", "'.' is not allowed");
+        assert_invalid_name("..", "'..' is not allowed");
+        let invalid_long_name = "a".repeat(201);
+        assert_invalid_name(
+            &invalid_long_name,
+            &format!(
+                "the length of '{invalid_long_name}' is longer than the max allowed length {MAX_NAME_LENGTH}"
+            ),
+        );
+    }
+
+    fn assert_invalid_name(name: &str, expected_message: &str) {
+        let result = TablePath::detect_invalid_name(name);
+        assert!(
+            result.is_some(),
+            "Expected '{name}' to be invalid, but it was valid"
+        );
+        assert!(
+            result.as_ref().unwrap().contains(expected_message),
+            "Expected message containing '{}', but got '{}'",
+            expected_message,
+            result.unwrap()
+        );
+    }
+
+    #[test]
+    fn test_is_auto_partitioned() {
+        let schema = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .primary_key(vec!["id".to_string()])
+            .build()
+            .unwrap();
+
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+
+        // 1. Not partitioned, auto partition disabled
+        let mut properties = HashMap::new();
+        let table_info = TableInfo::new(
+            table_path.clone(),
+            1,
+            1,
+            schema.clone(),
+            vec!["id".to_string()],
+            Arc::from(vec![]), // No partition keys
+            1,
+            properties.clone(),
+            HashMap::new(),
+            None,
+            0,
+            0,
+        );
+        assert!(!table_info.is_auto_partitioned());
+
+        // 2. Not partitioned, auto partition enabled
+        properties.insert(
+            "table.auto-partition.enabled".to_string(),
+            "true".to_string(),
+        );
+        let table_info = TableInfo::new(
+            table_path.clone(),
+            1,
+            1,
+            schema.clone(),
+            vec!["id".to_string()],
+            Arc::from(vec![]), // No partition keys
+            1,
+            properties.clone(),
+            HashMap::new(),
+            None,
+            0,
+            0,
+        );
+        assert!(!table_info.is_auto_partitioned());
+
+        // 3. Partitioned, auto partition disabled
+        properties.insert(
+            "table.auto-partition.enabled".to_string(),
+            "false".to_string(),
+        );
+        let table_info = TableInfo::new(
+            table_path.clone(),
+            1,
+            1,
+            schema.clone(),
+            vec!["id".to_string()],
+            Arc::from(vec!["name".to_string()]), // Partition keys
+            1,
+            properties.clone(),
+            HashMap::new(),
+            None,
+            0,
+            0,
+        );
+        assert!(!table_info.is_auto_partitioned());
+
+        // 4. Partitioned, auto partition enabled
+        properties.insert(
+            "table.auto-partition.enabled".to_string(),
+            "true".to_string(),
+        );
+        let table_info = TableInfo::new(
+            table_path.clone(),
+            1,
+            1,
+            schema.clone(),
+            vec!["id".to_string()],
+            Arc::from(vec!["name".to_string()]), // Partition keys
+            1,
+            properties.clone(),
+            HashMap::new(),
+            None,
+            0,
+            0,
+        );
+        assert!(table_info.is_auto_partitioned());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/metrics.rs b/fluss-rust/crates/fluss/src/metrics.rs
new file mode 100644
index 0000000000..7c62738c4e
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/metrics.rs
@@ -0,0 +1,617 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Metric name constants and helpers for fluss-rust client instrumentation.
+//!
+//! Uses the [`metrics`] crate facade pattern: library code emits metrics via
+//! `counter!`/`gauge!`/`histogram!` macros, and the application installs a
+//! recorder (e.g. `metrics-exporter-prometheus`) to collect them. When no
+//! recorder is installed, all metric calls are no-ops with zero overhead.
+
+use crate::metadata::TablePath;
+use crate::rpc::ApiKey;
+
+// ---------------------------------------------------------------------------
+// Label keys
+// ---------------------------------------------------------------------------
+
+pub const LABEL_API_KEY: &str = "api_key";
+
+/// Identifies the database and table for per-table scanner metrics.
+pub const LABEL_DATABASE: &str = "database";
+pub const LABEL_TABLE: &str = "table";
+
+// ---------------------------------------------------------------------------
+// Connection / RPC metrics
+//
+// Java reference: ConnectionMetrics.java, ClientMetricGroup.java, MetricNames.java
+//
+// Byte counting matches Java semantics: both sides count only the API message
+// body, excluding the protocol header and framing.
+// Java: rawRequest.totalSize() / response.totalSize() (see MessageCodec.java).
+// Rust: buf.len() - REQUEST_HEADER_LENGTH for sent bytes,
+//       buffer.len() - cursor.position() for received bytes.
+// ---------------------------------------------------------------------------
+
+pub const CLIENT_REQUESTS_TOTAL: &str = "fluss.client.requests.total";
+pub const CLIENT_RESPONSES_TOTAL: &str = "fluss.client.responses.total";
+pub const CLIENT_BYTES_SENT_TOTAL: &str = "fluss.client.bytes_sent.total";
+pub const CLIENT_BYTES_RECEIVED_TOTAL: &str = "fluss.client.bytes_received.total";
+pub const CLIENT_REQUEST_LATENCY_MS: &str = "fluss.client.request_latency_ms";
+pub const CLIENT_REQUESTS_IN_FLIGHT: &str = "fluss.client.requests_in_flight";
+
+// ---------------------------------------------------------------------------
+// Scanner poll-timing metrics
+//
+// Java reference: ScannerMetricGroup.java, LogScannerImpl.java
+//
+// These track consumer liveness and processing efficiency at the `poll()`
+// boundary. Java records via `volatile long` fields read by gauge suppliers;
+// Rust snapshots the values at poll start/end.
+//
+// Java's `lastPollSecondsAgo` gauge is intentionally NOT ported. Java
+// implements it as a gauge supplier evaluated at scrape time, which the
+// `metrics` crate facade has no equivalent for. A snapshot-at-poll-start
+// port would just duplicate `time_between_poll_ms / 1000` and would not
+// advance while a consumer is hung — defeating the metric's purpose
+// (detecting a stuck consumer). Revisit if the `metrics` crate gains a
+// supplier abstraction or we add a background liveness task.
+// ---------------------------------------------------------------------------
+
+/// Gauge: milliseconds between the start of consecutive `poll()` calls. A
+/// large value usually means the consumer's downstream processing is slow.
+pub const SCANNER_TIME_BETWEEN_POLL_MS: &str = "fluss.client.scanner.time_between_poll_ms";
+
+/// Gauge: fraction of wall-clock time spent inside `poll()` —
+/// `poll_time_ms / (poll_time_ms + time_between_poll_ms)`. A value near 1.0
+/// means the scanner is starved for data; a low value means the consumer is
+/// the bottleneck.
+pub const SCANNER_POLL_IDLE_RATIO: &str = "fluss.client.scanner.poll_idle_ratio";
+
+// ---------------------------------------------------------------------------
+// Scanner fetch + remote download metrics
+//
+// Fetch metrics are recorded in the LogFetcher fetch loop on response
+// completion. Remote metrics are recorded inside RemoteLogDownloader's
+// download task.
+//
+// Java uses a volatile-long gauge for fetch latency and Counter+MeterView
+// for rates. Rust uses a histogram for latency (richer percentile data)
+// and counters for throughput; the recorder/exporter handles rate
+// computation (e.g. Prometheus `rate()`).
+//
+// Java emits one `ScannerMetricGroup` per (database, table); Rust matches
+// that by attaching `database` + `table` labels to every scanner metric
+// (see `ScannerMetrics` below).
+// ---------------------------------------------------------------------------
+
+/// Histogram: elapsed ms for each successful FetchLog RPC.
+pub const SCANNER_FETCH_LATENCY_MS: &str = "fluss.client.scanner.fetch_latency_ms";
+
+/// Counter: total FetchLog RPC requests attempted after connection acquisition.
+pub const SCANNER_FETCH_REQUESTS_TOTAL: &str = "fluss.client.scanner.fetch_requests.total";
+
+/// Histogram: serialized bytes per successful FetchLog response.
+pub const SCANNER_BYTES_PER_REQUEST: &str = "fluss.client.scanner.bytes_per_request";
+
+/// Counter: total remote log download attempts (includes per-segment retries).
+pub const SCANNER_REMOTE_FETCH_REQUESTS_TOTAL: &str =
+    "fluss.client.scanner.remote_fetch_requests.total";
+
+/// Counter: total bytes downloaded from remote log storage.
+pub const SCANNER_REMOTE_FETCH_BYTES_TOTAL: &str = "fluss.client.scanner.remote_fetch_bytes.total";
+
+/// Counter: total remote log download failures (each retry attempt counts).
+pub const SCANNER_REMOTE_FETCH_ERRORS_TOTAL: &str =
+    "fluss.client.scanner.remote_fetch_errors.total";
+
+// ---------------------------------------------------------------------------
+// Per-table scanner metric handles
+// ---------------------------------------------------------------------------
+
+/// Cached `(database, table)`-labeled scanner metric handles.
+///
+/// Adding a new scanner metric: declare the constant above, add one
+/// field plus an initializer line in [`Self::new`] using the matching
+/// `scanner_{gauge,counter,histogram}` helper, and a `record_*` method.
+/// The helpers are the single source of truth for the label set, so a
+/// future label addition (e.g. `cluster_id`) is a one-line change.
+///
+/// # Recorder binding
+///
+/// `metrics::counter!(...)` / `gauge!(...)` / `histogram!(...)` resolve
+/// the recorder at the macro callsite. Because this struct caches the
+/// returned handles, every cached handle is bound to whichever recorder
+/// is installed when [`Self::new`] runs. Construct the scanner *after*
+/// installing the production recorder; in tests, construct it inside
+/// the `metrics::with_local_recorder(...)` closure. With no recorder
+/// installed, all `record_*` calls are zero-overhead no-ops.
+pub(crate) struct ScannerMetrics {
+    time_between_poll_ms: metrics::Gauge,
+    poll_idle_ratio: metrics::Gauge,
+    fetch_requests_total: metrics::Counter,
+    fetch_latency_ms: metrics::Histogram,
+    bytes_per_request: metrics::Histogram,
+    remote_fetch_requests_total: metrics::Counter,
+    remote_fetch_bytes_total: metrics::Counter,
+    remote_fetch_errors_total: metrics::Counter,
+}
+
+impl ScannerMetrics {
+    /// Build a fresh handle cache for `table_path`. Resolves the
+    /// currently installed recorder once per metric.
+    pub(crate) fn new(table_path: &TablePath) -> Self {
+        let database = table_path.database();
+        let table = table_path.table();
+        Self {
+            time_between_poll_ms: scanner_gauge(SCANNER_TIME_BETWEEN_POLL_MS, database, table),
+            poll_idle_ratio: scanner_gauge(SCANNER_POLL_IDLE_RATIO, database, table),
+            fetch_requests_total: scanner_counter(SCANNER_FETCH_REQUESTS_TOTAL, database, table),
+            fetch_latency_ms: scanner_histogram(SCANNER_FETCH_LATENCY_MS, database, table),
+            bytes_per_request: scanner_histogram(SCANNER_BYTES_PER_REQUEST, database, table),
+            remote_fetch_requests_total: scanner_counter(
+                SCANNER_REMOTE_FETCH_REQUESTS_TOTAL,
+                database,
+                table,
+            ),
+            remote_fetch_bytes_total: scanner_counter(
+                SCANNER_REMOTE_FETCH_BYTES_TOTAL,
+                database,
+                table,
+            ),
+            remote_fetch_errors_total: scanner_counter(
+                SCANNER_REMOTE_FETCH_ERRORS_TOTAL,
+                database,
+                table,
+            ),
+        }
+    }
+
+    pub(crate) fn record_time_between_poll_ms(&self, value: f64) {
+        self.time_between_poll_ms.set(value);
+    }
+
+    pub(crate) fn record_poll_idle_ratio(&self, value: f64) {
+        self.poll_idle_ratio.set(value);
+    }
+
+    pub(crate) fn record_fetch_request(&self) {
+        self.fetch_requests_total.increment(1);
+    }
+
+    pub(crate) fn record_fetch_latency_ms(&self, value: f64) {
+        self.fetch_latency_ms.record(value);
+    }
+
+    pub(crate) fn record_bytes_per_request(&self, value: f64) {
+        self.bytes_per_request.record(value);
+    }
+
+    pub(crate) fn record_remote_fetch_request(&self) {
+        self.remote_fetch_requests_total.increment(1);
+    }
+
+    pub(crate) fn record_remote_fetch_bytes(&self, bytes: u64) {
+        self.remote_fetch_bytes_total.increment(bytes);
+    }
+
+    pub(crate) fn record_remote_fetch_error(&self) {
+        self.remote_fetch_errors_total.increment(1);
+    }
+}
+
+// Per-table scanner handle factories. These centralize the
+// `(database, table)` label set so a future schema change (renaming a
+// label, adding `cluster_id`, etc.) is a one-line edit instead of
+// touching every callsite in `ScannerMetrics::new`.
+
+fn scanner_gauge(name: &'static str, database: &str, table: &str) -> metrics::Gauge {
+    metrics::gauge!(
+        name,
+        LABEL_DATABASE => database.to_string(),
+        LABEL_TABLE => table.to_string(),
+    )
+}
+
+fn scanner_counter(name: &'static str, database: &str, table: &str) -> metrics::Counter {
+    metrics::counter!(
+        name,
+        LABEL_DATABASE => database.to_string(),
+        LABEL_TABLE => table.to_string(),
+    )
+}
+
+fn scanner_histogram(name: &'static str, database: &str, table: &str) -> metrics::Histogram {
+    metrics::histogram!(
+        name,
+        LABEL_DATABASE => database.to_string(),
+        LABEL_TABLE => table.to_string(),
+    )
+}
+
+/// Returns a label value for reportable API keys, matching Java's
+/// `ConnectionMetrics.REPORT_API_KEYS` filter (`ProduceLog`, `FetchLog`,
+/// `PutKv`, `Lookup`). Returns `None` for admin/metadata/auth calls to
+/// avoid metric cardinality bloat.
+pub(crate) fn api_key_label(api_key: ApiKey) -> Option<&'static str> {
+    match api_key {
+        ApiKey::ProduceLog => Some("produce_log"),
+        ApiKey::FetchLog => Some("fetch_log"),
+        ApiKey::PutKv => Some("put_kv"),
+        ApiKey::Lookup => Some("lookup"),
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::test_utils::assert_scanner_entries_labeled;
+    use metrics_util::debugging::DebuggingRecorder;
+
+    macro_rules! find_counter {
+        ($entries:expr, $name:expr) => {
+            $entries.iter().find_map(|(key, _, _, val)| {
+                if key.key().name() == $name {
+                    match val {
+                        metrics_util::debugging::DebugValue::Counter(v) => Some(*v),
+                        _ => None,
+                    }
+                } else {
+                    None
+                }
+            })
+        };
+    }
+
+    macro_rules! find_histogram {
+        ($entries:expr, $name:expr) => {
+            $entries.iter().find_map(|(key, _, _, val)| {
+                if key.key().name() == $name {
+                    match val {
+                        metrics_util::debugging::DebugValue::Histogram(v) => {
+                            Some(v.iter().map(|f| f.into_inner()).collect::<Vec<_>>())
+                        }
+                        _ => None,
+                    }
+                } else {
+                    None
+                }
+            })
+        };
+    }
+
+    macro_rules! find_gauge {
+        ($entries:expr, $name:expr) => {
+            $entries.iter().find_map(|(key, _, _, val)| {
+                if key.key().name() == $name {
+                    match val {
+                        metrics_util::debugging::DebugValue::Gauge(g) => Some(g.into_inner()),
+                        _ => None,
+                    }
+                } else {
+                    None
+                }
+            })
+        };
+    }
+
+    #[test]
+    fn reportable_api_keys_return_label() {
+        assert_eq!(api_key_label(ApiKey::ProduceLog), Some("produce_log"));
+        assert_eq!(api_key_label(ApiKey::FetchLog), Some("fetch_log"));
+        assert_eq!(api_key_label(ApiKey::PutKv), Some("put_kv"));
+        assert_eq!(api_key_label(ApiKey::Lookup), Some("lookup"));
+    }
+
+    #[test]
+    fn non_reportable_api_keys_return_none() {
+        assert_eq!(api_key_label(ApiKey::MetaData), None);
+        assert_eq!(api_key_label(ApiKey::CreateTable), None);
+        assert_eq!(api_key_label(ApiKey::Authenticate), None);
+        assert_eq!(api_key_label(ApiKey::ListDatabases), None);
+        assert_eq!(api_key_label(ApiKey::GetTable), None);
+    }
+
+    #[test]
+    fn reportable_request_records_all_connection_metrics() {
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            let label = api_key_label(ApiKey::ProduceLog).unwrap();
+
+            metrics::counter!(CLIENT_REQUESTS_TOTAL, LABEL_API_KEY => label).increment(1);
+            metrics::counter!(CLIENT_BYTES_SENT_TOTAL, LABEL_API_KEY => label).increment(256);
+            metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).increment(1.0);
+
+            metrics::counter!(CLIENT_RESPONSES_TOTAL, LABEL_API_KEY => label).increment(1);
+            metrics::counter!(CLIENT_BYTES_RECEIVED_TOTAL, LABEL_API_KEY => label).increment(128);
+            metrics::histogram!(CLIENT_REQUEST_LATENCY_MS, LABEL_API_KEY => label).record(42.5);
+            metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).decrement(1.0);
+        });
+
+        let snapshot = snapshotter.snapshot();
+        let entries: Vec<_> = snapshot.into_vec();
+
+        assert_eq!(find_counter!(entries, CLIENT_REQUESTS_TOTAL), Some(1));
+        assert_eq!(find_counter!(entries, CLIENT_RESPONSES_TOTAL), Some(1));
+        assert_eq!(find_counter!(entries, CLIENT_BYTES_SENT_TOTAL), Some(256));
+        assert_eq!(
+            find_counter!(entries, CLIENT_BYTES_RECEIVED_TOTAL),
+            Some(128)
+        );
+        assert_eq!(
+            find_histogram!(entries, CLIENT_REQUEST_LATENCY_MS),
+            Some(vec![42.5])
+        );
+        assert_eq!(find_gauge!(entries, CLIENT_REQUESTS_IN_FLIGHT), Some(0.0));
+
+        let has_label = entries.iter().all(|(key, _, _, _)| {
+            key.key()
+                .labels()
+                .any(|l| l.key() == LABEL_API_KEY && l.value() == "produce_log")
+        });
+        assert!(has_label, "all metrics must carry the api_key label");
+    }
+
+    #[test]
+    fn non_reportable_request_records_no_metrics() {
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            let label = api_key_label(ApiKey::MetaData);
+            assert!(label.is_none());
+            // When label is None, no metrics calls are made (matching request() logic).
+        });
+
+        let snapshot = snapshotter.snapshot();
+        assert!(
+            snapshot.into_vec().is_empty(),
+            "non-reportable API keys must not produce metrics"
+        );
+    }
+
+    #[test]
+    fn inflight_gauge_nets_to_zero_after_balanced_calls() {
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            let label = api_key_label(ApiKey::FetchLog).unwrap();
+
+            // Simulate 3 concurrent requests completing
+            for _ in 0..3 {
+                metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).increment(1.0);
+            }
+            for _ in 0..3 {
+                metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).decrement(1.0);
+            }
+        });
+
+        let snapshot = snapshotter.snapshot();
+        let entries: Vec<_> = snapshot.into_vec();
+        assert_eq!(
+            find_gauge!(entries, CLIENT_REQUESTS_IN_FLIGHT),
+            Some(0.0),
+            "in-flight gauge should be 0 after balanced inc/dec"
+        );
+    }
+
+    #[test]
+    fn different_api_keys_produce_separate_metric_series() {
+        use std::collections::HashMap;
+
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            let produce_label = api_key_label(ApiKey::ProduceLog).unwrap();
+            let fetch_label = api_key_label(ApiKey::FetchLog).unwrap();
+
+            metrics::counter!(CLIENT_REQUESTS_TOTAL, LABEL_API_KEY => produce_label).increment(5);
+            metrics::counter!(CLIENT_REQUESTS_TOTAL, LABEL_API_KEY => fetch_label).increment(3);
+        });
+
+        let snapshot = snapshotter.snapshot();
+        let entries: Vec<_> = snapshot.into_vec();
+
+        let request_entries: Vec<_> = entries
+            .iter()
+            .filter(|(key, _, _, _)| key.key().name() == CLIENT_REQUESTS_TOTAL)
+            .collect();
+
+        assert_eq!(
+            request_entries.len(),
+            2,
+            "produce_log and fetch_log should be separate metric series"
+        );
+
+        let mut counter_by_api_key: HashMap<String, u64> = HashMap::new();
+        for (key, _, _, val) in request_entries {
+            let api_key = key
+                .key()
+                .labels()
+                .find(|label| label.key() == LABEL_API_KEY)
+                .map(|label| label.value())
+                .expect("requests total metric must include api_key label");
+
+            let counter_value = match val {
+                metrics_util::debugging::DebugValue::Counter(v) => *v,
+                other => panic!("expected Counter, got {other:?}"),
+            };
+
+            counter_by_api_key.insert(api_key.to_string(), counter_value);
+        }
+
+        assert_eq!(counter_by_api_key.get("produce_log"), Some(&5));
+        assert_eq!(counter_by_api_key.get("fetch_log"), Some(&3));
+    }
+
+    #[test]
+    fn scanner_poll_timing_metrics_emit_correctly() {
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            let table_path = TablePath::new("db", "tbl");
+            let m = ScannerMetrics::new(&table_path);
+            m.record_time_between_poll_ms(200.0);
+            m.record_poll_idle_ratio(0.8);
+        });
+
+        let snapshot = snapshotter.snapshot();
+        let entries: Vec<_> = snapshot.into_vec();
+
+        assert_eq!(
+            find_gauge!(entries, SCANNER_TIME_BETWEEN_POLL_MS),
+            Some(200.0)
+        );
+        assert_eq!(find_gauge!(entries, SCANNER_POLL_IDLE_RATIO), Some(0.8));
+        assert_scanner_entries_labeled(&entries, "db", "tbl");
+    }
+
+    #[test]
+    fn scanner_fetch_metrics_emit_correctly() {
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            let table_path = TablePath::new("db", "tbl");
+            let m = ScannerMetrics::new(&table_path);
+            m.record_fetch_request();
+            m.record_fetch_latency_ms(15.5);
+            m.record_bytes_per_request(4096.0);
+        });
+
+        let snapshot = snapshotter.snapshot();
+        let entries: Vec<_> = snapshot.into_vec();
+
+        assert_eq!(
+            find_counter!(entries, SCANNER_FETCH_REQUESTS_TOTAL),
+            Some(1)
+        );
+        assert_eq!(
+            find_histogram!(entries, SCANNER_FETCH_LATENCY_MS),
+            Some(vec![15.5])
+        );
+        assert_eq!(
+            find_histogram!(entries, SCANNER_BYTES_PER_REQUEST),
+            Some(vec![4096.0])
+        );
+        assert_scanner_entries_labeled(&entries, "db", "tbl");
+    }
+
+    #[test]
+    fn scanner_remote_fetch_metrics_emit_correctly() {
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            let table_path = TablePath::new("db", "tbl");
+            let m = ScannerMetrics::new(&table_path);
+            m.record_remote_fetch_request();
+            m.record_remote_fetch_request();
+            m.record_remote_fetch_request();
+            m.record_remote_fetch_bytes(1024);
+            m.record_remote_fetch_error();
+        });
+
+        let snapshot = snapshotter.snapshot();
+        let entries: Vec<_> = snapshot.into_vec();
+
+        assert_eq!(
+            find_counter!(entries, SCANNER_REMOTE_FETCH_REQUESTS_TOTAL),
+            Some(3)
+        );
+        assert_eq!(
+            find_counter!(entries, SCANNER_REMOTE_FETCH_BYTES_TOTAL),
+            Some(1024)
+        );
+        assert_eq!(
+            find_counter!(entries, SCANNER_REMOTE_FETCH_ERRORS_TOTAL),
+            Some(1)
+        );
+        assert_scanner_entries_labeled(&entries, "db", "tbl");
+    }
+
+    /// Two scanners on different tables must produce independent metric
+    /// series.
+    #[test]
+    fn different_table_paths_produce_separate_metric_series() {
+        use std::collections::HashMap;
+
+        let recorder = DebuggingRecorder::new();
+        let snapshotter = recorder.snapshotter();
+
+        metrics::with_local_recorder(&recorder, || {
+            let m1 = ScannerMetrics::new(&TablePath::new("db1", "t1"));
+            let m2 = ScannerMetrics::new(&TablePath::new("db2", "t2"));
+
+            for _ in 0..5 {
+                m1.record_fetch_request();
+            }
+            for _ in 0..3 {
+                m2.record_fetch_request();
+            }
+        });
+
+        let snapshot = snapshotter.snapshot();
+        let entries: Vec<_> = snapshot.into_vec();
+
+        let request_entries: Vec<_> = entries
+            .iter()
+            .filter(|(key, _, _, _)| key.key().name() == SCANNER_FETCH_REQUESTS_TOTAL)
+            .collect();
+
+        assert_eq!(
+            request_entries.len(),
+            2,
+            "(db1,t1) and (db2,t2) must be separate metric series"
+        );
+
+        let mut counter_by_table: HashMap<(String, String), u64> = HashMap::new();
+        for (key, _, _, val) in request_entries {
+            let mut database = None;
+            let mut table = None;
+            for label in key.key().labels() {
+                if label.key() == LABEL_DATABASE {
+                    database = Some(label.value().to_string());
+                } else if label.key() == LABEL_TABLE {
+                    table = Some(label.value().to_string());
+                }
+            }
+            let database = database.expect("scanner metric must include database label");
+            let table = table.expect("scanner metric must include table label");
+            let counter_value = match val {
+                metrics_util::debugging::DebugValue::Counter(v) => *v,
+                other => panic!("expected Counter, got {other:?}"),
+            };
+            counter_by_table.insert((database, table), counter_value);
+        }
+
+        assert_eq!(
+            counter_by_table.get(&("db1".to_string(), "t1".to_string())),
+            Some(&5),
+        );
+        assert_eq!(
+            counter_by_table.get(&("db2".to_string(), "t2".to_string())),
+            Some(&3),
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/record/arrow.rs b/fluss-rust/crates/fluss/src/record/arrow.rs
new file mode 100644
index 0000000000..b97fc120de
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/arrow.rs
@@ -0,0 +1,2320 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::{LogWriteRecord, Record, WriteRecord};
+use crate::compression::{
+    ArrowCompressionInfo, ArrowCompressionRatioEstimator, ArrowCompressionType,
+};
+use crate::error::{Error, Result};
+use crate::metadata::{DataField, DataType, RowType};
+use crate::record::{ChangeType, ScanRecord};
+use crate::row::column_writer::{ColumnWriter, round_up_to_8};
+use crate::row::{ColumnarRow, InternalRow, arrow_row_column_indices, fluss_row_column_indices};
+use arrow::array::{ArrayBuilder, ArrayRef};
+use arrow::{
+    array::RecordBatch,
+    buffer::Buffer,
+    ipc::{
+        CompressionType,
+        reader::{StreamReader, read_record_batch},
+        root_as_message,
+        writer::StreamWriter,
+    },
+};
+use arrow_schema::ArrowError::ParseError;
+use arrow_schema::SchemaRef;
+use arrow_schema::{DataType as ArrowDataType, Field};
+use byteorder::WriteBytesExt;
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::Bytes;
+use crc32c::crc32c;
+use std::{
+    cell::Cell,
+    collections::HashMap,
+    fs::File,
+    io::{Cursor, Read, Seek, SeekFrom, Write},
+    path::PathBuf,
+    sync::Arc,
+};
+
+use crate::error::Error::IllegalArgument;
+use arrow::ipc::writer::IpcWriteOptions;
+/// const for record batch
+pub const BASE_OFFSET_LENGTH: usize = 8;
+pub const LENGTH_LENGTH: usize = 4;
+pub const MAGIC_LENGTH: usize = 1;
+pub const COMMIT_TIMESTAMP_LENGTH: usize = 8;
+pub const CRC_LENGTH: usize = 4;
+pub const SCHEMA_ID_LENGTH: usize = 2;
+pub const ATTRIBUTE_LENGTH: usize = 1;
+pub const LAST_OFFSET_DELTA_LENGTH: usize = 4;
+pub const WRITE_CLIENT_ID_LENGTH: usize = 8;
+pub const BATCH_SEQUENCE_LENGTH: usize = 4;
+pub const RECORDS_COUNT_LENGTH: usize = 4;
+
+pub const BASE_OFFSET_OFFSET: usize = 0;
+pub const LENGTH_OFFSET: usize = BASE_OFFSET_OFFSET + BASE_OFFSET_LENGTH;
+pub const MAGIC_OFFSET: usize = LENGTH_OFFSET + LENGTH_LENGTH;
+pub const COMMIT_TIMESTAMP_OFFSET: usize = MAGIC_OFFSET + MAGIC_LENGTH;
+pub const CRC_OFFSET: usize = COMMIT_TIMESTAMP_OFFSET + COMMIT_TIMESTAMP_LENGTH;
+pub const SCHEMA_ID_OFFSET: usize = CRC_OFFSET + CRC_LENGTH;
+pub const ATTRIBUTES_OFFSET: usize = SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH;
+pub const LAST_OFFSET_DELTA_OFFSET: usize = ATTRIBUTES_OFFSET + ATTRIBUTE_LENGTH;
+pub const WRITE_CLIENT_ID_OFFSET: usize = LAST_OFFSET_DELTA_OFFSET + LAST_OFFSET_DELTA_LENGTH;
+pub const BATCH_SEQUENCE_OFFSET: usize = WRITE_CLIENT_ID_OFFSET + WRITE_CLIENT_ID_LENGTH;
+pub const RECORDS_COUNT_OFFSET: usize = BATCH_SEQUENCE_OFFSET + BATCH_SEQUENCE_LENGTH;
+pub const RECORDS_OFFSET: usize = RECORDS_COUNT_OFFSET + RECORDS_COUNT_LENGTH;
+
+pub const RECORD_BATCH_HEADER_SIZE: usize = RECORDS_OFFSET;
+pub const ARROW_CHANGETYPE_OFFSET: usize = RECORD_BATCH_HEADER_SIZE;
+pub const LOG_OVERHEAD: usize = LENGTH_OFFSET + LENGTH_LENGTH;
+
+/// Maximum batch size matches Java's Integer.MAX_VALUE limit.
+/// Java uses int type for batch size, so max value is 2^31 - 1 = 2,147,483,647 bytes (~2GB).
+/// This is the implicit limit in FileLogRecords.java and other Java components.
+pub const MAX_BATCH_SIZE: usize = i32::MAX as usize; // 2,147,483,647 bytes (~2GB)
+
+/// const for record
+/// The "magic" values.
+#[derive(Debug, Clone, Copy)]
+pub enum LogMagicValue {
+    V0 = 0,
+}
+
+/// Safely convert batch size from i32 to usize with validation.
+///
+/// Validates that:
+/// - batch_size_bytes is non-negative
+/// - batch_size_bytes + LOG_OVERHEAD doesn't overflow
+/// - Result is within reasonable bounds
+fn validate_batch_size(batch_size_bytes: i32) -> Result<usize> {
+    // Check for negative size (corrupted data)
+    if batch_size_bytes < 0 {
+        return Err(Error::UnexpectedError {
+            message: format!("Invalid negative batch size: {batch_size_bytes}"),
+            source: None,
+        });
+    }
+
+    let batch_size_u = batch_size_bytes as usize;
+
+    // Check for overflow when adding LOG_OVERHEAD
+    let total_size =
+        batch_size_u
+            .checked_add(LOG_OVERHEAD)
+            .ok_or_else(|| Error::UnexpectedError {
+                message: format!(
+                    "Batch size {batch_size_u} + LOG_OVERHEAD {LOG_OVERHEAD} would overflow"
+                ),
+                source: None,
+            })?;
+
+    // Sanity check: reject unreasonably large batches
+    if total_size > MAX_BATCH_SIZE {
+        return Err(Error::UnexpectedError {
+            message: format!(
+                "Batch size {total_size} exceeds maximum allowed size {MAX_BATCH_SIZE}"
+            ),
+            source: None,
+        });
+    }
+
+    Ok(total_size)
+}
+
+// NOTE: Rust layout/offsets currently match Java only for V0.
+// TODO: Add V1 layout/offsets to keep parity with Java's V1 format.
+pub const CURRENT_LOG_MAGIC_VALUE: u8 = LogMagicValue::V0 as u8;
+
+/// Value used if writer ID is not available or non-idempotent.
+pub const NO_WRITER_ID: i64 = -1;
+
+/// Value used if batch sequence is not available.
+pub const NO_BATCH_SEQUENCE: i32 = -1;
+
+pub const BUILDER_DEFAULT_OFFSET: i64 = 0;
+
+/// Initial capacity for Arrow column vectors (pre-allocation hint, not a record cap).
+/// Matching Java's `ArrowWriter.INITIAL_CAPACITY`.
+const INITIAL_ROW_CAPACITY: usize = 1024;
+
+/// Fraction of the allocated buffer used as the effective write limit.
+/// Matching Java's `ArrowWriter.BUFFER_USAGE_RATIO`.
+const BUFFER_USAGE_RATIO: f32 = 0.95;
+
+pub struct MemoryLogRecordsArrowBuilder {
+    base_log_offset: i64,
+    schema_id: i32,
+    magic: u8,
+    writer_id: i64,
+    batch_sequence: i32,
+    arrow_record_batch_builder: Box<dyn ArrowRecordBatchInnerBuilder>,
+    is_closed: bool,
+    arrow_compression_info: ArrowCompressionInfo,
+    /// Effective write limit in bytes (after applying BUFFER_USAGE_RATIO).
+    write_limit: usize,
+    /// Pre-computed Arrow IPC overhead (metadata + body framing) for this schema.
+    /// Constant per schema+compression combination.
+    ipc_overhead: usize,
+    /// Estimated record count at which the next byte-size check should occur.
+    /// -1 means "unknown — check on the next append". Updated dynamically to
+    /// skip expensive `estimated_size_in_bytes()` calls on every append.
+    /// Matching Java's `ArrowWriter.estimatedMaxRecordsCount`.
+    estimated_max_records_count: Cell<i32>,
+    /// Compression ratio estimator shared across batches for the same table.
+    compression_ratio_estimator: Arc<ArrowCompressionRatioEstimator>,
+    /// Snapshot of the compression ratio at batch creation time.
+    /// Matching Java's `ArrowWriter.estimatedCompressionRatio` which is
+    /// cached per batch and only refreshed on `reset()`.
+    estimated_compression_ratio: f32,
+}
+
+pub trait ArrowRecordBatchInnerBuilder: Send {
+    fn build_arrow_record_batch(&mut self) -> Result<Arc<RecordBatch>>;
+
+    fn append(&mut self, row: &dyn InternalRow) -> Result<bool>;
+
+    fn append_batch(&mut self, record_batch: Arc<RecordBatch>) -> Result<bool>;
+
+    fn schema(&self) -> SchemaRef;
+
+    fn records_count(&self) -> i32;
+
+    fn is_full(&self) -> bool;
+
+    /// Get an estimate of the size in bytes of the arrow data.
+    fn estimated_size_in_bytes(&self) -> usize;
+}
+
+#[derive(Default)]
+pub struct PrebuiltRecordBatchBuilder {
+    arrow_record_batch: Option<Arc<RecordBatch>>,
+    records_count: i32,
+}
+
+impl ArrowRecordBatchInnerBuilder for PrebuiltRecordBatchBuilder {
+    fn build_arrow_record_batch(&mut self) -> Result<Arc<RecordBatch>> {
+        Ok(self.arrow_record_batch.as_ref().unwrap().clone())
+    }
+
+    fn append(&mut self, _row: &dyn InternalRow) -> Result<bool> {
+        // append one single row is not supported, return false directly
+        Ok(false)
+    }
+
+    fn append_batch(&mut self, record_batch: Arc<RecordBatch>) -> Result<bool> {
+        if self.arrow_record_batch.is_some() {
+            return Ok(false);
+        }
+        self.records_count = record_batch.num_rows() as i32;
+        self.arrow_record_batch = Some(record_batch);
+        Ok(true)
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.arrow_record_batch.as_ref().unwrap().schema()
+    }
+
+    fn records_count(&self) -> i32 {
+        self.records_count
+    }
+
+    fn is_full(&self) -> bool {
+        // full if has one record batch
+        self.arrow_record_batch.is_some()
+    }
+
+    fn estimated_size_in_bytes(&self) -> usize {
+        self.arrow_record_batch
+            .as_ref()
+            .map(|batch| batch.get_array_memory_size())
+            .unwrap_or(0)
+    }
+}
+
+pub struct RowAppendRecordBatchBuilder {
+    table_schema: SchemaRef,
+    column_writers: Vec<ColumnWriter>,
+    records_count: i32,
+}
+
+impl RowAppendRecordBatchBuilder {
+    pub fn new(row_type: &RowType) -> Result<Self> {
+        let capacity = INITIAL_ROW_CAPACITY;
+        let schema_ref = to_arrow_schema(row_type)?;
+        let writers: Result<Vec<_>> = row_type
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(pos, field)| {
+                let arrow_type = schema_ref.field(pos).data_type();
+                ColumnWriter::create(field.data_type(), arrow_type, pos, capacity)
+            })
+            .collect();
+        Ok(Self {
+            table_schema: schema_ref.clone(),
+            column_writers: writers?,
+            records_count: 0,
+        })
+    }
+    /// Appends a row to the builder.
+    pub fn append(&mut self, row: &dyn InternalRow) -> Result<bool> {
+        ArrowRecordBatchInnerBuilder::append(self, row)
+    }
+
+    /// Builds the final Arrow RecordBatch.
+    pub fn build_arrow_record_batch(&mut self) -> Result<Arc<RecordBatch>> {
+        ArrowRecordBatchInnerBuilder::build_arrow_record_batch(self)
+    }
+}
+
+impl ArrowRecordBatchInnerBuilder for RowAppendRecordBatchBuilder {
+    fn build_arrow_record_batch(&mut self) -> Result<Arc<RecordBatch>> {
+        let arrays: Result<Vec<ArrayRef>> = self
+            .column_writers
+            .iter_mut()
+            .enumerate()
+            .map(|(idx, writer)| {
+                let array = writer.finish();
+                let expected_type = self.table_schema.field(idx).data_type();
+
+                // Validate array type matches schema
+                if array.data_type() != expected_type {
+                    return Err(Error::IllegalArgument {
+                        message: format!(
+                            "Builder type mismatch at column {}: expected {:?}, got {:?}",
+                            idx,
+                            expected_type,
+                            array.data_type()
+                        ),
+                    });
+                }
+
+                Ok(array)
+            })
+            .collect();
+
+        Ok(Arc::new(RecordBatch::try_new(
+            self.table_schema.clone(),
+            arrays?,
+        )?))
+    }
+
+    fn append(&mut self, row: &dyn InternalRow) -> Result<bool> {
+        for writer in &mut self.column_writers {
+            writer.write_field(row)?;
+        }
+        self.records_count += 1;
+        Ok(true)
+    }
+
+    fn append_batch(&mut self, _record_batch: Arc<RecordBatch>) -> Result<bool> {
+        Ok(false)
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.table_schema.clone()
+    }
+
+    fn records_count(&self) -> i32 {
+        self.records_count
+    }
+
+    fn is_full(&self) -> bool {
+        // Size-based fullness is handled by MemoryLogRecordsArrowBuilder,
+        // which accounts for metadata length and compression ratio.
+        false
+    }
+
+    fn estimated_size_in_bytes(&self) -> usize {
+        // Returns the uncompressed Arrow IPC body size by reading buffer lengths
+        // directly from the builders — O(num_columns), zero allocation.
+        // Analogous to Java's `ArrowUtils.estimateArrowBodyLength()`.
+        // Java reads exact IPC buffer sizes from vectors; we read builder
+        // buffer lengths. The IPC framing overhead is accounted for
+        // separately by `ipc_overhead`.
+        self.column_writers.iter().map(|w| w.buffer_size()).sum()
+    }
+}
+
+// TODO: Pool and reuse MemoryLogRecordsArrowBuilder instances per table/schema like
+// Java's ArrowWriterPool. Reused writers can seed `estimated_max_records_count` from
+// the previous batch (recordsCount / 2) for a warm start, avoiding the first-record
+// size check on every new batch.
+impl MemoryLogRecordsArrowBuilder {
+    pub fn new(
+        schema_id: i32,
+        row_type: &RowType,
+        to_append_record_batch: bool,
+        arrow_compression_info: ArrowCompressionInfo,
+        write_limit: usize,
+        compression_ratio_estimator: Arc<ArrowCompressionRatioEstimator>,
+    ) -> Result<Self> {
+        let arrow_batch_builder: Box<dyn ArrowRecordBatchInnerBuilder> = {
+            if to_append_record_batch {
+                Box::new(PrebuiltRecordBatchBuilder::default())
+            } else {
+                Box::new(RowAppendRecordBatchBuilder::new(row_type)?)
+            }
+        };
+        let schema = to_arrow_schema(row_type)?;
+        let ipc_overhead =
+            estimate_arrow_ipc_overhead(&schema, arrow_compression_info.get_compression_type())?;
+        let effective_limit = (write_limit as f32 * BUFFER_USAGE_RATIO) as usize;
+        let estimated_compression_ratio = compression_ratio_estimator.estimation();
+        Ok(MemoryLogRecordsArrowBuilder {
+            base_log_offset: BUILDER_DEFAULT_OFFSET,
+            schema_id,
+            magic: CURRENT_LOG_MAGIC_VALUE,
+            writer_id: NO_WRITER_ID,
+            batch_sequence: NO_BATCH_SEQUENCE,
+            is_closed: false,
+            arrow_record_batch_builder: arrow_batch_builder,
+            arrow_compression_info,
+            write_limit: effective_limit,
+            ipc_overhead,
+            estimated_max_records_count: Cell::new(-1),
+            compression_ratio_estimator,
+            estimated_compression_ratio,
+        })
+    }
+
+    pub fn append(&mut self, record: &WriteRecord) -> Result<bool> {
+        match &record.record() {
+            Record::Log(log_write_record) => match log_write_record {
+                LogWriteRecord::InternalRow(row) => {
+                    Ok(self.arrow_record_batch_builder.append(*row)?)
+                }
+                LogWriteRecord::RecordBatch(record_batch) => Ok(self
+                    .arrow_record_batch_builder
+                    .append_batch(record_batch.clone())?),
+            },
+            Record::Kv(_) => Err(Error::UnsupportedOperation {
+                message: "Only LogRecord is supported to append".to_string(),
+            }),
+        }
+        // todo: consider write other change type
+    }
+
+    /// Check if the builder is full based on estimated serialized size.
+    ///
+    /// Uses a threshold-based optimization to skip expensive size checks:
+    /// only computes the actual estimated size when the record count reaches
+    /// the predicted threshold. Matching Java's `ArrowWriter.isFull()`.
+    pub fn is_full(&self) -> bool {
+        // Delegate to inner builder first (e.g. PrebuiltRecordBatchBuilder
+        // is always full after one batch, regardless of size).
+        if self.arrow_record_batch_builder.is_full() {
+            return true;
+        }
+        let records_count = self.arrow_record_batch_builder.records_count();
+        let threshold = self.estimated_max_records_count.get();
+        if records_count > 0 && records_count >= threshold {
+            let body_size = self.arrow_record_batch_builder.estimated_size_in_bytes();
+            let estimated_body = self.estimated_compressed_size(body_size);
+            let current_size = self.ipc_overhead + estimated_body;
+            if current_size >= self.write_limit {
+                return true;
+            }
+            if estimated_body == 0 {
+                self.estimated_max_records_count.set(records_count + 1);
+                return false;
+            }
+            // Matching Java: subtract fixed metadata overhead from the limit,
+            // divide remaining body budget by per-record body cost.
+            let body_per_record = estimated_body as f64 / records_count as f64;
+            let next = ((self.write_limit.saturating_sub(self.ipc_overhead) as f64
+                / body_per_record)
+                .ceil() as i32)
+                .max(records_count + 1);
+            self.estimated_max_records_count.set(next);
+        }
+        false
+    }
+
+    /// Estimate the compressed body size using the ratio snapshot taken at batch creation.
+    /// Matching Java's `ArrowWriter.estimatedBytesWritten()`.
+    fn estimated_compressed_size(&self, uncompressed_body: usize) -> usize {
+        if self.arrow_compression_info.compression_type == ArrowCompressionType::None {
+            uncompressed_body
+        } else {
+            (uncompressed_body as f64 * self.estimated_compression_ratio as f64) as usize
+        }
+    }
+
+    pub fn is_closed(&self) -> bool {
+        self.is_closed
+    }
+
+    pub fn close(&mut self) {
+        self.is_closed = true;
+    }
+
+    pub fn build(&mut self) -> Result<Vec<u8>> {
+        // Capture uncompressed body size before serialization for compression ratio update.
+        let uncompressed_body_size = self.arrow_record_batch_builder.estimated_size_in_bytes();
+
+        // serialize arrow batch
+        let mut arrow_batch_bytes = vec![];
+        let table_schema = self.arrow_record_batch_builder.schema();
+        let compression_type = self.arrow_compression_info.get_compression_type();
+        let write_option =
+            IpcWriteOptions::try_with_compression(IpcWriteOptions::default(), compression_type);
+        let mut writer = StreamWriter::try_new_with_options(
+            &mut arrow_batch_bytes,
+            &table_schema,
+            write_option?,
+        )?;
+
+        // get header len
+        let header = writer.get_ref().len();
+        let record_batch = self.arrow_record_batch_builder.build_arrow_record_batch()?;
+        writer.write(record_batch.as_ref())?;
+        // get real arrow batch bytes (metadata + body, potentially compressed)
+        let real_arrow_batch_bytes = &arrow_batch_bytes[header..];
+
+        // Update compression ratio estimator with actual ratio.
+        // The serialized bytes include metadata + compressed body. Subtract
+        // metadata to isolate the compressed body for an accurate ratio.
+        if uncompressed_body_size > 0
+            && self.arrow_compression_info.compression_type != ArrowCompressionType::None
+        {
+            let compressed_body_size = real_arrow_batch_bytes
+                .len()
+                .saturating_sub(self.ipc_overhead);
+            let actual_ratio = compressed_body_size as f32 / uncompressed_body_size as f32;
+            self.compression_ratio_estimator
+                .update_estimation(actual_ratio);
+        }
+
+        // now, write batch header and arrow batch
+        let mut batch_bytes = vec![0u8; RECORD_BATCH_HEADER_SIZE + real_arrow_batch_bytes.len()];
+        // write batch header
+        self.write_batch_header(&mut batch_bytes[..])?;
+
+        // write arrow batch bytes
+        let mut cursor = Cursor::new(&mut batch_bytes[..]);
+        cursor.set_position(RECORD_BATCH_HEADER_SIZE as u64);
+        cursor.write_all(real_arrow_batch_bytes)?;
+
+        let calcute_crc_bytes = &cursor.get_ref()[SCHEMA_ID_OFFSET..];
+        // then update crc
+        let crc = crc32c(calcute_crc_bytes);
+        cursor.set_position(CRC_OFFSET as u64);
+        cursor.write_u32::<LittleEndian>(crc)?;
+
+        Ok(batch_bytes.to_vec())
+    }
+
+    fn write_batch_header(&self, buffer: &mut [u8]) -> Result<()> {
+        let total_len = buffer.len();
+        let mut cursor = Cursor::new(buffer);
+        cursor.write_i64::<LittleEndian>(self.base_log_offset)?;
+        cursor
+            .write_i32::<LittleEndian>((total_len - BASE_OFFSET_LENGTH - LENGTH_LENGTH) as i32)?;
+        cursor.write_u8(self.magic)?;
+        cursor.write_i64::<LittleEndian>(0)?; // timestamp placeholder
+        cursor.write_u32::<LittleEndian>(0)?; // crc placeholder
+        cursor.write_i16::<LittleEndian>(self.schema_id as i16)?;
+
+        let record_count = self.arrow_record_batch_builder.records_count();
+        // todo: curerntly, always is append only
+        let append_only = true;
+        cursor.write_u8(if append_only { 1 } else { 0 })?;
+        cursor.write_i32::<LittleEndian>(if record_count > 0 {
+            record_count - 1
+        } else {
+            0
+        })?;
+
+        cursor.write_i64::<LittleEndian>(self.writer_id)?;
+        cursor.write_i32::<LittleEndian>(self.batch_sequence)?;
+        cursor.write_i32::<LittleEndian>(record_count)?;
+        Ok(())
+    }
+
+    pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) {
+        self.writer_id = writer_id;
+        self.batch_sequence = batch_base_sequence;
+    }
+
+    /// Get an estimate of the number of bytes written to the underlying buffer.
+    /// Includes Fluss record batch header + Arrow IPC metadata + estimated
+    /// compressed body size.
+    pub fn estimated_size_in_bytes(&self) -> usize {
+        let body = self.arrow_record_batch_builder.estimated_size_in_bytes();
+        let estimated_body = self.estimated_compressed_size(body);
+        RECORD_BATCH_HEADER_SIZE + self.ipc_overhead + estimated_body
+    }
+}
+
+/// Estimate the Arrow IPC overhead (metadata + body framing) for a given schema.
+///
+/// Serializes a 1-row RecordBatch with known data sizes, then subtracts the
+/// raw data contribution to isolate the fixed overhead: IPC message header,
+/// RecordBatch flatbuffer, and per-buffer alignment padding within the body.
+/// This overhead is constant for a given schema+compression combination.
+///
+/// Note: called once per batch creation. With writer pooling (see TODO above),
+/// this would be computed once per pooled writer and reused across batches.
+/// Analogous to Java's `ArrowUtils.estimateArrowMetadataLength()`.
+fn estimate_arrow_ipc_overhead(
+    schema: &SchemaRef,
+    compression: Option<CompressionType>,
+) -> Result<usize> {
+    use arrow::array::new_null_array;
+
+    // Create a 1-row batch of nulls. Null arrays have minimal, predictable
+    // data: no validity bitmap, no variable-length data, just fixed-width
+    // zero buffers. This lets us compute raw data size exactly.
+    let null_arrays: Vec<ArrayRef> = schema
+        .fields()
+        .iter()
+        .map(|field| new_null_array(field.data_type(), 1))
+        .collect();
+    let batch = RecordBatch::try_new(schema.clone(), null_arrays)?;
+
+    // Sum the raw buffer sizes — this is what buffer_size() would report.
+    let raw_data: usize = batch
+        .columns()
+        .iter()
+        .map(|col| {
+            col.to_data()
+                .buffers()
+                .iter()
+                .map(|buf| round_up_to_8(buf.len()))
+                .sum::<usize>()
+                // Validity buffer (null bitmap)
+                + col
+                    .nulls()
+                    .map_or(0, |n| round_up_to_8(n.buffer().len()))
+        })
+        .sum();
+
+    // Serialize the batch via IPC and measure total output.
+    let mut buf = vec![];
+    let write_option =
+        IpcWriteOptions::try_with_compression(IpcWriteOptions::default(), compression);
+    let mut writer = StreamWriter::try_new_with_options(&mut buf, schema, write_option?)?;
+    let header_len = writer.get_ref().len();
+    writer.write(&batch)?;
+    let total_len = writer.get_ref().len();
+
+    // IPC overhead = total message size - raw data we put in.
+    let ipc_message_len = total_len - header_len;
+    Ok(ipc_message_len.saturating_sub(raw_data))
+}
+
+pub trait ToArrow {
+    fn append_to(&self, builder: &mut dyn ArrayBuilder) -> Result<()>;
+}
+
+/// In-memory log record source.
+/// Used for local tablet server fetches (existing path).
+struct MemorySource {
+    data: Bytes,
+}
+
+impl MemorySource {
+    fn new(data: Vec<u8>) -> Self {
+        Self {
+            data: Bytes::from(data),
+        }
+    }
+
+    fn read_batch_header(&mut self, pos: usize) -> Result<(i64, usize)> {
+        if pos + LOG_OVERHEAD > self.data.len() {
+            return Err(Error::UnexpectedError {
+                message: format!(
+                    "Position {} + LOG_OVERHEAD {} exceeds data size {}",
+                    pos,
+                    LOG_OVERHEAD,
+                    self.data.len()
+                ),
+                source: None,
+            });
+        }
+
+        let base_offset = LittleEndian::read_i64(&self.data[pos + BASE_OFFSET_OFFSET..]);
+        let batch_size_bytes = LittleEndian::read_i32(&self.data[pos + LENGTH_OFFSET..]);
+
+        // Validate batch size to prevent integer overflow and corruption
+        let batch_size = validate_batch_size(batch_size_bytes)?;
+
+        Ok((base_offset, batch_size))
+    }
+
+    fn read_batch_data(&mut self, pos: usize, size: usize) -> Result<Bytes> {
+        if pos + size > self.data.len() {
+            return Err(Error::UnexpectedError {
+                message: format!(
+                    "Read beyond data size: {} + {} > {}",
+                    pos,
+                    size,
+                    self.data.len()
+                ),
+                source: None,
+            });
+        }
+        // Zero-copy slice (Bytes is Arc-based)
+        Ok(self.data.slice(pos..pos + size))
+    }
+
+    fn total_size(&self) -> usize {
+        self.data.len()
+    }
+}
+
+/// RAII guard that deletes a file when dropped.
+/// Used to ensure file deletion happens AFTER the file handle is closed.
+struct FileCleanupGuard {
+    file_path: PathBuf,
+}
+
+impl Drop for FileCleanupGuard {
+    fn drop(&mut self) {
+        // File handle is already closed (this guard drops after the file field)
+        if let Err(e) = std::fs::remove_file(&self.file_path) {
+            log::warn!(
+                "Failed to delete remote log file {}: {}",
+                self.file_path.display(),
+                e
+            );
+        } else {
+            log::debug!("Deleted remote log file: {}", self.file_path.display());
+        }
+    }
+}
+
+/// File-backed log record source.
+/// Used for remote log segments downloaded to local disk.
+/// Streams data on-demand instead of loading entire file into memory.
+///
+/// Uses seek + read_exact for cross-platform compatibility.
+/// Access pattern is sequential iteration (single consumer).
+struct FileSource {
+    file: File,
+    file_size: usize,
+    base_offset: usize,
+    _cleanup: Option<FileCleanupGuard>, // Drops AFTER file (field order matters!)
+}
+
+impl FileSource {
+    /// Create a new FileSource.
+    ///
+    /// The file at `file_path` will be deleted when this FileSource is dropped.
+    fn new(file: File, base_offset: usize, file_path: PathBuf) -> Result<Self> {
+        let file_size = file.metadata()?.len() as usize;
+
+        // Validate base_offset to prevent underflow in total_size()
+        if base_offset > file_size {
+            return Err(Error::UnexpectedError {
+                message: format!("base_offset ({base_offset}) exceeds file_size ({file_size})"),
+                source: None,
+            });
+        }
+
+        Ok(Self {
+            file,
+            file_size,
+            base_offset,
+            _cleanup: Some(FileCleanupGuard { file_path }),
+        })
+    }
+
+    /// Read data at a specific position using seek + read_exact.
+    /// This is cross-platform and adequate for sequential access patterns.
+    fn read_at(&mut self, pos: u64, buf: &mut [u8]) -> Result<()> {
+        self.file.seek(SeekFrom::Start(pos))?;
+        self.file.read_exact(buf)?;
+        Ok(())
+    }
+
+    fn read_batch_header(&mut self, pos: usize) -> Result<(i64, usize)> {
+        let actual_pos = self.base_offset + pos;
+        if actual_pos + LOG_OVERHEAD > self.file_size {
+            return Err(Error::UnexpectedError {
+                message: format!(
+                    "Position {} exceeds file size {}",
+                    actual_pos, self.file_size
+                ),
+                source: None,
+            });
+        }
+
+        // Read only the header to extract base_offset and batch_size
+        let mut header_buf = vec![0u8; LOG_OVERHEAD];
+        self.read_at(actual_pos as u64, &mut header_buf)?;
+
+        let base_offset = LittleEndian::read_i64(&header_buf[BASE_OFFSET_OFFSET..]);
+        let batch_size_bytes = LittleEndian::read_i32(&header_buf[LENGTH_OFFSET..]);
+
+        // Validate batch size to prevent integer overflow and corruption
+        let batch_size = validate_batch_size(batch_size_bytes)?;
+
+        Ok((base_offset, batch_size))
+    }
+
+    fn read_batch_data(&mut self, pos: usize, size: usize) -> Result<Bytes> {
+        let actual_pos = self.base_offset + pos;
+        if actual_pos + size > self.file_size {
+            return Err(Error::UnexpectedError {
+                message: format!(
+                    "Read beyond file size: {} + {} > {}",
+                    actual_pos, size, self.file_size
+                ),
+                source: None,
+            });
+        }
+
+        // Read the full batch data
+        let mut batch_buf = vec![0u8; size];
+        self.read_at(actual_pos as u64, &mut batch_buf)?;
+
+        Ok(Bytes::from(batch_buf))
+    }
+
+    fn total_size(&self) -> usize {
+        self.file_size - self.base_offset
+    }
+}
+
+/// Enum for different log record sources.
+enum LogRecordsSource {
+    Memory(MemorySource),
+    File(FileSource),
+}
+
+impl LogRecordsSource {
+    fn read_batch_header(&mut self, pos: usize) -> Result<(i64, usize)> {
+        match self {
+            Self::Memory(s) => s.read_batch_header(pos),
+            Self::File(s) => s.read_batch_header(pos),
+        }
+    }
+
+    fn read_batch_data(&mut self, pos: usize, size: usize) -> Result<Bytes> {
+        match self {
+            Self::Memory(s) => s.read_batch_data(pos, size),
+            Self::File(s) => s.read_batch_data(pos, size),
+        }
+    }
+
+    fn total_size(&self) -> usize {
+        match self {
+            Self::Memory(s) => s.total_size(),
+            Self::File(s) => s.total_size(),
+        }
+    }
+}
+
+pub struct LogRecordsBatches {
+    source: LogRecordsSource,
+    current_pos: usize,
+    remaining_bytes: usize,
+}
+
+impl LogRecordsBatches {
+    /// Create from in-memory Vec (existing path - backward compatible).
+    pub fn new(data: Vec<u8>) -> Self {
+        let source = LogRecordsSource::Memory(MemorySource::new(data));
+        let remaining_bytes = source.total_size();
+        Self {
+            source,
+            current_pos: 0,
+            remaining_bytes,
+        }
+    }
+
+    /// Create from file.
+    /// Enables streaming without loading entire file into memory.
+    ///
+    /// The file at `file_path` will be deleted when dropped.
+    /// This ensures the file is closed before deletion.
+    pub fn from_file(file: File, base_offset: usize, file_path: PathBuf) -> Result<Self> {
+        let source = FileSource::new(file, base_offset, file_path)?;
+        let remaining_bytes = source.total_size();
+        Ok(Self {
+            source: LogRecordsSource::File(source),
+            current_pos: 0,
+            remaining_bytes,
+        })
+    }
+
+    /// Try to get the size of the next batch.
+    fn next_batch_size(&mut self) -> Result<Option<usize>> {
+        if self.remaining_bytes < LOG_OVERHEAD {
+            return Ok(None);
+        }
+
+        // Read only header to get size
+        match self.source.read_batch_header(self.current_pos) {
+            Ok((_base_offset, batch_size)) => {
+                if batch_size > self.remaining_bytes {
+                    Ok(None)
+                } else {
+                    Ok(Some(batch_size))
+                }
+            }
+            Err(e) => Err(e),
+        }
+    }
+}
+
+impl Iterator for LogRecordsBatches {
+    type Item = Result<LogRecordBatch>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.next_batch_size() {
+            Ok(Some(batch_size)) => {
+                // Read full batch data on-demand
+                match self.source.read_batch_data(self.current_pos, batch_size) {
+                    Ok(data) => {
+                        let record_batch = LogRecordBatch::new(data);
+                        self.current_pos += batch_size;
+                        self.remaining_bytes -= batch_size;
+                        Some(Ok(record_batch))
+                    }
+                    Err(e) => Some(Err(e)),
+                }
+            }
+            Ok(None) => None,
+            Err(e) => Some(Err(e)),
+        }
+    }
+}
+
+pub struct LogRecordBatch {
+    data: Bytes,
+}
+
+#[allow(dead_code)]
+impl LogRecordBatch {
+    pub fn new(data: Bytes) -> Self {
+        LogRecordBatch { data }
+    }
+
+    pub fn magic(&self) -> u8 {
+        self.data[MAGIC_OFFSET]
+    }
+
+    pub fn commit_timestamp(&self) -> i64 {
+        let offset = COMMIT_TIMESTAMP_OFFSET;
+        LittleEndian::read_i64(&self.data[offset..offset + COMMIT_TIMESTAMP_LENGTH])
+    }
+
+    pub fn writer_id(&self) -> i64 {
+        let offset = WRITE_CLIENT_ID_OFFSET;
+        LittleEndian::read_i64(&self.data[offset..offset + WRITE_CLIENT_ID_LENGTH])
+    }
+
+    pub fn batch_sequence(&self) -> i32 {
+        let offset = BATCH_SEQUENCE_OFFSET;
+        LittleEndian::read_i32(&self.data[offset..offset + BATCH_SEQUENCE_LENGTH])
+    }
+
+    pub fn ensure_valid(&self) -> Result<()> {
+        // TODO enable validation once checksum handling is corrected.
+        Ok(())
+    }
+
+    pub fn is_valid(&self) -> bool {
+        self.size_in_bytes() >= RECORD_BATCH_HEADER_SIZE
+            && self.checksum() == self.compute_checksum()
+    }
+
+    fn compute_checksum(&self) -> u32 {
+        let start = SCHEMA_ID_OFFSET;
+        crc32c(&self.data[start..])
+    }
+
+    fn attributes(&self) -> u8 {
+        self.data[ATTRIBUTES_OFFSET]
+    }
+
+    pub fn next_log_offset(&self) -> i64 {
+        self.last_log_offset() + 1
+    }
+
+    pub fn checksum(&self) -> u32 {
+        let offset = CRC_OFFSET;
+        LittleEndian::read_u32(&self.data[offset..offset + CRC_LENGTH])
+    }
+
+    pub fn schema_id(&self) -> i16 {
+        let offset = SCHEMA_ID_OFFSET;
+        LittleEndian::read_i16(&self.data[offset..offset + SCHEMA_ID_LENGTH])
+    }
+
+    pub fn base_log_offset(&self) -> i64 {
+        let offset = BASE_OFFSET_OFFSET;
+        LittleEndian::read_i64(&self.data[offset..offset + BASE_OFFSET_LENGTH])
+    }
+
+    pub fn last_log_offset(&self) -> i64 {
+        self.base_log_offset() + self.last_offset_delta() as i64
+    }
+
+    fn last_offset_delta(&self) -> i32 {
+        let offset = LAST_OFFSET_DELTA_OFFSET;
+        LittleEndian::read_i32(&self.data[offset..offset + LAST_OFFSET_DELTA_LENGTH])
+    }
+
+    pub fn size_in_bytes(&self) -> usize {
+        let offset = LENGTH_OFFSET;
+        LittleEndian::read_i32(&self.data[offset..offset + LENGTH_LENGTH]) as usize + LOG_OVERHEAD
+    }
+
+    pub fn record_count(&self) -> i32 {
+        let offset = RECORDS_COUNT_OFFSET;
+        LittleEndian::read_i32(&self.data[offset..offset + RECORDS_COUNT_LENGTH])
+    }
+
+    pub fn records(&self, read_context: &ReadContext) -> Result<LogRecordIterator> {
+        if self.record_count() == 0 {
+            return Ok(LogRecordIterator::empty());
+        }
+
+        let data = &self.data[RECORDS_OFFSET..];
+
+        let record_batch = read_context.record_batch(data)?;
+        let arrow_reader = ArrowReader::new_with_fluss_row_type(
+            Arc::new(record_batch),
+            read_context.row_type.clone(),
+            read_context.fluss_row_type().cloned(),
+        );
+        let log_record_iterator = LogRecordIterator::Arrow(ArrowLogRecordIterator {
+            reader: arrow_reader,
+            base_offset: self.base_log_offset(),
+            timestamp: self.commit_timestamp(),
+            row_id: 0,
+            change_type: ChangeType::AppendOnly,
+        });
+
+        Ok(log_record_iterator)
+    }
+
+    pub fn records_for_remote_log(&self, read_context: &ReadContext) -> Result<LogRecordIterator> {
+        if self.record_count() == 0 {
+            return Ok(LogRecordIterator::empty());
+        }
+
+        let data = &self.data[RECORDS_OFFSET..];
+
+        let record_batch = read_context.record_batch_for_remote_log(data)?;
+        let log_record_iterator = match record_batch {
+            None => LogRecordIterator::empty(),
+            Some(record_batch) => {
+                let arrow_reader = ArrowReader::new_with_fluss_row_type(
+                    Arc::new(record_batch),
+                    read_context.row_type.clone(),
+                    read_context.fluss_row_type().cloned(),
+                );
+                LogRecordIterator::Arrow(ArrowLogRecordIterator {
+                    reader: arrow_reader,
+                    base_offset: self.base_log_offset(),
+                    timestamp: self.commit_timestamp(),
+                    row_id: 0,
+                    change_type: ChangeType::AppendOnly,
+                })
+            }
+        };
+        Ok(log_record_iterator)
+    }
+
+    /// Returns the record batch directly without creating an iterator.
+    /// This is more efficient when you need the entire batch rather than
+    /// iterating row-by-row.
+    pub fn record_batch(&self, read_context: &ReadContext) -> Result<RecordBatch> {
+        if self.record_count() == 0 {
+            // Return empty batch with correct schema
+            return Ok(RecordBatch::new_empty(read_context.target_schema.clone()));
+        }
+
+        let data = self
+            .data
+            .get(RECORDS_OFFSET..)
+            .ok_or_else(|| Error::UnexpectedError {
+                message: format!(
+                    "Corrupt log record batch: data length {} is less than RECORDS_OFFSET {}",
+                    self.data.len(),
+                    RECORDS_OFFSET
+                ),
+                source: None,
+            })?;
+        read_context.record_batch(data)
+    }
+}
+
+/// Parse an Arrow IPC message from a byte slice.
+///
+/// Server returns RecordBatch message (without Schema message) in the encapsulated message format.
+/// Format: [continuation: 4 bytes (0xFFFFFFFF)][metadata_size: 4 bytes][RecordBatch metadata][body]
+///
+/// This format is documented at:
+/// https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format
+///
+/// # Arguments
+/// * `data` - The byte slice containing the IPC message.
+///
+/// # Returns
+/// Returns `Ok((batch_metadata, body_buffer, version))` on success:
+/// - `batch_metadata`: The RecordBatch metadata from the IPC message.
+/// - `body_buffer`: The buffer containing the record batch body data.
+/// - `version`: The Arrow IPC metadata version.
+///
+/// Returns `Err(arrow_error)` on errors
+/// - `arrow_error`: Error details e.g. malformed, too short or bad continuation marker.
+fn parse_ipc_message(
+    data: &[u8],
+) -> Result<(
+    arrow::ipc::RecordBatch<'_>,
+    Buffer,
+    arrow::ipc::MetadataVersion,
+)> {
+    const CONTINUATION_MARKER: u32 = 0xFFFFFFFF;
+
+    if data.len() < 8 {
+        Err(ParseError(format!("Invalid data length: {}", data.len())))?
+    }
+
+    let continuation = LittleEndian::read_u32(&data[0..4]);
+    let metadata_size = LittleEndian::read_u32(&data[4..8]) as usize;
+
+    if continuation != CONTINUATION_MARKER {
+        Err(ParseError(format!(
+            "Invalid continuation marker: {continuation}"
+        )))?
+    }
+
+    if data.len() < 8 + metadata_size {
+        Err(ParseError(format!(
+            "Invalid data length. Remaining data length {} is shorter than specified size {}",
+            data.len() - 8,
+            metadata_size
+        )))?
+    }
+
+    let metadata_bytes = &data[8..8 + metadata_size];
+    let message = root_as_message(metadata_bytes).map_err(|err| ParseError(err.to_string()))?;
+    let batch_metadata = message
+        .header_as_record_batch()
+        .ok_or(ParseError(String::from("Not a record batch")))?;
+
+    let metadata_padded_size = (metadata_size + 7) & !7;
+    let body_start = 8 + metadata_padded_size;
+    let body_data = &data[body_start..];
+    let body_buffer = Buffer::from(body_data);
+
+    Ok((batch_metadata, body_buffer, message.version()))
+}
+
+pub fn to_arrow_schema(fluss_schema: &RowType) -> Result<SchemaRef> {
+    let fields: Result<Vec<Field>> = fluss_schema
+        .fields()
+        .iter()
+        .map(|f| {
+            Ok(Field::new(
+                f.name(),
+                to_arrow_type(f.data_type())?,
+                f.data_type().is_nullable(),
+            ))
+        })
+        .collect();
+
+    Ok(SchemaRef::new(arrow_schema::Schema::new(fields?)))
+}
+
+pub fn to_arrow_type(fluss_type: &DataType) -> Result<ArrowDataType> {
+    Ok(match fluss_type {
+        DataType::Boolean(_) => ArrowDataType::Boolean,
+        DataType::TinyInt(_) => ArrowDataType::Int8,
+        DataType::SmallInt(_) => ArrowDataType::Int16,
+        DataType::BigInt(_) => ArrowDataType::Int64,
+        DataType::Int(_) => ArrowDataType::Int32,
+        DataType::Float(_) => ArrowDataType::Float32,
+        DataType::Double(_) => ArrowDataType::Float64,
+        DataType::Char(_) => ArrowDataType::Utf8,
+        DataType::String(_) => ArrowDataType::Utf8,
+        DataType::Decimal(decimal_type) => {
+            let precision =
+                decimal_type
+                    .precision()
+                    .try_into()
+                    .map_err(|_| Error::IllegalArgument {
+                        message: format!(
+                            "Decimal precision {} exceeds Arrow's maximum (u8::MAX)",
+                            decimal_type.precision()
+                        ),
+                    })?;
+            let scale = decimal_type
+                .scale()
+                .try_into()
+                .map_err(|_| Error::IllegalArgument {
+                    message: format!(
+                        "Decimal scale {} exceeds Arrow's maximum (i8::MAX)",
+                        decimal_type.scale()
+                    ),
+                })?;
+            ArrowDataType::Decimal128(precision, scale)
+        }
+        DataType::Date(_) => ArrowDataType::Date32,
+        DataType::Time(time_type) => match time_type.precision() {
+            0 => ArrowDataType::Time32(arrow_schema::TimeUnit::Second),
+            1..=3 => ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond),
+            4..=6 => ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond),
+            7..=9 => ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond),
+            invalid => {
+                return Err(Error::IllegalArgument {
+                    message: format!("Invalid precision {invalid} for TimeType (must be 0-9)"),
+                });
+            }
+        },
+        DataType::Timestamp(timestamp_type) => match timestamp_type.precision() {
+            0 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None),
+            1..=3 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
+            4..=6 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None),
+            7..=9 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None),
+            invalid => {
+                return Err(Error::IllegalArgument {
+                    message: format!("Invalid precision {invalid} for TimestampType (must be 0-9)"),
+                });
+            }
+        },
+        DataType::TimestampLTz(timestamp_ltz_type) => match timestamp_ltz_type.precision() {
+            0 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None),
+            1..=3 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
+            4..=6 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None),
+            7..=9 => ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None),
+            invalid => {
+                return Err(Error::IllegalArgument {
+                    message: format!(
+                        "Invalid precision {invalid} for TimestampLTzType (must be 0-9)"
+                    ),
+                });
+            }
+        },
+        DataType::Bytes(_) => ArrowDataType::Binary,
+        DataType::Binary(binary_type) => {
+            let length = binary_type
+                .length()
+                .try_into()
+                .map_err(|_| Error::IllegalArgument {
+                    message: format!(
+                        "Binary length {} exceeds Arrow's maximum (i32::MAX)",
+                        binary_type.length()
+                    ),
+                })?;
+            ArrowDataType::FixedSizeBinary(length)
+        }
+        DataType::Array(array_type) => ArrowDataType::List(
+            Field::new_list_field(
+                to_arrow_type(array_type.get_element_type())?,
+                array_type.get_element_type().is_nullable(),
+            )
+            .into(),
+        ),
+        DataType::Map(map_type) => {
+            let key_type = to_arrow_type(map_type.key_type())?;
+            let value_type = to_arrow_type(map_type.value_type())?;
+            let entry_fields = vec![
+                Field::new("key", key_type, map_type.key_type().is_nullable()),
+                Field::new("value", value_type, map_type.value_type().is_nullable()),
+            ];
+            ArrowDataType::Map(
+                Arc::new(Field::new(
+                    "entries",
+                    ArrowDataType::Struct(arrow_schema::Fields::from(entry_fields)),
+                    false,
+                )),
+                false,
+            )
+        }
+        DataType::Row(row_type) => {
+            let fields: Result<Vec<Field>> = row_type
+                .fields()
+                .iter()
+                .map(|f| {
+                    Ok(Field::new(
+                        f.name(),
+                        to_arrow_type(f.data_type())?,
+                        f.data_type().is_nullable(),
+                    ))
+                })
+                .collect();
+            ArrowDataType::Struct(arrow_schema::Fields::from(fields?))
+        }
+    })
+}
+
+/// Like `from_arrow_type`, but also reads the Field's nullability —
+/// Arrow stores it on the Field wrapper, not the leaf data type.
+pub(crate) fn from_arrow_field(field: &arrow_schema::Field) -> Result<DataType> {
+    let mut dt = from_arrow_type(field.data_type())?;
+    if !field.is_nullable() {
+        dt = dt.as_non_nullable();
+    }
+    Ok(dt)
+}
+
+/// Converts an Arrow data type back to a Fluss `DataType`.
+/// Used for reading array elements from Arrow ListArray back into Fluss types.
+pub(crate) fn from_arrow_type(arrow_type: &ArrowDataType) -> Result<DataType> {
+    use crate::metadata::DataTypes;
+
+    Ok(match arrow_type {
+        ArrowDataType::Boolean => DataTypes::boolean(),
+        ArrowDataType::Int8 => DataTypes::tinyint(),
+        ArrowDataType::Int16 => DataTypes::smallint(),
+        ArrowDataType::Int32 => DataTypes::int(),
+        ArrowDataType::Int64 => DataTypes::bigint(),
+        ArrowDataType::Float32 => DataTypes::float(),
+        ArrowDataType::Float64 => DataTypes::double(),
+        ArrowDataType::Utf8 => DataTypes::string(),
+        ArrowDataType::Binary => DataTypes::bytes(),
+        ArrowDataType::Date32 => DataTypes::date(),
+        ArrowDataType::FixedSizeBinary(len) => {
+            if *len < 0 {
+                return Err(Error::IllegalArgument {
+                    message: format!("FixedSizeBinary length must be >= 0, got {len}"),
+                });
+            }
+            DataTypes::binary(*len as usize)
+        }
+        ArrowDataType::Decimal128(p, s) => {
+            if *s < 0 {
+                return Err(Error::IllegalArgument {
+                    message: format!("Decimal scale must be >= 0, got {s}"),
+                });
+            }
+            DataTypes::decimal(*p as u32, *s as u32)
+        }
+        ArrowDataType::Time32(arrow_schema::TimeUnit::Second) => DataTypes::time_with_precision(0),
+        ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
+            DataTypes::time_with_precision(3)
+        }
+        ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
+            DataTypes::time_with_precision(6)
+        }
+        ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond) => {
+            DataTypes::time_with_precision(9)
+        }
+        ArrowDataType::Timestamp(unit, tz) => {
+            let precision = match unit {
+                arrow_schema::TimeUnit::Second => 0,
+                arrow_schema::TimeUnit::Millisecond => 3,
+                arrow_schema::TimeUnit::Microsecond => 6,
+                arrow_schema::TimeUnit::Nanosecond => 9,
+            };
+
+            if tz.is_some() {
+                DataTypes::timestamp_ltz_with_precision(precision)
+            } else {
+                DataTypes::timestamp_with_precision(precision)
+            }
+        }
+        ArrowDataType::List(field) => DataTypes::array(from_arrow_field(field)?),
+        ArrowDataType::Map(entries_field, _sorted) => {
+            let fields = match entries_field.data_type() {
+                ArrowDataType::Struct(f) => f,
+                other => {
+                    return Err(Error::IllegalArgument {
+                        message: format!("Map entries must be Struct, got {other:?}"),
+                    });
+                }
+            };
+            if fields.len() != 2 {
+                return Err(Error::IllegalArgument {
+                    message: format!(
+                        "Map entries Struct must have 2 fields (key, value), got {}",
+                        fields.len()
+                    ),
+                });
+            }
+            DataTypes::map(from_arrow_field(&fields[0])?, from_arrow_field(&fields[1])?)
+        }
+        ArrowDataType::Struct(fields) => {
+            let row_fields: Result<Vec<DataField>> = fields
+                .iter()
+                .map(|f| Ok(DataField::new(f.name(), from_arrow_field(f)?, None)))
+                .collect();
+            DataTypes::row(row_fields?)
+        }
+        other => {
+            return Err(Error::IllegalArgument {
+                message: format!("Cannot convert Arrow type to Fluss type: {other:?}"),
+            });
+        }
+    })
+}
+
+#[derive(Clone)]
+pub struct ReadContext {
+    target_schema: SchemaRef,
+    full_schema: SchemaRef,
+    row_type: Arc<RowType>,
+    projection: Option<Projection>,
+    is_from_remote: bool,
+    fluss_row_type: Option<Arc<RowType>>,
+}
+
+#[derive(Clone)]
+struct Projection {
+    ordered_schema: SchemaRef,
+    projected_fields: Vec<usize>,
+    ordered_fields: Vec<usize>,
+
+    reordering_indexes: Vec<usize>,
+    reordering_needed: bool,
+}
+
+impl ReadContext {
+    pub fn new(
+        arrow_schema: SchemaRef,
+        row_type: Arc<RowType>,
+        is_from_remote: bool,
+    ) -> ReadContext {
+        ReadContext {
+            target_schema: arrow_schema.clone(),
+            full_schema: arrow_schema,
+            row_type,
+            projection: None,
+            is_from_remote,
+            fluss_row_type: None,
+        }
+    }
+
+    pub fn with_fluss_row_type(mut self, fluss_row_type: Arc<RowType>) -> ReadContext {
+        self.fluss_row_type = Some(fluss_row_type);
+        self
+    }
+
+    pub fn fluss_row_type(&self) -> Option<&Arc<RowType>> {
+        self.fluss_row_type.as_ref()
+    }
+
+    pub fn with_projection_pushdown(
+        arrow_schema: SchemaRef,
+        row_type: Arc<RowType>,
+        projected_fields: Vec<usize>,
+        is_from_remote: bool,
+    ) -> Result<ReadContext> {
+        Self::validate_projection(&arrow_schema, projected_fields.as_slice())?;
+        let target_schema =
+            Self::project_schema(arrow_schema.clone(), projected_fields.as_slice())?;
+        // the logic is little bit of hard to understand, to refactor it to follow
+        // java side
+        let (need_do_reorder, sorted_fields) = {
+            // currently, for remote read, arrow log doesn't support projection pushdown,
+            // so, only need to do reordering when is not from remote
+            if !is_from_remote {
+                let mut sorted_fields = projected_fields.clone();
+                sorted_fields.sort_unstable();
+                (!sorted_fields.eq(&projected_fields), sorted_fields)
+            } else {
+                // sorted_fields won't be used when need_do_reorder is false,
+                // let's use an empty vec directly
+                (false, vec![])
+            }
+        };
+
+        let project = {
+            if need_do_reorder {
+                // reordering is required
+                // Calculate reordering indexes to transform from sorted order to user-requested order
+                let mut reordering_indexes = Vec::with_capacity(projected_fields.len());
+                for &original_idx in &projected_fields {
+                    let pos = sorted_fields.binary_search(&original_idx).map_err(|_| {
+                        IllegalArgument {
+                            message: format!(
+                                "Projection index {original_idx} is invalid for the current schema."
+                            ),
+                        }
+                    })?;
+                    reordering_indexes.push(pos);
+                }
+                Projection {
+                    ordered_schema: Self::project_schema(
+                        arrow_schema.clone(),
+                        sorted_fields.as_slice(),
+                    )?,
+                    projected_fields,
+                    ordered_fields: sorted_fields,
+                    reordering_indexes,
+                    reordering_needed: true,
+                }
+            } else {
+                Projection {
+                    ordered_schema: Self::project_schema(
+                        arrow_schema.clone(),
+                        projected_fields.as_slice(),
+                    )?,
+                    ordered_fields: projected_fields.clone(),
+                    projected_fields,
+                    reordering_indexes: vec![],
+                    reordering_needed: false,
+                }
+            }
+        };
+
+        Ok(ReadContext {
+            target_schema,
+            full_schema: arrow_schema,
+            row_type,
+            projection: Some(project),
+            is_from_remote,
+            fluss_row_type: None,
+        })
+    }
+
+    fn validate_projection(schema: &SchemaRef, projected_fields: &[usize]) -> Result<()> {
+        let field_count = schema.fields().len();
+        for &index in projected_fields {
+            if index >= field_count {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "Projection index {index} is out of bounds for schema with {field_count} fields."
+                    ),
+                });
+            }
+        }
+        Ok(())
+    }
+
+    pub fn project_schema(schema: SchemaRef, projected_fields: &[usize]) -> Result<SchemaRef> {
+        Ok(SchemaRef::new(schema.project(projected_fields).map_err(
+            |e| IllegalArgument {
+                message: format!("Invalid projection: {e}"),
+            },
+        )?))
+    }
+
+    pub fn project_fields(&self) -> Option<&[usize]> {
+        self.projection
+            .as_ref()
+            .map(|p| p.projected_fields.as_slice())
+    }
+
+    pub fn project_fields_in_order(&self) -> Option<&[usize]> {
+        self.projection
+            .as_ref()
+            .map(|p| p.ordered_fields.as_slice())
+    }
+
+    pub fn record_batch(&self, data: &[u8]) -> Result<RecordBatch> {
+        let (batch_metadata, body_buffer, version) = parse_ipc_message(data)?;
+
+        let resolve_schema = {
+            // if from remote, no projection, need to use full schema
+            if self.is_from_remote {
+                self.full_schema.clone()
+            } else {
+                // the record batch from server must be ordered by field pos,
+                // according to project to decide what arrow schema to use
+                // to parse the record batch
+                match self.projection {
+                    Some(ref projection) => {
+                        // projection, should use ordered schema by project field pos
+                        projection.ordered_schema.clone()
+                    }
+                    None => {
+                        // no projection, use target output schema
+                        self.target_schema.clone()
+                    }
+                }
+            }
+        };
+
+        let record_batch = read_record_batch(
+            &body_buffer,
+            batch_metadata,
+            resolve_schema,
+            &HashMap::new(),
+            None,
+            &version,
+        )?;
+
+        let record_batch = match &self.projection {
+            Some(projection) => {
+                let reordered_columns = {
+                    // need to do reorder
+                    if self.is_from_remote {
+                        Some(&projection.projected_fields)
+                    } else if projection.reordering_needed {
+                        Some(&projection.reordering_indexes)
+                    } else {
+                        None
+                    }
+                };
+                match reordered_columns {
+                    Some(reordered_columns) => {
+                        let arrow_columns = reordered_columns
+                            .iter()
+                            .map(|&idx| record_batch.column(idx).clone())
+                            .collect();
+                        RecordBatch::try_new(self.target_schema.clone(), arrow_columns)?
+                    }
+                    _ => record_batch,
+                }
+            }
+            _ => record_batch,
+        };
+        Ok(record_batch)
+    }
+
+    pub fn record_batch_for_remote_log(&self, data: &[u8]) -> Result<Option<RecordBatch>> {
+        let (batch_metadata, body_buffer, version) = parse_ipc_message(data)?;
+
+        let record_batch = read_record_batch(
+            &body_buffer,
+            batch_metadata,
+            self.full_schema.clone(),
+            &HashMap::new(),
+            None,
+            &version,
+        )?;
+
+        let record_batch = match &self.projection {
+            Some(projection) => {
+                let projected_columns: Vec<_> = projection
+                    .projected_fields
+                    .iter()
+                    .map(|&idx| record_batch.column(idx).clone())
+                    .collect();
+                RecordBatch::try_new(self.target_schema.clone(), projected_columns)?
+            }
+            None => record_batch,
+        };
+        Ok(Some(record_batch))
+    }
+}
+
+pub enum LogRecordIterator {
+    Empty,
+    Arrow(ArrowLogRecordIterator),
+}
+
+impl LogRecordIterator {
+    pub fn empty() -> Self {
+        LogRecordIterator::Empty
+    }
+}
+
+impl Iterator for LogRecordIterator {
+    type Item = ScanRecord;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self {
+            LogRecordIterator::Empty => None,
+            LogRecordIterator::Arrow(iter) => iter.next(),
+        }
+    }
+}
+
+pub struct ArrowLogRecordIterator {
+    reader: ArrowReader,
+    base_offset: i64,
+    timestamp: i64,
+    row_id: usize,
+    change_type: ChangeType,
+}
+
+#[allow(dead_code)]
+impl ArrowLogRecordIterator {
+    fn new(reader: ArrowReader, base_offset: i64, timestamp: i64, change_type: ChangeType) -> Self {
+        Self {
+            reader,
+            base_offset,
+            timestamp,
+            row_id: 0,
+            change_type,
+        }
+    }
+}
+
+impl Iterator for ArrowLogRecordIterator {
+    type Item = ScanRecord;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.row_id >= self.reader.row_count() {
+            return None;
+        }
+
+        let columnar_row = self.reader.read(self.row_id);
+        let scan_record = ScanRecord::new(
+            columnar_row,
+            self.base_offset + self.row_id as i64,
+            self.timestamp,
+            self.change_type,
+        );
+        self.row_id += 1;
+        Some(scan_record)
+    }
+}
+
+pub struct ArrowReader {
+    record_batch: Arc<RecordBatch>,
+    row_type: Arc<RowType>,
+    fluss_row_type: Option<Arc<RowType>>,
+    row_column_indices: Arc<[usize]>,
+}
+
+impl ArrowReader {
+    pub fn new(record_batch: Arc<RecordBatch>, row_type: Arc<RowType>) -> Self {
+        let row_column_indices = arrow_row_column_indices(&record_batch);
+        ArrowReader {
+            record_batch,
+            row_type,
+            fluss_row_type: None,
+            row_column_indices,
+        }
+    }
+
+    pub fn new_with_fluss_row_type(
+        record_batch: Arc<RecordBatch>,
+        row_type: Arc<RowType>,
+        fluss_row_type: Option<Arc<RowType>>,
+    ) -> Self {
+        let row_column_indices = match &fluss_row_type {
+            Some(rt) => fluss_row_column_indices(rt),
+            None => arrow_row_column_indices(&record_batch),
+        };
+        ArrowReader {
+            record_batch,
+            row_type,
+            fluss_row_type,
+            row_column_indices,
+        }
+    }
+
+    pub fn row_count(&self) -> usize {
+        self.record_batch.num_rows()
+    }
+
+    pub fn read(&self, row_id: usize) -> ColumnarRow {
+        ColumnarRow::with_indices(
+            self.record_batch.clone(),
+            self.row_type.clone(),
+            row_id,
+            self.fluss_row_type.clone(),
+            self.row_column_indices.clone(),
+        )
+    }
+}
+pub struct MyVec<T>(pub StreamReader<T>);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataField, DataTypes, RowType};
+    use crate::test_utils::build_table_info;
+
+    #[test]
+    fn test_to_array_type() {
+        assert_eq!(
+            to_arrow_type(&DataTypes::boolean()).unwrap(),
+            ArrowDataType::Boolean
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::tinyint()).unwrap(),
+            ArrowDataType::Int8
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::smallint()).unwrap(),
+            ArrowDataType::Int16
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::bigint()).unwrap(),
+            ArrowDataType::Int64
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::int()).unwrap(),
+            ArrowDataType::Int32
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::float()).unwrap(),
+            ArrowDataType::Float32
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::double()).unwrap(),
+            ArrowDataType::Float64
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::char(16)).unwrap(),
+            ArrowDataType::Utf8
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::string()).unwrap(),
+            ArrowDataType::Utf8
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::decimal(10, 2)).unwrap(),
+            ArrowDataType::Decimal128(10, 2)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::date()).unwrap(),
+            ArrowDataType::Date32
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::time()).unwrap(),
+            ArrowDataType::Time32(arrow_schema::TimeUnit::Second)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::time_with_precision(3)).unwrap(),
+            ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::time_with_precision(6)).unwrap(),
+            ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::time_with_precision(9)).unwrap(),
+            ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::timestamp_with_precision(0)).unwrap(),
+            ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::timestamp_with_precision(3)).unwrap(),
+            ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::timestamp_with_precision(6)).unwrap(),
+            ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::timestamp_with_precision(9)).unwrap(),
+            ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::timestamp_ltz_with_precision(0)).unwrap(),
+            ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, None)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::timestamp_ltz_with_precision(3)).unwrap(),
+            ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::timestamp_ltz_with_precision(6)).unwrap(),
+            ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::timestamp_ltz_with_precision(9)).unwrap(),
+            ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, None)
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::bytes()).unwrap(),
+            ArrowDataType::Binary
+        );
+        assert_eq!(
+            to_arrow_type(&DataTypes::binary(16)).unwrap(),
+            ArrowDataType::FixedSizeBinary(16)
+        );
+
+        assert_eq!(
+            to_arrow_type(&DataTypes::array(DataTypes::int())).unwrap(),
+            ArrowDataType::List(Field::new_list_field(ArrowDataType::Int32, true).into())
+        );
+
+        assert_eq!(
+            to_arrow_type(&DataTypes::map(DataTypes::string(), DataTypes::int())).unwrap(),
+            ArrowDataType::Map(
+                Arc::new(Field::new(
+                    "entries",
+                    ArrowDataType::Struct(arrow_schema::Fields::from(vec![
+                        Field::new("key", ArrowDataType::Utf8, false),
+                        Field::new("value", ArrowDataType::Int32, true),
+                    ])),
+                    false,
+                )),
+                false,
+            )
+        );
+
+        assert_eq!(
+            to_arrow_type(&DataTypes::row(vec![
+                DataTypes::field("f1", DataTypes::int()),
+                DataTypes::field("f2", DataTypes::string()),
+            ]))
+            .unwrap(),
+            ArrowDataType::Struct(arrow_schema::Fields::from(vec![
+                Field::new("f1", ArrowDataType::Int32, true),
+                Field::new("f2", ArrowDataType::Utf8, true),
+            ]))
+        );
+    }
+
+    #[test]
+    fn test_arrow_map_schema_strictness() {
+        let map_type = DataTypes::map(DataTypes::string(), DataTypes::int());
+        let arrow_type = to_arrow_type(&map_type).unwrap();
+
+        if let ArrowDataType::Map(entries_field, _) = arrow_type {
+            assert!(
+                !entries_field.is_nullable(),
+                "Arrow Map 'entries' field must be strictly non-nullable"
+            );
+        } else {
+            panic!("Expected ArrowDataType::Map, got {:?}", arrow_type);
+        }
+    }
+
+    #[test]
+    fn test_from_arrow_type_preserves_container_field_nullability() {
+        let arrow_list = ArrowDataType::List(Arc::new(arrow_schema::Field::new(
+            "item",
+            ArrowDataType::Int32,
+            false,
+        )));
+        match from_arrow_type(&arrow_list).unwrap() {
+            DataType::Array(at) => assert!(!at.get_element_type().is_nullable()),
+            other => panic!("expected Array, got {other:?}"),
+        }
+
+        let entries_struct = ArrowDataType::Struct(arrow_schema::Fields::from(vec![
+            arrow_schema::Field::new("key", ArrowDataType::Utf8, false),
+            arrow_schema::Field::new("value", ArrowDataType::Int32, false),
+        ]));
+        let entries_field = arrow_schema::Field::new("entries", entries_struct, false);
+        let arrow_map = ArrowDataType::Map(Arc::new(entries_field), false);
+        match from_arrow_type(&arrow_map).unwrap() {
+            DataType::Map(m) => {
+                assert!(!m.key_type().is_nullable());
+                assert!(!m.value_type().is_nullable());
+            }
+            other => panic!("expected Map, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_parse_ipc_message() {
+        let empty_body: &[u8] = &le_bytes(&[0xFFFFFFFF, 0x00000000]);
+        let result = parse_ipc_message(empty_body);
+        assert_eq!(
+            result.unwrap_err().to_string(),
+            String::from(
+                "Fluss hitting Arrow error Parser error: Range [0, 4) is out of bounds.\n\n: ParseError(\"Range [0, 4) is out of bounds.\\n\\n\")."
+            )
+        );
+
+        let invalid_data = &[];
+        assert_eq!(
+            parse_ipc_message(invalid_data).unwrap_err().to_string(),
+            String::from(
+                "Fluss hitting Arrow error Parser error: Invalid data length: 0: ParseError(\"Invalid data length: 0\")."
+            )
+        );
+
+        let data_with_invalid_continuation: &[u8] = &le_bytes(&[0x00000001, 0x00000000]);
+        assert_eq!(
+            parse_ipc_message(data_with_invalid_continuation)
+                .unwrap_err()
+                .to_string(),
+            String::from(
+                "Fluss hitting Arrow error Parser error: Invalid continuation marker: 1: ParseError(\"Invalid continuation marker: 1\")."
+            )
+        );
+
+        let data_with_invalid_length: &[u8] = &le_bytes(&[0xFFFFFFFF, 0x00000001]);
+        assert_eq!(
+            parse_ipc_message(data_with_invalid_length)
+                .unwrap_err()
+                .to_string(),
+            String::from(
+                "Fluss hitting Arrow error Parser error: Invalid data length. Remaining data length 0 is shorter than specified size 1: ParseError(\"Invalid data length. Remaining data length 0 is shorter than specified size 1\")."
+            )
+        );
+
+        let data_with_invalid_length = &le_bytes(&[0xFFFFFFFF, 0x00000004, 0x00000000]);
+        assert_eq!(
+            parse_ipc_message(data_with_invalid_length)
+                .unwrap_err()
+                .to_string(),
+            String::from(
+                "Fluss hitting Arrow error Parser error: Not a record batch: ParseError(\"Not a record batch\")."
+            )
+        );
+    }
+
+    #[test]
+    fn projection_rejects_out_of_bounds_index() {
+        let row_type = RowType::new(vec![
+            DataField::new("id", DataTypes::int(), None),
+            DataField::new("name", DataTypes::string(), None),
+        ]);
+        let schema = to_arrow_schema(&row_type).unwrap();
+        let result =
+            ReadContext::with_projection_pushdown(schema, Arc::new(row_type), vec![0, 2], false);
+
+        assert!(matches!(result, Err(IllegalArgument { .. })));
+    }
+
+    #[test]
+    fn checksum_and_schema_id_read_minimum_header() {
+        // Header-only batches with record_count == 0 are valid; this covers the minimal bytes
+        // needed for checksum/schema_id access.
+        let mut data = vec![0u8; SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH];
+        let crc = 0xA1B2C3D4u32;
+        let schema_id = 42i16;
+        LittleEndian::write_u32(&mut data[CRC_OFFSET..CRC_OFFSET + CRC_LENGTH], crc);
+        LittleEndian::write_i16(
+            &mut data[SCHEMA_ID_OFFSET..SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH],
+            schema_id,
+        );
+
+        let batch = LogRecordBatch::new(Bytes::from(data));
+        assert_eq!(batch.checksum(), crc);
+        assert_eq!(batch.schema_id(), schema_id);
+
+        let expected = crc32c(&batch.data[SCHEMA_ID_OFFSET..]);
+        assert_eq!(batch.compute_checksum(), expected);
+    }
+
+    fn le_bytes(vals: &[u32]) -> Vec<u8> {
+        let mut out = Vec::with_capacity(vals.len() * 4);
+        for &v in vals {
+            out.extend_from_slice(&v.to_le_bytes());
+        }
+        out
+    }
+
+    #[test]
+    fn test_temporal_and_decimal_builder_validation() {
+        use crate::row::column_writer::ColumnWriter;
+        use arrow::array::Array;
+
+        // Test valid builder creation with precision=10, scale=2
+        let mut writer = ColumnWriter::create(
+            &DataTypes::decimal(10, 2),
+            &ArrowDataType::Decimal128(10, 2),
+            0,
+            256,
+        )
+        .unwrap();
+        let array = writer.finish();
+        assert_eq!(array.data_type(), &ArrowDataType::Decimal128(10, 2));
+
+        // Test error case: invalid Arrow precision/scale (exceeds Arrow's limit)
+        let result = ColumnWriter::create(
+            &DataTypes::decimal(10, 2),
+            &ArrowDataType::Decimal128(100, 50),
+            0,
+            256,
+        );
+        assert!(result.is_err());
+    }
+
+    #[test]
+    fn test_decimal_rescaling_and_validation() -> Result<()> {
+        use crate::row::{Datum, Decimal, GenericRow};
+        use arrow::array::Decimal128Array;
+        use bigdecimal::BigDecimal;
+        use std::str::FromStr;
+
+        // Test 1: Rescaling from scale 3 to scale 2
+        let row_type = RowType::new(vec![DataField::new(
+            "amount",
+            DataTypes::decimal(10, 2),
+            None,
+        )]);
+        let mut builder = RowAppendRecordBatchBuilder::new(&row_type)?;
+        let decimal = Decimal::from_big_decimal(BigDecimal::from_str("123.456").unwrap(), 10, 3)?;
+        let row = GenericRow {
+            values: vec![Datum::Decimal(decimal)],
+        };
+        builder.append(&row)?;
+        let batch = builder.build_arrow_record_batch()?;
+        let array = batch
+            .column(0)
+            .as_any()
+            .downcast_ref::<Decimal128Array>()
+            .unwrap();
+        assert_eq!(array.value(0), 12346); // 123.456 rounded to 2 decimal places
+        assert_eq!(array.scale(), 2);
+
+        // Test 2: Precision overflow (should error)
+        let row_type = RowType::new(vec![DataField::new(
+            "amount",
+            DataTypes::decimal(5, 2),
+            None,
+        )]);
+        let mut builder = RowAppendRecordBatchBuilder::new(&row_type)?;
+        let decimal = Decimal::from_big_decimal(BigDecimal::from_str("123456.78").unwrap(), 10, 2)?;
+        let row = GenericRow {
+            values: vec![Datum::Decimal(decimal)],
+        };
+        let result = builder.append(&row);
+        assert!(result.is_err());
+        assert!(
+            result
+                .unwrap_err()
+                .to_string()
+                .contains("precision overflow")
+        );
+
+        Ok(())
+    }
+
+    // Tests for file-backed streaming
+
+    #[test]
+    fn test_file_source_streaming() -> Result<()> {
+        use tempfile::NamedTempFile;
+
+        // Test 1: Basic file reads work
+        let test_data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+        let mut tmp_file = NamedTempFile::new()?;
+        tmp_file.write_all(&test_data)?;
+        tmp_file.flush()?;
+
+        let file_path = tmp_file.path().to_path_buf();
+        let file = File::open(&file_path)?;
+        let mut source = FileSource::new(file, 0, file_path)?;
+
+        // Read full data
+        let data = source.read_batch_data(0, 10)?;
+        assert_eq!(data.to_vec(), test_data);
+
+        // Read partial data
+        let partial = source.read_batch_data(2, 5)?;
+        assert_eq!(partial.to_vec(), vec![3, 4, 5, 6, 7]);
+
+        // Test 2: base_offset works (critical for remote logs with pos_in_log_segment)
+        let prefix = vec![0xFF; 100];
+        let actual_data = vec![1, 2, 3, 4, 5];
+        let mut tmp_file2 = NamedTempFile::new()?;
+        tmp_file2.write_all(&prefix)?;
+        tmp_file2.write_all(&actual_data)?;
+        tmp_file2.flush()?;
+
+        let file_path2 = tmp_file2.path().to_path_buf();
+        let file2 = File::open(&file_path2)?;
+        let mut source2 = FileSource::new(file2, 100, file_path2)?; // Skip first 100 bytes
+
+        assert_eq!(source2.total_size(), 5); // Only counts data after offset
+        let data2 = source2.read_batch_data(0, 5)?;
+        assert_eq!(data2.to_vec(), actual_data);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_all_types_end_to_end() -> Result<()> {
+        use crate::row::{Date, Datum, Decimal, GenericRow, Time, TimestampLtz, TimestampNtz};
+        use arrow::array::{
+            Date32Array, Decimal128Array, Int32Array, Time32MillisecondArray,
+            Time64NanosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray,
+        };
+        use bigdecimal::BigDecimal;
+        use std::str::FromStr;
+
+        // Schema with int, decimal, date, time (ms + ns), timestamps (μs + ns)
+        let row_type = RowType::new(vec![
+            DataField::new("id".to_string(), DataTypes::int(), None),
+            DataField::new("amount".to_string(), DataTypes::decimal(10, 2), None),
+            DataField::new("date".to_string(), DataTypes::date(), None),
+            DataField::new(
+                "time_ms".to_string(),
+                DataTypes::time_with_precision(3),
+                None,
+            ),
+            DataField::new(
+                "time_ns".to_string(),
+                DataTypes::time_with_precision(9),
+                None,
+            ),
+            DataField::new(
+                "ts_us".to_string(),
+                DataTypes::timestamp_with_precision(6),
+                None,
+            ),
+            DataField::new(
+                "ts_ltz_ns".to_string(),
+                DataTypes::timestamp_ltz_with_precision(9),
+                None,
+            ),
+        ]);
+
+        let mut builder = RowAppendRecordBatchBuilder::new(&row_type)?;
+
+        // Append rows with various data types
+        let row = GenericRow {
+            values: vec![
+                Datum::Int32(1),
+                Datum::Decimal(Decimal::from_big_decimal(
+                    BigDecimal::from_str("123.456").unwrap(),
+                    10,
+                    3,
+                )?),
+                // 18000 days since epoch = 2019-04-14
+                Datum::Date(Date::new(18000)),
+                // 43200000 ms = 12:00:00.000 (noon)
+                Datum::Time(Time::new(43200000)),
+                // 12345 ms = 00:00:12.345
+                Datum::Time(Time::new(12345)),
+                // 1609459200000 ms = 2021-01-01 00:00:00 UTC, with 123456 additional nanoseconds
+                Datum::TimestampNtz(TimestampNtz::from_millis_nanos(1609459200000, 123456)?),
+                // 1609459200000 ms = 2021-01-01 00:00:00 UTC, with 987654 additional nanoseconds
+                Datum::TimestampLtz(TimestampLtz::from_millis_nanos(1609459200000, 987654)?),
+            ],
+        };
+        builder.append(&row)?;
+
+        let batch = builder.build_arrow_record_batch()?;
+
+        // Verify all conversions
+        assert_eq!(
+            batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .unwrap()
+                .value(0),
+            1
+        );
+
+        let dec = batch
+            .column(1)
+            .as_any()
+            .downcast_ref::<Decimal128Array>()
+            .unwrap();
+        assert_eq!(dec.value(0), 12346); // 123.456 rounded to 2 decimal places
+
+        assert_eq!(
+            batch
+                .column(2)
+                .as_any()
+                .downcast_ref::<Date32Array>()
+                .unwrap()
+                .value(0),
+            18000
+        );
+
+        assert_eq!(
+            batch
+                .column(3)
+                .as_any()
+                .downcast_ref::<Time32MillisecondArray>()
+                .unwrap()
+                .value(0),
+            43200000
+        );
+
+        assert_eq!(
+            batch
+                .column(4)
+                .as_any()
+                .downcast_ref::<Time64NanosecondArray>()
+                .unwrap()
+                .value(0),
+            12345000000
+        );
+
+        // Timestamp with sub-millisecond nanos preserved
+        assert_eq!(
+            batch
+                .column(5)
+                .as_any()
+                .downcast_ref::<TimestampMicrosecondArray>()
+                .unwrap()
+                .value(0),
+            1609459200000123
+        );
+
+        assert_eq!(
+            batch
+                .column(6)
+                .as_any()
+                .downcast_ref::<TimestampNanosecondArray>()
+                .unwrap()
+                .value(0),
+            1609459200000987654
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_log_records_batches_from_file() -> Result<()> {
+        use crate::client::WriteRecord;
+        use crate::compression::{
+            ArrowCompressionInfo, ArrowCompressionType, DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+        };
+        use crate::metadata::{PhysicalTablePath, TablePath};
+        use crate::row::GenericRow;
+        use tempfile::NamedTempFile;
+
+        // Integration test: Real log record batch streamed from file
+        let row_type = RowType::new(vec![
+            DataField::new("id".to_string(), DataTypes::int(), None),
+            DataField::new("name".to_string(), DataTypes::string(), None),
+        ]);
+        let table_path = TablePath::new("db".to_string(), "tbl".to_string());
+        let table_info = Arc::new(build_table_info(table_path.clone(), 1, 1));
+        let physical_table_path = Arc::new(PhysicalTablePath::of(Arc::new(table_path)));
+
+        let mut builder = MemoryLogRecordsArrowBuilder::new(
+            1,
+            &row_type,
+            false,
+            ArrowCompressionInfo {
+                compression_type: ArrowCompressionType::None,
+                compression_level: DEFAULT_NON_ZSTD_COMPRESSION_LEVEL,
+            },
+            usize::MAX,
+            Arc::new(ArrowCompressionRatioEstimator::default()),
+        )?;
+
+        let mut row = GenericRow::new(2);
+        row.set_field(0, 1_i32);
+        row.set_field(1, "alice");
+        let record = WriteRecord::for_append(
+            Arc::clone(&table_info),
+            physical_table_path.clone(),
+            1,
+            &row,
+        );
+        builder.append(&record)?;
+
+        let mut row2 = GenericRow::new(2);
+        row2.set_field(0, 2_i32);
+        row2.set_field(1, "bob");
+        let record2 =
+            WriteRecord::for_append(Arc::clone(&table_info), physical_table_path, 2, &row2);
+        builder.append(&record2)?;
+
+        let data = builder.build()?;
+
+        // Write to file
+        let mut tmp_file = NamedTempFile::new()?;
+        tmp_file.write_all(&data)?;
+        tmp_file.flush()?;
+
+        // Create file-backed LogRecordsBatches (should stream, not load all into memory)
+        let file_path = tmp_file.path().to_path_buf();
+        let file = File::open(&file_path)?;
+        let mut batches = LogRecordsBatches::from_file(file, 0, file_path)?;
+
+        // Iterate through batches (should work just like in-memory)
+        let batch = batches.next().expect("Should have at least one batch")?;
+        assert!(batch.size_in_bytes() > 0);
+        assert_eq!(batch.record_count(), 2);
+
+        Ok(())
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/record/error.rs b/fluss-rust/crates/fluss/src/record/error.rs
new file mode 100644
index 0000000000..22704a0cdf
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/error.rs
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::io;
+use thiserror::Error;
+
+#[derive(Error, Debug)]
+#[non_exhaustive]
+#[allow(dead_code)]
+pub enum Error {
+    #[error(transparent)]
+    Io(#[from] io::Error),
+}
diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs
new file mode 100644
index 0000000000..ed67aa0e24
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/kv/kv_record.rs
@@ -0,0 +1,351 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Key-Value record implementation.
+//!
+//! This module provides the KvRecord struct which represents an immutable key-value record.
+//! The record format is:
+//! - Length => Int32
+//! - KeyLength => Unsigned VarInt
+//! - Key => bytes
+//! - Row => BinaryRow (optional, if null then this is a deletion record)
+
+use bytes::{BufMut, Bytes, BytesMut};
+use std::io;
+
+use crate::row::RowDecoder;
+use crate::row::compacted::CompactedRow;
+use crate::util::varint::{
+    read_unsigned_varint_bytes, size_of_unsigned_varint, write_unsigned_varint_buf,
+};
+
+/// Length field size in bytes
+pub const LENGTH_LENGTH: usize = 4;
+
+/// A key-value record containing raw key and value bytes.
+///
+/// The schema is:
+/// - Length => Int32
+/// - KeyLength => Unsigned VarInt
+/// - Key => bytes
+/// - Value => bytes (BinaryRow, written directly without length prefix)
+///
+/// When the value is None (deletion), no Value bytes are present.
+///
+/// This struct stores only raw bytes. To decode the value into a typed row,
+/// use the `row()` method with a RowDecoder (typically obtained from the iterator).
+///
+/// Reference implementation:
+/// <https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecord.java>
+#[derive(Debug, Clone)]
+pub struct KvRecord {
+    key: Bytes,
+    value_bytes: Option<Bytes>,
+    size_in_bytes: usize,
+}
+
+impl KvRecord {
+    /// Get the key bytes.
+    pub fn key(&self) -> &Bytes {
+        &self.key
+    }
+
+    /// Get the raw value bytes (for testing).
+    #[cfg(test)]
+    pub(crate) fn value_bytes(&self) -> Option<&Bytes> {
+        self.value_bytes.as_ref()
+    }
+
+    /// Decode the value bytes into a typed row using the provided decoder.
+    /// This creates a lightweight CompactedRow view over the raw bytes.
+    /// Actual field parsing is lazy (on first access).
+    pub fn row<'a>(&'a self, decoder: &dyn RowDecoder) -> Option<CompactedRow<'a>> {
+        self.value_bytes.as_ref().map(|bytes| {
+            // Decode on-demand - CompactedRow<'a> lifetime tied to &'a self
+            decoder.decode(bytes.as_ref())
+        })
+    }
+
+    /// Calculate the total size of the record when serialized (including length prefix).
+    pub fn size_of(key: &[u8], value: Option<&[u8]>) -> usize {
+        Self::size_without_length(key, value) + LENGTH_LENGTH
+    }
+
+    /// Calculate the size without the length prefix.
+    fn size_without_length(key: &[u8], value: Option<&[u8]>) -> usize {
+        let key_len = key.len();
+        let key_len_size = size_of_unsigned_varint(key_len as u32);
+
+        match value {
+            Some(v) => key_len_size.saturating_add(key_len).saturating_add(v.len()),
+            None => {
+                // Deletion: no value bytes
+                key_len_size.saturating_add(key_len)
+            }
+        }
+    }
+
+    /// Write a KV record to a buffer.
+    ///
+    /// Returns the number of bytes written.
+    pub fn write_to_buf(buf: &mut BytesMut, key: &[u8], value: Option<&[u8]>) -> io::Result<usize> {
+        let size_in_bytes = Self::size_without_length(key, value);
+
+        let size_i32 = i32::try_from(size_in_bytes).map_err(|_| {
+            io::Error::new(
+                io::ErrorKind::InvalidInput,
+                format!("Record size {size_in_bytes} exceeds i32::MAX"),
+            )
+        })?;
+        buf.put_i32_le(size_i32);
+        let key_len = key.len() as u32;
+        write_unsigned_varint_buf(key_len, buf);
+
+        buf.put_slice(key);
+
+        if let Some(v) = value {
+            buf.put_slice(v);
+        }
+        // For None (deletion), don't write any value bytes
+
+        Ok(size_in_bytes + LENGTH_LENGTH)
+    }
+
+    /// Read a KV record from bytes at the given position.
+    ///
+    /// Returns the KvRecord and the number of bytes consumed.
+    /// The record contains only raw bytes; use `row()` with a RowDecoder to decode the value.
+    pub fn read_from(bytes: &Bytes, position: usize) -> io::Result<(Self, usize)> {
+        if bytes.len() < position.saturating_add(LENGTH_LENGTH) {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Not enough bytes to read record length",
+            ));
+        }
+
+        let size_in_bytes_i32 = i32::from_le_bytes([
+            bytes[position],
+            bytes[position + 1],
+            bytes[position + 2],
+            bytes[position + 3],
+        ]);
+
+        if size_in_bytes_i32 < 0 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Invalid record length: {size_in_bytes_i32}"),
+            ));
+        }
+
+        let size_in_bytes = size_in_bytes_i32 as usize;
+
+        let total_size = size_in_bytes.checked_add(LENGTH_LENGTH).ok_or_else(|| {
+            io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Record size overflow: {size_in_bytes} + {LENGTH_LENGTH}"),
+            )
+        })?;
+
+        let available = bytes.len().saturating_sub(position);
+        if available < total_size {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                format!(
+                    "Not enough bytes to read record: expected {total_size}, available {available}"
+                ),
+            ));
+        }
+
+        let mut current_offset = position + LENGTH_LENGTH;
+        let record_end = position + total_size;
+
+        // Read key length as unsigned varint (bounded by record end)
+        let (key_len, varint_size) =
+            read_unsigned_varint_bytes(&bytes[current_offset..record_end])?;
+        current_offset += varint_size;
+
+        // Read key bytes
+        let key_end = current_offset + key_len as usize;
+        if key_end > position + total_size {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "Key length exceeds record size",
+            ));
+        }
+        let key = bytes.slice(current_offset..key_end);
+        current_offset = key_end;
+
+        // Read value bytes directly (don't decode yet - will decode on-demand)
+        let value_bytes = if current_offset < record_end {
+            // Value is present: all remaining bytes are the value
+            Some(bytes.slice(current_offset..record_end))
+        } else {
+            // No remaining bytes: this is a deletion record
+            None
+        };
+
+        Ok((
+            Self {
+                key,
+                value_bytes,
+                size_in_bytes: total_size,
+            },
+            total_size,
+        ))
+    }
+
+    /// Get the total size in bytes of this record.
+    pub fn get_size_in_bytes(&self) -> usize {
+        self.size_in_bytes
+    }
+
+    /// Check if this is a deletion record (no value).
+    pub fn is_deletion(&self) -> bool {
+        self.value_bytes.is_none()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_kv_record_basic_operations() {
+        let key = b"test_key";
+        let value = b"test_value";
+
+        // Test size calculation with value
+        let size_with_value = KvRecord::size_of(key, Some(value));
+        assert_eq!(
+            size_with_value,
+            LENGTH_LENGTH + size_of_unsigned_varint(key.len() as u32) + key.len() + value.len()
+        );
+
+        // Test size calculation without value (deletion)
+        let size_without_value = KvRecord::size_of(key, None);
+        assert_eq!(
+            size_without_value,
+            LENGTH_LENGTH + size_of_unsigned_varint(key.len() as u32) + key.len()
+        );
+
+        // Test write/read round trip with value
+        let mut buf = BytesMut::new();
+        let written = KvRecord::write_to_buf(&mut buf, key, Some(value)).unwrap();
+
+        let bytes = buf.freeze();
+        let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap();
+
+        assert_eq!(written, read_size);
+        assert_eq!(record.key().as_ref(), key);
+        assert_eq!(record.value_bytes().unwrap().as_ref(), value);
+        assert_eq!(record.get_size_in_bytes(), written);
+        assert!(!record.is_deletion());
+
+        // Test deletion record (no value)
+        let delete_key = b"delete_me";
+        let mut buf = BytesMut::new();
+        let written = KvRecord::write_to_buf(&mut buf, delete_key, None).unwrap();
+
+        let bytes = buf.freeze();
+        let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap();
+
+        assert_eq!(written, read_size);
+        assert_eq!(record.key().as_ref(), delete_key);
+        assert!(record.is_deletion());
+        assert!(record.value_bytes().is_none());
+    }
+
+    #[test]
+    fn test_kv_record_multiple_records() {
+        // Test multiple regular-sized records in buffer
+        let records = vec![
+            (b"key1".as_slice(), Some(b"value1".as_slice())),
+            (b"key2".as_slice(), None), // Deletion
+            (b"key3".as_slice(), Some(b"value3".as_slice())),
+        ];
+
+        let mut buf = BytesMut::new();
+        for (key, value) in &records {
+            KvRecord::write_to_buf(&mut buf, key, *value).unwrap();
+        }
+
+        let bytes = buf.freeze();
+        let mut offset = 0;
+        for (expected_key, expected_value) in &records {
+            let (record, size) = KvRecord::read_from(&bytes, offset).unwrap();
+            assert_eq!(record.key().as_ref(), *expected_key);
+            match expected_value {
+                Some(v) => {
+                    assert_eq!(record.value_bytes().unwrap().as_ref(), *v);
+                    assert!(!record.is_deletion());
+                }
+                None => {
+                    assert!(record.is_deletion());
+                    assert!(record.value_bytes().is_none());
+                }
+            }
+            offset += size;
+        }
+        assert_eq!(offset, bytes.len());
+
+        // Test large keys and values
+        let large_key = vec![0u8; 1024];
+        let large_value = vec![1u8; 4096];
+
+        let mut buf = BytesMut::new();
+        let written = KvRecord::write_to_buf(&mut buf, &large_key, Some(&large_value)).unwrap();
+
+        let bytes = buf.freeze();
+        let (record, read_size) = KvRecord::read_from(&bytes, 0).unwrap();
+
+        assert_eq!(written, read_size);
+        assert_eq!(record.key().len(), large_key.len());
+        assert_eq!(record.value_bytes().unwrap().len(), large_value.len());
+    }
+
+    #[test]
+    fn test_invalid_record_lengths() {
+        let mut buf = BytesMut::new();
+        buf.put_i32_le(-1); // Negative length
+        buf.put_u8(1); // Some dummy data
+        buf.put_slice(b"key");
+        let bytes = buf.freeze();
+        let result = KvRecord::read_from(&bytes, 0);
+        assert!(result.is_err());
+        if let Err(e) = result {
+            assert_eq!(e.kind(), io::ErrorKind::InvalidData);
+        }
+
+        // Test overflow length
+        let mut buf = BytesMut::new();
+        buf.put_i32_le(i32::MAX); // Very large length
+        buf.put_u8(1); // Some dummy data
+        let bytes = buf.freeze();
+        let result = KvRecord::read_from(&bytes, 0);
+        assert!(result.is_err());
+
+        // Test impossibly large but non-negative length
+        let mut buf = BytesMut::new();
+        buf.put_i32_le(1_000_000);
+        let bytes = buf.freeze();
+        let result = KvRecord::read_from(&bytes, 0);
+        assert!(result.is_err());
+        if let Err(e) = result {
+            assert_eq!(e.kind(), io::ErrorKind::UnexpectedEof);
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs
new file mode 100644
index 0000000000..14ff2e91b4
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch.rs
@@ -0,0 +1,456 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! KV record batch implementation.
+//!
+//! The schema of a KvRecordBatch is:
+//! - Length => Int32
+//! - Magic => Int8
+//! - CRC => Uint32
+//! - SchemaId => Int16
+//! - Attributes => Int8
+//! - WriterId => Int64
+//! - BatchSequence => Int32
+//! - RecordCount => Int32
+//! - Records => [Record]
+//!
+//! The CRC covers data from the SchemaId to the end of the batch.
+
+use bytes::Bytes;
+use std::io;
+use std::sync::Arc;
+
+use crate::error::Result;
+use crate::record::kv::{KvRecord, ReadContext};
+use crate::row::RowDecoder;
+
+// Field lengths in bytes
+pub const LENGTH_LENGTH: usize = 4;
+pub const MAGIC_LENGTH: usize = 1;
+pub const CRC_LENGTH: usize = 4;
+pub const SCHEMA_ID_LENGTH: usize = 2;
+pub const ATTRIBUTE_LENGTH: usize = 1;
+pub const WRITE_CLIENT_ID_LENGTH: usize = 8;
+pub const BATCH_SEQUENCE_LENGTH: usize = 4;
+pub const RECORDS_COUNT_LENGTH: usize = 4;
+
+// Field offsets
+pub const LENGTH_OFFSET: usize = 0;
+pub const MAGIC_OFFSET: usize = LENGTH_OFFSET + LENGTH_LENGTH;
+pub const CRC_OFFSET: usize = MAGIC_OFFSET + MAGIC_LENGTH;
+pub const SCHEMA_ID_OFFSET: usize = CRC_OFFSET + CRC_LENGTH;
+pub const ATTRIBUTES_OFFSET: usize = SCHEMA_ID_OFFSET + SCHEMA_ID_LENGTH;
+pub const WRITE_CLIENT_ID_OFFSET: usize = ATTRIBUTES_OFFSET + ATTRIBUTE_LENGTH;
+pub const BATCH_SEQUENCE_OFFSET: usize = WRITE_CLIENT_ID_OFFSET + WRITE_CLIENT_ID_LENGTH;
+pub const RECORDS_COUNT_OFFSET: usize = BATCH_SEQUENCE_OFFSET + BATCH_SEQUENCE_LENGTH;
+pub const RECORDS_OFFSET: usize = RECORDS_COUNT_OFFSET + RECORDS_COUNT_LENGTH;
+
+/// Total header size
+pub const RECORD_BATCH_HEADER_SIZE: usize = RECORDS_OFFSET;
+
+/// Overhead of the batch (length field)
+pub const KV_OVERHEAD: usize = LENGTH_OFFSET + LENGTH_LENGTH;
+
+/// A KV record batch.
+///
+/// This struct provides read access to a serialized KV record batch.
+// Reference implementation:
+// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecordBatch.java
+pub struct KvRecordBatch {
+    data: Bytes,
+    position: usize,
+}
+
+impl KvRecordBatch {
+    /// Create a new KvRecordBatch pointing to the given data at the specified position.
+    pub fn new(data: Bytes, position: usize) -> Self {
+        Self { data, position }
+    }
+
+    /// Get the size in bytes of this batch.
+    pub fn size_in_bytes(&self) -> io::Result<usize> {
+        if self.data.len() < self.position.saturating_add(LENGTH_LENGTH) {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Not enough bytes to read batch length",
+            ));
+        }
+        let length_i32 = i32::from_le_bytes([
+            self.data[self.position],
+            self.data[self.position + 1],
+            self.data[self.position + 2],
+            self.data[self.position + 3],
+        ]);
+
+        if length_i32 < 0 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Invalid batch length: {length_i32}"),
+            ));
+        }
+
+        let length = length_i32 as usize;
+
+        Ok(length.saturating_add(KV_OVERHEAD))
+    }
+
+    /// Check if this batch is valid by verifying the checksum.
+    pub fn is_valid(&self) -> bool {
+        if !matches!(self.size_in_bytes(), Ok(s) if s >= RECORD_BATCH_HEADER_SIZE) {
+            return false;
+        }
+
+        match (self.checksum(), self.compute_checksum()) {
+            (Ok(stored), Ok(computed)) => stored == computed,
+            _ => false,
+        }
+    }
+
+    /// Get the magic byte.
+    pub fn magic(&self) -> io::Result<u8> {
+        if self.data.len() < self.position.saturating_add(MAGIC_OFFSET).saturating_add(1) {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Not enough bytes to read magic byte",
+            ));
+        }
+        Ok(self.data[self.position + MAGIC_OFFSET])
+    }
+
+    /// Get the checksum.
+    pub fn checksum(&self) -> io::Result<u32> {
+        if self.data.len() < self.position.saturating_add(CRC_OFFSET).saturating_add(4) {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Not enough bytes to read checksum",
+            ));
+        }
+        Ok(u32::from_le_bytes([
+            self.data[self.position + CRC_OFFSET],
+            self.data[self.position + CRC_OFFSET + 1],
+            self.data[self.position + CRC_OFFSET + 2],
+            self.data[self.position + CRC_OFFSET + 3],
+        ]))
+    }
+
+    /// Compute the checksum of this batch.
+    pub fn compute_checksum(&self) -> io::Result<u32> {
+        let size = self.size_in_bytes()?;
+        if size < RECORD_BATCH_HEADER_SIZE {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Batch size {size} is less than header size {RECORD_BATCH_HEADER_SIZE}"),
+            ));
+        }
+
+        let start = self.position.saturating_add(SCHEMA_ID_OFFSET);
+        let end = self.position.saturating_add(size);
+
+        if end > self.data.len() || start >= end {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Not enough bytes to compute checksum",
+            ));
+        }
+
+        Ok(crc32c::crc32c(&self.data[start..end]))
+    }
+
+    /// Get the schema ID.
+    pub fn schema_id(&self) -> io::Result<i16> {
+        if self.data.len()
+            < self
+                .position
+                .saturating_add(SCHEMA_ID_OFFSET)
+                .saturating_add(2)
+        {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Not enough bytes to read schema ID",
+            ));
+        }
+        Ok(i16::from_le_bytes([
+            self.data[self.position + SCHEMA_ID_OFFSET],
+            self.data[self.position + SCHEMA_ID_OFFSET + 1],
+        ]))
+    }
+
+    /// Get the writer ID.
+    pub fn writer_id(&self) -> io::Result<i64> {
+        if self.data.len()
+            < self
+                .position
+                .saturating_add(WRITE_CLIENT_ID_OFFSET)
+                .saturating_add(8)
+        {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Not enough bytes to read writer ID",
+            ));
+        }
+        Ok(i64::from_le_bytes([
+            self.data[self.position + WRITE_CLIENT_ID_OFFSET],
+            self.data[self.position + WRITE_CLIENT_ID_OFFSET + 1],
+            self.data[self.position + WRITE_CLIENT_ID_OFFSET + 2],
+            self.data[self.position + WRITE_CLIENT_ID_OFFSET + 3],
+            self.data[self.position + WRITE_CLIENT_ID_OFFSET + 4],
+            self.data[self.position + WRITE_CLIENT_ID_OFFSET + 5],
+            self.data[self.position + WRITE_CLIENT_ID_OFFSET + 6],
+            self.data[self.position + WRITE_CLIENT_ID_OFFSET + 7],
+        ]))
+    }
+
+    /// Get the batch sequence.
+    pub fn batch_sequence(&self) -> io::Result<i32> {
+        if self.data.len()
+            < self
+                .position
+                .saturating_add(BATCH_SEQUENCE_OFFSET)
+                .saturating_add(4)
+        {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Not enough bytes to read batch sequence",
+            ));
+        }
+        Ok(i32::from_le_bytes([
+            self.data[self.position + BATCH_SEQUENCE_OFFSET],
+            self.data[self.position + BATCH_SEQUENCE_OFFSET + 1],
+            self.data[self.position + BATCH_SEQUENCE_OFFSET + 2],
+            self.data[self.position + BATCH_SEQUENCE_OFFSET + 3],
+        ]))
+    }
+
+    /// Get the number of records in this batch.
+    pub fn record_count(&self) -> io::Result<i32> {
+        if self.data.len()
+            < self
+                .position
+                .saturating_add(RECORDS_COUNT_OFFSET)
+                .saturating_add(4)
+        {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Not enough bytes to read record count",
+            ));
+        }
+        Ok(i32::from_le_bytes([
+            self.data[self.position + RECORDS_COUNT_OFFSET],
+            self.data[self.position + RECORDS_COUNT_OFFSET + 1],
+            self.data[self.position + RECORDS_COUNT_OFFSET + 2],
+            self.data[self.position + RECORDS_COUNT_OFFSET + 3],
+        ]))
+    }
+
+    /// Create an iterable collection of records in this batch.
+    ///
+    /// This validates the batch checksum before returning the records.
+    /// For trusted data paths, use `records_unchecked()` to skip validation.
+    ///
+    /// Mirrors: KvRecordBatch.records(ReadContext)
+    pub fn records(&self, read_context: &dyn ReadContext) -> Result<KvRecords> {
+        if !self.is_valid() {
+            return Err(crate::error::Error::IoUnexpectedError {
+                message: "Invalid batch checksum".to_string(),
+                source: io::Error::new(io::ErrorKind::InvalidData, "Invalid batch checksum"),
+            });
+        }
+        self.records_unchecked(read_context)
+    }
+
+    /// Create an iterable collection of records in this batch without validating the checksum.
+    pub fn records_unchecked(&self, read_context: &dyn ReadContext) -> Result<KvRecords> {
+        let size = self.size_in_bytes()?;
+        let count = self.record_count()?;
+        let schema_id = self.schema_id()?;
+
+        if count < 0 {
+            return Err(crate::error::Error::IoUnexpectedError {
+                message: format!("Invalid record count: {count}"),
+                source: io::Error::new(io::ErrorKind::InvalidData, "Invalid record count"),
+            });
+        }
+
+        // Get row decoder for this schema from context (cached)
+        let row_decoder = read_context.get_row_decoder(schema_id)?;
+
+        Ok(KvRecords {
+            iter: KvRecordIterator {
+                data: self.data.clone(),
+                position: self.position + RECORDS_OFFSET,
+                end: self.position + size,
+                remaining_count: count,
+            },
+            row_decoder,
+        })
+    }
+}
+
+/// Iterable collection of KV records with associated decoder.
+///
+/// This wrapper provides both iteration capability and access to the row decoder
+/// needed to decode record values into typed rows.
+pub struct KvRecords {
+    iter: KvRecordIterator,
+    row_decoder: Arc<dyn RowDecoder>,
+}
+
+impl KvRecords {
+    /// Get a reference to the row decoder for decoding record values.
+    ///
+    /// Returns a reference tied to the lifetime of `&self`.
+    /// Use this when iterating by reference.
+    pub fn decoder(&self) -> &dyn RowDecoder {
+        &*self.row_decoder
+    }
+
+    /// Get an owned Arc to the row decoder.
+    ///
+    /// Returns a cloned Arc that can outlive the KvRecords,
+    /// allowing you to grab it before consuming the iterator.
+    /// Useful if you must keep the decoder beyond the iterable’s lifetime(collect then decode style)
+    pub fn decoder_arc(&self) -> Arc<dyn RowDecoder> {
+        Arc::clone(&self.row_decoder)
+    }
+}
+
+impl IntoIterator for KvRecords {
+    type Item = io::Result<KvRecord>;
+    type IntoIter = KvRecordIterator;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter
+    }
+}
+
+/// Iterator over records in a KV record batch.
+pub struct KvRecordIterator {
+    data: Bytes,
+    position: usize,
+    end: usize,
+    remaining_count: i32,
+}
+
+impl Iterator for KvRecordIterator {
+    type Item = io::Result<KvRecord>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.remaining_count <= 0 || self.position >= self.end {
+            return None;
+        }
+
+        match KvRecord::read_from(&self.data, self.position) {
+            Ok((record, size)) => {
+                self.position += size;
+                self.remaining_count -= 1;
+                Some(Ok(record))
+            }
+            Err(e) => {
+                self.remaining_count = 0; // Stop iteration on error
+                Some(Err(e))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataTypes, KvFormat};
+    use crate::record::kv::test_util::TestReadContext;
+    use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, KvRecordBatchBuilder};
+    use crate::row::InternalRow;
+    use crate::row::binary::BinaryWriter;
+
+    use bytes::{BufMut, BytesMut};
+
+    #[test]
+    fn test_invalid_batch_lengths() {
+        // Test negative length
+        let mut buf = BytesMut::new();
+        buf.put_i32_le(-1);
+        let bytes = buf.freeze();
+        let batch = KvRecordBatch::new(bytes, 0);
+        assert!(batch.size_in_bytes().is_err()); // Should error for invalid
+        assert!(!batch.is_valid());
+
+        // Test overflow length
+        let mut buf = BytesMut::new();
+        buf.put_i32_le(i32::MAX);
+        let bytes = buf.freeze();
+        let batch = KvRecordBatch::new(bytes, 0);
+        assert!(!batch.is_valid());
+
+        // Test too-short buffer
+        let mut buf = BytesMut::new();
+        buf.put_i32_le(100); // Claims 100 bytes but buffer is tiny
+        let bytes = buf.freeze();
+        let batch = KvRecordBatch::new(bytes, 0);
+        assert!(!batch.is_valid());
+    }
+
+    #[test]
+    fn test_kv_record_batch_build_and_read() {
+        use crate::row::compacted::CompactedRowWriter;
+
+        let schema_id = 42;
+        let write_limit = 4096;
+
+        let mut builder = KvRecordBatchBuilder::new(schema_id, write_limit, KvFormat::COMPACTED);
+        builder.set_writer_state(100, 5);
+
+        let key1 = b"key1";
+        let mut value1_writer = CompactedRowWriter::new(1);
+        value1_writer.write_bytes(&[1, 2, 3, 4, 5]);
+
+        let row_bytes = value1_writer.buffer();
+        builder.append_row(key1, Some(row_bytes)).unwrap();
+
+        let key2 = b"key2";
+        builder.append_row(key2, None).unwrap();
+
+        let bytes = builder.build().unwrap();
+
+        let batch = KvRecordBatch::new(bytes.clone(), 0);
+        assert!(batch.is_valid());
+        assert_eq!(batch.magic().unwrap(), CURRENT_KV_MAGIC_VALUE);
+        assert_eq!(batch.schema_id().unwrap(), schema_id as i16);
+        assert_eq!(batch.writer_id().unwrap(), 100);
+        assert_eq!(batch.batch_sequence().unwrap(), 5);
+        assert_eq!(batch.record_count().unwrap(), 2);
+
+        // Create ReadContext for reading
+        let read_context = TestReadContext::compacted(vec![DataTypes::bytes()]);
+
+        // Iterate and verify records using typed API
+        let records = batch.records(&read_context).unwrap();
+        let decoder = records.decoder_arc(); // Get Arc before consuming
+
+        let mut iter = records.into_iter();
+        let record1 = iter.next().unwrap().unwrap();
+        assert_eq!(record1.key().as_ref(), key1);
+        assert!(!record1.is_deletion());
+        let row1 = record1.row(&*decoder).unwrap();
+        assert_eq!(row1.get_bytes(0).unwrap(), &[1, 2, 3, 4, 5]);
+
+        let record2 = iter.next().unwrap().unwrap();
+        assert_eq!(record2.key().as_ref(), key2);
+        assert!(record2.is_deletion());
+
+        assert!(iter.next().is_none());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs
new file mode 100644
index 0000000000..0e806337fd
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_batch_builder.rs
@@ -0,0 +1,578 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! KV record batch builder implementation.
+//!
+//! This module provides the KvRecordBatchBuilder for building batches of KV records.
+
+use crate::error::{Error, Result};
+use crate::metadata::KvFormat;
+use crate::record::kv::kv_record::KvRecord;
+use crate::record::kv::kv_record_batch::{
+    ATTRIBUTES_OFFSET, BATCH_SEQUENCE_OFFSET, CRC_OFFSET, LENGTH_LENGTH, LENGTH_OFFSET,
+    MAGIC_OFFSET, RECORD_BATCH_HEADER_SIZE, RECORDS_COUNT_OFFSET, SCHEMA_ID_OFFSET,
+    WRITE_CLIENT_ID_OFFSET,
+};
+use crate::record::kv::{CURRENT_KV_MAGIC_VALUE, NO_BATCH_SEQUENCE, NO_WRITER_ID};
+use bytes::{Bytes, BytesMut};
+use log::warn;
+use std::io;
+
+/// Builder for KvRecordBatch.
+///
+/// This builder accumulates KV records and produces a serialized batch with proper
+/// header information and checksums.
+// Reference implementation:
+// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/record/KvRecordBatchBuilder.java
+pub struct KvRecordBatchBuilder {
+    schema_id: i32,
+    magic: u8,
+    write_limit: usize,
+    buffer: BytesMut,
+    writer_id: i64,
+    batch_sequence: i32,
+    current_record_number: i32,
+    size_in_bytes: usize,
+    is_closed: bool,
+    kv_format: KvFormat,
+    aborted: bool,
+    built_buffer: Option<Bytes>,
+}
+
+impl KvRecordBatchBuilder {
+    /// Create a new KvRecordBatchBuilder.
+    ///
+    /// # Arguments
+    /// * `schema_id` - The schema ID for records in this batch (must fit in i16)
+    /// * `write_limit` - Maximum bytes that can be appended
+    /// * `kv_format` - The KV format (Compacted, Indexed, or Aligned)
+    pub fn new(schema_id: i32, write_limit: usize, kv_format: KvFormat) -> Self {
+        assert!(
+            schema_id <= i16::MAX as i32,
+            "schema_id shouldn't be greater than the max value of i16: {}",
+            i16::MAX
+        );
+
+        let mut buffer = BytesMut::with_capacity(write_limit.max(RECORD_BATCH_HEADER_SIZE));
+
+        // Reserve space for header (we'll write it at the end)
+        buffer.resize(RECORD_BATCH_HEADER_SIZE, 0);
+
+        Self {
+            schema_id,
+            magic: CURRENT_KV_MAGIC_VALUE,
+            write_limit,
+            buffer,
+            writer_id: NO_WRITER_ID,
+            batch_sequence: NO_BATCH_SEQUENCE,
+            current_record_number: 0,
+            size_in_bytes: RECORD_BATCH_HEADER_SIZE,
+            is_closed: false,
+            kv_format,
+            aborted: false,
+            built_buffer: None,
+        }
+    }
+
+    /// Check if there is room for a new record containing the given key and row bytes.
+    /// If no records have been appended, this always returns true.
+    pub fn has_room_for_row(&self, key: &[u8], row_bytes: Option<&[u8]>) -> bool {
+        self.size_in_bytes + KvRecord::size_of(key, row_bytes) <= self.write_limit
+    }
+
+    /// Append a KV record with row bytes to the batch.
+    ///
+    /// Returns an error if:
+    /// - The builder has been aborted
+    /// - The builder is closed
+    /// - Adding this record would exceed the write limit
+    /// - The maximum number of records is exceeded
+    /// - The KV format is not COMPACTED
+    pub fn append_row(&mut self, key: &[u8], row_bytes: Option<&[u8]>) -> io::Result<()> {
+        if self.kv_format != KvFormat::COMPACTED {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "append_row can only be used with KvFormat::COMPACTED",
+            ));
+        }
+
+        if self.aborted {
+            return Err(io::Error::other(
+                "Tried to append a record, but KvRecordBatchBuilder has already been aborted",
+            ));
+        }
+
+        if self.is_closed {
+            return Err(io::Error::other(
+                "Tried to append a record, but KvRecordBatchBuilder is closed for record appends",
+            ));
+        }
+
+        // Check record count limit before mutation
+        if self.current_record_number == i32::MAX {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                format!(
+                    "Maximum number of records per batch exceeded, max records: {}",
+                    i32::MAX
+                ),
+            ));
+        }
+
+        let record_size = KvRecord::size_of(key, row_bytes);
+        if self.size_in_bytes + record_size > self.write_limit {
+            return Err(io::Error::new(
+                io::ErrorKind::WriteZero,
+                format!(
+                    "Adding record would exceed write limit: {} + {} > {}",
+                    self.size_in_bytes, record_size, self.write_limit
+                ),
+            ));
+        }
+
+        let record_byte_size = KvRecord::write_to_buf(&mut self.buffer, key, row_bytes)?;
+        debug_assert_eq!(record_byte_size, record_size, "Record size mismatch");
+
+        self.current_record_number += 1;
+        self.size_in_bytes += record_byte_size;
+
+        // Invalidate cached buffer since we modified the batch
+        self.built_buffer = None;
+
+        Ok(())
+    }
+
+    /// Set the writer state (writer ID and batch base sequence).
+    ///
+    /// This invalidates any cached buffer, ensuring the batch header will be rebuilt
+    /// on the next call to [`build`](Self::build).
+    pub fn set_writer_state(&mut self, writer_id: i64, batch_base_sequence: i32) {
+        self.writer_id = writer_id;
+        self.batch_sequence = batch_base_sequence;
+        // Invalidate cached buffer since header fields changed
+        self.built_buffer = None;
+    }
+
+    /// Build the batch and return the serialized bytes.
+    ///
+    /// This can be called multiple times as the batch is cached after the first build.
+    ///
+    /// # Caching and Mutations
+    ///
+    /// The builder caches the result after the first successful build. However, the cache
+    /// is invalidated (and the batch rebuilt) if any of the following occur after building:
+    /// - Calling [`append_row`](Self::append_row) to add records
+    /// - Calling [`set_writer_state`](Self::set_writer_state) to modify writer metadata
+    ///
+    /// This allows the builder to be reused with different writer states or to continue
+    /// appending records after an initial build, but callers should be aware that the
+    /// built bytes may change if mutations occur between builds.
+    ///
+    /// Note: [`close`](Self::close) prevents further appends but does not prevent writer state modifications.
+    pub fn build(&mut self) -> Result<Bytes> {
+        if self.aborted {
+            return Err(Error::UnexpectedError {
+                message: "Attempting to build an aborted record batch".to_string(),
+                source: None,
+            });
+        }
+
+        if let Some(ref cached) = self.built_buffer {
+            return Ok(cached.clone());
+        }
+
+        self.write_batch_header()?;
+        let bytes = self.buffer.clone().freeze();
+        self.built_buffer = Some(bytes);
+        Ok(self.built_buffer.as_ref().unwrap().clone())
+    }
+
+    /// Get the writer ID.
+    pub fn writer_id(&self) -> i64 {
+        self.writer_id
+    }
+
+    /// Get the batch sequence.
+    pub fn batch_sequence(&self) -> i32 {
+        self.batch_sequence
+    }
+
+    /// Check if the builder is closed.
+    pub fn is_closed(&self) -> bool {
+        self.is_closed
+    }
+
+    /// Abort the builder.
+    /// After aborting, no more records can be appended and the batch cannot be built.
+    pub fn abort(&mut self) {
+        self.aborted = true;
+    }
+
+    /// Close the builder.
+    /// After closing, no more records can be appended, but the batch can still be built.
+    pub fn close(&mut self) -> Result<()> {
+        if self.aborted {
+            return Err(Error::UnexpectedError {
+                message: "Cannot close KvRecordBatchBuilder as it has already been aborted"
+                    .to_string(),
+                source: None,
+            });
+        }
+        self.is_closed = true;
+        Ok(())
+    }
+
+    /// Get the current size in bytes of the batch.
+    pub fn get_size_in_bytes(&self) -> usize {
+        self.size_in_bytes
+    }
+
+    // ----------------------- Internal methods -------------------------------
+
+    /// Write the batch header.
+    fn write_batch_header(&mut self) -> io::Result<()> {
+        let size_without_length = self.size_in_bytes - LENGTH_LENGTH;
+        let total_size = i32::try_from(size_without_length).map_err(|_| {
+            io::Error::new(
+                io::ErrorKind::InvalidInput,
+                format!("Batch size {size_without_length} exceeds i32::MAX"),
+            )
+        })?;
+
+        // Compute attributes before borrowing buffer mutably
+        let attributes = self.compute_attributes();
+
+        // Write to the beginning of the buffer
+        let header = &mut self.buffer[0..RECORD_BATCH_HEADER_SIZE];
+
+        // Write length
+        header[LENGTH_OFFSET..LENGTH_OFFSET + LENGTH_LENGTH]
+            .copy_from_slice(&total_size.to_le_bytes());
+
+        // Write magic
+        header[MAGIC_OFFSET] = self.magic;
+
+        // Write empty CRC first (will update later)
+        header[CRC_OFFSET..CRC_OFFSET + 4].copy_from_slice(&0u32.to_le_bytes());
+
+        // Write schema ID
+        header[SCHEMA_ID_OFFSET..SCHEMA_ID_OFFSET + 2]
+            .copy_from_slice(&(self.schema_id as i16).to_le_bytes());
+
+        // Write attributes
+        header[ATTRIBUTES_OFFSET] = attributes;
+
+        // Write writer ID
+        header[WRITE_CLIENT_ID_OFFSET..WRITE_CLIENT_ID_OFFSET + 8]
+            .copy_from_slice(&self.writer_id.to_le_bytes());
+
+        // Write batch sequence
+        header[BATCH_SEQUENCE_OFFSET..BATCH_SEQUENCE_OFFSET + 4]
+            .copy_from_slice(&self.batch_sequence.to_le_bytes());
+
+        // Write record count
+        header[RECORDS_COUNT_OFFSET..RECORDS_COUNT_OFFSET + 4]
+            .copy_from_slice(&self.current_record_number.to_le_bytes());
+
+        // Compute and update CRC
+        let crc = crc32c::crc32c(&self.buffer[SCHEMA_ID_OFFSET..self.size_in_bytes]);
+        self.buffer[CRC_OFFSET..CRC_OFFSET + 4].copy_from_slice(&crc.to_le_bytes());
+
+        Ok(())
+    }
+
+    /// Compute the attributes byte.
+    fn compute_attributes(&self) -> u8 {
+        // Currently no attributes are used
+        0
+    }
+}
+
+impl Drop for KvRecordBatchBuilder {
+    fn drop(&mut self) {
+        // Warn if the builder has records but was never built or was aborted
+        if self.current_record_number > 0 && !self.aborted && self.built_buffer.is_none() {
+            warn!(
+                "Warning: KvRecordBatchBuilder dropped with {} record(s) that were never built. \
+                 Call build() to serialize the batch before dropping.",
+                self.current_record_number
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataTypes, RowType};
+    use crate::row::binary::BinaryWriter;
+    use crate::row::compacted::{CompactedRow, CompactedRowWriter};
+    use std::sync::LazyLock;
+    static TEST_ROW_TYPE: LazyLock<RowType> =
+        LazyLock::new(|| RowType::with_data_types(vec![DataTypes::bytes()]));
+
+    // Helper function to create a CompactedRowWriter with a single bytes field for testing
+    fn create_test_row(data: &[u8]) -> CompactedRow<'_> {
+        CompactedRow::from_bytes(&TEST_ROW_TYPE, data)
+    }
+
+    #[test]
+    fn test_builder_basic_operations() {
+        // Test basic workflow: initial state, writer state, append, close, build
+        let schema_id = 42;
+        let write_limit = 4096;
+        let mut builder = KvRecordBatchBuilder::new(schema_id, write_limit, KvFormat::COMPACTED);
+
+        assert!(!builder.is_closed());
+        assert_eq!(builder.writer_id(), NO_WRITER_ID);
+        assert_eq!(builder.batch_sequence(), NO_BATCH_SEQUENCE);
+
+        builder.set_writer_state(100, 5);
+        assert_eq!(builder.writer_id(), 100);
+        assert_eq!(builder.batch_sequence(), 5);
+
+        let key1 = b"key1";
+        let value1 = create_test_row(b"value1");
+        assert!(builder.has_room_for_row(key1, Some(value1.as_bytes())));
+        builder.append_row(key1, Some(value1.as_bytes())).unwrap();
+
+        let key2 = b"key2";
+        assert!(builder.has_room_for_row(key2, None));
+        builder.append_row(key2, None).unwrap();
+
+        builder.close().unwrap();
+        assert!(builder.is_closed());
+
+        let bytes = builder.build().unwrap();
+        assert!(bytes.len() > RECORD_BATCH_HEADER_SIZE);
+
+        // Building again should return cached result
+        let bytes2 = builder.build().unwrap();
+        assert_eq!(bytes.len(), bytes2.len());
+
+        // Test lifecycle: abort behavior
+        let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED);
+        let value = create_test_row(b"value");
+        builder.append_row(b"key", Some(value.as_bytes())).unwrap();
+        builder.abort();
+        assert!(builder.append_row(b"key2", None).is_err());
+        assert!(builder.build().is_err());
+        assert!(builder.close().is_err());
+
+        // Test lifecycle: close behavior
+        let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED);
+        let value = create_test_row(b"value");
+        builder.append_row(b"key", Some(value.as_bytes())).unwrap();
+        builder.close().unwrap();
+        assert!(builder.append_row(b"key2", None).is_err());
+        assert!(builder.build().is_ok());
+
+        // Test KvFormat validation
+        let mut row_writer = CompactedRowWriter::new(1);
+        row_writer.write_int(42);
+        let row_bytes = row_writer.buffer();
+
+        // INDEXED format should reject append_row
+        let mut indexed_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::INDEXED);
+        let result = indexed_builder.append_row(b"key", Some(row_bytes));
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput);
+
+        // COMPACTED format should accept append_row
+        let mut compacted_builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED);
+        let result = compacted_builder.append_row(b"key", Some(row_bytes));
+        assert!(result.is_ok());
+    }
+
+    #[test]
+    fn test_write_limit_enforcement() {
+        let write_limit = 100; // Very small limit
+        let mut builder = KvRecordBatchBuilder::new(1, write_limit, KvFormat::COMPACTED);
+
+        // Test has_room_for_row helper
+        let large_key = vec![0u8; 1000];
+        let large_value = vec![1u8; 1000];
+        let large_row = create_test_row(&large_value);
+        assert!(!builder.has_room_for_row(&large_key, Some(large_row.as_bytes())));
+        let small_value = create_test_row(b"value");
+        assert!(builder.has_room_for_row(b"key", Some(small_value.as_bytes())));
+
+        // Test append enforcement - add small record first
+        builder
+            .append_row(b"key", Some(small_value.as_bytes()))
+            .unwrap();
+
+        // Try to add large record that exceeds limit (reuse large_row from above)
+        let result = builder.append_row(b"key2", Some(large_row.as_bytes()));
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err().kind(), io::ErrorKind::WriteZero);
+    }
+
+    #[test]
+    fn test_append_checks_record_count_limit() {
+        let mut builder = KvRecordBatchBuilder::new(1, 100000, KvFormat::COMPACTED);
+        builder.current_record_number = i32::MAX - 1;
+
+        let value1 = create_test_row(b"value1");
+        builder
+            .append_row(b"key1", Some(value1.as_bytes()))
+            .unwrap();
+
+        let value2 = create_test_row(b"value2");
+        let result = builder.append_row(b"key2", Some(value2.as_bytes()));
+        assert!(result.is_err());
+        assert_eq!(result.unwrap_err().kind(), io::ErrorKind::InvalidInput);
+    }
+
+    #[test]
+    #[should_panic(expected = "schema_id shouldn't be greater than")]
+    fn test_builder_invalid_schema_id() {
+        KvRecordBatchBuilder::new(i16::MAX as i32 + 1, 4096, KvFormat::COMPACTED);
+    }
+
+    #[test]
+    fn test_builder_cache_invalidation() {
+        use crate::record::kv::KvRecordBatch;
+
+        // Test cache invalidation on append
+        let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED);
+        builder.set_writer_state(100, 5);
+
+        let value1 = create_test_row(b"value1");
+        builder
+            .append_row(b"key1", Some(value1.as_bytes()))
+            .unwrap();
+        let bytes1 = builder.build().unwrap();
+        let len1 = bytes1.len();
+
+        // Append another record - this should invalidate the cache
+        let value2 = create_test_row(b"value2");
+        builder
+            .append_row(b"key2", Some(value2.as_bytes()))
+            .unwrap();
+        let bytes2 = builder.build().unwrap();
+        let len2 = bytes2.len();
+
+        // Verify the second build includes both records
+        assert!(len2 > len1);
+        let batch = KvRecordBatch::new(bytes2, 0);
+        assert!(batch.is_valid());
+        assert_eq!(batch.record_count().unwrap(), 2);
+
+        // Test cache invalidation on writer state change
+        let mut builder = KvRecordBatchBuilder::new(1, 4096, KvFormat::COMPACTED);
+        builder.set_writer_state(100, 5);
+        let value = create_test_row(b"value");
+        builder.append_row(b"key", Some(value.as_bytes())).unwrap();
+        let bytes1 = builder.build().unwrap();
+
+        // Change writer state - this should invalidate the cache
+        builder.set_writer_state(200, 10);
+        let bytes2 = builder.build().unwrap();
+
+        assert_ne!(bytes1, bytes2);
+
+        let batch1 = KvRecordBatch::new(bytes1, 0);
+        let batch2 = KvRecordBatch::new(bytes2, 0);
+
+        assert_eq!(batch1.writer_id().unwrap(), 100);
+        assert_eq!(batch1.batch_sequence().unwrap(), 5);
+        assert_eq!(batch2.writer_id().unwrap(), 200);
+        assert_eq!(batch2.batch_sequence().unwrap(), 10);
+    }
+
+    #[test]
+    fn test_builder_with_compacted_row_writer() -> crate::error::Result<()> {
+        use crate::record::kv::KvRecordBatch;
+        use crate::row::InternalRow;
+
+        let mut builder = KvRecordBatchBuilder::new(1, 100000, KvFormat::COMPACTED);
+        builder.set_writer_state(100, 5);
+
+        // Create and append first record with CompactedRowWriter
+        let mut row_writer1 = CompactedRowWriter::new(2);
+        row_writer1.write_int(42);
+        row_writer1.write_string("hello");
+
+        let row_bytes1 = row_writer1.buffer();
+
+        let key1 = b"key1";
+        assert!(builder.has_room_for_row(key1, Some(row_bytes1)));
+        builder.append_row(key1, Some(row_bytes1))?;
+
+        // Create and append second record
+        let mut row_writer2 = CompactedRowWriter::new(2);
+        row_writer2.write_int(100);
+        row_writer2.write_string("world");
+
+        let row_bytes2 = row_writer2.buffer();
+
+        let key2 = b"key2";
+        builder.append_row(key2, Some(row_bytes2))?;
+
+        // Append a deletion record
+        let key3 = b"key3";
+        builder.append_row(key3, None)?;
+
+        // Build and verify
+        builder.close()?;
+        let bytes = builder.build()?;
+
+        let batch = KvRecordBatch::new(bytes, 0);
+        assert!(batch.is_valid());
+        assert_eq!(batch.record_count()?, 3);
+        assert_eq!(batch.writer_id()?, 100);
+        assert_eq!(batch.batch_sequence()?, 5);
+
+        // Create ReadContext for reading typed rows
+        let types = vec![DataTypes::int(), DataTypes::string()];
+        let read_context = crate::record::kv::test_util::TestReadContext::compacted(types);
+
+        // Read back and verify records using idiomatic for-loop
+        let records = batch.records(&read_context)?;
+        let decoder = records.decoder_arc();
+        let mut record_count = 0;
+
+        for rec in records {
+            let rec = rec?;
+            record_count += 1;
+
+            match record_count {
+                1 => {
+                    assert_eq!(rec.key().as_ref(), key1);
+                    let row = rec.row(&*decoder).unwrap();
+                    assert_eq!(row.get_int(0)?, 42);
+                    assert_eq!(row.get_string(1)?, "hello");
+                }
+                2 => {
+                    assert_eq!(rec.key().as_ref(), key2);
+                    let row = rec.row(&*decoder).unwrap();
+                    assert_eq!(row.get_int(0)?, 100);
+                    assert_eq!(row.get_string(1)?, "world");
+                }
+                3 => {
+                    assert_eq!(rec.key().as_ref(), key3);
+                    assert!(rec.is_deletion());
+                }
+                _ => panic!("Unexpected record count"),
+            }
+        }
+
+        assert_eq!(record_count, 3);
+        Ok(())
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs
new file mode 100644
index 0000000000..4200e044b3
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/kv/kv_record_read_context.rs
@@ -0,0 +1,165 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Default implementation of ReadContext with decoder caching.
+
+use super::ReadContext;
+use crate::error::Result;
+use crate::metadata::{KvFormat, Schema};
+use crate::row::{RowDecoder, RowDecoderFactory};
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+
+/// Trait for fetching schemas by ID.
+///
+/// This trait abstracts schema retrieval, allowing different implementations
+/// (e.g., from metadata store, cache, or test mocks).
+pub trait SchemaGetter: Send + Sync {
+    /// Get the schema for the given schema ID.
+    ///
+    /// # Arguments
+    /// * `schema_id` - The schema ID to fetch
+    ///
+    /// # Returns
+    /// An Arc-wrapped Schema for the specified ID, or an error if the schema
+    /// cannot be fetched (missing ID, network error, etc.)
+    fn get_schema(&self, schema_id: i16) -> Result<Arc<Schema>>;
+}
+
+/// Default implementation of ReadContext with decoder caching.
+///
+/// This implementation caches RowDecoders by schema ID for performance,
+/// avoiding repeated schema lookups and decoder creation.
+///
+/// Reference: org.apache.fluss.record.KvRecordReadContext
+pub struct KvRecordReadContext {
+    kv_format: KvFormat,
+    schema_getter: Arc<dyn SchemaGetter>,
+    row_decoder_cache: Mutex<HashMap<i16, Arc<dyn RowDecoder>>>,
+}
+
+impl KvRecordReadContext {
+    /// Create a new KvRecordReadContext.
+    ///
+    /// # Arguments
+    /// * `kv_format` - The KV format (COMPACTED or INDEXED)
+    /// * `schema_getter` - The schema getter for fetching schemas by ID
+    ///
+    /// # Returns
+    /// A new KvRecordReadContext instance
+    pub fn new(kv_format: KvFormat, schema_getter: Arc<dyn SchemaGetter>) -> Self {
+        Self {
+            kv_format,
+            schema_getter,
+            row_decoder_cache: Mutex::new(HashMap::new()),
+        }
+    }
+}
+
+impl ReadContext for KvRecordReadContext {
+    fn get_row_decoder(&self, schema_id: i16) -> Result<Arc<dyn RowDecoder>> {
+        // First check: fast path
+        {
+            let cache = self
+                .row_decoder_cache
+                .lock()
+                .unwrap_or_else(|poisoned| poisoned.into_inner());
+            if let Some(decoder) = cache.get(&schema_id) {
+                return Ok(Arc::clone(decoder));
+            }
+        } // Release lock before expensive operations
+
+        // Build decoder outside the lock to avoid blocking other threads
+        let schema = self.schema_getter.get_schema(schema_id)?;
+        let row_type = schema.row_type().clone();
+
+        // Create decoder outside lock
+        let decoder = RowDecoderFactory::create(self.kv_format, row_type)?;
+
+        // Second check: insert only if another thread didn't beat us to it
+        {
+            let mut cache = self
+                .row_decoder_cache
+                .lock()
+                .unwrap_or_else(|poisoned| poisoned.into_inner());
+            // Check again - another thread might have inserted while we were building
+            if let Some(existing) = cache.get(&schema_id) {
+                return Ok(Arc::clone(existing));
+            }
+            cache.insert(schema_id, Arc::clone(&decoder));
+        }
+
+        Ok(decoder)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataTypes, Schema};
+
+    struct MockSchemaGetter {
+        schema: Arc<Schema>,
+    }
+
+    impl MockSchemaGetter {
+        fn new(data_types: Vec<crate::metadata::DataType>) -> Self {
+            let mut builder = Schema::builder();
+            for (i, dt) in data_types.iter().enumerate() {
+                builder = builder.column(format!("field{i}"), dt.clone());
+            }
+            let schema = builder.build().expect("Failed to build schema");
+
+            Self {
+                schema: Arc::new(schema),
+            }
+        }
+    }
+
+    impl SchemaGetter for MockSchemaGetter {
+        fn get_schema(&self, _schema_id: i16) -> Result<Arc<Schema>> {
+            Ok(Arc::clone(&self.schema))
+        }
+    }
+
+    #[test]
+    fn test_kv_record_read_context() {
+        // Test decoder caching for same schema ID
+        let schema_getter = Arc::new(MockSchemaGetter::new(vec![
+            DataTypes::int(),
+            DataTypes::string(),
+        ]));
+        let read_context = KvRecordReadContext::new(KvFormat::COMPACTED, schema_getter);
+
+        // Get decoder twice - should return the same instance (cached)
+        let decoder1 = read_context.get_row_decoder(42).unwrap();
+        let decoder2 = read_context.get_row_decoder(42).unwrap();
+
+        // Verify same instance (Arc pointer equality)
+        assert!(Arc::ptr_eq(&decoder1, &decoder2));
+
+        // Test different schema IDs get different decoders
+        let schema_getter = Arc::new(MockSchemaGetter::new(vec![DataTypes::int()]));
+        let read_context = KvRecordReadContext::new(KvFormat::COMPACTED, schema_getter);
+
+        let decoder1 = read_context.get_row_decoder(10).unwrap();
+        let decoder2 = read_context.get_row_decoder(20).unwrap();
+
+        // Should be different instances
+        assert!(!Arc::ptr_eq(&decoder1, &decoder2));
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/record/kv/mod.rs b/fluss-rust/crates/fluss/src/record/kv/mod.rs
new file mode 100644
index 0000000000..4d0f894638
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/kv/mod.rs
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Key-Value record and batch implementations.
+
+mod kv_record;
+mod kv_record_batch;
+mod kv_record_batch_builder;
+mod kv_record_read_context;
+mod read_context;
+mod value_record_batch;
+
+#[cfg(test)]
+mod test_util;
+
+pub use kv_record::{KvRecord, LENGTH_LENGTH as KV_RECORD_LENGTH_LENGTH};
+pub use kv_record_batch::*;
+pub use kv_record_batch_builder::*;
+pub use kv_record_read_context::{KvRecordReadContext, SchemaGetter};
+pub use read_context::ReadContext;
+pub(crate) use value_record_batch::ValueRecordBatch;
+
+/// Current KV magic value
+pub const CURRENT_KV_MAGIC_VALUE: u8 = 0;
+
+/// No writer ID constant
+pub const NO_WRITER_ID: i64 = -1;
+
+/// No batch sequence constant
+pub const NO_BATCH_SEQUENCE: i32 = -1;
diff --git a/fluss-rust/crates/fluss/src/record/kv/read_context.rs b/fluss-rust/crates/fluss/src/record/kv/read_context.rs
new file mode 100644
index 0000000000..63502613d1
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/kv/read_context.rs
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Read context for KV record batches.
+//!
+//! Provides schema and decoder information needed for typed record reading.
+
+use crate::error::Result;
+use crate::row::RowDecoder;
+use std::sync::Arc;
+
+/// Context for reading KV records with type information.
+///
+/// The ReadContext provides access to RowDecoders based on schema IDs,
+/// enabling typed deserialization of KV record values.
+///
+/// Reference: org.apache.fluss.record.KvRecordBatch.ReadContext
+pub trait ReadContext: Send + Sync {
+    /// Get the row decoder for the given schema ID.
+    ///
+    /// The decoder is typically cached, so repeated calls with the same
+    /// schema ID should return the same decoder instance.
+    ///
+    /// # Arguments
+    /// * `schema_id` - The schema ID for which to get the decoder
+    ///
+    /// # Returns
+    /// An Arc-wrapped RowDecoder for the specified schema, or an error if
+    /// the schema is invalid or cannot be retrieved
+    fn get_row_decoder(&self, schema_id: i16) -> Result<Arc<dyn RowDecoder>>;
+}
diff --git a/fluss-rust/crates/fluss/src/record/kv/test_util.rs b/fluss-rust/crates/fluss/src/record/kv/test_util.rs
new file mode 100644
index 0000000000..54eaac8f3d
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/kv/test_util.rs
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Test utilities for KV record reading.
+
+use super::ReadContext;
+use crate::error::Result;
+use crate::metadata::{DataType, KvFormat, RowType};
+use crate::row::{RowDecoder, RowDecoderFactory};
+use std::sync::Arc;
+
+/// Simple test-only ReadContext that creates decoders directly from data types.
+///
+/// This bypasses the production Schema/SchemaGetter machinery for simpler tests.
+pub(crate) struct TestReadContext {
+    kv_format: KvFormat,
+    data_types: Vec<DataType>,
+}
+
+impl TestReadContext {
+    /// Create a test context for COMPACTED format (most common case).
+    pub(crate) fn compacted(data_types: Vec<DataType>) -> Self {
+        Self {
+            kv_format: KvFormat::COMPACTED,
+            data_types,
+        }
+    }
+}
+
+impl ReadContext for TestReadContext {
+    fn get_row_decoder(&self, _schema_id: i16) -> Result<Arc<dyn RowDecoder>> {
+        // Directly create decoder from data types - no Schema needed!
+        let row_type = RowType::with_data_types(self.data_types.clone());
+        RowDecoderFactory::create(self.kv_format, row_type)
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/record/kv/value_record_batch.rs b/fluss-rust/crates/fluss/src/record/kv/value_record_batch.rs
new file mode 100644
index 0000000000..fdd6b0702c
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/kv/value_record_batch.rs
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Reader for the value-record batch returned by a KV (primary-key) limit
+//! scan. This is a distinct wire format from [`super::KvRecordBatch`]: it
+//! carries value-only records (no keys, no CRC/writer-id header) and a schema
+//! id *per record* rather than per batch.
+//!
+//! Batch layout (little-endian):
+//! - Length      => Int32  (size of everything after this field)
+//! - Magic       => Int8
+//! - RecordCount => Int32
+//! - Records     => [ValueRecord]
+//!
+//! Each `ValueRecord`:
+//! - Length   => Int32  (size after this field: SchemaId + Value)
+//! - SchemaId => Int16
+//! - Value    => row bytes
+//!
+//! Reference: `org.apache.fluss.record.DefaultValueRecordBatch` and
+//! `org.apache.fluss.record.DefaultValueRecord`.
+
+use crate::error::{Error, Result};
+use byteorder::{ByteOrder, LittleEndian};
+use bytes::Bytes;
+use std::ops::Range;
+
+const LENGTH_LENGTH: usize = 4;
+const MAGIC_LENGTH: usize = 1;
+const RECORD_COUNT_LENGTH: usize = 4;
+/// Offset of the record count within the batch header.
+const RECORD_COUNT_OFFSET: usize = LENGTH_LENGTH + MAGIC_LENGTH;
+/// Size of the batch header (`Length + Magic + RecordCount`).
+const RECORD_BATCH_HEADER_SIZE: usize = LENGTH_LENGTH + MAGIC_LENGTH + RECORD_COUNT_LENGTH;
+/// Size of a `ValueRecord`'s leading length field.
+const RECORD_LENGTH_LENGTH: usize = 4;
+
+/// Read-only view over a serialized value-record batch.
+pub(crate) struct ValueRecordBatch {
+    data: Bytes,
+}
+
+impl ValueRecordBatch {
+    /// Wraps raw batch bytes. The batch is expected to start at offset 0.
+    pub(crate) fn new(data: Bytes) -> Self {
+        Self { data }
+    }
+
+    /// Number of records declared in the batch header.
+    pub(crate) fn record_count(&self) -> Result<i32> {
+        if self.data.len() < RECORD_BATCH_HEADER_SIZE {
+            return Err(corrupt(format!(
+                "value-record batch too short: {} bytes, need {} for header",
+                self.data.len(),
+                RECORD_BATCH_HEADER_SIZE
+            )));
+        }
+        Ok(LittleEndian::read_i32(
+            &self.data[RECORD_COUNT_OFFSET..RECORD_COUNT_OFFSET + RECORD_COUNT_LENGTH],
+        ))
+    }
+
+    /// Returns one byte range per record, each spanning `[SchemaId | Value]`:
+    /// the payload [`crate::row::FixedSchemaDecoder::decode`] expects. Index
+    /// [`Self::data`] with a returned range to get it without copying.
+    pub(crate) fn value_ranges(&self) -> Result<Vec<Range<usize>>> {
+        let count = self.record_count()?;
+        if count < 0 {
+            return Err(corrupt(format!("invalid record count {count}")));
+        }
+        let mut ranges = Vec::with_capacity(count as usize);
+        let mut pos = RECORD_BATCH_HEADER_SIZE;
+        for i in 0..count as usize {
+            if pos + RECORD_LENGTH_LENGTH > self.data.len() {
+                return Err(corrupt(format!(
+                    "truncated value-record batch: record {i} length field runs past end"
+                )));
+            }
+            let rec_len = LittleEndian::read_i32(&self.data[pos..pos + RECORD_LENGTH_LENGTH]);
+            if rec_len < 0 {
+                return Err(corrupt(format!("record {i} has negative length {rec_len}")));
+            }
+            let start = pos + RECORD_LENGTH_LENGTH;
+            let end = start + rec_len as usize;
+            if end > self.data.len() {
+                return Err(corrupt(format!(
+                    "truncated value-record batch: record {i} payload runs past end"
+                )));
+            }
+            ranges.push(start..end);
+            pos = end;
+        }
+        Ok(ranges)
+    }
+
+    /// The underlying batch bytes.
+    pub(crate) fn data(&self) -> &Bytes {
+        &self.data
+    }
+}
+
+fn corrupt(message: String) -> Error {
+    Error::UnexpectedError {
+        message,
+        source: None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::record::kv::SCHEMA_ID_LENGTH;
+
+    /// Build a value-record batch from `(schema_id, row_bytes)` pairs, mirroring
+    /// the Java `DefaultValueRecordBatch.Builder` wire layout.
+    fn build_batch(records: &[(i16, &[u8])]) -> Vec<u8> {
+        let mut body = Vec::new();
+        for (schema_id, row) in records {
+            let rec_len = (SCHEMA_ID_LENGTH + row.len()) as i32;
+            body.extend_from_slice(&rec_len.to_le_bytes());
+            body.extend_from_slice(&schema_id.to_le_bytes());
+            body.extend_from_slice(row);
+        }
+        let mut out = Vec::new();
+        // Length covers Magic + RecordCount + body.
+        let length = (MAGIC_LENGTH + RECORD_COUNT_LENGTH + body.len()) as i32;
+        out.extend_from_slice(&length.to_le_bytes());
+        out.push(0); // magic
+        out.extend_from_slice(&(records.len() as i32).to_le_bytes());
+        out.extend_from_slice(&body);
+        out
+    }
+
+    #[test]
+    fn parses_record_count_and_ranges() {
+        let raw = build_batch(&[(7, &[1, 2, 3]), (7, &[4, 5])]);
+        let batch = ValueRecordBatch::new(Bytes::from(raw));
+        assert_eq!(batch.record_count().unwrap(), 2);
+
+        let ranges = batch.value_ranges().unwrap();
+        assert_eq!(ranges.len(), 2);
+        // First record payload = [schema_id(2) | row(3)] = 5 bytes.
+        let r0 = &batch.data()[ranges[0].clone()];
+        assert_eq!(r0.len(), 5);
+        assert_eq!(LittleEndian::read_i16(&r0[..2]), 7);
+        assert_eq!(&r0[2..], &[1, 2, 3]);
+        // Second record payload = [schema_id(2) | row(2)] = 4 bytes.
+        let r1 = &batch.data()[ranges[1].clone()];
+        assert_eq!(r1.len(), 4);
+        assert_eq!(&r1[2..], &[4, 5]);
+    }
+
+    #[test]
+    fn empty_batch_has_no_ranges() {
+        let raw = build_batch(&[]);
+        let batch = ValueRecordBatch::new(Bytes::from(raw));
+        assert_eq!(batch.record_count().unwrap(), 0);
+        assert!(batch.value_ranges().unwrap().is_empty());
+    }
+
+    #[test]
+    fn truncated_payload_errors() {
+        let mut raw = build_batch(&[(7, &[1, 2, 3])]);
+        raw.truncate(raw.len() - 2); // chop into the row payload
+        let batch = ValueRecordBatch::new(Bytes::from(raw));
+        assert!(batch.value_ranges().is_err());
+    }
+
+    #[test]
+    fn short_header_errors() {
+        let batch = ValueRecordBatch::new(Bytes::from(vec![0u8, 1, 2]));
+        assert!(batch.record_count().is_err());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/record/mod.rs b/fluss-rust/crates/fluss/src/record/mod.rs
new file mode 100644
index 0000000000..462bdebbc1
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/record/mod.rs
@@ -0,0 +1,327 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::TableBucket;
+use crate::row::ColumnarRow;
+use ::arrow::array::RecordBatch;
+use core::fmt;
+use std::collections::HashMap;
+
+mod arrow;
+mod error;
+pub mod kv;
+
+pub use arrow::*;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ChangeType {
+    /// Append-only operation
+    AppendOnly,
+    /// Insert operation
+    Insert,
+    /// Update operation containing the previous content of the updated row
+    UpdateBefore,
+    /// Update operation containing the new content of the updated row
+    UpdateAfter,
+    /// Delete operation
+    Delete,
+}
+
+impl ChangeType {
+    /// Returns a short string representation of this ChangeType
+    pub fn short_string(&self) -> &'static str {
+        match self {
+            ChangeType::AppendOnly => "+A",
+            ChangeType::Insert => "+I",
+            ChangeType::UpdateBefore => "-U",
+            ChangeType::UpdateAfter => "+U",
+            ChangeType::Delete => "-D",
+        }
+    }
+
+    /// Returns the byte value representation used for serialization
+    pub fn to_byte_value(&self) -> u8 {
+        match self {
+            ChangeType::AppendOnly => 0,
+            ChangeType::Insert => 1,
+            ChangeType::UpdateBefore => 2,
+            ChangeType::UpdateAfter => 3,
+            ChangeType::Delete => 4,
+        }
+    }
+
+    /// Creates a ChangeType from its byte value representation
+    ///
+    /// # Errors
+    /// Returns an error if the byte value doesn't correspond to any ChangeType
+    pub fn from_byte_value(value: u8) -> Result<Self, String> {
+        match value {
+            0 => Ok(ChangeType::AppendOnly),
+            1 => Ok(ChangeType::Insert),
+            2 => Ok(ChangeType::UpdateBefore),
+            3 => Ok(ChangeType::UpdateAfter),
+            4 => Ok(ChangeType::Delete),
+            _ => Err(format!("Unsupported byte value '{value}' for change type")),
+        }
+    }
+}
+
+impl fmt::Display for ChangeType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.short_string())
+    }
+}
+
+#[derive(Clone)]
+pub struct ScanRecord {
+    pub row: ColumnarRow,
+    offset: i64,
+    timestamp: i64,
+    change_type: ChangeType,
+}
+
+impl ScanRecord {
+    const INVALID: i64 = -1;
+
+    pub fn new_default(row: ColumnarRow) -> Self {
+        ScanRecord {
+            row,
+            offset: Self::INVALID,
+            timestamp: Self::INVALID,
+            change_type: ChangeType::Insert,
+        }
+    }
+
+    pub fn new(row: ColumnarRow, offset: i64, timestamp: i64, change_type: ChangeType) -> Self {
+        ScanRecord {
+            row,
+            offset,
+            timestamp,
+            change_type,
+        }
+    }
+
+    pub fn row(&self) -> &ColumnarRow {
+        &self.row
+    }
+
+    /// Returns the position in the log
+    pub fn offset(&self) -> i64 {
+        self.offset
+    }
+
+    /// Returns the timestamp
+    pub fn timestamp(&self) -> i64 {
+        self.timestamp
+    }
+
+    /// Returns the change type
+    pub fn change_type(&self) -> &ChangeType {
+        &self.change_type
+    }
+}
+
+pub struct ScanRecords {
+    records: HashMap<TableBucket, Vec<ScanRecord>>,
+}
+
+impl ScanRecords {
+    pub fn empty() -> Self {
+        Self {
+            records: HashMap::new(),
+        }
+    }
+
+    pub fn new(records: HashMap<TableBucket, Vec<ScanRecord>>) -> Self {
+        Self { records }
+    }
+
+    pub fn records(&self, scan_bucket: &TableBucket) -> &[ScanRecord] {
+        self.records.get(scan_bucket).map_or(&[], |records| records)
+    }
+
+    pub fn count(&self) -> usize {
+        self.records.values().map(|v| v.len()).sum()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.records.is_empty()
+    }
+
+    pub fn records_by_buckets(&self) -> &HashMap<TableBucket, Vec<ScanRecord>> {
+        &self.records
+    }
+
+    pub fn into_records_by_buckets(self) -> HashMap<TableBucket, Vec<ScanRecord>> {
+        self.records
+    }
+}
+
+/// A batch of records with metadata about bucket and offsets.
+///
+/// This is the batch-level equivalent of [`ScanRecord`], providing efficient
+/// access to Arrow RecordBatches while preserving the bucket and offset information
+/// needed for tracking consumption progress.
+#[derive(Debug, Clone)]
+pub struct ScanBatch {
+    /// The bucket this batch belongs to
+    bucket: TableBucket,
+    /// The Arrow RecordBatch containing the data
+    batch: RecordBatch,
+    /// Offset of the first record in this batch
+    base_offset: i64,
+}
+
+impl ScanBatch {
+    pub fn new(bucket: TableBucket, batch: RecordBatch, base_offset: i64) -> Self {
+        Self {
+            bucket,
+            batch,
+            base_offset,
+        }
+    }
+
+    pub fn bucket(&self) -> &TableBucket {
+        &self.bucket
+    }
+
+    pub fn batch(&self) -> &RecordBatch {
+        &self.batch
+    }
+
+    pub fn into_batch(self) -> RecordBatch {
+        self.batch
+    }
+
+    pub fn base_offset(&self) -> i64 {
+        self.base_offset
+    }
+
+    pub fn num_records(&self) -> usize {
+        self.batch.num_rows()
+    }
+
+    /// Returns the offset of the last record in this batch.
+    pub fn last_offset(&self) -> i64 {
+        if self.batch.num_rows() == 0 {
+            self.base_offset - 1
+        } else {
+            self.base_offset + self.batch.num_rows() as i64 - 1
+        }
+    }
+}
+
+impl IntoIterator for ScanRecords {
+    type Item = ScanRecord;
+    type IntoIter = std::vec::IntoIter<ScanRecord>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.records
+            .into_values()
+            .flatten()
+            .collect::<Vec<_>>()
+            .into_iter()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ::arrow::array::{Int32Array, RecordBatch};
+    use ::arrow::datatypes::{DataType, Field, Schema};
+    use std::sync::Arc;
+
+    fn make_row(values: Vec<i32>, row_id: usize) -> ColumnarRow {
+        use crate::metadata::RowType;
+        let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, false)]));
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(values))])
+            .expect("record batch");
+        let row_type = Arc::new(RowType::with_data_types(vec![
+            crate::metadata::DataType::Int(crate::metadata::IntType::new()),
+        ]));
+        ColumnarRow::new(Arc::new(batch), row_type, row_id, None)
+    }
+
+    #[test]
+    fn change_type_round_trip() {
+        let cases = [
+            (ChangeType::AppendOnly, "+A", 0),
+            (ChangeType::Insert, "+I", 1),
+            (ChangeType::UpdateBefore, "-U", 2),
+            (ChangeType::UpdateAfter, "+U", 3),
+            (ChangeType::Delete, "-D", 4),
+        ];
+
+        for (change_type, short, byte) in cases {
+            assert_eq!(change_type.short_string(), short);
+            assert_eq!(change_type.to_byte_value(), byte);
+            assert_eq!(ChangeType::from_byte_value(byte).unwrap(), change_type);
+        }
+
+        let err = ChangeType::from_byte_value(9).unwrap_err();
+        assert!(err.contains("Unsupported byte value"));
+    }
+
+    #[test]
+    fn scan_records_counts_and_iterates() {
+        let bucket0 = TableBucket::new(1, 0);
+        let bucket1 = TableBucket::new(1, 1);
+        let record0 = ScanRecord::new(make_row(vec![10, 11], 0), 5, 7, ChangeType::Insert);
+        let record1 = ScanRecord::new(make_row(vec![10, 11], 1), 6, 8, ChangeType::Delete);
+
+        let mut records = HashMap::new();
+        records.insert(bucket0.clone(), vec![record0.clone(), record1.clone()]);
+
+        let scan_records = ScanRecords::new(records);
+        assert_eq!(scan_records.records(&bucket0).len(), 2);
+        assert!(scan_records.records(&bucket1).is_empty());
+        assert_eq!(scan_records.count(), 2);
+
+        let collected: Vec<_> = scan_records.into_iter().collect();
+        assert_eq!(collected.len(), 2);
+    }
+
+    #[test]
+    fn scan_record_default_values() {
+        let record = ScanRecord::new_default(make_row(vec![1], 0));
+        assert_eq!(record.offset(), -1);
+        assert_eq!(record.timestamp(), -1);
+        assert_eq!(record.change_type(), &ChangeType::Insert);
+    }
+
+    #[test]
+    fn scan_batch_last_offset() {
+        let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int32, false)]));
+        let bucket = TableBucket::new(1, 0);
+
+        // Batch with 3 records starting at offset 100 -> last_offset = 102
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
+        )
+        .unwrap();
+        let scan_batch = ScanBatch::new(bucket.clone(), batch, 100);
+        assert_eq!(scan_batch.num_records(), 3);
+        assert_eq!(scan_batch.last_offset(), 102);
+
+        // Empty batch -> last_offset = base_offset - 1
+        let empty_batch = RecordBatch::new_empty(schema);
+        let empty_scan_batch = ScanBatch::new(bucket, empty_batch, 100);
+        assert_eq!(empty_scan_batch.num_records(), 0);
+        assert_eq!(empty_scan_batch.last_offset(), 99);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs
new file mode 100644
index 0000000000..3380629599
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/binary/binary_writer.rs
@@ -0,0 +1,318 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::{DataType, RowType};
+use crate::row::Decimal;
+use crate::row::binary::BinaryRowFormat;
+use crate::row::datum::{TimestampLtz, TimestampNtz};
+use crate::row::{Datum, FlussArray, FlussMap};
+
+/// Writer to write a composite data format, like row, array,
+#[allow(dead_code)]
+pub trait BinaryWriter {
+    /// Reset writer to prepare next write
+    fn reset(&mut self);
+
+    /// Set null to this field
+    fn set_null_at(&mut self, pos: usize);
+
+    fn write_boolean(&mut self, value: bool);
+
+    fn write_byte(&mut self, value: u8);
+
+    fn write_bytes(&mut self, value: &[u8]);
+
+    fn write_char(&mut self, value: &str, length: usize);
+
+    fn write_string(&mut self, value: &str);
+
+    fn write_short(&mut self, value: i16);
+
+    fn write_int(&mut self, value: i32);
+
+    fn write_long(&mut self, value: i64);
+
+    fn write_float(&mut self, value: f32);
+
+    fn write_double(&mut self, value: f64);
+
+    fn write_binary(&mut self, bytes: &[u8], length: usize);
+
+    fn write_decimal(&mut self, value: &Decimal, precision: u32);
+
+    /// Writes a TIME value.
+    ///
+    /// Note: TIME is physically stored as an i32 (milliseconds since midnight).
+    /// This method exists for type safety and semantic clarity, even though it's
+    /// currently equivalent to `write_int()`. The precision parameter is accepted
+    /// for API consistency with TIMESTAMP types, though TIME encoding doesn't
+    /// currently vary by precision.
+    fn write_time(&mut self, value: i32, precision: u32);
+
+    fn write_timestamp_ntz(&mut self, value: &TimestampNtz, precision: u32);
+
+    fn write_timestamp_ltz(&mut self, value: &TimestampLtz, precision: u32);
+
+    fn write_array(&mut self, value: &FlussArray);
+
+    fn write_map(&mut self, value: &FlussMap);
+
+    // TODO Row serializer
+    // fn write_row(&mut self, pos: i32, value: &InternalRow);
+
+    /// Finally, complete write to set real size to binary.
+    fn complete(&mut self);
+}
+
+pub enum ValueWriter {
+    Nullable(InnerValueWriter),
+    NonNullable(InnerValueWriter),
+}
+
+impl ValueWriter {
+    pub fn create_value_writer(
+        element_type: &DataType,
+        binary_row_format: Option<&BinaryRowFormat>,
+    ) -> Result<ValueWriter> {
+        let value_writer =
+            InnerValueWriter::create_inner_value_writer(element_type, binary_row_format)?;
+        if element_type.is_nullable() {
+            Ok(Self::Nullable(value_writer))
+        } else {
+            Ok(Self::NonNullable(value_writer))
+        }
+    }
+
+    pub fn write_value<W: BinaryWriter>(
+        &self,
+        writer: &mut W,
+        pos: usize,
+        value: &Datum,
+    ) -> Result<()> {
+        match self {
+            Self::Nullable(inner_value_writer) => {
+                if let Datum::Null = value {
+                    writer.set_null_at(pos);
+                    Ok(())
+                } else {
+                    inner_value_writer.write_value(writer, pos, value)
+                }
+            }
+            Self::NonNullable(inner_value_writer) => {
+                inner_value_writer.write_value(writer, pos, value)
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum InnerValueWriter {
+    Char,
+    String,
+    Boolean,
+    Binary,
+    Bytes,
+    TinyInt,
+    SmallInt,
+    Int,
+    BigInt,
+    Float,
+    Double,
+    Decimal(u32, u32), // precision, scale
+    Date,
+    Time(u32),         // precision (not used in wire format, but kept for consistency)
+    TimestampNtz(u32), // precision
+    TimestampLtz(u32), // precision
+    Array,
+    Map,
+    Row(NestedRowWriter),
+}
+
+#[derive(Debug)]
+pub struct NestedRowWriter {
+    field_writers: Vec<InnerValueWriter>,
+    field_nullable: Vec<bool>,
+}
+
+impl NestedRowWriter {
+    fn from_row_type(row_type: &RowType) -> Result<Self> {
+        let fields = row_type.fields();
+        let mut field_writers = Vec::with_capacity(fields.len());
+        let mut field_nullable = Vec::with_capacity(fields.len());
+        for field in fields {
+            field_writers.push(InnerValueWriter::create_inner_value_writer(
+                field.data_type(),
+                None,
+            )?);
+            field_nullable.push(field.data_type().is_nullable());
+        }
+        Ok(Self {
+            field_writers,
+            field_nullable,
+        })
+    }
+
+    fn field_count(&self) -> usize {
+        self.field_writers.len()
+    }
+}
+
+/// Accessor for writing the fields/elements of a binary writer during runtime, the
+/// fields/elements must be written in the order.
+impl InnerValueWriter {
+    pub fn create_inner_value_writer(
+        data_type: &DataType,
+        _: Option<&BinaryRowFormat>,
+    ) -> Result<InnerValueWriter> {
+        match data_type {
+            DataType::Char(_) => Ok(InnerValueWriter::Char),
+            DataType::String(_) => Ok(InnerValueWriter::String),
+            DataType::Boolean(_) => Ok(InnerValueWriter::Boolean),
+            DataType::Binary(_) => Ok(InnerValueWriter::Binary),
+            DataType::Bytes(_) => Ok(InnerValueWriter::Bytes),
+            DataType::TinyInt(_) => Ok(InnerValueWriter::TinyInt),
+            DataType::SmallInt(_) => Ok(InnerValueWriter::SmallInt),
+            DataType::Int(_) => Ok(InnerValueWriter::Int),
+            DataType::BigInt(_) => Ok(InnerValueWriter::BigInt),
+            DataType::Float(_) => Ok(InnerValueWriter::Float),
+            DataType::Double(_) => Ok(InnerValueWriter::Double),
+            DataType::Decimal(d) => {
+                // Validation is done at DecimalType construction time
+                Ok(InnerValueWriter::Decimal(d.precision(), d.scale()))
+            }
+            DataType::Date(_) => Ok(InnerValueWriter::Date),
+            DataType::Time(t) => {
+                // Validation is done at TimeType construction time
+                Ok(InnerValueWriter::Time(t.precision()))
+            }
+            DataType::Timestamp(t) => {
+                // Validation is done at TimestampType construction time
+                Ok(InnerValueWriter::TimestampNtz(t.precision()))
+            }
+            DataType::TimestampLTz(t) => {
+                // Validation is done at TimestampLTzType construction time
+                Ok(InnerValueWriter::TimestampLtz(t.precision()))
+            }
+            DataType::Array(_) => Ok(InnerValueWriter::Array),
+            DataType::Map(_) => Ok(InnerValueWriter::Map),
+            DataType::Row(row_type) => Ok(InnerValueWriter::Row(NestedRowWriter::from_row_type(
+                row_type,
+            )?)),
+        }
+    }
+    pub fn write_value<W: BinaryWriter>(
+        &self,
+        writer: &mut W,
+        _pos: usize,
+        value: &Datum,
+    ) -> Result<()> {
+        match (self, value) {
+            (InnerValueWriter::Char, Datum::String(v)) => {
+                writer.write_char(v, v.len());
+            }
+            (InnerValueWriter::String, Datum::String(v)) => {
+                writer.write_string(v);
+            }
+            (InnerValueWriter::Boolean, Datum::Bool(v)) => {
+                writer.write_boolean(*v);
+            }
+            (InnerValueWriter::Binary, Datum::Blob(v)) => {
+                let b = v.as_ref();
+                writer.write_binary(b, b.len());
+            }
+            (InnerValueWriter::Bytes, Datum::Blob(v)) => {
+                writer.write_bytes(v.as_ref());
+            }
+            (InnerValueWriter::TinyInt, Datum::Int8(v)) => {
+                writer.write_byte(*v as u8);
+            }
+            (InnerValueWriter::SmallInt, Datum::Int16(v)) => {
+                writer.write_short(*v);
+            }
+            (InnerValueWriter::Int, Datum::Int32(v)) => {
+                writer.write_int(*v);
+            }
+            (InnerValueWriter::BigInt, Datum::Int64(v)) => {
+                writer.write_long(*v);
+            }
+            (InnerValueWriter::Float, Datum::Float32(v)) => {
+                writer.write_float(v.into_inner());
+            }
+            (InnerValueWriter::Double, Datum::Float64(v)) => {
+                writer.write_double(v.into_inner());
+            }
+            (InnerValueWriter::Decimal(p, _s), Datum::Decimal(v)) => {
+                writer.write_decimal(v, *p);
+            }
+            (InnerValueWriter::Date, Datum::Date(d)) => {
+                writer.write_int(d.get_inner());
+            }
+            (InnerValueWriter::Time(p), Datum::Time(t)) => {
+                writer.write_time(t.get_inner(), *p);
+            }
+            (InnerValueWriter::TimestampNtz(p), Datum::TimestampNtz(ts)) => {
+                writer.write_timestamp_ntz(ts, *p);
+            }
+            (InnerValueWriter::TimestampLtz(p), Datum::TimestampLtz(ts)) => {
+                writer.write_timestamp_ltz(ts, *p);
+            }
+            (InnerValueWriter::Array, Datum::Array(arr)) => {
+                writer.write_array(arr);
+            }
+            (InnerValueWriter::Map, Datum::Map(map)) => {
+                writer.write_map(map);
+            }
+            (InnerValueWriter::Row(nested_writer), Datum::Row(inner_row)) => {
+                use crate::row::compacted::CompactedRowWriter;
+                let field_count = nested_writer.field_count();
+                if inner_row.values.len() != field_count {
+                    return Err(IllegalArgument {
+                        message: format!(
+                            "nested row arity mismatch: schema has {} fields, got {}",
+                            field_count,
+                            inner_row.values.len(),
+                        ),
+                    });
+                }
+                let mut nested = CompactedRowWriter::new(field_count);
+                for (i, datum) in inner_row.values.iter().enumerate() {
+                    if datum.is_null() {
+                        if !nested_writer.field_nullable[i] {
+                            return Err(IllegalArgument {
+                                message: format!(
+                                    "nested row field {i} is non-nullable but received null",
+                                ),
+                            });
+                        }
+                        nested.set_null_at(i);
+                    } else {
+                        nested_writer.field_writers[i].write_value(&mut nested, i, datum)?;
+                    }
+                }
+                writer.write_bytes(nested.buffer());
+            }
+            _ => {
+                return Err(IllegalArgument {
+                    message: format!("{self:?} used to write value {value:?}"),
+                });
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/binary/iceberg_binary_row_writer.rs b/fluss-rust/crates/fluss/src/row/binary/iceberg_binary_row_writer.rs
new file mode 100644
index 0000000000..82a61928ae
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/binary/iceberg_binary_row_writer.rs
@@ -0,0 +1,564 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use bytes::{Bytes, BytesMut};
+
+use crate::error::{Error, Result};
+use crate::metadata::DataType;
+use crate::row::Decimal;
+use crate::row::binary::{BinaryWriter, ValueWriter};
+use crate::row::binary_array::FlussArray;
+use crate::row::binary_map::FlussMap;
+
+const MICROS_PER_MILLI: i64 = 1_000;
+
+/// Iceberg-specific binary writer for encoding key columns.
+///
+/// Unlike [`CompactedRowWriter`] which uses varint encoding and length-prefixed
+/// variable-length fields, this writer follows Iceberg's encoding conventions:
+/// - Integers (int, date) are written as i64 (8 bytes, little-endian)
+/// - Time values are converted from milliseconds to microseconds
+/// - Timestamps are converted to microseconds
+/// - Floats/doubles use fixed-width little-endian encoding
+/// - Variable-length types (string, binary) are written without length prefixes
+/// - Decimals are written as unscaled big-endian bytes without length prefixes
+///
+/// The encoded bytes feed directly into `IcebergBucketingFunction`'s MurmurHash
+/// for bucket assignment and must match the Java Fluss server's encoding exactly.
+///
+/// [`CompactedRowWriter`]: crate::row::compacted::CompactedRowWriter
+pub struct IcebergBinaryRowWriter {
+    position: usize,
+    buffer: BytesMut,
+}
+
+impl Default for IcebergBinaryRowWriter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl IcebergBinaryRowWriter {
+    pub fn new() -> Self {
+        let buffer = BytesMut::zeroed(64);
+        Self {
+            position: 0,
+            buffer,
+        }
+    }
+
+    // Dependency order note:
+    // 1) Keep this PR scoped to writer-level Java parity.
+    // 2) Wire the writer through IcebergKeyEncoder in follow-up #308.
+    // TODO(#308): add end-to-end key-encoding tests via IcebergKeyEncoder
+    // (similar to CompactedKeyEncoder tests for CompactedKeyWriter).
+    pub fn create_value_writer(field_type: &DataType) -> Result<ValueWriter> {
+        match field_type {
+            // Match Java IcebergBinaryRowWriter.createFieldWriter() supported types exactly.
+            DataType::Int(_)
+            | DataType::Date(_)
+            | DataType::Time(_)
+            | DataType::BigInt(_)
+            | DataType::Float(_)
+            | DataType::Double(_)
+            | DataType::Timestamp(_)
+            | DataType::Decimal(_)
+            | DataType::String(_)
+            | DataType::Char(_)
+            | DataType::Binary(_)
+            | DataType::Bytes(_) => ValueWriter::create_value_writer(field_type, None),
+
+            // Keep Java's explicit scalar-only rejection messaging for ARRAY/MAP.
+            DataType::Array(_) => Err(Error::UnsupportedOperation {
+                message:
+                    "Array types cannot be used as bucket keys. Bucket keys must be scalar types."
+                        .to_string(),
+            }),
+            DataType::Map(_) => Err(Error::UnsupportedOperation {
+                message:
+                    "Map types cannot be used as bucket keys. Bucket keys must be scalar types."
+                        .to_string(),
+            }),
+
+            // BOOLEAN, TINYINT, SMALLINT, TIMESTAMP_LTZ, ROW and any future types.
+            _ => Err(Error::UnsupportedOperation {
+                message: format!(
+                    "Unsupported type for Iceberg binary row writer: {:?}",
+                    field_type
+                ),
+            }),
+        }
+    }
+
+    #[allow(dead_code)]
+    pub fn position(&self) -> usize {
+        self.position
+    }
+
+    #[allow(dead_code)]
+    pub fn buffer(&self) -> &[u8] {
+        &self.buffer[..self.position]
+    }
+
+    pub fn to_bytes(&self) -> Bytes {
+        Bytes::copy_from_slice(&self.buffer[..self.position])
+    }
+
+    fn ensure_capacity(&mut self, need_len: usize) {
+        if (self.buffer.len() - self.position) < need_len {
+            let new_len = std::cmp::max(self.buffer.len() * 2, self.buffer.len() + need_len);
+            self.buffer.resize(new_len, 0);
+        }
+    }
+
+    fn write_raw(&mut self, src: &[u8]) {
+        let end = self.position + src.len();
+        self.ensure_capacity(src.len());
+        self.buffer[self.position..end].copy_from_slice(src);
+        self.position = end;
+    }
+}
+
+impl BinaryWriter for IcebergBinaryRowWriter {
+    fn reset(&mut self) {
+        if self.position > 0 {
+            self.buffer[..self.position].fill(0);
+        }
+        self.position = 0;
+    }
+
+    fn set_null_at(&mut self, _pos: usize) {
+        panic!("Iceberg key columns do not support null values");
+    }
+
+    fn write_boolean(&mut self, value: bool) {
+        self.write_raw(&[if value { 1u8 } else { 0u8 }]);
+    }
+
+    fn write_byte(&mut self, value: u8) {
+        self.write_raw(&[value]);
+    }
+
+    fn write_bytes(&mut self, value: &[u8]) {
+        // Iceberg: raw bytes, no length prefix
+        self.write_raw(value);
+    }
+
+    fn write_char(&mut self, value: &str, _length: usize) {
+        // Iceberg: same as string — raw UTF-8, no length prefix
+        self.write_string(value);
+    }
+
+    fn write_string(&mut self, value: &str) {
+        // Iceberg: raw UTF-8 bytes, no length prefix
+        self.write_raw(value.as_bytes());
+    }
+
+    fn write_short(&mut self, value: i16) {
+        self.write_raw(&value.to_le_bytes());
+    }
+
+    fn write_int(&mut self, value: i32) {
+        // Iceberg: promote i32 to i64, write as 8 bytes little-endian
+        self.write_raw(&(value as i64).to_le_bytes());
+    }
+
+    fn write_long(&mut self, value: i64) {
+        self.write_raw(&value.to_le_bytes());
+    }
+
+    fn write_float(&mut self, value: f32) {
+        self.write_raw(&value.to_le_bytes());
+    }
+
+    fn write_double(&mut self, value: f64) {
+        self.write_raw(&value.to_le_bytes());
+    }
+
+    fn write_binary(&mut self, bytes: &[u8], length: usize) {
+        // Iceberg: raw bytes, no length prefix
+        self.write_raw(&bytes[..length.min(bytes.len())]);
+    }
+
+    fn write_decimal(&mut self, value: &Decimal, _precision: u32) {
+        // Iceberg: unscaled big-endian bytes, no length prefix
+        let unscaled_bytes = value.to_unscaled_bytes();
+        self.write_raw(&unscaled_bytes);
+    }
+
+    fn write_time(&mut self, value: i32, _precision: u32) {
+        // NOTE: this is the same with Java's long arithmetic wraps on overflow.
+        let micros = (value as i64).wrapping_mul(MICROS_PER_MILLI);
+        self.write_raw(&micros.to_le_bytes());
+    }
+
+    fn write_timestamp_ntz(&mut self, value: &crate::row::datum::TimestampNtz, _precision: u32) {
+        // NOTE: this is the same with Java's long arithmetic wraps on overflow.
+        let millis = value.get_millisecond();
+        let nanos = value.get_nano_of_millisecond();
+        let micros = millis
+            .wrapping_mul(MICROS_PER_MILLI)
+            .wrapping_add((nanos as i64) / MICROS_PER_MILLI);
+        self.write_raw(&micros.to_le_bytes());
+    }
+
+    fn write_timestamp_ltz(&mut self, value: &crate::row::datum::TimestampLtz, _precision: u32) {
+        // NOTE: this is the same with Java's long arithmetic wraps on overflow.
+        let millis = value.get_epoch_millisecond();
+        let nanos = value.get_nano_of_millisecond();
+        let micros = millis
+            .wrapping_mul(MICROS_PER_MILLI)
+            .wrapping_add((nanos as i64) / MICROS_PER_MILLI);
+        self.write_raw(&micros.to_le_bytes());
+    }
+
+    fn write_array(&mut self, _value: &FlussArray) {
+        unreachable!("Array/Map types are rejected during value writer creation");
+    }
+
+    fn write_map(&mut self, _value: &FlussMap) {
+        unreachable!("Array/Map types are rejected during value writer creation");
+    }
+
+    fn complete(&mut self) {
+        // No finalization needed for Iceberg key encoding
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataTypes, SmallIntType, TinyIntType};
+    use crate::row::datum::{TimestampLtz, TimestampNtz};
+    use bigdecimal::{BigDecimal, num_bigint::BigInt};
+
+    fn assert_unsupported_type(dt: DataType, expected_fragment: &str) {
+        match IcebergBinaryRowWriter::create_value_writer(&dt) {
+            Err(e) => assert!(
+                e.to_string().contains(expected_fragment),
+                "unexpected error for {dt:?}: {e}"
+            ),
+            Ok(_) => panic!("expected error for unsupported type {dt:?}, got Ok"),
+        }
+    }
+
+    #[test]
+    fn test_write_int_as_i64_le() {
+        let mut w = IcebergBinaryRowWriter::new();
+        w.write_int(42);
+        assert_eq!(w.buffer(), &42i64.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_int_negative() {
+        let mut w = IcebergBinaryRowWriter::new();
+        w.write_int(-1);
+        assert_eq!(w.buffer(), &(-1i64).to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_long() {
+        let mut w = IcebergBinaryRowWriter::new();
+        w.write_long(123456789012345i64);
+        assert_eq!(w.buffer(), &123456789012345i64.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_float() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let val = 1.23f32;
+        w.write_float(val);
+        assert_eq!(w.buffer(), &val.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_double() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let val = 9.876543210f64;
+        w.write_double(val);
+        assert_eq!(w.buffer(), &val.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_string_no_length_prefix() {
+        let mut w = IcebergBinaryRowWriter::new();
+        w.write_string("hello");
+        assert_eq!(w.buffer(), b"hello");
+    }
+
+    #[test]
+    fn test_write_bytes_no_length_prefix() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let data = &[0xDE, 0xAD, 0xBE, 0xEF];
+        w.write_bytes(data);
+        assert_eq!(w.buffer(), data);
+    }
+
+    #[test]
+    fn test_write_binary_no_length_prefix() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let data = &[1, 2, 3, 4, 5];
+        w.write_binary(data, 3);
+        assert_eq!(w.buffer(), &[1, 2, 3]);
+    }
+
+    #[test]
+    fn test_write_time_millis_to_micros() {
+        let mut w = IcebergBinaryRowWriter::new();
+        // 1000 ms = 1_000_000 µs
+        w.write_time(1000, 0);
+        assert_eq!(w.buffer(), &1_000_000i64.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_timestamp_ntz_compact() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let ts = TimestampNtz::new(1672531200000); // 2023-01-01 00:00:00 UTC
+        w.write_timestamp_ntz(&ts, 3);
+        let expected_micros = 1672531200000i64 * 1000;
+        assert_eq!(w.buffer(), &expected_micros.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_timestamp_ntz_with_nanos() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let ts = TimestampNtz::from_millis_nanos(1000, 500_000).unwrap();
+        w.write_timestamp_ntz(&ts, 6);
+        // 1000ms * 1000 + 500_000ns / 1000 = 1_000_000 + 500 = 1_000_500 µs
+        assert_eq!(w.buffer(), &1_000_500i64.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_timestamp_ltz() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let ts = TimestampLtz::from_millis_nanos(2000, 300_000).unwrap();
+        w.write_timestamp_ltz(&ts, 6);
+        // 2000ms * 1000 + 300_000ns / 1000 = 2_000_000 + 300 = 2_000_300 µs
+        assert_eq!(w.buffer(), &2_000_300i64.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_timestamp_ntz_overflow_wraps_like_java() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let ts = TimestampNtz::from_millis_nanos(i64::MAX, 999_999).unwrap();
+        w.write_timestamp_ntz(&ts, 9);
+
+        let expected = i64::MAX.wrapping_mul(MICROS_PER_MILLI).wrapping_add(999);
+        assert_eq!(w.buffer(), &expected.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_timestamp_ltz_overflow_wraps_like_java() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let ts = TimestampLtz::from_millis_nanos(i64::MIN, 999_999).unwrap();
+        w.write_timestamp_ltz(&ts, 9);
+
+        let expected = i64::MIN.wrapping_mul(MICROS_PER_MILLI).wrapping_add(999);
+        assert_eq!(w.buffer(), &expected.to_le_bytes());
+    }
+
+    #[test]
+    fn test_write_decimal_compact() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let bd = BigDecimal::new(BigInt::from(12345), 2); // 123.45
+        let decimal = Decimal::from_big_decimal(bd, 10, 2).unwrap();
+        w.write_decimal(&decimal, 10);
+
+        let expected = BigInt::from(12345).to_signed_bytes_be();
+        assert_eq!(w.buffer(), expected.as_slice());
+    }
+
+    #[test]
+    fn test_write_decimal_non_compact() {
+        let mut w = IcebergBinaryRowWriter::new();
+        let bd = BigDecimal::new(BigInt::from(12345), 0);
+        let decimal = Decimal::from_big_decimal(bd, 28, 0).unwrap();
+        w.write_decimal(&decimal, 28);
+
+        let expected = BigInt::from(12345).to_signed_bytes_be();
+        assert_eq!(w.buffer(), expected.as_slice());
+    }
+
+    #[test]
+    fn test_write_boolean() {
+        let mut w = IcebergBinaryRowWriter::new();
+        w.write_boolean(true);
+        assert_eq!(w.buffer(), &[1u8]);
+
+        w.reset();
+        w.write_boolean(false);
+        assert_eq!(w.buffer(), &[0u8]);
+    }
+
+    #[test]
+    #[should_panic(expected = "Iceberg key columns do not support null values")]
+    fn test_set_null_panics() {
+        let mut w = IcebergBinaryRowWriter::new();
+        w.set_null_at(0);
+    }
+
+    #[test]
+    fn test_reset_clears_position() {
+        let mut w = IcebergBinaryRowWriter::new();
+        w.write_int(42);
+        assert_eq!(w.position(), 8);
+        w.reset();
+        assert_eq!(w.position(), 0);
+        assert_eq!(w.buffer().len(), 0);
+    }
+
+    #[test]
+    fn test_to_bytes() {
+        let mut w = IcebergBinaryRowWriter::new();
+        w.write_string("test");
+        let bytes = w.to_bytes();
+        assert_eq!(bytes.as_ref(), b"test");
+    }
+
+    #[test]
+    fn test_multiple_writes() {
+        let mut w = IcebergBinaryRowWriter::new();
+        w.write_int(1);
+        w.write_string("ab");
+        let buf = w.buffer().to_vec();
+        // 8 bytes for int-as-i64 + 2 bytes for "ab"
+        assert_eq!(buf.len(), 10);
+        assert_eq!(&buf[..8], &1i64.to_le_bytes());
+        assert_eq!(&buf[8..], b"ab");
+    }
+
+    #[test]
+    fn test_buffer_growth() {
+        let mut w = IcebergBinaryRowWriter::new();
+        // Write more than 64 bytes to trigger buffer growth
+        let large = vec![0xAAu8; 128];
+        w.write_bytes(&large);
+        assert_eq!(w.buffer(), large.as_slice());
+    }
+
+    #[test]
+    fn test_create_value_writer_rejects_tinyint() {
+        let dt = DataType::TinyInt(TinyIntType::new());
+        match IcebergBinaryRowWriter::create_value_writer(&dt) {
+            Err(e) => assert!(
+                e.to_string()
+                    .contains("Unsupported type for Iceberg binary row writer"),
+                "unexpected error: {e}",
+            ),
+            Ok(_) => panic!("expected error for TinyInt, got Ok"),
+        }
+    }
+
+    #[test]
+    fn test_create_value_writer_rejects_smallint() {
+        let dt = DataType::SmallInt(SmallIntType::new());
+        match IcebergBinaryRowWriter::create_value_writer(&dt) {
+            Err(e) => assert!(
+                e.to_string()
+                    .contains("Unsupported type for Iceberg binary row writer"),
+                "unexpected error: {e}",
+            ),
+            Ok(_) => panic!("expected error for SmallInt, got Ok"),
+        }
+    }
+
+    #[test]
+    fn test_create_value_writer_rejects_boolean() {
+        assert_unsupported_type(
+            DataTypes::boolean(),
+            "Unsupported type for Iceberg binary row writer",
+        );
+    }
+
+    #[test]
+    fn test_create_value_writer_rejects_timestamp_ltz() {
+        assert_unsupported_type(
+            DataTypes::timestamp_ltz(),
+            "Unsupported type for Iceberg binary row writer",
+        );
+    }
+
+    #[test]
+    fn test_create_value_writer_rejects_array() {
+        assert_unsupported_type(
+            DataTypes::array(DataTypes::int()),
+            "Array types cannot be used as bucket keys",
+        );
+    }
+
+    #[test]
+    fn test_create_value_writer_rejects_map() {
+        assert_unsupported_type(
+            DataTypes::map(DataTypes::string(), DataTypes::int()),
+            "Map types cannot be used as bucket keys",
+        );
+    }
+
+    #[test]
+    fn test_create_value_writer_rejects_row() {
+        assert_unsupported_type(
+            DataTypes::row(vec![DataTypes::field("f0", DataTypes::int())]),
+            "Unsupported type for Iceberg binary row writer",
+        );
+    }
+
+    #[test]
+    fn test_create_value_writer_accepts_java_supported_scalar_types() {
+        let supported_types = vec![
+            ("int", DataTypes::int()),
+            ("date", DataTypes::date()),
+            ("time", DataTypes::time()),
+            ("bigint", DataTypes::bigint()),
+            ("float", DataTypes::float()),
+            ("double", DataTypes::double()),
+            ("timestamp_ntz", DataTypes::timestamp()),
+            ("decimal", DataTypes::decimal(10, 2)),
+            ("string", DataTypes::string()),
+            ("char", DataTypes::char(16)),
+            ("binary", DataTypes::binary(8)),
+            ("bytes", DataTypes::bytes()),
+        ];
+
+        for (name, data_type) in supported_types {
+            let res = IcebergBinaryRowWriter::create_value_writer(&data_type);
+            if let Err(e) = res {
+                panic!("expected {name} to be supported, got error: {e}");
+            }
+        }
+    }
+
+    #[test]
+    fn test_write_char_same_as_string() {
+        let mut w1 = IcebergBinaryRowWriter::new();
+        w1.write_char("hello", 10);
+
+        let mut w2 = IcebergBinaryRowWriter::new();
+        w2.write_string("hello");
+
+        assert_eq!(w1.buffer(), w2.buffer());
+    }
+
+    #[test]
+    fn test_write_date_as_int() {
+        // Date encoding goes through write_int (via InnerValueWriter::Date)
+        // which writes as i64 LE in Iceberg encoding
+        let mut w = IcebergBinaryRowWriter::new();
+        let days_since_epoch = 19000i32; // ~2022-01-06
+        w.write_int(days_since_epoch);
+        assert_eq!(w.buffer(), &(days_since_epoch as i64).to_le_bytes());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/binary/mod.rs b/fluss-rust/crates/fluss/src/row/binary/mod.rs
new file mode 100644
index 0000000000..d6248dc515
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/binary/mod.rs
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod binary_writer;
+mod iceberg_binary_row_writer;
+
+pub use binary_writer::*;
+pub use iceberg_binary_row_writer::IcebergBinaryRowWriter;
+
+/// The binary row format types, it indicates the generated row type by the [`BinaryWriter`]
+#[allow(dead_code)]
+pub enum BinaryRowFormat {
+    Compacted,
+    Aligned,
+    Indexed,
+}
diff --git a/fluss-rust/crates/fluss/src/row/binary_array.rs b/fluss-rust/crates/fluss/src/row/binary_array.rs
new file mode 100644
index 0000000000..b987cec8b7
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/binary_array.rs
@@ -0,0 +1,1288 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Binary array format matching Java's `BinaryArray.java` layout.
+//!
+//! Binary layout:
+//! ```text
+//! [size(4B)] + [null bits (4-byte word aligned)] + [fixed-length part] + [variable-length part]
+//! ```
+//!
+//! Java reference: `BinaryArray.java`, `BinaryArrayWriter.java`
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::{DataType, RowType};
+use crate::row::Decimal;
+use crate::row::InternalRow;
+use crate::row::binary::{BinaryRowFormat, ValueWriter};
+use crate::row::binary_map::FlussMap;
+use crate::row::compacted::{CompactedRow, CompactedRowWriter, calculate_bit_set_width_in_bytes};
+use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz};
+use crate::row::field_getter::FieldGetter;
+use bytes::Bytes;
+use serde::Serialize;
+use std::fmt;
+use std::hash::{Hash, Hasher};
+
+const MAX_FIX_PART_DATA_SIZE: usize = 7;
+const HIGHEST_FIRST_BIT: u64 = 0x80_u64 << 56;
+const HIGHEST_SECOND_TO_EIGHTH_BIT: u64 = 0x7F_u64 << 56;
+
+/// Calculates the header size in bytes: 4 (for element count) + null bits (4-byte word aligned).
+/// Matches Java's `BinaryArray.calculateHeaderInBytes(numFields)`.
+pub fn calculate_header_in_bytes(num_elements: usize) -> usize {
+    4 + num_elements.div_ceil(32) * 4
+}
+
+/// Calculates the fixed-length part size per element for a given data type.
+/// Matches Java's `BinaryArray.calculateFixLengthPartSize(DataType)`.
+pub fn calculate_fix_length_part_size(element_type: &DataType) -> usize {
+    match element_type {
+        DataType::Boolean(_) | DataType::TinyInt(_) => 1,
+        DataType::SmallInt(_) => 2,
+        DataType::Int(_) | DataType::Float(_) | DataType::Date(_) | DataType::Time(_) => 4,
+        DataType::BigInt(_)
+        | DataType::Double(_)
+        | DataType::Char(_)
+        | DataType::String(_)
+        | DataType::Binary(_)
+        | DataType::Bytes(_)
+        | DataType::Decimal(_)
+        | DataType::Timestamp(_)
+        | DataType::TimestampLTz(_)
+        | DataType::Array(_)
+        | DataType::Map(_)
+        | DataType::Row(_) => 8,
+    }
+}
+
+/// Rounds a byte count up to the nearest 8-byte word boundary.
+/// Matches Java's `roundNumberOfBytesToNearestWord`.
+fn round_to_nearest_word(num_bytes: usize) -> usize {
+    (num_bytes + 7) & !7
+}
+
+fn is_variable_length_type(dt: &DataType) -> bool {
+    match dt {
+        DataType::Char(_)
+        | DataType::String(_)
+        | DataType::Binary(_)
+        | DataType::Bytes(_)
+        | DataType::Array(_)
+        | DataType::Map(_)
+        | DataType::Row(_) => true,
+        DataType::Decimal(d) => !Decimal::is_compact_precision(d.precision()),
+        DataType::Timestamp(t) => !TimestampNtz::is_compact(t.precision()),
+        DataType::TimestampLTz(t) => !TimestampLtz::is_compact(t.precision()),
+        _ => false,
+    }
+}
+
+/// A Fluss binary array, wire-compatible with Java's `BinaryArray`.
+///
+/// Stores elements in a flat byte buffer with a header (element count + null bitmap)
+/// followed by fixed-length slots and an optional variable-length section.
+///
+/// Uses `Bytes` internally so cloning is O(1) reference-counted.
+// TODO: FlussArray currently exposes only fallible getters. Infallible
+// fast-path variants may be added later as non-breaking extensions.
+#[derive(Clone)]
+pub struct FlussArray {
+    data: Bytes,
+    size: usize,
+    element_offset: usize,
+}
+
+impl fmt::Debug for FlussArray {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("FlussArray")
+            .field("size", &self.size)
+            .field("data_len", &self.data.len())
+            .finish()
+    }
+}
+
+impl fmt::Display for FlussArray {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "FlussArray[size={}]", self.size)
+    }
+}
+
+impl PartialEq for FlussArray {
+    fn eq(&self, other: &Self) -> bool {
+        self.data == other.data
+    }
+}
+
+impl Eq for FlussArray {}
+
+impl PartialOrd for FlussArray {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for FlussArray {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.data.cmp(&other.data)
+    }
+}
+
+impl Hash for FlussArray {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.data.hash(state);
+    }
+}
+
+impl Serialize for FlussArray {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_bytes(&self.data)
+    }
+}
+
+impl FlussArray {
+    /// Validates the raw bytes and computes derived fields (size, element_offset).
+    fn validate(data: &[u8]) -> Result<(usize, usize)> {
+        if data.len() < 4 {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussArray data too short: need at least 4 bytes, got {}",
+                    data.len()
+                ),
+            });
+        }
+        let raw_size = i32::from_le_bytes(data[0..4].try_into().unwrap());
+        if raw_size < 0 {
+            return Err(IllegalArgument {
+                message: format!("FlussArray size must be non-negative, got {raw_size}"),
+            });
+        }
+        let size = raw_size as usize;
+        let element_offset = calculate_header_in_bytes(size);
+        if element_offset > data.len() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussArray header exceeds payload: header={}, payload={}",
+                    element_offset,
+                    data.len()
+                ),
+            });
+        }
+        Ok((size, element_offset))
+    }
+
+    /// Creates a FlussArray from a byte slice (copies data).
+    pub fn from_bytes(data: &[u8]) -> Result<Self> {
+        let (size, element_offset) = Self::validate(data)?;
+        Ok(FlussArray {
+            data: Bytes::copy_from_slice(data),
+            size,
+            element_offset,
+        })
+    }
+
+    /// Creates a FlussArray from an owned `Vec<u8>` without copying.
+    pub fn from_vec(data: Vec<u8>) -> Result<Self> {
+        let (size, element_offset) = Self::validate(&data)?;
+        Ok(FlussArray {
+            data: Bytes::from(data),
+            size,
+            element_offset,
+        })
+    }
+
+    /// Creates a FlussArray from owned bytes without copying.
+    fn from_owned_bytes(data: Bytes) -> Result<Self> {
+        let (size, element_offset) = Self::validate(&data)?;
+        Ok(FlussArray {
+            data,
+            size,
+            element_offset,
+        })
+    }
+
+    /// Returns the number of elements.
+    pub fn size(&self) -> usize {
+        self.size
+    }
+
+    /// Returns the raw bytes of this array (the complete binary representation).
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.data
+    }
+
+    /// Returns true if the element at position `pos` is null.
+    pub fn is_null_at(&self, pos: usize) -> bool {
+        let byte_index = pos >> 3;
+        let bit = pos & 7;
+        (self.data[4 + byte_index] & (1u8 << bit)) != 0
+    }
+
+    /// Returns the logically occupied bytes of this array, including the variable-length part.
+    /// This is used to detect trailing garbage in binary containers.
+    pub fn extent(&self, element_type: &DataType) -> Result<usize> {
+        let header_size = calculate_header_in_bytes(self.size);
+        let element_size = calculate_fix_length_part_size(element_type);
+        let fixed_part_size = round_to_nearest_word(header_size + self.size * element_size);
+
+        if !is_variable_length_type(element_type) {
+            return Ok(fixed_part_size);
+        }
+
+        let mut max_extent = fixed_part_size;
+        for i in 0..self.size {
+            if !self.is_null_at(i) {
+                let packed = self.read_i64(i, "extent calculation")? as u64;
+                let mark = packed & HIGHEST_FIRST_BIT;
+                if mark == 0 {
+                    let offset = (packed >> 32) as usize;
+                    let len = (packed & 0xFFFF_FFFF) as usize;
+                    max_extent = max_extent.max(offset + len);
+                }
+            }
+        }
+
+        Ok(round_to_nearest_word(max_extent))
+    }
+
+    fn checked_slice(&self, start: usize, len: usize, context: &str) -> Result<&[u8]> {
+        let end = start.checked_add(len).ok_or_else(|| IllegalArgument {
+            message: format!("Overflow while reading {context}: start={start}, len={len}"),
+        })?;
+        if end > self.data.len() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Out-of-bounds while reading {context}: start={start}, len={len}, payload={}",
+                    self.data.len()
+                ),
+            });
+        }
+        Ok(&self.data[start..end])
+    }
+
+    fn checked_element_offset(
+        &self,
+        pos: usize,
+        element_size: usize,
+        context: &str,
+    ) -> Result<usize> {
+        if pos >= self.size {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Array element index out of bounds while reading {context}: pos={pos}, size={}",
+                    self.size
+                ),
+            });
+        }
+        let rel = pos.checked_mul(element_size).ok_or_else(|| IllegalArgument {
+            message: format!(
+                "Overflow while calculating array element offset for {context}: pos={pos}, element_size={element_size}"
+            ),
+        })?;
+        self.element_offset
+            .checked_add(rel)
+            .ok_or_else(|| IllegalArgument {
+                message: format!(
+                    "Overflow while adding base offset for {context}: base={}, rel={rel}",
+                    self.element_offset
+                ),
+            })
+    }
+
+    fn read_fixed_bytes(&self, pos: usize, len: usize, context: &str) -> Result<&[u8]> {
+        let offset = self.checked_element_offset(pos, len, context)?;
+        self.checked_slice(offset, len, context)
+    }
+
+    fn read_i16(&self, pos: usize, context: &str) -> Result<i16> {
+        let bytes = self.read_fixed_bytes(pos, 2, context)?;
+        Ok(i16::from_le_bytes([bytes[0], bytes[1]]))
+    }
+
+    fn read_i32(&self, pos: usize, context: &str) -> Result<i32> {
+        let bytes = self.read_fixed_bytes(pos, 4, context)?;
+        Ok(i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]))
+    }
+
+    fn read_i64(&self, pos: usize, context: &str) -> Result<i64> {
+        let bytes = self.read_fixed_bytes(pos, 8, context)?;
+        let mut buf = [0_u8; 8];
+        buf.copy_from_slice(bytes);
+        Ok(i64::from_le_bytes(buf))
+    }
+
+    fn read_i64_at_offset(&self, offset: usize, context: &str) -> Result<i64> {
+        let bytes = self.checked_slice(offset, 8, context)?;
+        let mut buf = [0_u8; 8];
+        buf.copy_from_slice(bytes);
+        Ok(i64::from_le_bytes(buf))
+    }
+
+    fn read_var_len_span(&self, pos: usize) -> Result<(usize, usize)> {
+        let field_offset = self.checked_element_offset(pos, 8, "variable-length array element")?;
+        let packed = self.read_i64(pos, "variable-length array element")? as u64;
+        let mark = packed & HIGHEST_FIRST_BIT;
+
+        if mark == 0 {
+            let offset = (packed >> 32) as usize;
+            let len = (packed & 0xFFFF_FFFF) as usize;
+            let _ = self.checked_slice(offset, len, "variable-length array element")?;
+            Ok((offset, len))
+        } else {
+            let len = ((packed & HIGHEST_SECOND_TO_EIGHTH_BIT) >> 56) as usize;
+            if len > MAX_FIX_PART_DATA_SIZE {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "Inline array element length must be <= {MAX_FIX_PART_DATA_SIZE}, got {len}"
+                    ),
+                });
+            }
+            // Java stores inline bytes in the 8-byte slot itself.
+            // On little-endian, bytes start at field_offset; on big-endian they start at +1.
+            let start = if cfg!(target_endian = "little") {
+                field_offset
+            } else {
+                field_offset + 1
+            };
+            let _ = self.checked_slice(start, len, "inline array element")?;
+            Ok((start, len))
+        }
+    }
+
+    fn read_var_len_bytes(&self, pos: usize) -> Result<&[u8]> {
+        let (start, len) = self.read_var_len_span(pos)?;
+        Ok(&self.data[start..start + len])
+    }
+
+    pub fn get_boolean(&self, pos: usize) -> Result<bool> {
+        let bytes = self.read_fixed_bytes(pos, 1, "boolean array element")?;
+        Ok(bytes[0] != 0)
+    }
+
+    pub fn get_byte(&self, pos: usize) -> Result<i8> {
+        let bytes = self.read_fixed_bytes(pos, 1, "byte array element")?;
+        Ok(bytes[0] as i8)
+    }
+
+    pub fn get_short(&self, pos: usize) -> Result<i16> {
+        self.read_i16(pos, "short array element")
+    }
+
+    pub fn get_int(&self, pos: usize) -> Result<i32> {
+        self.read_i32(pos, "int array element")
+    }
+
+    pub fn get_long(&self, pos: usize) -> Result<i64> {
+        self.read_i64(pos, "long array element")
+    }
+
+    pub fn get_float(&self, pos: usize) -> Result<f32> {
+        let bits = self.read_i32(pos, "float array element")? as u32;
+        Ok(f32::from_bits(bits))
+    }
+
+    pub fn get_double(&self, pos: usize) -> Result<f64> {
+        let bits = self.read_i64(pos, "double array element")? as u64;
+        Ok(f64::from_bits(bits))
+    }
+
+    /// Reads the offset_and_size packed long for variable-length elements.
+    fn get_offset_and_size(&self, pos: usize) -> Result<(usize, usize)> {
+        let packed = self.get_long(pos)? as u64;
+        let offset = (packed >> 32) as usize;
+        let size = (packed & 0xFFFF_FFFF) as usize;
+        Ok((offset, size))
+    }
+
+    pub fn get_string(&self, pos: usize) -> Result<&str> {
+        let bytes = self.read_var_len_bytes(pos)?;
+        std::str::from_utf8(bytes).map_err(|e| IllegalArgument {
+            message: format!("Invalid UTF-8 in array element at position {pos}: {e}"),
+        })
+    }
+
+    pub fn get_binary(&self, pos: usize) -> Result<&[u8]> {
+        self.read_var_len_bytes(pos)
+    }
+
+    pub fn get_decimal(&self, pos: usize, precision: u32, scale: u32) -> Result<Decimal> {
+        if Decimal::is_compact_precision(precision) {
+            let unscaled = self.get_long(pos)?;
+            Decimal::from_unscaled_long(unscaled, precision, scale)
+        } else {
+            let (offset, size) = self.get_offset_and_size(pos)?;
+            let bytes = self.checked_slice(offset, size, "decimal bytes")?;
+            Decimal::from_unscaled_bytes(bytes, precision, scale)
+        }
+    }
+
+    pub fn get_date(&self, pos: usize) -> Result<Date> {
+        Ok(Date::new(self.get_int(pos)?))
+    }
+
+    pub fn get_time(&self, pos: usize) -> Result<Time> {
+        Ok(Time::new(self.get_int(pos)?))
+    }
+
+    pub fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> Result<TimestampNtz> {
+        if TimestampNtz::is_compact(precision) {
+            Ok(TimestampNtz::new(self.get_long(pos)?))
+        } else {
+            let (offset, nanos_of_millis) = self.get_offset_and_size(pos)?;
+            let millis = self.read_i64_at_offset(offset, "timestamp ntz millis")?;
+            TimestampNtz::from_millis_nanos(millis, nanos_of_millis as i32)
+        }
+    }
+
+    pub fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> Result<TimestampLtz> {
+        if TimestampLtz::is_compact(precision) {
+            Ok(TimestampLtz::new(self.get_long(pos)?))
+        } else {
+            let (offset, nanos_of_millis) = self.get_offset_and_size(pos)?;
+            let millis = self.read_i64_at_offset(offset, "timestamp ltz millis")?;
+            TimestampLtz::from_millis_nanos(millis, nanos_of_millis as i32)
+        }
+    }
+
+    pub fn get_array(&self, pos: usize) -> Result<FlussArray> {
+        let (start, len) = self.read_var_len_span(pos)?;
+        FlussArray::from_owned_bytes(self.data.slice(start..start + len))
+    }
+
+    pub fn get_map(
+        &self,
+        pos: usize,
+        key_type: &DataType,
+        value_type: &DataType,
+    ) -> Result<FlussMap> {
+        let (start, len) = self.read_var_len_span(pos)?;
+        FlussMap::from_owned_bytes(self.data.slice(start..start + len), key_type, value_type)
+    }
+
+    pub fn get_row<'a>(&'a self, pos: usize, row_type: &'a RowType) -> Result<CompactedRow<'a>> {
+        let bytes = self.read_var_len_bytes(pos)?;
+        let header_size = calculate_bit_set_width_in_bytes(row_type.fields().len());
+        if bytes.len() < header_size {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussArray row bytes at position {} are too short for row type with {} fields: \
+                     need at least {} header bytes, got {}",
+                    pos,
+                    row_type.fields().len(),
+                    header_size,
+                    bytes.len()
+                ),
+            });
+        }
+        Ok(CompactedRow::from_bytes(row_type, bytes))
+    }
+}
+
+struct RowFieldAccessor {
+    getter: FieldGetter,
+    writer: ValueWriter,
+    nullable: bool,
+}
+
+fn build_row_accessors(row_type: &RowType) -> Result<Vec<RowFieldAccessor>> {
+    row_type
+        .fields()
+        .iter()
+        .enumerate()
+        .map(|(i, f)| {
+            Ok(RowFieldAccessor {
+                getter: FieldGetter::create(f.data_type(), i),
+                writer: ValueWriter::create_value_writer(
+                    f.data_type(),
+                    Some(&BinaryRowFormat::Compacted),
+                )?,
+                nullable: f.data_type().is_nullable(),
+            })
+        })
+        .collect()
+}
+
+/// Writer for building a `FlussArray` element by element.
+/// Matches Java's `BinaryArrayWriter`.
+pub struct FlussArrayWriter {
+    data: Vec<u8>,
+    null_bits_offset: usize,
+    element_offset: usize,
+    element_size: usize,
+    cursor: usize,
+    num_elements: usize,
+    // Some(_) only when constructed with a DataType::Row(_) element type.
+    row_accessors: Option<Vec<RowFieldAccessor>>,
+}
+
+impl FlussArrayWriter {
+    /// Creates a new writer for an array with `num_elements` elements of the given element type.
+    pub fn new(num_elements: usize, element_type: &DataType) -> Self {
+        let element_size = calculate_fix_length_part_size(element_type);
+        let row_accessors = match element_type {
+            DataType::Row(rt) => Some(
+                build_row_accessors(rt)
+                    .expect("ROW element type contains a field with no ValueWriter"),
+            ),
+            _ => None,
+        };
+        Self::with_state(num_elements, element_size, row_accessors)
+    }
+
+    /// Creates a new writer with an explicit element size (in bytes). Does not support `write_row`.
+    pub fn with_element_size(num_elements: usize, element_size: usize) -> Self {
+        Self::with_state(num_elements, element_size, None)
+    }
+
+    fn with_state(
+        num_elements: usize,
+        element_size: usize,
+        row_accessors: Option<Vec<RowFieldAccessor>>,
+    ) -> Self {
+        let header_in_bytes = calculate_header_in_bytes(num_elements);
+        let fixed_size = round_to_nearest_word(header_in_bytes + element_size * num_elements);
+        let mut data = vec![0u8; fixed_size];
+
+        // Java's MemorySegment.putInt() stores little-endian.
+        data[0..4].copy_from_slice(&(num_elements as i32).to_le_bytes());
+
+        FlussArrayWriter {
+            data,
+            null_bits_offset: 4,
+            element_offset: header_in_bytes,
+            element_size,
+            cursor: fixed_size,
+            num_elements,
+            row_accessors,
+        }
+    }
+
+    fn get_element_offset(&self, pos: usize) -> usize {
+        self.element_offset + self.element_size * pos
+    }
+
+    /// Sets the null bit for the element at position `pos`.
+    pub fn set_null_at(&mut self, pos: usize) {
+        let byte_index = pos >> 3;
+        let bit = pos & 7;
+        self.data[self.null_bits_offset + byte_index] |= 1u8 << bit;
+    }
+
+    pub fn write_boolean(&mut self, pos: usize, value: bool) {
+        let offset = self.get_element_offset(pos);
+        self.data[offset] = if value { 1 } else { 0 };
+    }
+
+    pub fn write_byte(&mut self, pos: usize, value: i8) {
+        let offset = self.get_element_offset(pos);
+        self.data[offset] = value as u8;
+    }
+
+    pub fn write_short(&mut self, pos: usize, value: i16) {
+        let offset = self.get_element_offset(pos);
+        self.data[offset..offset + 2].copy_from_slice(&value.to_le_bytes());
+    }
+
+    pub fn write_int(&mut self, pos: usize, value: i32) {
+        let offset = self.get_element_offset(pos);
+        self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes());
+    }
+
+    pub fn write_long(&mut self, pos: usize, value: i64) {
+        let offset = self.get_element_offset(pos);
+        self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes());
+    }
+
+    pub fn write_float(&mut self, pos: usize, value: f32) {
+        let offset = self.get_element_offset(pos);
+        self.data[offset..offset + 4].copy_from_slice(&value.to_le_bytes());
+    }
+
+    pub fn write_double(&mut self, pos: usize, value: f64) {
+        let offset = self.get_element_offset(pos);
+        self.data[offset..offset + 8].copy_from_slice(&value.to_le_bytes());
+    }
+
+    /// Writes variable-length bytes to the variable part and stores offset+size in the fixed slot.
+    fn write_bytes_to_var_len_part(&mut self, pos: usize, bytes: &[u8]) {
+        let rounded = round_to_nearest_word(bytes.len());
+        let var_offset = self.cursor;
+        self.data.resize(self.data.len() + rounded, 0);
+        self.data[var_offset..var_offset + bytes.len()].copy_from_slice(bytes);
+        self.set_offset_and_size(pos, var_offset, bytes.len());
+        self.cursor += rounded;
+    }
+
+    fn set_offset_and_size(&mut self, pos: usize, offset: usize, size: usize) {
+        let packed = ((offset as i64) << 32) | (size as i64);
+        self.write_long(pos, packed);
+    }
+
+    fn write_bytes_to_fix_len_part(&mut self, pos: usize, bytes: &[u8]) {
+        let len = bytes.len();
+        debug_assert!(len <= MAX_FIX_PART_DATA_SIZE);
+        let first_byte = (len as u64) | 0x80;
+        let mut seven_bytes = 0_u64;
+        if cfg!(target_endian = "little") {
+            for (i, b) in bytes.iter().enumerate() {
+                seven_bytes |= ((*b as u64) & 0xFF) << (i * 8);
+            }
+        } else {
+            for (i, b) in bytes.iter().enumerate() {
+                seven_bytes |= ((*b as u64) & 0xFF) << ((6 - i) * 8);
+            }
+        }
+        let packed = ((first_byte << 56) | seven_bytes) as i64;
+        self.write_long(pos, packed);
+    }
+
+    pub fn write_string(&mut self, pos: usize, value: &str) {
+        let bytes = value.as_bytes();
+        if bytes.len() <= MAX_FIX_PART_DATA_SIZE {
+            self.write_bytes_to_fix_len_part(pos, bytes);
+        } else {
+            self.write_bytes_to_var_len_part(pos, bytes);
+        }
+    }
+
+    pub fn write_binary_bytes(&mut self, pos: usize, value: &[u8]) {
+        if value.len() <= MAX_FIX_PART_DATA_SIZE {
+            self.write_bytes_to_fix_len_part(pos, value);
+        } else {
+            self.write_bytes_to_var_len_part(pos, value);
+        }
+    }
+
+    pub fn write_decimal(&mut self, pos: usize, value: &Decimal, precision: u32) {
+        if Decimal::is_compact_precision(precision) {
+            self.write_long(
+                pos,
+                value
+                    .to_unscaled_long()
+                    .expect("Decimal should fit in i64 for compact precision"),
+            );
+        } else {
+            let bytes = value.to_unscaled_bytes();
+            self.write_bytes_to_var_len_part(pos, &bytes);
+        }
+    }
+
+    pub fn write_date(&mut self, pos: usize, value: Date) {
+        self.write_int(pos, value.get_inner());
+    }
+
+    pub fn write_time(&mut self, pos: usize, value: Time) {
+        self.write_int(pos, value.get_inner());
+    }
+
+    pub fn write_timestamp_ntz(&mut self, pos: usize, value: &TimestampNtz, precision: u32) {
+        if TimestampNtz::is_compact(precision) {
+            self.write_long(pos, value.get_millisecond());
+        } else {
+            let millis_bytes = value.get_millisecond().to_le_bytes();
+            let var_offset = self.cursor;
+            let rounded = round_to_nearest_word(8);
+            self.data.resize(self.data.len() + rounded, 0);
+            self.data[var_offset..var_offset + 8].copy_from_slice(&millis_bytes);
+            self.set_offset_and_size(pos, var_offset, value.get_nano_of_millisecond() as usize);
+            self.cursor += rounded;
+        }
+    }
+
+    pub fn write_timestamp_ltz(&mut self, pos: usize, value: &TimestampLtz, precision: u32) {
+        if TimestampLtz::is_compact(precision) {
+            self.write_long(pos, value.get_epoch_millisecond());
+        } else {
+            let millis_bytes = value.get_epoch_millisecond().to_le_bytes();
+            let var_offset = self.cursor;
+            let rounded = round_to_nearest_word(8);
+            self.data.resize(self.data.len() + rounded, 0);
+            self.data[var_offset..var_offset + 8].copy_from_slice(&millis_bytes);
+            self.set_offset_and_size(pos, var_offset, value.get_nano_of_millisecond() as usize);
+            self.cursor += rounded;
+        }
+    }
+
+    /// Writes a nested FlussArray into this array at position `pos`.
+    pub fn write_array(&mut self, pos: usize, value: &FlussArray) {
+        self.write_bytes_to_var_len_part(pos, value.as_bytes());
+    }
+
+    /// Writes a nested FlussMap into this array at position `pos`.
+    pub fn write_map(&mut self, pos: usize, value: &FlussMap) {
+        self.write_bytes_to_var_len_part(pos, value.as_bytes());
+    }
+
+    /// Writes a nested row at `pos`. Requires the writer to have been
+    /// constructed via [`new`](Self::new) with a `DataType::Row(_)` element type.
+    pub fn write_row(&mut self, pos: usize, row: &dyn InternalRow) -> Result<()> {
+        let accessors = self.row_accessors.as_ref().ok_or_else(|| IllegalArgument {
+            message: "write_row requires a DataType::Row element type".to_string(),
+        })?;
+        let mut nested = CompactedRowWriter::new(accessors.len());
+        for (i, accessor) in accessors.iter().enumerate() {
+            if !accessor.nullable && row.is_null_at(i)? {
+                return Err(IllegalArgument {
+                    message: format!("nested row field {i} is non-nullable but received null"),
+                });
+            }
+            let datum = accessor.getter.get_field(row)?;
+            accessor.writer.write_value(&mut nested, i, &datum)?;
+        }
+        self.write_bytes_to_var_len_part(pos, nested.buffer());
+        Ok(())
+    }
+
+    /// Finalizes the writer and returns the completed FlussArray.
+    pub fn complete(self) -> Result<FlussArray> {
+        let mut data = self.data;
+        data.truncate(self.cursor);
+        FlussArray::from_vec(data)
+    }
+
+    /// Returns the number of elements this writer was initialized with.
+    pub fn num_elements(&self) -> usize {
+        self.num_elements
+    }
+}
+
+impl InternalRow for FlussArray {
+    fn get_field_count(&self) -> usize {
+        self.size()
+    }
+
+    fn is_null_at(&self, pos: usize) -> Result<bool> {
+        Ok(self.is_null_at(pos))
+    }
+
+    fn get_boolean(&self, pos: usize) -> Result<bool> {
+        self.get_boolean(pos)
+    }
+    fn get_byte(&self, pos: usize) -> Result<i8> {
+        self.get_byte(pos)
+    }
+    fn get_short(&self, pos: usize) -> Result<i16> {
+        self.get_short(pos)
+    }
+    fn get_int(&self, pos: usize) -> Result<i32> {
+        self.get_int(pos)
+    }
+    fn get_long(&self, pos: usize) -> Result<i64> {
+        self.get_long(pos)
+    }
+    fn get_float(&self, pos: usize) -> Result<f32> {
+        self.get_float(pos)
+    }
+    fn get_double(&self, pos: usize) -> Result<f64> {
+        self.get_double(pos)
+    }
+
+    fn get_char(&self, pos: usize, _length: usize) -> Result<&str> {
+        self.get_string(pos)
+    }
+
+    fn get_string(&self, pos: usize) -> Result<&str> {
+        self.get_string(pos)
+    }
+
+    fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> Result<Decimal> {
+        self.get_decimal(pos, precision as u32, scale as u32)
+    }
+
+    fn get_date(&self, pos: usize) -> Result<Date> {
+        self.get_date(pos)
+    }
+    fn get_time(&self, pos: usize) -> Result<Time> {
+        self.get_time(pos)
+    }
+    fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> Result<TimestampNtz> {
+        self.get_timestamp_ntz(pos, precision)
+    }
+    fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> Result<TimestampLtz> {
+        self.get_timestamp_ltz(pos, precision)
+    }
+
+    fn get_binary(&self, pos: usize, _length: usize) -> Result<&[u8]> {
+        self.get_binary(pos)
+    }
+
+    fn get_bytes(&self, pos: usize) -> Result<&[u8]> {
+        self.get_binary(pos)
+    }
+
+    fn get_array(&self, pos: usize) -> Result<FlussArray> {
+        self.get_array(pos)
+    }
+
+    fn get_map(&self, pos: usize) -> Result<FlussMap> {
+        // FlussArray carries no schema; nested map reads must go through the
+        // inherent FlussArray::get_map(pos, key_type, value_type).
+        Err(IllegalArgument {
+            message: format!(
+                "InternalRow::get_map is not supported on FlussArray (pos {pos}); \
+                 use FlussArray::get_map(pos, key_type, value_type) directly"
+            ),
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataField, DataTypes};
+    use crate::row::binary::BinaryWriter as BinaryWriterTrait;
+    use crate::row::compacted::CompactedRowWriter;
+    use crate::row::{Datum, GenericRow};
+
+    #[test]
+    fn test_header_calculation() {
+        assert_eq!(calculate_header_in_bytes(0), 4);
+        assert_eq!(calculate_header_in_bytes(1), 8);
+        assert_eq!(calculate_header_in_bytes(31), 8);
+        assert_eq!(calculate_header_in_bytes(32), 8);
+        assert_eq!(calculate_header_in_bytes(33), 12);
+        assert_eq!(calculate_header_in_bytes(64), 12);
+        assert_eq!(calculate_header_in_bytes(65), 16);
+    }
+
+    #[test]
+    fn test_fix_length_part_size() {
+        assert_eq!(calculate_fix_length_part_size(&DataTypes::boolean()), 1);
+        assert_eq!(calculate_fix_length_part_size(&DataTypes::tinyint()), 1);
+        assert_eq!(calculate_fix_length_part_size(&DataTypes::smallint()), 2);
+        assert_eq!(calculate_fix_length_part_size(&DataTypes::int()), 4);
+        assert_eq!(calculate_fix_length_part_size(&DataTypes::bigint()), 8);
+        assert_eq!(calculate_fix_length_part_size(&DataTypes::float()), 4);
+        assert_eq!(calculate_fix_length_part_size(&DataTypes::double()), 8);
+        assert_eq!(calculate_fix_length_part_size(&DataTypes::string()), 8);
+        assert_eq!(
+            calculate_fix_length_part_size(&DataTypes::array(DataTypes::int())),
+            8
+        );
+    }
+
+    #[test]
+    fn test_round_trip_int_array() {
+        let elem_type = DataTypes::int();
+        let mut writer = FlussArrayWriter::new(3, &elem_type);
+        writer.write_int(0, 10);
+        writer.write_int(1, 20);
+        writer.write_int(2, 30);
+        let array = writer.complete().unwrap();
+
+        assert_eq!(array.size(), 3);
+        assert!(!array.is_null_at(0));
+        assert_eq!(array.get_int(0).unwrap(), 10);
+        assert_eq!(array.get_int(1).unwrap(), 20);
+        assert_eq!(array.get_int(2).unwrap(), 30);
+    }
+
+    #[test]
+    fn test_round_trip_with_nulls() {
+        let elem_type = DataTypes::int();
+        let mut writer = FlussArrayWriter::new(3, &elem_type);
+        writer.write_int(0, 1);
+        writer.set_null_at(1);
+        writer.write_int(2, 3);
+        let array = writer.complete().unwrap();
+
+        assert_eq!(array.size(), 3);
+        assert!(!array.is_null_at(0));
+        assert!(array.is_null_at(1));
+        assert!(!array.is_null_at(2));
+        assert_eq!(array.get_int(0).unwrap(), 1);
+        assert_eq!(array.get_int(2).unwrap(), 3);
+    }
+
+    #[test]
+    fn test_round_trip_string_array() {
+        let elem_type = DataTypes::string();
+        let mut writer = FlussArrayWriter::new(3, &elem_type);
+        writer.write_string(0, "hello");
+        writer.write_string(1, "world");
+        writer.write_string(2, "!");
+        let array = writer.complete().unwrap();
+
+        assert_eq!(array.size(), 3);
+        assert_eq!(array.get_string(0).unwrap(), "hello");
+        assert_eq!(array.get_string(1).unwrap(), "world");
+        assert_eq!(array.get_string(2).unwrap(), "!");
+    }
+
+    #[test]
+    fn test_java_inline_short_string_decoding() {
+        // Manually construct Java-style inline encoded short string ("abc")
+        // slot payload: [len|0x80 in top byte] + [bytes in low 7 bytes on little-endian]
+        let mut data = vec![0_u8; 16];
+        data[0..4].copy_from_slice(&(1_i32).to_le_bytes());
+        // null bits remain 0
+        let first_byte = (3_u64 | 0x80) << 56;
+        let seven_bytes = (b'a' as u64) | ((b'b' as u64) << 8) | ((b'c' as u64) << 16);
+        let packed = first_byte | seven_bytes;
+        data[8..16].copy_from_slice(&packed.to_le_bytes());
+
+        let arr = FlussArray::from_bytes(&data).unwrap();
+        assert_eq!(arr.size(), 1);
+        assert_eq!(arr.get_string(0).unwrap(), "abc");
+    }
+
+    #[test]
+    fn test_java_inline_short_binary_decoding() {
+        let elem_type = DataTypes::bytes();
+        let mut writer = FlussArrayWriter::new(1, &elem_type);
+        writer.write_binary_bytes(0, b"abc");
+        let arr = writer.complete().unwrap();
+        assert_eq!(arr.get_binary(0).unwrap(), b"abc");
+    }
+
+    #[test]
+    fn test_round_trip_empty_array() {
+        let elem_type = DataTypes::int();
+        let writer = FlussArrayWriter::new(0, &elem_type);
+        let array = writer.complete().unwrap();
+        assert_eq!(array.size(), 0);
+    }
+
+    #[test]
+    fn test_round_trip_boolean_array() {
+        let elem_type = DataTypes::boolean();
+        let mut writer = FlussArrayWriter::new(3, &elem_type);
+        writer.write_boolean(0, true);
+        writer.write_boolean(1, false);
+        writer.write_boolean(2, true);
+        let array = writer.complete().unwrap();
+
+        assert_eq!(array.size(), 3);
+        assert!(array.get_boolean(0).unwrap());
+        assert!(!array.get_boolean(1).unwrap());
+        assert!(array.get_boolean(2).unwrap());
+    }
+
+    #[test]
+    fn test_round_trip_long_array() {
+        let elem_type = DataTypes::bigint();
+        let mut writer = FlussArrayWriter::new(2, &elem_type);
+        writer.write_long(0, i64::MAX);
+        writer.write_long(1, i64::MIN);
+        let array = writer.complete().unwrap();
+
+        assert_eq!(array.get_long(0).unwrap(), i64::MAX);
+        assert_eq!(array.get_long(1).unwrap(), i64::MIN);
+    }
+
+    #[test]
+    fn test_round_trip_double_array() {
+        let elem_type = DataTypes::double();
+        let mut writer = FlussArrayWriter::new(2, &elem_type);
+        writer.write_double(0, 1.23);
+        writer.write_double(1, -4.56);
+        let array = writer.complete().unwrap();
+
+        assert_eq!(array.get_double(0).unwrap(), 1.23);
+        assert_eq!(array.get_double(1).unwrap(), -4.56);
+    }
+
+    #[test]
+    fn test_round_trip_array_of_row() {
+        let row_type_owned = DataTypes::row(vec![
+            DataField::new("x", DataTypes::int(), None),
+            DataField::new("label", DataTypes::string(), None),
+        ]);
+        let element_type = row_type_owned.clone();
+        let row_type = match &row_type_owned {
+            DataType::Row(rt) => rt,
+            _ => unreachable!(),
+        };
+
+        // Build array<row<int, string>> with two rows: (42, "hello"), (-1, null)
+        let mut writer = FlussArrayWriter::new(2, &element_type);
+
+        let mut r0 = GenericRow::new(2);
+        r0.set_field(0, 42_i32);
+        r0.set_field(1, "hello");
+        writer.write_row(0, &r0).expect("write row 0");
+
+        let mut r1 = GenericRow::new(2);
+        r1.set_field(0, -1_i32);
+        r1.set_field(1, Datum::Null);
+        writer.write_row(1, &r1).expect("write row 1");
+
+        let array = writer.complete().unwrap();
+        assert_eq!(array.size(), 2);
+
+        let row0 = array.get_row(0, row_type).expect("get row 0");
+        assert_eq!(row0.get_int(0).unwrap(), 42);
+        assert_eq!(row0.get_string(1).unwrap(), "hello");
+
+        let row1 = array.get_row(1, row_type).expect("get row 1");
+        assert_eq!(row1.get_int(0).unwrap(), -1);
+        assert!(row1.is_null_at(1).unwrap());
+    }
+
+    #[test]
+    fn test_get_row_rejects_oversized_row_type() {
+        let small_row_type_owned =
+            DataTypes::row(vec![DataField::new("n", DataTypes::int(), None)]);
+        let small_row_type = match &small_row_type_owned {
+            DataType::Row(rt) => rt,
+            _ => unreachable!(),
+        };
+        let mut writer = FlussArrayWriter::new(1, &small_row_type_owned);
+        let mut row = GenericRow::new(1);
+        row.set_field(0, 7_i32);
+        writer.write_row(0, &row).unwrap();
+        let array = writer.complete().unwrap();
+
+        let oversized_owned = DataTypes::row(
+            (0..10)
+                .map(|i| DataField::new(format!("f{i}"), DataTypes::int(), None))
+                .collect(),
+        );
+        let oversized_row_type = match &oversized_owned {
+            DataType::Row(rt) => rt,
+            _ => unreachable!(),
+        };
+        let huge_owned = DataTypes::row(
+            (0..100)
+                .map(|i| DataField::new(format!("f{i}"), DataTypes::int(), None))
+                .collect(),
+        );
+        let huge_row_type = match &huge_owned {
+            DataType::Row(rt) => rt,
+            _ => unreachable!(),
+        };
+        match array.get_row(0, huge_row_type) {
+            Err(e) => assert!(
+                e.to_string().contains("too short for row type"),
+                "unexpected error: {e}"
+            ),
+            Ok(_) => panic!("expected oversized row_type to be rejected"),
+        }
+
+        let recovered = array.get_row(0, small_row_type).unwrap();
+        assert_eq!(recovered.get_int(0).unwrap(), 7);
+
+        let _ = oversized_row_type;
+    }
+
+    #[test]
+    fn test_round_trip_array_of_row_with_nullable_element() {
+        let row_type_owned = DataTypes::row(vec![DataField::new("n", DataTypes::int(), None)]);
+        let element_type = row_type_owned.clone();
+        let row_type = match &row_type_owned {
+            DataType::Row(rt) => rt,
+            _ => unreachable!(),
+        };
+
+        let mut writer = FlussArrayWriter::new(3, &element_type);
+
+        let mut r0 = GenericRow::new(1);
+        r0.set_field(0, 7_i32);
+        writer.write_row(0, &r0).expect("write row 0");
+
+        writer.set_null_at(1);
+
+        let mut r2 = GenericRow::new(1);
+        r2.set_field(0, 8_i32);
+        writer.write_row(2, &r2).expect("write row 2");
+
+        let array = writer.complete().unwrap();
+
+        let row0 = array.get_row(0, row_type).unwrap();
+        assert_eq!(row0.get_int(0).unwrap(), 7);
+        assert!(array.is_null_at(1));
+        let row2 = array.get_row(2, row_type).unwrap();
+        assert_eq!(row2.get_int(0).unwrap(), 8);
+
+        let strict_row_type_owned = DataTypes::row(vec![DataField::new(
+            "n",
+            DataTypes::int().as_non_nullable(),
+            None,
+        )]);
+        let mut bad_writer = FlussArrayWriter::new(1, &strict_row_type_owned);
+        let mut bad = GenericRow::new(1);
+        bad.set_field(0, Datum::Null);
+        let err = bad_writer.write_row(0, &bad).unwrap_err();
+        assert!(
+            err.to_string().contains("non-nullable"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn test_round_trip_array_of_row_of_array_of_string() {
+        let inner_array_type = DataTypes::array(DataTypes::string());
+        let inner_row_type_owned =
+            DataTypes::row(vec![DataField::new("tags", inner_array_type.clone(), None)]);
+        let inner_row_type = match &inner_row_type_owned {
+            DataType::Row(rt) => rt,
+            _ => unreachable!(),
+        };
+
+        let mut tags1 = FlussArrayWriter::new(2, &DataTypes::string());
+        tags1.write_string(0, "alpha");
+        tags1.write_string(1, "beta");
+        let tags1 = tags1.complete().unwrap();
+        let mut row1 = GenericRow::new(1);
+        row1.set_field(0, tags1);
+
+        let mut tags2 = FlussArrayWriter::new(3, &DataTypes::string());
+        tags2.write_string(0, "x");
+        tags2.set_null_at(1);
+        tags2.write_string(2, "z");
+        let tags2 = tags2.complete().unwrap();
+        let mut row2 = GenericRow::new(1);
+        row2.set_field(0, tags2);
+
+        let mut outer_writer = FlussArrayWriter::new(2, &inner_row_type_owned);
+        outer_writer.write_row(0, &row1).unwrap();
+        outer_writer.write_row(1, &row2).unwrap();
+        let outer = outer_writer.complete().unwrap();
+
+        assert_eq!(outer.size(), 2);
+
+        let r0 = outer.get_row(0, inner_row_type).unwrap();
+        let r0_tags = r0.get_array(0).unwrap();
+        assert_eq!(r0_tags.size(), 2);
+        assert_eq!(r0_tags.get_string(0).unwrap(), "alpha");
+        assert_eq!(r0_tags.get_string(1).unwrap(), "beta");
+
+        let r1 = outer.get_row(1, inner_row_type).unwrap();
+        let r1_tags = r1.get_array(0).unwrap();
+        assert_eq!(r1_tags.size(), 3);
+        assert_eq!(r1_tags.get_string(0).unwrap(), "x");
+        assert!(r1_tags.is_null_at(1));
+        assert_eq!(r1_tags.get_string(2).unwrap(), "z");
+    }
+
+    #[test]
+    fn test_round_trip_row_of_array_of_row() {
+        let inner_row_type_owned =
+            DataTypes::row(vec![DataField::new("n", DataTypes::int(), None)]);
+        let inner_array_type = DataTypes::array(inner_row_type_owned.clone());
+        let outer_row_type_owned =
+            DataTypes::row(vec![DataField::new("arr", inner_array_type.clone(), None)]);
+
+        let outer_row_type = match &outer_row_type_owned {
+            DataType::Row(rt) => rt,
+            _ => unreachable!(),
+        };
+        let inner_row_type = match &inner_row_type_owned {
+            DataType::Row(rt) => rt,
+            _ => unreachable!(),
+        };
+
+        let mut arr_writer = FlussArrayWriter::new(2, &inner_row_type_owned);
+        let mut r0 = GenericRow::new(1);
+        r0.set_field(0, 1_i32);
+        arr_writer.write_row(0, &r0).unwrap();
+        let mut r1 = GenericRow::new(1);
+        r1.set_field(0, 2_i32);
+        arr_writer.write_row(1, &r1).unwrap();
+        let inner_arr = arr_writer.complete().unwrap();
+
+        let mut outer = GenericRow::new(1);
+        outer.set_field(0, inner_arr.clone());
+
+        let mut writer = CompactedRowWriter::new(1);
+        writer.write_array(&inner_arr);
+        let bytes = writer.to_bytes();
+
+        let outer_compacted = CompactedRow::from_bytes(outer_row_type, &bytes);
+        let recovered_arr = outer_compacted.get_array(0).unwrap();
+        assert_eq!(recovered_arr.size(), 2);
+
+        let recovered_r0 = recovered_arr.get_row(0, inner_row_type).unwrap();
+        assert_eq!(recovered_r0.get_int(0).unwrap(), 1);
+        let recovered_r1 = recovered_arr.get_row(1, inner_row_type).unwrap();
+        assert_eq!(recovered_r1.get_int(0).unwrap(), 2);
+    }
+
+    #[test]
+    fn test_round_trip_nested_array() {
+        let inner_type = DataTypes::int();
+        let outer_type = DataTypes::array(DataTypes::int());
+
+        let mut inner_writer = FlussArrayWriter::new(2, &inner_type);
+        inner_writer.write_int(0, 1);
+        inner_writer.write_int(1, 2);
+        let inner_array = inner_writer.complete().unwrap();
+
+        let mut outer_writer = FlussArrayWriter::new(1, &outer_type);
+        outer_writer.write_array(0, &inner_array);
+        let outer_array = outer_writer.complete().unwrap();
+
+        assert_eq!(outer_array.size(), 1);
+        let nested = outer_array.get_array(0).unwrap();
+        assert_eq!(nested.size(), 2);
+        assert_eq!(nested.get_int(0).unwrap(), 1);
+        assert_eq!(nested.get_int(1).unwrap(), 2);
+    }
+
+    #[test]
+    fn test_primitive_getter_out_of_bounds_returns_error() {
+        let elem_type = DataTypes::int();
+        let mut writer = FlussArrayWriter::new(1, &elem_type);
+        writer.write_int(0, 10);
+        let array = writer.complete().unwrap();
+
+        let err = array.get_int(1).unwrap_err();
+        assert!(
+            err.to_string().contains("out of bounds"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn test_primitive_getter_on_malformed_payload_returns_error() {
+        // Size says 1, but payload only contains header (no element bytes).
+        let mut data = vec![0_u8; 8];
+        data[0..4].copy_from_slice(&(1_i32).to_le_bytes());
+        let arr = FlussArray::from_bytes(&data).unwrap();
+
+        let err = arr.get_int(0).unwrap_err();
+        assert!(
+            err.to_string().contains("Out-of-bounds"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn test_binary_layout_matches_java() {
+        // Verify exact byte layout for a simple [1, 2, 3] int array
+        let elem_type = DataTypes::int();
+        let mut writer = FlussArrayWriter::new(3, &elem_type);
+        writer.write_int(0, 1);
+        writer.write_int(1, 2);
+        writer.write_int(2, 3);
+        let array = writer.complete().unwrap();
+        let bytes = array.as_bytes();
+
+        // size = 3 at offset 0 (4 bytes, little-endian per Java MemorySegment.putInt)
+        assert_eq!(i32::from_le_bytes(bytes[0..4].try_into().unwrap()), 3);
+        // null bits: 4 bytes starting at offset 4, should be all zeros
+        assert_eq!(&bytes[4..8], &[0, 0, 0, 0]);
+        // elements start at offset 8 (header = 4 + 4), each 4 bytes (little-endian)
+        assert_eq!(i32::from_le_bytes(bytes[8..12].try_into().unwrap()), 1);
+        assert_eq!(i32::from_le_bytes(bytes[12..16].try_into().unwrap()), 2);
+        assert_eq!(i32::from_le_bytes(bytes[16..20].try_into().unwrap()), 3);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/binary_map.rs b/fluss-rust/crates/fluss/src/row/binary_map.rs
new file mode 100644
index 0000000000..02425129d3
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/binary_map.rs
@@ -0,0 +1,703 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Binary map format matching Java's `BinaryMap.java` layout.
+//!
+//! Binary layout:
+//! ```text
+//! [4 bytes: keyArraySizeInBytes] + [Key BinaryArray bytes] + [Value BinaryArray bytes]
+//! ```
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::DataType;
+use crate::row::binary_array::{FlussArray, FlussArrayWriter};
+use crate::row::datum::{Datum, read_datum_from_fluss_array};
+use bytes::Bytes;
+use serde::Serialize;
+use std::fmt;
+use std::hash::{Hash, Hasher};
+
+/// A Fluss binary map, wire-compatible with Java's `BinaryMap`.
+///
+/// Stores entries as two parallel binary arrays (keys and values) within a single
+/// byte buffer.
+#[derive(Clone)]
+pub struct FlussMap {
+    data: Bytes,
+    key_array: FlussArray,
+    value_array: FlussArray,
+    key_type: DataType,
+    value_type: DataType,
+}
+
+impl fmt::Debug for FlussMap {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("FlussMap")
+            .field("size", &self.size())
+            .field("data_len", &self.data.len())
+            .finish()
+    }
+}
+
+impl fmt::Display for FlussMap {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "FlussMap[size={}]", self.size())
+    }
+}
+
+impl PartialEq for FlussMap {
+    fn eq(&self, other: &Self) -> bool {
+        self.data == other.data
+    }
+}
+
+impl Eq for FlussMap {}
+
+impl PartialOrd for FlussMap {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for FlussMap {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.data.cmp(&other.data)
+    }
+}
+
+impl Hash for FlussMap {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.data.hash(state);
+    }
+}
+
+impl Serialize for FlussMap {
+    fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        serializer.serialize_bytes(&self.data)
+    }
+}
+
+fn check_no_null_keys(key_array: &FlussArray) -> Result<()> {
+    for i in 0..key_array.size() {
+        if key_array.is_null_at(i) {
+            return Err(IllegalArgument {
+                message: "FlussMap keys cannot be null".to_string(),
+            });
+        }
+    }
+    Ok(())
+}
+
+impl FlussMap {
+    /// Validates the raw bytes and extracts the sub-arrays.
+    fn validate(
+        data: &[u8],
+        key_type: &DataType,
+        value_type: &DataType,
+    ) -> Result<(FlussArray, FlussArray)> {
+        if data.len() < 4 {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussMap data too short: need at least 4 bytes, got {}",
+                    data.len()
+                ),
+            });
+        }
+        let raw_key_size = i32::from_le_bytes(data[0..4].try_into().unwrap());
+        if raw_key_size < 0 {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussMap key array size must be non-negative, got {}",
+                    raw_key_size
+                ),
+            });
+        }
+        let key_size = raw_key_size as usize;
+        if 4 + key_size > data.len() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussMap key array size {} exceeds remaining payload {}",
+                    key_size,
+                    data.len() - 4
+                ),
+            });
+        }
+
+        let key_bytes = &data[4..4 + key_size];
+        let value_bytes = &data[4 + key_size..];
+
+        let key_array = FlussArray::from_bytes(key_bytes).map_err(|e| IllegalArgument {
+            message: format!("Invalid key array in FlussMap: {}", e),
+        })?;
+
+        let value_array = FlussArray::from_bytes(value_bytes).map_err(|e| IllegalArgument {
+            message: format!("Invalid value array in FlussMap: {}", e),
+        })?;
+
+        if key_array.size() != value_array.size() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussMap key array size ({}) does not match value array size ({})",
+                    key_array.size(),
+                    value_array.size()
+                ),
+            });
+        }
+
+        // Strict trailing byte check: ensure the total reach of key and value arrays
+        // plus the 4-byte header matches the provided data length exactly.
+        let key_extent = key_array.extent(key_type)?;
+        let value_extent = value_array.extent(value_type)?;
+        let expected_len = 4 + key_extent + value_extent;
+        if expected_len != data.len() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussMap binary validation failed: expected {expected_len} bytes, got {}",
+                    data.len()
+                ),
+            });
+        }
+
+        check_no_null_keys(&key_array)?;
+
+        Ok((key_array, value_array))
+    }
+
+    /// Creates a FlussMap from a byte slice (copies data).
+    pub(crate) fn from_bytes(
+        data: &[u8],
+        key_type: &DataType,
+        value_type: &DataType,
+    ) -> Result<Self> {
+        let (key_array, value_array) = Self::validate(data, key_type, value_type)?;
+        Ok(FlussMap {
+            data: Bytes::copy_from_slice(data),
+            key_array,
+            value_array,
+            key_type: key_type.clone(),
+            value_type: value_type.clone(),
+        })
+    }
+
+    /// Creates a FlussMap from owned bytes without copying.
+    pub(crate) fn from_owned_bytes(
+        data: Bytes,
+        key_type: &DataType,
+        value_type: &DataType,
+    ) -> Result<Self> {
+        let (key_array, value_array) = Self::validate(&data, key_type, value_type)?;
+        Ok(FlussMap {
+            data,
+            key_array,
+            value_array,
+            key_type: key_type.clone(),
+            value_type: value_type.clone(),
+        })
+    }
+
+    /// Creates a FlussMap by combining a key array and a value array.
+    ///
+    /// Copies both arrays into a new contiguous buffer.
+    pub fn from_arrays(
+        key_array: &FlussArray,
+        value_array: &FlussArray,
+        key_type: &DataType,
+        value_type: &DataType,
+    ) -> Result<Self> {
+        if key_array.size() != value_array.size() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "FlussMap key array size ({}) does not match value array size ({})",
+                    key_array.size(),
+                    value_array.size()
+                ),
+            });
+        }
+        check_no_null_keys(key_array)?;
+
+        let key_bytes = key_array.as_bytes();
+        let value_bytes = value_array.as_bytes();
+
+        let mut data = Vec::with_capacity(4 + key_bytes.len() + value_bytes.len());
+        // Write the key array size (4 bytes)
+        // Java's BinaryMap uses memory segment methods which write in LE
+        data.extend_from_slice(&(key_bytes.len() as i32).to_le_bytes());
+        // Write key array bytes
+        data.extend_from_slice(key_bytes);
+        // Write value array bytes
+        data.extend_from_slice(value_bytes);
+
+        let data = Bytes::from(data);
+        Ok(FlussMap {
+            data,
+            key_array: key_array.clone(),
+            value_array: value_array.clone(),
+            key_type: key_type.clone(),
+            value_type: value_type.clone(),
+        })
+    }
+
+    /// Returns the number of entries in the map.
+    pub fn size(&self) -> usize {
+        self.key_array.size()
+    }
+
+    /// Returns the raw bytes of this map (the complete binary representation).
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.data
+    }
+
+    /// Returns the key array.
+    pub fn key_array(&self) -> &FlussArray {
+        &self.key_array
+    }
+
+    /// Returns the value array.
+    pub fn value_array(&self) -> &FlussArray {
+        &self.value_array
+    }
+
+    pub fn key_type(&self) -> &DataType {
+        &self.key_type
+    }
+
+    pub fn value_type(&self) -> &DataType {
+        &self.value_type
+    }
+
+    pub fn entries(&self) -> Entries<'_> {
+        Entries {
+            map: self,
+            index: 0,
+        }
+    }
+
+    /// O(n) linear scan; the binary format carries no key index.
+    pub fn get<'a>(&'a self, key: &Datum<'_>) -> Result<Option<Datum<'a>>> {
+        for entry in self.entries() {
+            let (k, v) = entry?;
+            if &k == key {
+                return Ok(Some(v));
+            }
+        }
+        Ok(None)
+    }
+}
+
+pub struct Entries<'a> {
+    map: &'a FlussMap,
+    index: usize,
+}
+
+impl<'a> Iterator for Entries<'a> {
+    type Item = Result<(Datum<'a>, Datum<'a>)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.index >= self.map.size() {
+            return None;
+        }
+        let i = self.index;
+        self.index += 1;
+        let key = read_datum_from_fluss_array(&self.map.key_array, i, &self.map.key_type);
+        let value = read_datum_from_fluss_array(&self.map.value_array, i, &self.map.value_type);
+        Some(key.and_then(|k| value.map(|v| (k, v))))
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let remaining = self.map.size() - self.index;
+        (remaining, Some(remaining))
+    }
+}
+
+impl ExactSizeIterator for Entries<'_> {}
+
+/// Writer for building a `FlussMap` entry by entry.
+pub struct FlussMapWriter {
+    key_writer: FlussArrayWriter,
+    value_writer: FlussArrayWriter,
+    key_type: DataType,
+    value_type: DataType,
+    current_index: usize,
+}
+
+impl FlussMapWriter {
+    /// Creates a new writer for a map with the given capacity and key/value types.
+    pub fn new(capacity: usize, key_type: &DataType, value_type: &DataType) -> Self {
+        Self {
+            key_writer: FlussArrayWriter::new(capacity, key_type),
+            value_writer: FlussArrayWriter::new(capacity, value_type),
+            key_type: key_type.clone(),
+            value_type: value_type.clone(),
+            current_index: 0,
+        }
+    }
+
+    pub fn extend<'a, I, K, V>(&mut self, entries: I) -> Result<()>
+    where
+        I: IntoIterator<Item = (K, V)>,
+        K: Into<Datum<'a>>,
+        V: Into<Datum<'a>>,
+    {
+        for (k, v) in entries {
+            self.write_entry(k.into(), v.into())?;
+        }
+        Ok(())
+    }
+
+    /// Writes a key-value entry into the map.
+    ///
+    /// # Errors
+    /// Returns an error if the key is null or if there's a type mismatch.
+    pub fn write_entry(&mut self, key: Datum, value: Datum) -> Result<()> {
+        if key.is_null() {
+            return Err(IllegalArgument {
+                message: "FlussMap keys cannot be null".to_string(),
+            });
+        }
+
+        Self::write_datum(
+            &mut self.key_writer,
+            self.current_index,
+            key,
+            &self.key_type,
+        )?;
+        Self::write_datum(
+            &mut self.value_writer,
+            self.current_index,
+            value,
+            &self.value_type,
+        )?;
+        self.current_index += 1;
+        Ok(())
+    }
+
+    /// Finalizes the writer and returns the completed `FlussMap`.
+    pub fn complete(self) -> Result<FlussMap> {
+        let key_array = self.key_writer.complete()?;
+        let value_array = self.value_writer.complete()?;
+        FlussMap::from_arrays(&key_array, &value_array, &self.key_type, &self.value_type)
+    }
+
+    fn write_datum(
+        writer: &mut FlussArrayWriter,
+        pos: usize,
+        datum: Datum,
+        dt: &DataType,
+    ) -> Result<()> {
+        if datum.is_null() {
+            writer.set_null_at(pos);
+            return Ok(());
+        }
+
+        match (dt, &datum) {
+            (DataType::Boolean(_), Datum::Bool(v)) => writer.write_boolean(pos, *v),
+            (DataType::TinyInt(_), Datum::Int8(v)) => writer.write_byte(pos, *v),
+            (DataType::SmallInt(_), Datum::Int16(v)) => writer.write_short(pos, *v),
+            (DataType::Int(_), Datum::Int32(v)) => writer.write_int(pos, *v),
+            (DataType::BigInt(_), Datum::Int64(v)) => writer.write_long(pos, *v),
+            (DataType::Float(_), Datum::Float32(v)) => writer.write_float(pos, v.into_inner()),
+            (DataType::Double(_), Datum::Float64(v)) => writer.write_double(pos, v.into_inner()),
+            (DataType::Char(_), Datum::String(v)) => writer.write_string(pos, v),
+            (DataType::String(_), Datum::String(v)) => writer.write_string(pos, v),
+            (DataType::Binary(_), Datum::Blob(v)) => writer.write_binary_bytes(pos, v),
+            (DataType::Bytes(_), Datum::Blob(v)) => writer.write_binary_bytes(pos, v),
+            (DataType::Decimal(d), Datum::Decimal(v)) => {
+                writer.write_decimal(pos, v, d.precision())
+            }
+            (DataType::Date(_), Datum::Date(v)) => writer.write_date(pos, *v),
+            (DataType::Time(_), Datum::Time(v)) => writer.write_time(pos, *v),
+            (DataType::Timestamp(t), Datum::TimestampNtz(v)) => {
+                writer.write_timestamp_ntz(pos, v, t.precision())
+            }
+            (DataType::TimestampLTz(t), Datum::TimestampLtz(v)) => {
+                writer.write_timestamp_ltz(pos, v, t.precision())
+            }
+            (DataType::Array(_), Datum::Array(v)) => writer.write_array(pos, v),
+            (DataType::Map(_), Datum::Map(v)) => writer.write_map(pos, v),
+            (DataType::Row(_), Datum::Row(v)) => writer.write_row(pos, v.as_ref())?,
+            _ => {
+                return Err(IllegalArgument {
+                    message: format!("Type mismatch: expected {:?}, got {:?}", dt, datum),
+                });
+            }
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::DataTypes;
+    use crate::row::binary_array::FlussArrayWriter;
+
+    #[test]
+    fn test_round_trip_int_to_string_map() {
+        let mut writer = FlussMapWriter::new(2, &DataTypes::int(), &DataTypes::string());
+        writer.write_entry(1.into(), "a".into()).unwrap();
+        writer.write_entry(2.into(), "b".into()).unwrap();
+        let map = writer.complete().unwrap();
+        assert_eq!(map.size(), 2);
+
+        assert_eq!(
+            map.as_bytes(),
+            &[
+                16, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
+                0, 97, 0, 0, 0, 0, 0, 0, 129, 98, 0, 0, 0, 0, 0, 0, 129
+            ]
+        );
+
+        let bytes = map.as_bytes();
+        let decoded = FlussMap::from_bytes(bytes, &DataTypes::int(), &DataTypes::string()).unwrap();
+
+        assert_eq!(decoded.size(), 2);
+        let decoded_keys = decoded.key_array();
+        let decoded_values = decoded.value_array();
+
+        assert_eq!(decoded_keys.get_int(0).unwrap(), 1);
+        assert_eq!(decoded_keys.get_int(1).unwrap(), 2);
+        assert_eq!(decoded_values.get_string(0).unwrap(), "a");
+        assert_eq!(decoded_values.get_string(1).unwrap(), "b");
+    }
+
+    #[test]
+    fn test_empty_map() {
+        let writer = FlussMapWriter::new(0, &DataTypes::int(), &DataTypes::string());
+        let map = writer.complete().unwrap();
+        assert_eq!(map.size(), 0);
+
+        let decoded =
+            FlussMap::from_bytes(map.as_bytes(), &DataTypes::int(), &DataTypes::string()).unwrap();
+        assert_eq!(decoded.size(), 0);
+    }
+
+    #[test]
+    fn test_map_with_null_values() {
+        let key_type = DataTypes::string();
+        let value_type = DataTypes::int();
+        let mut writer = FlussMapWriter::new(3, &key_type, &value_type);
+        writer.write_entry("k1".into(), 10.into()).unwrap();
+        writer.write_entry("k2".into(), Datum::Null).unwrap();
+        writer.write_entry("k3".into(), 30.into()).unwrap();
+        let map = writer.complete().unwrap();
+
+        assert_eq!(
+            map.as_bytes(),
+            &[
+                32, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 107, 49, 0, 0, 0, 0, 0, 130, 107, 50, 0, 0, 0,
+                0, 0, 130, 107, 51, 0, 0, 0, 0, 0, 130, 3, 0, 0, 0, 2, 0, 0, 0, 10, 0, 0, 0, 0, 0,
+                0, 0, 30, 0, 0, 0, 0, 0, 0, 0
+            ]
+        );
+
+        let decoded = FlussMap::from_bytes(map.as_bytes(), &key_type, &value_type).unwrap();
+
+        let values = decoded.value_array();
+        assert_eq!(values.size(), 3);
+        assert!(!values.is_null_at(0));
+        assert!(values.is_null_at(1));
+        assert!(!values.is_null_at(2));
+        assert_eq!(values.get_int(0).unwrap(), 10);
+        assert_eq!(values.get_int(2).unwrap(), 30);
+    }
+
+    #[test]
+    fn test_invalid_data() {
+        // Too short
+        let err =
+            FlussMap::from_bytes(&[1, 2, 3], &DataTypes::int(), &DataTypes::int()).unwrap_err();
+        assert!(err.to_string().contains("FlussMap data too short"));
+
+        // Negative size
+        let neg_size = (-1i32).to_le_bytes();
+        let mut bad_data = vec![];
+        bad_data.extend_from_slice(&neg_size);
+        bad_data.extend_from_slice(&[0, 0, 0, 0]);
+        let err2 =
+            FlussMap::from_bytes(&bad_data, &DataTypes::int(), &DataTypes::int()).unwrap_err();
+        assert!(
+            err2.to_string()
+                .contains("FlussMap key array size must be non-negative")
+        );
+
+        // Key array length exceeds payload
+        let large_size = 100i32.to_le_bytes();
+        let mut bad_data2 = vec![];
+        bad_data2.extend_from_slice(&large_size);
+        bad_data2.extend_from_slice(&[0, 0, 0, 0]);
+        let err3 =
+            FlussMap::from_bytes(&bad_data2, &DataTypes::int(), &DataTypes::int()).unwrap_err();
+        assert!(
+            err3.to_string()
+                .contains("FlussMap key array size 100 exceeds remaining payload 4")
+        );
+    }
+
+    #[test]
+    fn test_mismatched_array_sizes() {
+        let key_writer = FlussArrayWriter::new(1, &DataTypes::int());
+        let key_array = key_writer.complete().unwrap();
+
+        let value_writer = FlussArrayWriter::new(2, &DataTypes::string());
+        let value_array = value_writer.complete().unwrap();
+
+        let err = FlussMap::from_arrays(
+            &key_array,
+            &value_array,
+            &DataTypes::int(),
+            &DataTypes::string(),
+        )
+        .unwrap_err();
+        assert!(err.to_string().contains("does not match value array size"));
+    }
+
+    #[test]
+    fn test_nested_map() {
+        let map_type = DataTypes::map(DataTypes::int(), DataTypes::string());
+        let mut inner_writer = FlussMapWriter::new(1, &DataTypes::int(), &DataTypes::string());
+        inner_writer.write_entry(1.into(), "b".into()).unwrap();
+        let inner_map = inner_writer.complete().unwrap();
+
+        let mut writer = FlussMapWriter::new(1, &DataTypes::string(), &map_type);
+        writer
+            .write_entry("a".into(), Datum::Map(inner_map))
+            .unwrap();
+        let map = writer.complete().unwrap();
+
+        let decoded =
+            FlussMap::from_bytes(map.as_bytes(), &DataTypes::string(), &map_type).unwrap();
+        let decoded_keys = decoded.key_array();
+        let decoded_values = decoded.value_array();
+
+        assert_eq!(decoded_keys.get_string(0).unwrap(), "a");
+        let decoded_inner_map = decoded_values
+            .get_map(0, &DataTypes::int(), &DataTypes::string())
+            .unwrap();
+        assert_eq!(decoded_inner_map.key_array().get_int(0).unwrap(), 1);
+        assert_eq!(decoded_inner_map.value_array().get_string(0).unwrap(), "b");
+    }
+
+    #[test]
+    fn test_trailing_garbage() {
+        let mut key_writer = FlussArrayWriter::new(1, &DataTypes::int());
+        key_writer.write_int(0, 1);
+        let key_array = key_writer.complete().unwrap();
+
+        let mut value_writer = FlussArrayWriter::new(1, &DataTypes::int());
+        value_writer.write_int(0, 100);
+        let value_array = value_writer.complete().unwrap();
+
+        let map = FlussMap::from_arrays(
+            &key_array,
+            &value_array,
+            &DataTypes::int(),
+            &DataTypes::int(),
+        )
+        .unwrap();
+        let bytes = map.as_bytes();
+
+        // Valid bytes should pass
+        assert!(FlussMap::from_bytes(bytes, &DataTypes::int(), &DataTypes::int()).is_ok());
+
+        // Append trailing garbage
+        let mut bad_bytes = bytes.to_vec();
+        bad_bytes.push(0);
+        let err =
+            FlussMap::from_bytes(&bad_bytes, &DataTypes::int(), &DataTypes::int()).unwrap_err();
+        assert!(err.to_string().contains("binary validation failed"));
+        assert!(err.to_string().contains("expected"));
+    }
+
+    #[test]
+    fn test_null_keys_fail_validation() {
+        let mut key_writer = FlussArrayWriter::new(1, &DataTypes::int());
+        key_writer.set_null_at(0);
+        let key_array = key_writer.complete().unwrap();
+
+        let mut value_writer = FlussArrayWriter::new(1, &DataTypes::int());
+        value_writer.write_int(0, 100);
+        let value_array = value_writer.complete().unwrap();
+
+        let err = FlussMap::from_arrays(
+            &key_array,
+            &value_array,
+            &DataTypes::int(),
+            &DataTypes::int(),
+        )
+        .unwrap_err();
+        assert!(err.to_string().contains("keys cannot be null"));
+
+        let key_bytes = key_array.as_bytes();
+        let value_bytes = value_array.as_bytes();
+        let mut data = vec![];
+        data.extend_from_slice(&(key_bytes.len() as i32).to_le_bytes());
+        data.extend_from_slice(key_bytes);
+        data.extend_from_slice(value_bytes);
+
+        let err = FlussMap::from_bytes(&data, &DataTypes::int(), &DataTypes::int()).unwrap_err();
+        assert!(err.to_string().contains("keys cannot be null"));
+    }
+
+    #[test]
+    fn entries_yields_typed_pairs_including_nulls() {
+        let mut writer = FlussMapWriter::new(3, &DataTypes::string(), &DataTypes::int());
+        writer.write_entry("a".into(), 1.into()).unwrap();
+        writer.write_entry("b".into(), Datum::Null).unwrap();
+        writer.write_entry("c".into(), 3.into()).unwrap();
+        let map = writer.complete().unwrap();
+
+        let collected: Vec<(Datum, Datum)> = map
+            .entries()
+            .collect::<Result<Vec<_>>>()
+            .expect("entries should decode cleanly");
+
+        assert_eq!(collected.len(), 3);
+        assert_eq!(collected[0], (Datum::from("a"), Datum::from(1i32)));
+        assert_eq!(collected[1].0, Datum::from("b"));
+        assert_eq!(collected[1].1, Datum::Null);
+        assert_eq!(collected[2], (Datum::from("c"), Datum::from(3i32)));
+    }
+
+    #[test]
+    fn get_finds_present_key_and_returns_none_for_absent() {
+        let mut writer = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::int());
+        writer.write_entry("a".into(), 10.into()).unwrap();
+        writer.write_entry("b".into(), 20.into()).unwrap();
+        let map = writer.complete().unwrap();
+
+        let v = map.get(&Datum::from("b")).unwrap();
+        assert_eq!(v, Some(Datum::from(20i32)));
+
+        let missing = map.get(&Datum::from("z")).unwrap();
+        assert!(missing.is_none());
+    }
+
+    #[test]
+    fn writer_extend_from_iterator_round_trips() {
+        let src: Vec<(&str, i32)> = vec![("a", 1), ("b", 2), ("c", 3)];
+        let mut writer = FlussMapWriter::new(src.len(), &DataTypes::string(), &DataTypes::int());
+        writer.extend(src).unwrap();
+        let map = writer.complete().unwrap();
+
+        assert_eq!(map.size(), 3);
+        assert_eq!(map.get(&Datum::from("b")).unwrap(), Some(Datum::from(2i32)));
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/column.rs b/fluss-rust/crates/fluss/src/row/column.rs
new file mode 100644
index 0000000000..9dbdd947dd
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/column.rs
@@ -0,0 +1,1719 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::{DataType, RowType};
+use crate::record::from_arrow_field;
+use crate::row::binary_array::FlussArrayWriter;
+use crate::row::binary_map::FlussMap;
+use crate::row::datum::{Date, Datum, Time, TimestampLtz, TimestampNtz};
+use crate::row::{Decimal, FlussArray, GenericRow, InternalRow};
+use arrow::array::{
+    Array, AsArray, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray,
+    Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, ListArray, MapArray,
+    RecordBatch, StringArray, StructArray, Time32MillisecondArray, Time32SecondArray,
+    Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
+    TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray,
+};
+use arrow::datatypes::{
+    DataType as ArrowDataType, Date32Type, Decimal128Type, Float32Type, Float64Type, Int8Type,
+    Int16Type, Int32Type, Int64Type, Time32MillisecondType, Time32SecondType,
+    Time64MicrosecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType,
+    TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType,
+};
+use std::sync::Arc;
+
+#[derive(Clone)]
+pub struct ColumnarRow {
+    record_batch: Arc<RecordBatch>,
+    row_type: Arc<RowType>,
+    row_id: usize,
+    fluss_row_type: Option<Arc<RowType>>,
+    row_column_indices: Arc<[usize]>,
+    row_caches: Box<[std::sync::OnceLock<GenericRow<'static>>]>,
+}
+
+pub(crate) fn fluss_row_column_indices(row_type: &RowType) -> Arc<[usize]> {
+    row_type
+        .fields()
+        .iter()
+        .enumerate()
+        .filter_map(|(i, f)| matches!(f.data_type, DataType::Row(_)).then_some(i))
+        .collect()
+}
+
+pub(crate) fn arrow_row_column_indices(batch: &RecordBatch) -> Arc<[usize]> {
+    batch
+        .columns()
+        .iter()
+        .enumerate()
+        .filter_map(|(i, c)| matches!(c.data_type(), ArrowDataType::Struct(_)).then_some(i))
+        .collect()
+}
+
+fn make_row_caches(indices: &[usize]) -> Box<[std::sync::OnceLock<GenericRow<'static>>]> {
+    indices.iter().map(|_| std::sync::OnceLock::new()).collect()
+}
+
+impl ColumnarRow {
+    pub fn new(
+        batch: Arc<RecordBatch>,
+        row_type: Arc<RowType>,
+        row_id: usize,
+        fluss_row_type: Option<Arc<RowType>>,
+    ) -> Self {
+        let row_column_indices = match &fluss_row_type {
+            Some(rt) => fluss_row_column_indices(rt),
+            None => arrow_row_column_indices(&batch),
+        };
+        Self::with_indices(batch, row_type, row_id, fluss_row_type, row_column_indices)
+    }
+
+    pub(crate) fn with_indices(
+        batch: Arc<RecordBatch>,
+        row_type: Arc<RowType>,
+        row_id: usize,
+        fluss_row_type: Option<Arc<RowType>>,
+        row_column_indices: Arc<[usize]>,
+    ) -> Self {
+        let row_caches = make_row_caches(&row_column_indices);
+        ColumnarRow {
+            record_batch: batch,
+            row_type,
+            row_id,
+            fluss_row_type,
+            row_column_indices,
+            row_caches,
+        }
+    }
+
+    pub fn fluss_row_type(&self) -> Option<&Arc<RowType>> {
+        self.fluss_row_type.as_ref()
+    }
+
+    pub fn set_row_id(&mut self, row_id: usize) {
+        self.row_id = row_id;
+        for lock in self.row_caches.iter_mut() {
+            *lock = std::sync::OnceLock::new();
+        }
+    }
+
+    pub fn get_row_id(&self) -> usize {
+        self.row_id
+    }
+
+    pub fn get_record_batch(&self) -> &RecordBatch {
+        &self.record_batch
+    }
+
+    fn column(&self, pos: usize) -> Result<&Arc<dyn Array>> {
+        self.record_batch
+            .columns()
+            .get(pos)
+            .ok_or_else(|| IllegalArgument {
+                message: format!(
+                    "column index {pos} out of bounds (batch has {} columns)",
+                    self.record_batch.num_columns()
+                ),
+            })
+    }
+
+    /// Generic helper to read timestamp from Arrow, handling all TimeUnit conversions.
+    /// Like Java, the precision parameter is ignored - conversion is determined by Arrow TimeUnit.
+    fn read_timestamp_from_arrow<T>(
+        &self,
+        pos: usize,
+        _precision: u32,
+        construct_compact: impl FnOnce(i64) -> T,
+        construct_with_nanos: impl FnOnce(i64, i32) -> Result<T>,
+    ) -> Result<T> {
+        let column = self.column(pos)?;
+
+        // Read value and time unit based on the actual Arrow timestamp type
+        let (value, time_unit) = match column.data_type() {
+            ArrowDataType::Timestamp(TimeUnit::Second, _) => (
+                column
+                    .as_primitive_opt::<TimestampSecondType>()
+                    .ok_or_else(|| IllegalArgument {
+                        message: format!("expected TimestampSecondArray at position {pos}"),
+                    })?
+                    .value(self.row_id),
+                TimeUnit::Second,
+            ),
+            ArrowDataType::Timestamp(TimeUnit::Millisecond, _) => (
+                column
+                    .as_primitive_opt::<TimestampMillisecondType>()
+                    .ok_or_else(|| IllegalArgument {
+                        message: format!("expected TimestampMillisecondArray at position {pos}"),
+                    })?
+                    .value(self.row_id),
+                TimeUnit::Millisecond,
+            ),
+            ArrowDataType::Timestamp(TimeUnit::Microsecond, _) => (
+                column
+                    .as_primitive_opt::<TimestampMicrosecondType>()
+                    .ok_or_else(|| IllegalArgument {
+                        message: format!("expected TimestampMicrosecondArray at position {pos}"),
+                    })?
+                    .value(self.row_id),
+                TimeUnit::Microsecond,
+            ),
+            ArrowDataType::Timestamp(TimeUnit::Nanosecond, _) => (
+                column
+                    .as_primitive_opt::<TimestampNanosecondType>()
+                    .ok_or_else(|| IllegalArgument {
+                        message: format!("expected TimestampNanosecondArray at position {pos}"),
+                    })?
+                    .value(self.row_id),
+                TimeUnit::Nanosecond,
+            ),
+            other => {
+                return Err(IllegalArgument {
+                    message: format!("expected Timestamp column at position {pos}, got {other:?}"),
+                });
+            }
+        };
+
+        // Convert based on Arrow TimeUnit
+        let (millis, nanos) = match time_unit {
+            TimeUnit::Second => (value * 1000, 0),
+            TimeUnit::Millisecond => (value, 0),
+            TimeUnit::Microsecond => {
+                // Use Euclidean division so that nanos is always non-negative,
+                // even for timestamps before the Unix epoch.
+                let millis = value.div_euclid(1000);
+                let nanos = (value.rem_euclid(1000) * 1000) as i32;
+                (millis, nanos)
+            }
+            TimeUnit::Nanosecond => {
+                // Use Euclidean division so that nanos is always in [0, 999_999].
+                let millis = value.div_euclid(1_000_000);
+                let nanos = value.rem_euclid(1_000_000) as i32;
+                (millis, nanos)
+            }
+        };
+
+        if nanos == 0 {
+            Ok(construct_compact(millis))
+        } else {
+            construct_with_nanos(millis, nanos)
+        }
+    }
+
+    /// Read date value from Arrow Date32Array
+    fn read_date_from_arrow(&self, pos: usize) -> Result<i32> {
+        Ok(self
+            .column(pos)?
+            .as_primitive_opt::<Date32Type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected Date32Array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    /// Read time value from Arrow Time32/Time64 arrays, converting to milliseconds
+    fn read_time_from_arrow(&self, pos: usize) -> Result<i32> {
+        let column = self.column(pos)?;
+
+        match column.data_type() {
+            ArrowDataType::Time32(TimeUnit::Second) => {
+                let value = column
+                    .as_primitive_opt::<Time32SecondType>()
+                    .ok_or_else(|| IllegalArgument {
+                        message: format!("expected Time32SecondArray at position {pos}"),
+                    })?
+                    .value(self.row_id);
+                Ok(value * 1000) // Convert seconds to milliseconds
+            }
+            ArrowDataType::Time32(TimeUnit::Millisecond) => Ok(column
+                .as_primitive_opt::<Time32MillisecondType>()
+                .ok_or_else(|| IllegalArgument {
+                    message: format!("expected Time32MillisecondArray at position {pos}"),
+                })?
+                .value(self.row_id)),
+            ArrowDataType::Time64(TimeUnit::Microsecond) => {
+                let value = column
+                    .as_primitive_opt::<Time64MicrosecondType>()
+                    .ok_or_else(|| IllegalArgument {
+                        message: format!("expected Time64MicrosecondArray at position {pos}"),
+                    })?
+                    .value(self.row_id);
+                Ok((value / 1000) as i32) // Convert microseconds to milliseconds
+            }
+            ArrowDataType::Time64(TimeUnit::Nanosecond) => {
+                let value = column
+                    .as_primitive_opt::<Time64NanosecondType>()
+                    .ok_or_else(|| IllegalArgument {
+                        message: format!("expected Time64NanosecondArray at position {pos}"),
+                    })?
+                    .value(self.row_id);
+                Ok((value / 1_000_000) as i32) // Convert nanoseconds to milliseconds
+            }
+            other => Err(IllegalArgument {
+                message: format!("expected Time column at position {pos}, got {other:?}"),
+            }),
+        }
+    }
+}
+
+fn extract_struct_from_array(
+    array: &dyn Array,
+    row_id: usize,
+    row_type: Option<&RowType>,
+) -> Result<GenericRow<'static>> {
+    let sa = array
+        .as_any()
+        .downcast_ref::<StructArray>()
+        .ok_or_else(|| IllegalArgument {
+            message: format!("expected StructArray, got {:?}", array.data_type()),
+        })?;
+    if let Some(rt) = row_type
+        && rt.fields().len() != sa.num_columns()
+    {
+        return Err(IllegalArgument {
+            message: format!(
+                "Fluss RowType has {} fields but Arrow StructArray has {}",
+                rt.fields().len(),
+                sa.num_columns(),
+            ),
+        });
+    }
+    let mut values = Vec::with_capacity(sa.num_columns());
+    for i in 0..sa.num_columns() {
+        let child = sa.column(i);
+        let fluss_type = row_type.map(|rt| &rt.fields()[i].data_type);
+        values.push(arrow_value_to_datum(child.as_ref(), row_id, fluss_type)?);
+    }
+    Ok(GenericRow { values })
+}
+
+fn arrow_value_to_datum(
+    array: &dyn Array,
+    row_id: usize,
+    fluss_type: Option<&DataType>,
+) -> Result<Datum<'static>> {
+    if array.is_null(row_id) {
+        return Ok(Datum::Null);
+    }
+
+    macro_rules! downcast {
+        ($ty:ty) => {
+            array
+                .as_any()
+                .downcast_ref::<$ty>()
+                .ok_or_else(|| IllegalArgument {
+                    message: format!(
+                        "expected {} for arrow type {:?}",
+                        stringify!($ty),
+                        array.data_type()
+                    ),
+                })?
+        };
+    }
+
+    match array.data_type() {
+        ArrowDataType::Boolean => Ok(Datum::Bool(downcast!(BooleanArray).value(row_id))),
+        ArrowDataType::Int8 => Ok(Datum::Int8(downcast!(Int8Array).value(row_id))),
+        ArrowDataType::Int16 => Ok(Datum::Int16(downcast!(Int16Array).value(row_id))),
+        ArrowDataType::Int32 => Ok(Datum::Int32(downcast!(Int32Array).value(row_id))),
+        ArrowDataType::Int64 => Ok(Datum::Int64(downcast!(Int64Array).value(row_id))),
+        ArrowDataType::Float32 => Ok(Datum::Float32(downcast!(Float32Array).value(row_id).into())),
+        ArrowDataType::Float64 => Ok(Datum::Float64(downcast!(Float64Array).value(row_id).into())),
+        ArrowDataType::Utf8 => Ok(Datum::String(std::borrow::Cow::Owned(
+            downcast!(StringArray).value(row_id).to_owned(),
+        ))),
+        ArrowDataType::Binary => Ok(Datum::Blob(std::borrow::Cow::Owned(
+            downcast!(BinaryArray).value(row_id).to_vec(),
+        ))),
+        ArrowDataType::FixedSizeBinary(_) => Ok(Datum::Blob(std::borrow::Cow::Owned(
+            downcast!(FixedSizeBinaryArray).value(row_id).to_vec(),
+        ))),
+        ArrowDataType::Decimal128(p, s) => {
+            let (p, s) = (*p, *s);
+            let i128_val = downcast!(Decimal128Array).value(row_id);
+            Ok(Datum::Decimal(Decimal::from_arrow_decimal128(
+                i128_val, s as i64, p as u32, s as u32,
+            )?))
+        }
+        ArrowDataType::Date32 => Ok(Datum::Date(Date::new(downcast!(Date32Array).value(row_id)))),
+        ArrowDataType::Time32(TimeUnit::Second) => Ok(Datum::Time(Time::new(
+            downcast!(Time32SecondArray).value(row_id) * 1000,
+        ))),
+        ArrowDataType::Time32(TimeUnit::Millisecond) => Ok(Datum::Time(Time::new(
+            downcast!(Time32MillisecondArray).value(row_id),
+        ))),
+        ArrowDataType::Time64(TimeUnit::Microsecond) => Ok(Datum::Time(Time::new(
+            (downcast!(Time64MicrosecondArray).value(row_id) / 1000) as i32,
+        ))),
+        ArrowDataType::Time64(TimeUnit::Nanosecond) => Ok(Datum::Time(Time::new(
+            (downcast!(Time64NanosecondArray).value(row_id) / 1_000_000) as i32,
+        ))),
+        ArrowDataType::Timestamp(time_unit, _tz) => {
+            let value: i64 = match time_unit {
+                TimeUnit::Second => downcast!(TimestampSecondArray).value(row_id),
+                TimeUnit::Millisecond => downcast!(TimestampMillisecondArray).value(row_id),
+                TimeUnit::Microsecond => downcast!(TimestampMicrosecondArray).value(row_id),
+                TimeUnit::Nanosecond => downcast!(TimestampNanosecondArray).value(row_id),
+            };
+            let (millis, nanos) = match time_unit {
+                TimeUnit::Second => (value * 1000, 0i32),
+                TimeUnit::Millisecond => (value, 0i32),
+                TimeUnit::Microsecond => {
+                    let millis = value.div_euclid(1000);
+                    let nanos = (value.rem_euclid(1000) * 1000) as i32;
+                    (millis, nanos)
+                }
+                TimeUnit::Nanosecond => {
+                    let millis = value.div_euclid(1_000_000);
+                    let nanos = value.rem_euclid(1_000_000) as i32;
+                    (millis, nanos)
+                }
+            };
+            // TIMESTAMP and TIMESTAMP_LTZ both map to `Timestamp(unit, None)` in Arrow.
+            let is_ltz = matches!(fluss_type, Some(DataType::TimestampLTz(_)));
+            if is_ltz {
+                if nanos == 0 {
+                    Ok(Datum::TimestampLtz(TimestampLtz::new(millis)))
+                } else {
+                    Ok(Datum::TimestampLtz(TimestampLtz::from_millis_nanos(
+                        millis, nanos,
+                    )?))
+                }
+            } else if nanos == 0 {
+                Ok(Datum::TimestampNtz(TimestampNtz::new(millis)))
+            } else {
+                Ok(Datum::TimestampNtz(TimestampNtz::from_millis_nanos(
+                    millis, nanos,
+                )?))
+            }
+        }
+        ArrowDataType::Struct(_) => {
+            let nested_row_type = fluss_type.and_then(|t| match t {
+                DataType::Row(rt) => Some(rt),
+                _ => None,
+            });
+            let nested = extract_struct_from_array(array, row_id, nested_row_type)?;
+            Ok(Datum::Row(Box::new(nested)))
+        }
+        ArrowDataType::List(field) => {
+            let list_arr = downcast!(ListArray);
+            let values = list_arr.value(row_id);
+            // Infer via from_arrow_field so the inferred element type
+            // matches what `arrow_map_entry_to_fluss_map` / strict `==`
+            // expect when there's no upstream Fluss schema.
+            let element_fluss_type = match fluss_type {
+                Some(DataType::Array(at)) => at.get_element_type().clone(),
+                _ => from_arrow_field(field)?,
+            };
+            let mut writer = FlussArrayWriter::new(values.len(), &element_fluss_type);
+            write_arrow_values_to_fluss_array(&*values, &element_fluss_type, &mut writer)?;
+            Ok(Datum::Array(writer.complete()?))
+        }
+        ArrowDataType::Map(entries_field, _) => {
+            let map_arr = downcast!(MapArray);
+            let entries = map_arr.value(row_id);
+            let (key_type, value_type) = match fluss_type {
+                Some(DataType::Map(m)) => (m.key_type().clone(), m.value_type().clone()),
+                _ => {
+                    let fields = match entries_field.data_type() {
+                        ArrowDataType::Struct(f) => f,
+                        other => {
+                            return Err(IllegalArgument {
+                                message: format!("expected Struct for Map entries, got {other:?}"),
+                            });
+                        }
+                    };
+                    if fields.len() != 2 {
+                        return Err(IllegalArgument {
+                            message: format!(
+                                "Map entries Struct must have 2 fields, got {}",
+                                fields.len()
+                            ),
+                        });
+                    }
+                    (from_arrow_field(&fields[0])?, from_arrow_field(&fields[1])?)
+                }
+            };
+            Ok(Datum::Map(arrow_map_entry_to_fluss_map(
+                &entries,
+                &key_type,
+                &value_type,
+            )?))
+        }
+        other => Err(IllegalArgument {
+            message: format!("unsupported Arrow data type for nested row extraction: {other:?}"),
+        }),
+    }
+}
+
+impl InternalRow for ColumnarRow {
+    fn get_field_count(&self) -> usize {
+        self.record_batch.num_columns()
+    }
+
+    fn is_null_at(&self, pos: usize) -> Result<bool> {
+        Ok(self.column(pos)?.is_null(self.row_id))
+    }
+
+    fn get_boolean(&self, pos: usize) -> Result<bool> {
+        Ok(self
+            .column(pos)?
+            .as_boolean_opt()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected boolean array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_byte(&self, pos: usize) -> Result<i8> {
+        Ok(self
+            .column(pos)?
+            .as_primitive_opt::<Int8Type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected byte array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_short(&self, pos: usize) -> Result<i16> {
+        Ok(self
+            .column(pos)?
+            .as_primitive_opt::<Int16Type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected short array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_int(&self, pos: usize) -> Result<i32> {
+        Ok(self
+            .column(pos)?
+            .as_primitive_opt::<Int32Type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected int array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_long(&self, pos: usize) -> Result<i64> {
+        Ok(self
+            .column(pos)?
+            .as_primitive_opt::<Int64Type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected long array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_float(&self, pos: usize) -> Result<f32> {
+        Ok(self
+            .column(pos)?
+            .as_primitive_opt::<Float32Type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected float32 array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_double(&self, pos: usize) -> Result<f64> {
+        Ok(self
+            .column(pos)?
+            .as_primitive_opt::<Float64Type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected float64 array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_char(&self, pos: usize, _length: usize) -> Result<&str> {
+        Ok(self
+            .column(pos)?
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected String array for char type at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_string(&self, pos: usize) -> Result<&str> {
+        Ok(self
+            .column(pos)?
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected String array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> Result<Decimal> {
+        let column = self.column(pos)?;
+        let array = column
+            .as_primitive_opt::<Decimal128Type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!(
+                    "expected Decimal128Array at column {pos}, found: {:?}",
+                    column.data_type()
+                ),
+            })?;
+
+        // Contract: caller must check is_null_at() before calling get_decimal.
+        debug_assert!(
+            !array.is_null(self.row_id),
+            "get_decimal called on null value at pos {} row {}",
+            pos,
+            self.row_id
+        );
+
+        let arrow_scale = match column.data_type() {
+            ArrowDataType::Decimal128(_p, s) => *s as i64,
+            dt => {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "expected Decimal128 data type at column {pos}, found: {dt:?}"
+                    ),
+                });
+            }
+        };
+
+        Decimal::from_arrow_decimal128(
+            array.value(self.row_id),
+            arrow_scale,
+            precision as u32,
+            scale as u32,
+        )
+    }
+
+    fn get_date(&self, pos: usize) -> Result<Date> {
+        Ok(Date::new(self.read_date_from_arrow(pos)?))
+    }
+
+    fn get_time(&self, pos: usize) -> Result<Time> {
+        Ok(Time::new(self.read_time_from_arrow(pos)?))
+    }
+
+    fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> Result<TimestampNtz> {
+        self.read_timestamp_from_arrow(
+            pos,
+            precision,
+            TimestampNtz::new,
+            TimestampNtz::from_millis_nanos,
+        )
+    }
+
+    fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> Result<TimestampLtz> {
+        self.read_timestamp_from_arrow(
+            pos,
+            precision,
+            TimestampLtz::new,
+            TimestampLtz::from_millis_nanos,
+        )
+    }
+
+    fn get_binary(&self, pos: usize, _length: usize) -> Result<&[u8]> {
+        Ok(self
+            .column(pos)?
+            .as_fixed_size_binary_opt()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected binary array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_bytes(&self, pos: usize) -> Result<&[u8]> {
+        Ok(self
+            .column(pos)?
+            .as_any()
+            .downcast_ref::<BinaryArray>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!("expected bytes array at position {pos}"),
+            })?
+            .value(self.row_id))
+    }
+
+    fn get_array(&self, pos: usize) -> Result<FlussArray> {
+        let expected_type = self.row_type.fields()[pos].data_type();
+        let element_fluss_type = match expected_type {
+            DataType::Array(a) => a.get_element_type(),
+            _ => {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "expected Array type at position {pos}, got {expected_type:?}"
+                    ),
+                });
+            }
+        };
+
+        let column = self.column(pos)?;
+        match column.data_type() {
+            ArrowDataType::List(_) => {}
+            other => {
+                return Err(IllegalArgument {
+                    message: format!("expected List array at position {pos}, got {other:?}"),
+                });
+            }
+        }
+
+        // `to_arrow_type` is lossy (e.g. TIMESTAMP_LTZ → plain Arrow Timestamp);
+        // trust the Fluss schema and let the per-element conversion below catch
+        // real shape mismatches.
+
+        let list_arr = column
+            .as_any()
+            .downcast_ref::<ListArray>()
+            .expect("data_type matched List but downcast failed; arrow-rs invariant violated");
+        let values = list_arr.value(self.row_id);
+        let mut writer = FlussArrayWriter::new(values.len(), element_fluss_type);
+        write_arrow_values_to_fluss_array(&*values, element_fluss_type, &mut writer)?;
+        writer.complete()
+    }
+
+    fn get_map(&self, pos: usize) -> Result<FlussMap> {
+        let expected_type = self.row_type.fields()[pos].data_type();
+        let map_type = match expected_type {
+            DataType::Map(m) => m,
+            _ => {
+                return Err(IllegalArgument {
+                    message: format!("expected Map type at position {pos}, got {expected_type:?}"),
+                });
+            }
+        };
+
+        let column = self.column(pos)?;
+        let map_arr =
+            column
+                .as_any()
+                .downcast_ref::<MapArray>()
+                .ok_or_else(|| IllegalArgument {
+                    message: format!(
+                        "expected Map array at position {pos}, got {:?}",
+                        column.data_type()
+                    ),
+                })?;
+
+        arrow_map_entry_to_fluss_map(
+            &map_arr.value(self.row_id),
+            map_type.key_type(),
+            map_type.value_type(),
+        )
+    }
+
+    fn get_row(&self, pos: usize) -> Result<&GenericRow<'_>> {
+        let cache_idx = self
+            .row_column_indices
+            .iter()
+            .position(|&i| i == pos)
+            .ok_or_else(|| IllegalArgument {
+                message: format!("get_row called on non-ROW column at position {pos}"),
+            })?;
+        let column = self.record_batch.column(pos);
+        // Children of a null parent may carry stale bytes; caller must
+        // check is_null_at first rather than rely on what we'd read.
+        if column.is_null(self.row_id) {
+            return Err(IllegalArgument {
+                message: format!(
+                    "get_row called on null ROW cell at position {pos}, row {}; \
+                     check is_null_at({pos}) first",
+                    self.row_id
+                ),
+            });
+        }
+        let lock = &self.row_caches[cache_idx];
+        if let Some(row) = lock.get() {
+            return Ok(row);
+        }
+        let nested_row_type = self.fluss_row_type.as_ref().and_then(|rt| {
+            rt.fields().get(pos).and_then(|f| match &f.data_type {
+                DataType::Row(inner) => Some(inner),
+                _ => None,
+            })
+        });
+        let extracted = extract_struct_from_array(column.as_ref(), self.row_id, nested_row_type)?;
+        Ok(lock.get_or_init(|| extracted))
+    }
+}
+
+#[inline]
+fn arrow_map_entry_to_fluss_map(
+    struct_arr: &arrow::array::StructArray,
+    key_type: &DataType,
+    value_type: &DataType,
+) -> Result<FlussMap> {
+    let fields = match struct_arr.data_type() {
+        ArrowDataType::Struct(f) => f,
+        other => {
+            return Err(IllegalArgument {
+                message: format!("expected Struct for Map entries, got {other:?}"),
+            });
+        }
+    };
+    if fields.len() != 2 {
+        return Err(IllegalArgument {
+            message: format!(
+                "Expected 2 columns in Map entries struct, got {}",
+                fields.len()
+            ),
+        });
+    }
+
+    // `to_arrow_type` is lossy (e.g. TIMESTAMP_LTZ → plain Arrow Timestamp);
+    // trust the Fluss schema and let the per-element conversion below catch
+    // real shape mismatches.
+
+    let keys_arrow = struct_arr.column(0);
+    let values_arrow = struct_arr.column(1);
+
+    let len = keys_arrow.len();
+
+    // Convert Arrow keys → FlussArray
+    let mut key_writer = FlussArrayWriter::new(len, key_type);
+    write_arrow_values_to_fluss_array(&**keys_arrow, key_type, &mut key_writer)?;
+    let key_array = key_writer.complete()?;
+
+    // Convert Arrow values → FlussArray
+    let mut value_writer = FlussArrayWriter::new(len, value_type);
+    write_arrow_values_to_fluss_array(&**values_arrow, value_type, &mut value_writer)?;
+    let value_array = value_writer.complete()?;
+
+    FlussMap::from_arrays(&key_array, &value_array, key_type, value_type)
+}
+
+/// Downcast to a primitive Arrow array type, then loop with null checks calling a writer method.
+macro_rules! write_primitive_elements {
+    ($values:expr, $arrow_type:ty, $element_type:expr, $writer:expr, $write_method:ident) => {{
+        let arr = $values
+            .as_primitive_opt::<$arrow_type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!(
+                    "expected {} for {:?} element",
+                    stringify!($arrow_type),
+                    $element_type
+                ),
+            })?;
+        for i in 0..arr.len() {
+            if arr.is_null(i) {
+                $writer.set_null_at(i);
+            } else {
+                $writer.$write_method(i, arr.value(i));
+            }
+        }
+    }};
+}
+
+/// Downcast via `downcast_ref`, then loop with null checks calling a writer method.
+macro_rules! write_downcast_elements {
+    ($values:expr, $array_type:ty, $element_type:expr, $writer:expr, $write_method:ident) => {{
+        let arr = $values
+            .as_any()
+            .downcast_ref::<$array_type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!(
+                    "expected {} for {:?} element",
+                    stringify!($array_type),
+                    $element_type
+                ),
+            })?;
+        for i in 0..arr.len() {
+            if arr.is_null(i) {
+                $writer.set_null_at(i);
+            } else {
+                $writer.$write_method(i, arr.value(i));
+            }
+        }
+    }};
+}
+
+/// Downcast via `downcast_ref` to a List array type, then loop with null checks.
+macro_rules! write_list_elements {
+    ($values:expr, $list_array_type:ty, $len:expr, $element_type:expr, $writer:expr) => {{
+        let arr = $values
+            .as_any()
+            .downcast_ref::<$list_array_type>()
+            .ok_or_else(|| IllegalArgument {
+                message: format!(
+                    "expected {} for {:?} element",
+                    stringify!($list_array_type),
+                    $element_type
+                ),
+            })?;
+        let nested_element_type = match $element_type {
+            DataType::Array(a) => a.get_element_type(),
+            _ => unreachable!("Expected Array type for write_list_elements"),
+        };
+        for i in 0..$len {
+            if arr.is_null(i) {
+                $writer.set_null_at(i);
+            } else {
+                let nested_values = arr.value(i);
+                let mut nested_writer =
+                    FlussArrayWriter::new(nested_values.len(), &nested_element_type);
+                write_arrow_values_to_fluss_array(
+                    &*nested_values,
+                    &nested_element_type,
+                    &mut nested_writer,
+                )?;
+                let nested_array = nested_writer.complete()?;
+                $writer.write_array(i, &nested_array);
+            }
+        }
+    }};
+}
+
+/// Converts all elements of an Arrow array into a `FlussArrayWriter`, downcasting
+/// the Arrow array once per call rather than per element.
+fn write_arrow_values_to_fluss_array(
+    values: &dyn Array,
+    element_type: &DataType,
+    writer: &mut FlussArrayWriter,
+) -> Result<()> {
+    let len = values.len();
+
+    match element_type {
+        DataType::Boolean(_) => {
+            write_downcast_elements!(values, BooleanArray, element_type, writer, write_boolean)
+        }
+        DataType::TinyInt(_) => {
+            write_primitive_elements!(values, Int8Type, element_type, writer, write_byte)
+        }
+        DataType::SmallInt(_) => {
+            write_primitive_elements!(values, Int16Type, element_type, writer, write_short)
+        }
+        DataType::Int(_) => {
+            write_primitive_elements!(values, Int32Type, element_type, writer, write_int)
+        }
+        DataType::BigInt(_) => {
+            write_primitive_elements!(values, Int64Type, element_type, writer, write_long)
+        }
+        DataType::Float(_) => {
+            write_primitive_elements!(values, Float32Type, element_type, writer, write_float)
+        }
+        DataType::Double(_) => {
+            write_primitive_elements!(values, Float64Type, element_type, writer, write_double)
+        }
+        DataType::Char(_) | DataType::String(_) => {
+            write_downcast_elements!(values, StringArray, element_type, writer, write_string)
+        }
+        DataType::Binary(_) => {
+            write_downcast_elements!(
+                values,
+                FixedSizeBinaryArray,
+                element_type,
+                writer,
+                write_binary_bytes
+            )
+        }
+        DataType::Bytes(_) => {
+            write_downcast_elements!(
+                values,
+                BinaryArray,
+                element_type,
+                writer,
+                write_binary_bytes
+            )
+        }
+        DataType::Decimal(dt) => {
+            let arr =
+                values
+                    .as_primitive_opt::<Decimal128Type>()
+                    .ok_or_else(|| IllegalArgument {
+                        message: format!("expected Decimal128Array for {element_type:?} element"),
+                    })?;
+            let arrow_scale = match values.data_type() {
+                ArrowDataType::Decimal128(_p, s) => *s as i64,
+                other => {
+                    return Err(IllegalArgument {
+                        message: format!(
+                            "expected Decimal128 data type for {element_type:?} element, got {other:?}"
+                        ),
+                    });
+                }
+            };
+            let precision = dt.precision();
+            let scale = dt.scale();
+            for i in 0..len {
+                if arr.is_null(i) {
+                    writer.set_null_at(i);
+                } else {
+                    let d = Decimal::from_arrow_decimal128(
+                        arr.value(i),
+                        arrow_scale,
+                        precision,
+                        scale,
+                    )?;
+                    writer.write_decimal(i, &d, precision);
+                }
+            }
+        }
+        DataType::Date(_) => {
+            let arr = values
+                .as_primitive_opt::<Date32Type>()
+                .ok_or_else(|| IllegalArgument {
+                    message: format!("expected Date32Array for {element_type:?} element"),
+                })?;
+            for i in 0..len {
+                if arr.is_null(i) {
+                    writer.set_null_at(i);
+                } else {
+                    writer.write_date(i, Date::new(arr.value(i)));
+                }
+            }
+        }
+        DataType::Time(_) => {
+            write_time_elements(values, element_type, writer)?;
+        }
+        DataType::Timestamp(ts_type) => {
+            write_timestamp_elements(
+                values,
+                element_type,
+                writer,
+                ts_type.precision(),
+                TimestampNtz::new,
+                TimestampNtz::from_millis_nanos,
+                |w, i, ts, p| w.write_timestamp_ntz(i, &ts, p),
+            )?;
+        }
+        DataType::TimestampLTz(ts_type) => {
+            write_timestamp_elements(
+                values,
+                element_type,
+                writer,
+                ts_type.precision(),
+                TimestampLtz::new,
+                TimestampLtz::from_millis_nanos,
+                |w, i, ts, p| w.write_timestamp_ltz(i, &ts, p),
+            )?;
+        }
+        DataType::Array(_) => {
+            if values.as_any().is::<ListArray>() {
+                write_list_elements!(values, ListArray, len, element_type, writer);
+            } else {
+                return Err(IllegalArgument {
+                    message: format!(
+                        "expected ListArray for {element_type:?} element, got {:?}",
+                        values.data_type()
+                    ),
+                });
+            }
+        }
+        DataType::Map(_) => {
+            let map_arr =
+                values
+                    .as_any()
+                    .downcast_ref::<MapArray>()
+                    .ok_or_else(|| IllegalArgument {
+                        message: format!(
+                            "Expected MapArray for {element_type:?} element, got {:?}",
+                            values.data_type()
+                        ),
+                    })?;
+            for i in 0..len {
+                if map_arr.is_null(i) {
+                    writer.set_null_at(i);
+                } else {
+                    let expected_map_type = match element_type {
+                        DataType::Map(m) => m,
+                        _ => unreachable!("Expected Map type for Map variant"),
+                    };
+                    let fluss_map = arrow_map_entry_to_fluss_map(
+                        &map_arr.value(i),
+                        expected_map_type.key_type(),
+                        expected_map_type.value_type(),
+                    )?;
+                    writer.write_map(i, &fluss_map);
+                }
+            }
+        }
+        DataType::Row(row_type) => {
+            let struct_arr = values
+                .as_any()
+                .downcast_ref::<StructArray>()
+                .ok_or_else(|| IllegalArgument {
+                    message: format!(
+                        "expected StructArray for {element_type:?} element, got {:?}",
+                        values.data_type()
+                    ),
+                })?;
+            for i in 0..len {
+                if struct_arr.is_null(i) {
+                    writer.set_null_at(i);
+                } else {
+                    let nested = extract_struct_from_array(struct_arr, i, Some(row_type))?;
+                    writer.write_row(i, &nested)?;
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+fn write_time_elements(
+    values: &dyn Array,
+    element_type: &DataType,
+    writer: &mut FlussArrayWriter,
+) -> Result<()> {
+    macro_rules! process_time {
+        ($arrow_type:ty, $to_millis:expr) => {{
+            let arr = values
+                .as_primitive_opt::<$arrow_type>()
+                .ok_or_else(|| IllegalArgument {
+                    message: format!(
+                        "expected {} for {:?} element",
+                        stringify!($arrow_type),
+                        element_type
+                    ),
+                })?;
+            for i in 0..arr.len() {
+                if arr.is_null(i) {
+                    writer.set_null_at(i);
+                } else {
+                    let to_millis_fn = $to_millis;
+                    writer.write_time(i, Time::new(to_millis_fn(arr.value(i))));
+                }
+            }
+        }};
+    }
+
+    match values.data_type() {
+        ArrowDataType::Time32(TimeUnit::Second) => {
+            process_time!(Time32SecondType, |v: i32| v * 1000);
+        }
+        ArrowDataType::Time32(TimeUnit::Millisecond) => {
+            process_time!(Time32MillisecondType, |v: i32| v);
+        }
+        ArrowDataType::Time64(TimeUnit::Microsecond) => {
+            process_time!(Time64MicrosecondType, |v: i64| (v / 1000) as i32);
+        }
+        ArrowDataType::Time64(TimeUnit::Nanosecond) => {
+            process_time!(Time64NanosecondType, |v: i64| (v / 1_000_000) as i32);
+        }
+        other => {
+            return Err(IllegalArgument {
+                message: format!(
+                    "expected Time column for {element_type:?} element, got {other:?}"
+                ),
+            });
+        }
+    }
+    Ok(())
+}
+
+fn convert_timestamp_raw(raw: i64, unit: &TimeUnit) -> (i64, i32) {
+    match unit {
+        TimeUnit::Second => (raw * 1000, 0),
+        TimeUnit::Millisecond => (raw, 0),
+        TimeUnit::Microsecond => {
+            let millis = raw.div_euclid(1000);
+            let nanos = (raw.rem_euclid(1000) * 1000) as i32;
+            (millis, nanos)
+        }
+        TimeUnit::Nanosecond => {
+            let millis = raw.div_euclid(1_000_000);
+            let nanos = raw.rem_euclid(1_000_000) as i32;
+            (millis, nanos)
+        }
+    }
+}
+
+fn write_timestamp_elements<T>(
+    values: &dyn Array,
+    element_type: &DataType,
+    writer: &mut FlussArrayWriter,
+    precision: u32,
+    construct_compact: impl Fn(i64) -> T,
+    construct_with_nanos: impl Fn(i64, i32) -> Result<T>,
+    write_fn: impl Fn(&mut FlussArrayWriter, usize, T, u32),
+) -> Result<()> {
+    let unit = match values.data_type() {
+        ArrowDataType::Timestamp(unit, _) => unit,
+        other => {
+            return Err(IllegalArgument {
+                message: format!(
+                    "expected Timestamp column for {element_type:?} element, got {other:?}"
+                ),
+            });
+        }
+    };
+
+    macro_rules! process_ts {
+        ($arrow_type:ty) => {{
+            let arr = values
+                .as_primitive_opt::<$arrow_type>()
+                .ok_or_else(|| IllegalArgument {
+                    message: format!(
+                        "expected {} for {:?} element",
+                        stringify!($arrow_type),
+                        element_type
+                    ),
+                })?;
+            for i in 0..arr.len() {
+                if arr.is_null(i) {
+                    writer.set_null_at(i);
+                    continue;
+                }
+                let (millis, nanos) = convert_timestamp_raw(arr.value(i), unit);
+                let ts = if nanos == 0 {
+                    construct_compact(millis)
+                } else {
+                    construct_with_nanos(millis, nanos)?
+                };
+                write_fn(writer, i, ts, precision);
+            }
+        }};
+    }
+
+    match unit {
+        TimeUnit::Second => process_ts!(TimestampSecondType),
+        TimeUnit::Millisecond => process_ts!(TimestampMillisecondType),
+        TimeUnit::Microsecond => process_ts!(TimestampMicrosecondType),
+        TimeUnit::Nanosecond => process_ts!(TimestampNanosecondType),
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataField, RowType};
+    use arrow::array::{
+        ArrayRef, BinaryArray, BooleanArray, Decimal128Array, Float32Array, Float64Array,
+        Int8Array, Int16Array, Int32Array, Int32Builder, Int64Array, ListBuilder, StringArray,
+        StructArray, UInt32Builder,
+    };
+    use arrow::datatypes::{DataType, Field, Fields, Schema};
+
+    fn infer_fluss_type(arrow_dt: &arrow_schema::DataType) -> crate::metadata::DataType {
+        match arrow_dt {
+            arrow_schema::DataType::Int32 => {
+                crate::metadata::DataType::Int(crate::metadata::IntType::new())
+            }
+            arrow_schema::DataType::List(f) => crate::metadata::DataType::Array(
+                crate::metadata::ArrayType::new(infer_fluss_type(f.data_type())),
+            ),
+            _ => crate::metadata::DataType::Int(crate::metadata::IntType::new()),
+        }
+    }
+
+    fn single_column_row(array: ArrayRef) -> ColumnarRow {
+        let dt = infer_fluss_type(array.data_type());
+        let batch =
+            RecordBatch::try_from_iter(vec![("arr", array)]).expect("record batch with one column");
+        let row_type = Arc::new(RowType::with_data_types(vec![dt]));
+        ColumnarRow::new(Arc::new(batch), row_type, 0, None)
+    }
+
+    #[test]
+    fn columnar_row_reads_values() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("b", ArrowDataType::Boolean, false),
+            Field::new("i8", ArrowDataType::Int8, false),
+            Field::new("i16", ArrowDataType::Int16, false),
+            Field::new("i32", ArrowDataType::Int32, false),
+            Field::new("i64", ArrowDataType::Int64, false),
+            Field::new("f32", ArrowDataType::Float32, false),
+            Field::new("f64", ArrowDataType::Float64, false),
+            Field::new("s", ArrowDataType::Utf8, false),
+            Field::new("bin", ArrowDataType::Binary, false),
+            Field::new("char", ArrowDataType::Utf8, false),
+        ]));
+
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(BooleanArray::from(vec![true])),
+                Arc::new(Int8Array::from(vec![1])),
+                Arc::new(Int16Array::from(vec![2])),
+                Arc::new(Int32Array::from(vec![3])),
+                Arc::new(Int64Array::from(vec![4])),
+                Arc::new(Float32Array::from(vec![1.25])),
+                Arc::new(Float64Array::from(vec![2.5])),
+                Arc::new(StringArray::from(vec!["hello"])),
+                Arc::new(BinaryArray::from(vec![b"data".as_slice()])),
+                Arc::new(StringArray::from(vec!["ab"])),
+            ],
+        )
+        .expect("record batch");
+
+        let mut row = ColumnarRow::new(Arc::new(batch), Arc::new(RowType::new(vec![])), 0, None);
+        assert_eq!(row.get_field_count(), 10);
+        assert!(row.get_boolean(0).unwrap());
+        assert_eq!(row.get_byte(1).unwrap(), 1);
+        assert_eq!(row.get_short(2).unwrap(), 2);
+        assert_eq!(row.get_int(3).unwrap(), 3);
+        assert_eq!(row.get_long(4).unwrap(), 4);
+        assert_eq!(row.get_float(5).unwrap(), 1.25);
+        assert_eq!(row.get_double(6).unwrap(), 2.5);
+        assert_eq!(row.get_string(7).unwrap(), "hello");
+        assert_eq!(row.get_bytes(8).unwrap(), b"data");
+        assert_eq!(row.get_char(9, 2).unwrap(), "ab");
+        row.set_row_id(0);
+        assert_eq!(row.get_row_id(), 0);
+    }
+
+    #[test]
+    fn columnar_row_reads_decimal() {
+        use bigdecimal::{BigDecimal, num_bigint::BigInt};
+
+        // Test with Decimal128
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("dec1", ArrowDataType::Decimal128(10, 2), false),
+            Field::new("dec2", ArrowDataType::Decimal128(20, 5), false),
+            Field::new("dec3", ArrowDataType::Decimal128(38, 10), false),
+        ]));
+
+        // Create decimal values: 123.45, 12345.67890, large decimal
+        let dec1_val = 12345i128; // 123.45 with scale 2
+        let dec2_val = 1234567890i128; // 12345.67890 with scale 5
+        let dec3_val = 999999999999999999i128; // Large value (18 nines) with scale 10
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(
+                    Decimal128Array::from(vec![dec1_val])
+                        .with_precision_and_scale(10, 2)
+                        .unwrap(),
+                ),
+                Arc::new(
+                    Decimal128Array::from(vec![dec2_val])
+                        .with_precision_and_scale(20, 5)
+                        .unwrap(),
+                ),
+                Arc::new(
+                    Decimal128Array::from(vec![dec3_val])
+                        .with_precision_and_scale(38, 10)
+                        .unwrap(),
+                ),
+            ],
+        )
+        .expect("record batch");
+
+        let row = ColumnarRow::new(Arc::new(batch), Arc::new(RowType::new(vec![])), 0, None);
+        assert_eq!(row.get_field_count(), 3);
+
+        // Verify decimal values
+        assert_eq!(
+            row.get_decimal(0, 10, 2).unwrap(),
+            Decimal::from_big_decimal(BigDecimal::new(BigInt::from(12345), 2), 10, 2).unwrap()
+        );
+        assert_eq!(
+            row.get_decimal(1, 20, 5).unwrap(),
+            Decimal::from_big_decimal(BigDecimal::new(BigInt::from(1234567890), 5), 20, 5).unwrap()
+        );
+        assert_eq!(
+            row.get_decimal(2, 38, 10).unwrap(),
+            Decimal::from_big_decimal(
+                BigDecimal::new(BigInt::from(999999999999999999i128), 10),
+                38,
+                10
+            )
+            .unwrap()
+        );
+    }
+
+    #[test]
+    fn columnar_row_get_array_int_roundtrip() {
+        let mut builder = ListBuilder::new(Int32Builder::new());
+        builder.values().append_value(1);
+        builder.values().append_value(2);
+        builder.values().append_value(3);
+        builder.append(true);
+        let array = Arc::new(builder.finish()) as ArrayRef;
+
+        let row = single_column_row(array);
+        let arr = row.get_array(0).unwrap();
+        assert_eq!(arr.size(), 3);
+        assert_eq!(arr.get_int(0).unwrap(), 1);
+        assert_eq!(arr.get_int(1).unwrap(), 2);
+        assert_eq!(arr.get_int(2).unwrap(), 3);
+    }
+
+    #[test]
+    fn columnar_row_get_array_with_nulls() {
+        let mut builder = ListBuilder::new(Int32Builder::new());
+        builder.values().append_value(1);
+        builder.values().append_null();
+        builder.values().append_value(3);
+        builder.append(true);
+        let array = Arc::new(builder.finish()) as ArrayRef;
+
+        let row = single_column_row(array);
+        let arr = row.get_array(0).unwrap();
+        assert_eq!(arr.size(), 3);
+        assert_eq!(arr.get_int(0).unwrap(), 1);
+        assert!(arr.is_null_at(1));
+        assert_eq!(arr.get_int(2).unwrap(), 3);
+    }
+
+    #[test]
+    fn columnar_row_get_array_nested_array() {
+        let mut outer = ListBuilder::new(ListBuilder::new(Int32Builder::new()));
+
+        // first nested array: [1, 2]
+        outer.values().values().append_value(1);
+        outer.values().values().append_value(2);
+        outer.values().append(true);
+
+        // second nested array: [99]
+        outer.values().values().append_value(99);
+        outer.values().append(true);
+
+        // one row containing two nested arrays
+        outer.append(true);
+        let array = Arc::new(outer.finish()) as ArrayRef;
+
+        let row = single_column_row(array);
+        let arr = row.get_array(0).unwrap();
+        assert_eq!(arr.size(), 2);
+
+        let nested0 = arr.get_array(0).unwrap();
+        assert_eq!(nested0.size(), 2);
+        assert_eq!(nested0.get_int(0).unwrap(), 1);
+        assert_eq!(nested0.get_int(1).unwrap(), 2);
+
+        let nested1 = arr.get_array(1).unwrap();
+        assert_eq!(nested1.size(), 1);
+        assert_eq!(nested1.get_int(0).unwrap(), 99);
+    }
+
+    #[test]
+    fn columnar_row_get_array_non_list_column_returns_error() {
+        let array = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
+        let row = single_column_row(array);
+        let err = row.get_array(0).unwrap_err();
+        assert!(
+            err.to_string().contains("expected Array type"),
+            "unexpected error: {err}"
+        );
+    }
+
+    #[test]
+    fn columnar_row_get_array_unsupported_element_type_returns_error() {
+        let mut builder = ListBuilder::new(UInt32Builder::new());
+        builder.values().append_value(7);
+        builder.append(true);
+        let array = Arc::new(builder.finish()) as ArrayRef;
+
+        let batch = RecordBatch::try_from_iter(vec![("arr", array)]).expect("record batch");
+        // We manually create a row type that claims to be Array(Int) even though it's List(UInt32)
+        // to test the validation in get_array.
+        let row_type = Arc::new(RowType::new(vec![DataField::new(
+            "arr",
+            crate::metadata::DataTypes::array(crate::metadata::DataTypes::int()),
+            None,
+        )]));
+        let row = ColumnarRow::new(Arc::new(batch), row_type, 0, None);
+
+        let err = row.get_array(0).unwrap_err();
+        assert!(
+            err.to_string().contains("expected Int32Type"),
+            "unexpected error: {err}"
+        );
+    }
+
+    fn make_struct_batch(
+        field_name: &str,
+        child_fields: Fields,
+        child_arrays: Vec<Arc<dyn Array>>,
+        _num_rows: usize,
+    ) -> Arc<RecordBatch> {
+        let struct_array = StructArray::new(child_fields.clone(), child_arrays, None);
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            field_name,
+            DataType::Struct(child_fields),
+            false,
+        )]));
+        Arc::new(RecordBatch::try_new(schema, vec![Arc::new(struct_array)]).expect("record batch"))
+    }
+
+    #[test]
+    fn columnar_row_reads_nested_row() {
+        let child_fields = Fields::from(vec![
+            Field::new("x", DataType::Int32, false),
+            Field::new("s", DataType::Utf8, false),
+        ]);
+        let child_arrays: Vec<Arc<dyn Array>> = vec![
+            Arc::new(Int32Array::from(vec![42, 99])),
+            Arc::new(StringArray::from(vec!["hello", "world"])),
+        ];
+        let batch = make_struct_batch("nested", child_fields, child_arrays, 2);
+
+        let mut row = ColumnarRow::new(batch, Arc::new(RowType::new(vec![])), 0, None);
+
+        // row_id = 0
+        let nested = row.get_row(0).unwrap();
+        assert_eq!(nested.get_field_count(), 2);
+        assert_eq!(nested.get_int(0).unwrap(), 42);
+        assert_eq!(nested.get_string(1).unwrap(), "hello");
+
+        // row_id = 1
+        row.set_row_id(1);
+        let nested = row.get_row(0).unwrap();
+        assert_eq!(nested.get_int(0).unwrap(), 99);
+        assert_eq!(nested.get_string(1).unwrap(), "world");
+    }
+
+    #[test]
+    fn columnar_row_reads_deeply_nested_row() {
+        // Build: outer struct { i32, inner struct { string } }
+        let inner_fields = Fields::from(vec![Field::new("s", DataType::Utf8, false)]);
+        let inner_array = Arc::new(StructArray::new(
+            inner_fields.clone(),
+            vec![Arc::new(StringArray::from(vec!["deep", "deeper"])) as Arc<dyn Array>],
+            None,
+        ));
+
+        let outer_fields = Fields::from(vec![
+            Field::new("n", DataType::Int32, false),
+            Field::new("inner", DataType::Struct(inner_fields), false),
+        ]);
+        let outer_array = Arc::new(StructArray::new(
+            outer_fields.clone(),
+            vec![
+                Arc::new(Int32Array::from(vec![1, 2])) as Arc<dyn Array>,
+                inner_array as Arc<dyn Array>,
+            ],
+            None,
+        ));
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "outer",
+            DataType::Struct(outer_fields),
+            false,
+        )]));
+        let batch =
+            Arc::new(RecordBatch::try_new(schema, vec![outer_array]).expect("record batch"));
+
+        let mut row = ColumnarRow::new(batch, Arc::new(RowType::new(vec![])), 0, None);
+
+        // row_id = 0
+        let outer = row.get_row(0).unwrap();
+        assert_eq!(outer.get_int(0).unwrap(), 1);
+        let inner = outer.get_row(1).unwrap();
+        assert_eq!(inner.get_string(0).unwrap(), "deep");
+
+        // row_id = 1
+        row.set_row_id(1);
+        let outer = row.get_row(0).unwrap();
+        assert_eq!(outer.get_int(0).unwrap(), 2);
+        let inner = outer.get_row(1).unwrap();
+        assert_eq!(inner.get_string(0).unwrap(), "deeper");
+    }
+
+    #[test]
+    fn columnar_row_get_row_cache_invalidated_on_set_row_id() {
+        let child_fields = Fields::from(vec![Field::new("x", DataType::Int32, false)]);
+        let child_arrays: Vec<Arc<dyn Array>> = vec![Arc::new(Int32Array::from(vec![10, 20]))];
+        let batch = make_struct_batch("s", child_fields, child_arrays, 2);
+
+        let mut row = ColumnarRow::new(batch, Arc::new(RowType::new(vec![])), 0, None);
+
+        // row_id = 0: nested x = 10
+        let nested_0 = row.get_row(0).unwrap();
+        assert_eq!(nested_0.get_int(0).unwrap(), 10);
+
+        // After set_row_id(1), cache is cleared → nested x = 20
+        row.set_row_id(1);
+        let nested_1 = row.get_row(0).unwrap();
+        assert_eq!(nested_1.get_int(0).unwrap(), 20);
+    }
+
+    #[test]
+    fn columnar_row_get_map_accepts_non_nullable_key_from_map_type() {
+        use crate::metadata::DataTypes;
+        use arrow::array::{MapBuilder, StringBuilder};
+
+        // Arrow map column with INT keys, STRING values.
+        let mut builder = MapBuilder::new(None, Int32Builder::new(), StringBuilder::new());
+        builder.keys().append_value(1);
+        builder.values().append_value("a");
+        builder.append(true).unwrap();
+        let map_arr = builder.finish();
+
+        let map_arrow_type = map_arr.data_type().clone();
+        let schema = Arc::new(Schema::new(vec![Field::new("m", map_arrow_type, true)]));
+        let batch =
+            Arc::new(RecordBatch::try_new(schema, vec![Arc::new(map_arr)]).expect("record batch"));
+
+        let map_type = DataTypes::map(DataTypes::int(), DataTypes::string());
+        let row_type = Arc::new(RowType::with_data_types(vec![map_type]));
+        let row = ColumnarRow::new(batch, row_type, 0, None);
+
+        let fluss_map = row
+            .get_map(0)
+            .expect("get_map should succeed on ColumnarRow");
+        assert_eq!(fluss_map.size(), 1);
+        assert_eq!(fluss_map.key_array().get_int(0).unwrap(), 1);
+        assert_eq!(fluss_map.value_array().get_string(0).unwrap(), "a");
+    }
+
+    #[test]
+    fn columnar_row_reads_row_containing_map() {
+        use crate::metadata::DataTypes;
+        use arrow::array::{MapBuilder, StringBuilder};
+
+        // Inner Map<String, Int> Arrow column with one entry per row, 2 rows.
+        let mut mb = MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
+        mb.keys().append_value("k1");
+        mb.values().append_value(42);
+        mb.append(true).unwrap();
+        mb.keys().append_value("k2");
+        mb.values().append_value(7);
+        mb.append(true).unwrap();
+        let map_arr = mb.finish();
+
+        // Struct { id: Int32, m: Map<String, Int> }
+        let id_arr = Int32Array::from(vec![10, 20]);
+        let struct_fields = Fields::from(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("m", map_arr.data_type().clone(), false),
+        ]);
+        let struct_arr = Arc::new(StructArray::new(
+            struct_fields.clone(),
+            vec![Arc::new(id_arr), Arc::new(map_arr)],
+            None,
+        ));
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "outer",
+            DataType::Struct(struct_fields),
+            false,
+        )]));
+        let batch = Arc::new(RecordBatch::try_new(schema, vec![struct_arr]).expect("record batch"));
+
+        // Fluss outer ROW<id INT, m MAP<STRING, INT>>
+        let inner_row_type = RowType::with_data_types(vec![
+            DataTypes::int(),
+            DataTypes::map(DataTypes::string(), DataTypes::int()),
+        ]);
+        let outer_row_type = Arc::new(RowType::with_data_types(vec![
+            crate::metadata::DataType::Row(inner_row_type),
+        ]));
+
+        let mut row = ColumnarRow::new(
+            batch,
+            outer_row_type.clone(),
+            0,
+            Some(outer_row_type.clone()),
+        );
+
+        let nested = row
+            .get_row(0)
+            .expect("reading row with Map field must succeed");
+        assert_eq!(nested.get_int(0).unwrap(), 10);
+        let inner_map = nested.get_map(1).expect("nested map should be accessible");
+        assert_eq!(inner_map.size(), 1);
+        assert_eq!(inner_map.key_array().get_string(0).unwrap(), "k1");
+        assert_eq!(inner_map.value_array().get_int(0).unwrap(), 42);
+
+        // Verify cache invalidation across rows works for Row-with-Map too.
+        row.set_row_id(1);
+        let nested = row.get_row(0).expect("row 1 must read");
+        assert_eq!(nested.get_int(0).unwrap(), 20);
+        let inner_map = nested.get_map(1).unwrap();
+        assert_eq!(inner_map.key_array().get_string(0).unwrap(), "k2");
+        assert_eq!(inner_map.value_array().get_int(0).unwrap(), 7);
+    }
+
+    #[test]
+    fn columnar_row_reads_array_of_maps() {
+        use crate::metadata::DataTypes;
+        use arrow::array::{ListBuilder, MapBuilder, StringBuilder};
+
+        // One row whose ARRAY<MAP<STRING, INT>> contains two maps:
+        // [{"k1" -> 1}, {"k2" -> 2, "k3" -> 3}].
+        let mut outer = ListBuilder::new(MapBuilder::new(
+            None,
+            StringBuilder::new(),
+            Int32Builder::new(),
+        ));
+        {
+            let mb = outer.values();
+            // Map 0: {"k1" -> 1}
+            mb.keys().append_value("k1");
+            mb.values().append_value(1);
+            mb.append(true).unwrap();
+            // Map 1: {"k2" -> 2, "k3" -> 3}
+            mb.keys().append_value("k2");
+            mb.values().append_value(2);
+            mb.keys().append_value("k3");
+            mb.values().append_value(3);
+            mb.append(true).unwrap();
+        }
+        outer.append(true);
+        let list_arr = outer.finish();
+        let arrow_dt = list_arr.data_type().clone();
+
+        let schema = Arc::new(Schema::new(vec![Field::new("a", arrow_dt, false)]));
+        let batch =
+            Arc::new(RecordBatch::try_new(schema, vec![Arc::new(list_arr)]).expect("record batch"));
+
+        let array_type = DataTypes::array(DataTypes::map(DataTypes::string(), DataTypes::int()));
+        let row_type = Arc::new(RowType::with_data_types(vec![array_type]));
+        let row = ColumnarRow::new(batch, row_type, 0, None);
+
+        let arr = row.get_array(0).expect("get_array on ARRAY<MAP> must work");
+        assert_eq!(arr.size(), 2);
+
+        let m0 = arr
+            .get_map(0, &DataTypes::string(), &DataTypes::int())
+            .unwrap();
+        assert_eq!(m0.size(), 1);
+        assert_eq!(m0.key_array().get_string(0).unwrap(), "k1");
+        assert_eq!(m0.value_array().get_int(0).unwrap(), 1);
+
+        let m1 = arr
+            .get_map(1, &DataTypes::string(), &DataTypes::int())
+            .unwrap();
+        assert_eq!(m1.size(), 2);
+        assert_eq!(m1.key_array().get_string(0).unwrap(), "k2");
+        assert_eq!(m1.value_array().get_int(0).unwrap(), 2);
+        assert_eq!(m1.key_array().get_string(1).unwrap(), "k3");
+        assert_eq!(m1.value_array().get_int(1).unwrap(), 3);
+    }
+
+    #[test]
+    fn columnar_row_get_map_rejects_real_type_mismatch() {
+        use crate::metadata::DataTypes;
+        use arrow::array::{MapBuilder, StringBuilder};
+
+        let mut mb = MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
+        mb.keys().append_value("k");
+        mb.values().append_value(1);
+        mb.append(true).unwrap();
+        let map_arr = mb.finish();
+        let map_arrow_type = map_arr.data_type().clone();
+
+        let schema = Arc::new(Schema::new(vec![Field::new("m", map_arrow_type, true)]));
+        let batch =
+            Arc::new(RecordBatch::try_new(schema, vec![Arc::new(map_arr)]).expect("record batch"));
+
+        // Caller mis-declares the value type as STRING.
+        let row_type = Arc::new(RowType::with_data_types(vec![DataTypes::map(
+            DataTypes::string(),
+            DataTypes::string(),
+        )]));
+        let row = ColumnarRow::new(batch, row_type, 0, None);
+
+        let err = row.get_map(0).expect_err("type mismatch must error");
+        let msg = err.to_string();
+        assert!(
+            msg.contains("expected StringArray"),
+            "unexpected error: {msg}"
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/column_writer.rs b/fluss-rust/crates/fluss/src/row/column_writer.rs
new file mode 100644
index 0000000000..85776bc53e
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/column_writer.rs
@@ -0,0 +1,1402 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Typed column writers that write directly from [`InternalRow`] to concrete
+//! Arrow builders, bypassing the intermediate [`Datum`] enum and runtime
+//! `downcast_mut` dispatch.
+
+use crate::error::Error::RowConvertError;
+use crate::error::{Error, Result};
+use crate::metadata::{DataType, RowType};
+use crate::row::datum::{
+    MICROS_PER_MILLI, MILLIS_PER_SECOND, NANOS_PER_MILLI, append_decimal_to_builder,
+    millis_nanos_to_micros, millis_nanos_to_nanos,
+};
+use crate::row::{FlussArray, FlussMap, InternalRow};
+use arrow::array::{
+    ArrayBuilder, ArrayRef, BinaryBuilder, BooleanBuilder, Date32Builder, Decimal128Builder,
+    FixedSizeBinaryBuilder, Float32Builder, Float64Builder, Int8Builder, Int16Builder,
+    Int32Builder, Int64Builder, StringBuilder, Time32MillisecondBuilder, Time32SecondBuilder,
+    Time64MicrosecondBuilder, Time64NanosecondBuilder, TimestampMicrosecondBuilder,
+    TimestampMillisecondBuilder, TimestampNanosecondBuilder, TimestampSecondBuilder,
+};
+use arrow_schema::DataType as ArrowDataType;
+
+/// Round up to the next multiple of 8 (Arrow IPC buffer alignment).
+#[inline]
+pub(crate) fn round_up_to_8(n: usize) -> usize {
+    (n + 7) & !7
+}
+
+/// Estimated average byte size for variable-width columns (Utf8, Binary).
+/// Used to pre-allocate data buffers and avoid reallocations during batch building.
+/// Matches Java Arrow's `BaseVariableWidthVector.DEFAULT_RECORD_BYTE_COUNT`.
+const VARIABLE_WIDTH_AVG_BYTES: usize = 8;
+
+/// A typed column writer that reads one column from an [`InternalRow`] and
+/// appends directly to a concrete Arrow builder — no intermediate [`Datum`],
+/// no `as_any_mut().downcast_mut()`.
+pub struct ColumnWriter {
+    pos: usize,
+    nullable: bool,
+    inner: TypedWriter,
+}
+
+enum TypedWriter {
+    Bool(BooleanBuilder),
+    Int8(Int8Builder),
+    Int16(Int16Builder),
+    Int32(Int32Builder),
+    Int64(Int64Builder),
+    Float32(Float32Builder),
+    Float64(Float64Builder),
+    Char {
+        len: usize,
+        builder: StringBuilder,
+    },
+    String(StringBuilder),
+    Bytes(BinaryBuilder),
+    Binary {
+        len: usize,
+        builder: FixedSizeBinaryBuilder,
+    },
+    Decimal128 {
+        src_precision: usize,
+        src_scale: usize,
+        target_precision: u32,
+        target_scale: i64,
+        builder: Decimal128Builder,
+    },
+    Date32(Date32Builder),
+    Time32Second(Time32SecondBuilder),
+    Time32Millisecond(Time32MillisecondBuilder),
+    Time64Microsecond(Time64MicrosecondBuilder),
+    Time64Nanosecond(Time64NanosecondBuilder),
+    TimestampNtzSecond {
+        precision: u32,
+        builder: TimestampSecondBuilder,
+    },
+    TimestampNtzMillisecond {
+        precision: u32,
+        builder: TimestampMillisecondBuilder,
+    },
+    TimestampNtzMicrosecond {
+        precision: u32,
+        builder: TimestampMicrosecondBuilder,
+    },
+    TimestampNtzNanosecond {
+        precision: u32,
+        builder: TimestampNanosecondBuilder,
+    },
+    TimestampLtzSecond {
+        precision: u32,
+        builder: TimestampSecondBuilder,
+    },
+    TimestampLtzMillisecond {
+        precision: u32,
+        builder: TimestampMillisecondBuilder,
+    },
+    TimestampLtzMicrosecond {
+        precision: u32,
+        builder: TimestampMicrosecondBuilder,
+    },
+    TimestampLtzNanosecond {
+        precision: u32,
+        builder: TimestampNanosecondBuilder,
+    },
+    List {
+        element_writer: Box<ColumnWriter>,
+        offsets: Vec<i32>,
+        validity: Vec<bool>,
+    },
+    Map {
+        key_writer: Box<ColumnWriter>,
+        value_writer: Box<ColumnWriter>,
+        key_type: DataType,
+        value_type: DataType,
+        offsets: Vec<i32>,
+        validity: Vec<bool>,
+    },
+    Struct {
+        field_writers: Vec<ColumnWriter>,
+        validity: Vec<bool>,
+        fields: arrow_schema::Fields,
+        row_type: RowType,
+    },
+}
+
+/// Dispatch to the inner builder across all `TypedWriter` variants.
+/// Exhaustive matching ensures new variants won't compile without an arm.
+macro_rules! with_builder {
+    ($self:expr, $b:ident => $body:expr) => {
+        match $self {
+            TypedWriter::Bool($b) => $body,
+            TypedWriter::Int8($b) => $body,
+            TypedWriter::Int16($b) => $body,
+            TypedWriter::Int32($b) => $body,
+            TypedWriter::Int64($b) => $body,
+            TypedWriter::Float32($b) => $body,
+            TypedWriter::Float64($b) => $body,
+            TypedWriter::Char { builder: $b, .. } => $body,
+            TypedWriter::String($b) => $body,
+            TypedWriter::Bytes($b) => $body,
+            TypedWriter::Binary { builder: $b, .. } => $body,
+            TypedWriter::Decimal128 { builder: $b, .. } => $body,
+            TypedWriter::Date32($b) => $body,
+            TypedWriter::Time32Second($b) => $body,
+            TypedWriter::Time32Millisecond($b) => $body,
+            TypedWriter::Time64Microsecond($b) => $body,
+            TypedWriter::Time64Nanosecond($b) => $body,
+            TypedWriter::TimestampNtzSecond { builder: $b, .. } => $body,
+            TypedWriter::TimestampNtzMillisecond { builder: $b, .. } => $body,
+            TypedWriter::TimestampNtzMicrosecond { builder: $b, .. } => $body,
+            TypedWriter::TimestampNtzNanosecond { builder: $b, .. } => $body,
+            TypedWriter::TimestampLtzSecond { builder: $b, .. } => $body,
+            TypedWriter::TimestampLtzMillisecond { builder: $b, .. } => $body,
+            TypedWriter::TimestampLtzMicrosecond { builder: $b, .. } => $body,
+            TypedWriter::TimestampLtzNanosecond { builder: $b, .. } => $body,
+            TypedWriter::List { .. } => panic!("List variant not supported in with_builder!"),
+            TypedWriter::Map { .. } => panic!("Map variant not supported in with_builder!"),
+            TypedWriter::Struct { .. } => panic!("Struct variant not supported in with_builder!"),
+        }
+    };
+}
+
+impl ColumnWriter {
+    /// Create a column writer for the given Fluss `DataType` and Arrow
+    /// `ArrowDataType` at position `pos` with the given pre-allocation
+    /// `capacity`.
+    pub fn create(
+        fluss_type: &DataType,
+        arrow_type: &ArrowDataType,
+        pos: usize,
+        capacity: usize,
+    ) -> Result<Self> {
+        let nullable = fluss_type.is_nullable();
+
+        let inner = match fluss_type {
+            DataType::Boolean(_) => TypedWriter::Bool(BooleanBuilder::with_capacity(capacity)),
+            DataType::TinyInt(_) => TypedWriter::Int8(Int8Builder::with_capacity(capacity)),
+            DataType::SmallInt(_) => TypedWriter::Int16(Int16Builder::with_capacity(capacity)),
+            DataType::Int(_) => TypedWriter::Int32(Int32Builder::with_capacity(capacity)),
+            DataType::BigInt(_) => TypedWriter::Int64(Int64Builder::with_capacity(capacity)),
+            DataType::Float(_) => TypedWriter::Float32(Float32Builder::with_capacity(capacity)),
+            DataType::Double(_) => TypedWriter::Float64(Float64Builder::with_capacity(capacity)),
+            DataType::Char(t) => TypedWriter::Char {
+                len: t.length() as usize,
+                builder: StringBuilder::with_capacity(
+                    capacity,
+                    capacity.saturating_mul(VARIABLE_WIDTH_AVG_BYTES),
+                ),
+            },
+            DataType::String(_) => TypedWriter::String(StringBuilder::with_capacity(
+                capacity,
+                capacity.saturating_mul(VARIABLE_WIDTH_AVG_BYTES),
+            )),
+            DataType::Bytes(_) => TypedWriter::Bytes(BinaryBuilder::with_capacity(
+                capacity,
+                capacity.saturating_mul(VARIABLE_WIDTH_AVG_BYTES),
+            )),
+            DataType::Binary(t) => {
+                let arrow_len: i32 = t.length().try_into().map_err(|_| Error::IllegalArgument {
+                    message: format!(
+                        "Binary length {} exceeds Arrow's maximum (i32::MAX)",
+                        t.length()
+                    ),
+                })?;
+                TypedWriter::Binary {
+                    len: t.length(),
+                    builder: FixedSizeBinaryBuilder::with_capacity(capacity, arrow_len),
+                }
+            }
+            DataType::Decimal(dt) => {
+                let (target_p, target_s) = match arrow_type {
+                    ArrowDataType::Decimal128(p, s) => (*p, *s),
+                    _ => {
+                        return Err(Error::IllegalArgument {
+                            message: format!(
+                                "Expected Decimal128 Arrow type for Decimal, got: {arrow_type:?}"
+                            ),
+                        });
+                    }
+                };
+                if target_s < 0 {
+                    return Err(Error::IllegalArgument {
+                        message: format!("Negative decimal scale {target_s} is not supported"),
+                    });
+                }
+                let builder = Decimal128Builder::with_capacity(capacity)
+                    .with_precision_and_scale(target_p, target_s)
+                    .map_err(|e| Error::IllegalArgument {
+                        message: format!(
+                            "Invalid decimal precision {target_p} or scale {target_s}: {e}"
+                        ),
+                    })?;
+                TypedWriter::Decimal128 {
+                    src_precision: dt.precision() as usize,
+                    src_scale: dt.scale() as usize,
+                    target_precision: target_p as u32,
+                    target_scale: target_s as i64,
+                    builder,
+                }
+            }
+            DataType::Date(_) => TypedWriter::Date32(Date32Builder::with_capacity(capacity)),
+            DataType::Time(_) => match arrow_type {
+                ArrowDataType::Time32(arrow_schema::TimeUnit::Second) => {
+                    TypedWriter::Time32Second(Time32SecondBuilder::with_capacity(capacity))
+                }
+                ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
+                    TypedWriter::Time32Millisecond(Time32MillisecondBuilder::with_capacity(
+                        capacity,
+                    ))
+                }
+                ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
+                    TypedWriter::Time64Microsecond(Time64MicrosecondBuilder::with_capacity(
+                        capacity,
+                    ))
+                }
+                ArrowDataType::Time64(arrow_schema::TimeUnit::Nanosecond) => {
+                    TypedWriter::Time64Nanosecond(Time64NanosecondBuilder::with_capacity(capacity))
+                }
+                _ => {
+                    return Err(Error::IllegalArgument {
+                        message: format!("Unsupported Arrow type for Time: {arrow_type:?}"),
+                    });
+                }
+            },
+            DataType::Timestamp(t) => {
+                let precision = t.precision();
+                match arrow_type {
+                    ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, _) => {
+                        TypedWriter::TimestampNtzSecond {
+                            precision,
+                            builder: TimestampSecondBuilder::with_capacity(capacity),
+                        }
+                    }
+                    ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => {
+                        TypedWriter::TimestampNtzMillisecond {
+                            precision,
+                            builder: TimestampMillisecondBuilder::with_capacity(capacity),
+                        }
+                    }
+                    ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => {
+                        TypedWriter::TimestampNtzMicrosecond {
+                            precision,
+                            builder: TimestampMicrosecondBuilder::with_capacity(capacity),
+                        }
+                    }
+                    ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => {
+                        TypedWriter::TimestampNtzNanosecond {
+                            precision,
+                            builder: TimestampNanosecondBuilder::with_capacity(capacity),
+                        }
+                    }
+                    _ => {
+                        return Err(Error::IllegalArgument {
+                            message: format!(
+                                "Unsupported Arrow type for Timestamp: {arrow_type:?}"
+                            ),
+                        });
+                    }
+                }
+            }
+            DataType::TimestampLTz(t) => {
+                let precision = t.precision();
+                match arrow_type {
+                    ArrowDataType::Timestamp(arrow_schema::TimeUnit::Second, _) => {
+                        TypedWriter::TimestampLtzSecond {
+                            precision,
+                            builder: TimestampSecondBuilder::with_capacity(capacity),
+                        }
+                    }
+                    ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => {
+                        TypedWriter::TimestampLtzMillisecond {
+                            precision,
+                            builder: TimestampMillisecondBuilder::with_capacity(capacity),
+                        }
+                    }
+                    ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => {
+                        TypedWriter::TimestampLtzMicrosecond {
+                            precision,
+                            builder: TimestampMicrosecondBuilder::with_capacity(capacity),
+                        }
+                    }
+                    ArrowDataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => {
+                        TypedWriter::TimestampLtzNanosecond {
+                            precision,
+                            builder: TimestampNanosecondBuilder::with_capacity(capacity),
+                        }
+                    }
+                    _ => {
+                        return Err(Error::IllegalArgument {
+                            message: format!(
+                                "Unsupported Arrow type for TimestampLTz: {arrow_type:?}"
+                            ),
+                        });
+                    }
+                }
+            }
+            DataType::Array(array_type) => {
+                let element_type = array_type.get_element_type();
+                let arrow_element_type = match arrow_type {
+                    ArrowDataType::List(field) => field.data_type(),
+                    _ => {
+                        return Err(Error::IllegalArgument {
+                            message: format!(
+                                "Expected List Arrow type for Array, got: {arrow_type:?}"
+                            ),
+                        });
+                    }
+                };
+                let element_writer =
+                    ColumnWriter::create(element_type, arrow_element_type, 0, capacity)?;
+                TypedWriter::List {
+                    element_writer: Box::new(element_writer),
+                    offsets: vec![0],
+                    validity: Vec::with_capacity(capacity),
+                }
+            }
+            DataType::Map(m) => {
+                let (key_arrow_type, value_arrow_type) = match arrow_type {
+                    ArrowDataType::Map(field, _) => match field.data_type() {
+                        ArrowDataType::Struct(fields) => {
+                            if fields.len() != 2 {
+                                return Err(Error::IllegalArgument {
+                                    message: format!(
+                                        "Expected Struct with 2 fields for Map, got {}",
+                                        fields.len()
+                                    ),
+                                });
+                            }
+                            (fields[0].data_type().clone(), fields[1].data_type().clone())
+                        }
+                        struct_type => {
+                            return Err(Error::IllegalArgument {
+                                message: format!(
+                                    "Expected Struct within Map Arrow type, got {:?}",
+                                    struct_type
+                                ),
+                            });
+                        }
+                    },
+                    _ => {
+                        return Err(Error::IllegalArgument {
+                            message: format!(
+                                "Expected Map Arrow type for Map, got: {arrow_type:?}"
+                            ),
+                        });
+                    }
+                };
+
+                let key_writer = ColumnWriter::create(m.key_type(), &key_arrow_type, 0, capacity)?;
+                let value_writer =
+                    ColumnWriter::create(m.value_type(), &value_arrow_type, 1, capacity)?;
+                TypedWriter::Map {
+                    key_writer: Box::new(key_writer),
+                    value_writer: Box::new(value_writer),
+                    key_type: m.key_type().clone(),
+                    value_type: m.value_type().clone(),
+                    offsets: vec![0],
+                    validity: Vec::with_capacity(capacity),
+                }
+            }
+            DataType::Row(row_type) => {
+                let arrow_fields = match arrow_type {
+                    ArrowDataType::Struct(fields) => fields.clone(),
+                    _ => {
+                        return Err(Error::IllegalArgument {
+                            message: format!(
+                                "Expected Struct Arrow type for Row, got: {arrow_type:?}"
+                            ),
+                        });
+                    }
+                };
+                if arrow_fields.len() != row_type.fields().len() {
+                    return Err(Error::IllegalArgument {
+                        message: format!(
+                            "Row arity mismatch: Fluss type has {} fields, Arrow type has {}",
+                            row_type.fields().len(),
+                            arrow_fields.len(),
+                        ),
+                    });
+                }
+                let field_writers: Result<Vec<_>> = row_type
+                    .fields()
+                    .iter()
+                    .zip(arrow_fields.iter())
+                    .map(|(f, af)| ColumnWriter::create(&f.data_type, af.data_type(), 0, capacity))
+                    .collect();
+                TypedWriter::Struct {
+                    field_writers: field_writers?,
+                    validity: Vec::with_capacity(capacity),
+                    fields: arrow_fields,
+                    row_type: row_type.clone(),
+                }
+            }
+        };
+
+        Ok(Self {
+            pos,
+            nullable,
+            inner,
+        })
+    }
+
+    /// Read one value from `row` at this writer's column position and append it
+    /// directly to the concrete Arrow builder.
+    #[inline]
+    pub fn write_field(&mut self, row: &dyn InternalRow) -> Result<()> {
+        self.write_field_at(row, self.pos)
+    }
+
+    /// Read one value from `row` at position `pos` and append it
+    /// directly to the concrete Arrow builder.
+    #[inline]
+    pub fn write_field_at(&mut self, row: &dyn InternalRow, pos: usize) -> Result<()> {
+        if self.nullable && row.is_null_at(pos)? {
+            self.append_null();
+            return Ok(());
+        }
+        self.write_non_null_at(row, pos)
+    }
+
+    /// Finish the builder, producing the final Arrow array.
+    pub fn finish(&mut self) -> ArrayRef {
+        match &mut self.inner {
+            TypedWriter::List {
+                element_writer,
+                offsets,
+                validity,
+            } => {
+                let item_nullable = element_writer.nullable;
+                let values = element_writer.finish();
+                let taken_offsets = std::mem::replace(offsets, vec![0]);
+                let taken_validity = std::mem::take(validity);
+                finish_list_array(values, item_nullable, &taken_offsets, &taken_validity)
+            }
+            TypedWriter::Map {
+                key_writer,
+                value_writer,
+                offsets,
+                validity,
+                ..
+            } => {
+                let value_nullable = value_writer.nullable;
+                let keys = key_writer.finish();
+                let values = value_writer.finish();
+                let taken_offsets = std::mem::replace(offsets, vec![0]);
+                let taken_validity = std::mem::take(validity);
+                finish_map_array(
+                    keys,
+                    values,
+                    value_nullable,
+                    &taken_offsets,
+                    &taken_validity,
+                )
+            }
+            TypedWriter::Struct {
+                field_writers,
+                validity,
+                fields,
+                ..
+            } => {
+                let taken_validity = std::mem::take(validity);
+                let child_arrays: Vec<ArrayRef> =
+                    field_writers.iter_mut().map(|w| w.finish()).collect();
+                finish_struct_array(fields.clone(), child_arrays, &taken_validity)
+            }
+            _ => with_builder!(&mut self.inner, b => (b as &mut dyn ArrayBuilder).finish()),
+        }
+    }
+
+    /// Returns the total buffer size in bytes, rounded up to 8-byte alignment
+    /// per buffer. Reads buffer lengths directly from the builders — O(1), no
+    /// allocation. Analogous to Java's `ArrowUtils.estimateArrowBodyLength()`
+    /// which sums `buf.readableBytes()` with 8-byte rounding per buffer.
+    /// The IPC framing overhead not captured here is accounted for separately
+    /// by `estimate_arrow_ipc_overhead()`.
+    pub fn buffer_size(&self) -> usize {
+        /// Validity bitmap size, rounded to 8-byte alignment.
+        /// When no nulls have been appended, the builder does not materialize
+        /// the bitmap and the IPC body contributes 0 bytes for this buffer.
+        #[inline]
+        fn validity_size(slice: Option<&[u8]>) -> usize {
+            round_up_to_8(slice.map_or(0, |s| s.len()))
+        }
+
+        /// Primitive builder: validity + values (values_slice returns &[T::Native]).
+        macro_rules! primitive_size {
+            ($b:expr) => {
+                validity_size($b.validity_slice())
+                    + round_up_to_8(std::mem::size_of_val($b.values_slice()))
+            };
+        }
+
+        /// Variable-width builder: validity + offsets + values.
+        macro_rules! var_width_size {
+            ($b:expr) => {
+                validity_size($b.validity_slice())
+                    + round_up_to_8(std::mem::size_of_val($b.offsets_slice()))
+                    + round_up_to_8($b.values_slice().len())
+            };
+        }
+
+        match &self.inner {
+            TypedWriter::Bool(b) => {
+                validity_size(b.validity_slice()) + round_up_to_8(b.values_slice().len())
+            }
+            TypedWriter::Int8(b) => primitive_size!(b),
+            TypedWriter::Int16(b) => primitive_size!(b),
+            TypedWriter::Int32(b) => primitive_size!(b),
+            TypedWriter::Int64(b) => primitive_size!(b),
+            TypedWriter::Float32(b) => primitive_size!(b),
+            TypedWriter::Float64(b) => primitive_size!(b),
+            TypedWriter::Decimal128 { builder: b, .. } => primitive_size!(b),
+            TypedWriter::Date32(b) => primitive_size!(b),
+            TypedWriter::Time32Second(b) => primitive_size!(b),
+            TypedWriter::Time32Millisecond(b) => primitive_size!(b),
+            TypedWriter::Time64Microsecond(b) => primitive_size!(b),
+            TypedWriter::Time64Nanosecond(b) => primitive_size!(b),
+            TypedWriter::TimestampNtzSecond { builder: b, .. } => primitive_size!(b),
+            TypedWriter::TimestampNtzMillisecond { builder: b, .. } => primitive_size!(b),
+            TypedWriter::TimestampNtzMicrosecond { builder: b, .. } => primitive_size!(b),
+            TypedWriter::TimestampNtzNanosecond { builder: b, .. } => primitive_size!(b),
+            TypedWriter::TimestampLtzSecond { builder: b, .. } => primitive_size!(b),
+            TypedWriter::TimestampLtzMillisecond { builder: b, .. } => primitive_size!(b),
+            TypedWriter::TimestampLtzMicrosecond { builder: b, .. } => primitive_size!(b),
+            TypedWriter::TimestampLtzNanosecond { builder: b, .. } => primitive_size!(b),
+            // Variable-width types: validity + offsets + values
+            TypedWriter::Char { builder: b, .. } => var_width_size!(b),
+            TypedWriter::String(b) => var_width_size!(b),
+            TypedWriter::Bytes(b) => var_width_size!(b),
+            TypedWriter::Binary { builder: b, .. } => {
+                validity_size(b.validity_slice()) + round_up_to_8(b.values_slice().len())
+            }
+            TypedWriter::List {
+                element_writer,
+                offsets,
+                validity,
+            } => {
+                let validity_bytes = round_up_to_8(validity.len().div_ceil(8));
+                let offsets_bytes = round_up_to_8(offsets.len() * std::mem::size_of::<i32>());
+                validity_bytes + offsets_bytes + element_writer.buffer_size()
+            }
+            TypedWriter::Map {
+                key_writer,
+                value_writer,
+                offsets,
+                validity,
+                ..
+            } => {
+                let validity_bytes = round_up_to_8(validity.len().div_ceil(8));
+                let offsets_bytes = round_up_to_8(offsets.len() * std::mem::size_of::<i32>());
+                validity_bytes
+                    + offsets_bytes
+                    + key_writer.buffer_size()
+                    + value_writer.buffer_size()
+            }
+            TypedWriter::Struct {
+                field_writers,
+                validity,
+                ..
+            } => {
+                let validity_bytes = round_up_to_8(validity.len().div_ceil(8));
+                let children_bytes: usize = field_writers.iter().map(|w| w.buffer_size()).sum();
+                validity_bytes + children_bytes
+            }
+        }
+    }
+
+    fn append_null(&mut self) {
+        match &mut self.inner {
+            TypedWriter::List {
+                offsets, validity, ..
+            }
+            | TypedWriter::Map {
+                offsets, validity, ..
+            } => {
+                let last = *offsets.last().unwrap_or(&0);
+                offsets.push(last);
+                validity.push(false);
+            }
+            TypedWriter::Struct {
+                field_writers,
+                validity,
+                ..
+            } => {
+                // Arrow StructArray children must match parent length.
+                for child in field_writers.iter_mut() {
+                    child.append_null();
+                }
+                validity.push(false);
+            }
+            _ => with_builder!(&mut self.inner, b => b.append_null()),
+        }
+    }
+
+    #[inline]
+    fn write_non_null_at(&mut self, row: &dyn InternalRow, pos: usize) -> Result<()> {
+        match &mut self.inner {
+            TypedWriter::Bool(b) => {
+                b.append_value(row.get_boolean(pos)?);
+                Ok(())
+            }
+            TypedWriter::Int8(b) => {
+                b.append_value(row.get_byte(pos)?);
+                Ok(())
+            }
+            TypedWriter::Int16(b) => {
+                b.append_value(row.get_short(pos)?);
+                Ok(())
+            }
+            TypedWriter::Int32(b) => {
+                b.append_value(row.get_int(pos)?);
+                Ok(())
+            }
+            TypedWriter::Int64(b) => {
+                b.append_value(row.get_long(pos)?);
+                Ok(())
+            }
+            TypedWriter::Float32(b) => {
+                b.append_value(row.get_float(pos)?);
+                Ok(())
+            }
+            TypedWriter::Float64(b) => {
+                b.append_value(row.get_double(pos)?);
+                Ok(())
+            }
+            TypedWriter::Char { len, builder } => {
+                let v = row.get_char(pos, *len)?;
+                builder.append_value(v);
+                Ok(())
+            }
+            TypedWriter::String(b) => {
+                let v = row.get_string(pos)?;
+                b.append_value(v);
+                Ok(())
+            }
+            TypedWriter::Bytes(b) => {
+                let v = row.get_bytes(pos)?;
+                b.append_value(v);
+                Ok(())
+            }
+            TypedWriter::Binary { len, builder } => {
+                let v = row.get_binary(pos, *len)?;
+                builder.append_value(v).map_err(|e| RowConvertError {
+                    message: format!("Failed to append binary value: {e}"),
+                })?;
+                Ok(())
+            }
+            TypedWriter::Decimal128 {
+                src_precision,
+                src_scale,
+                target_precision,
+                target_scale,
+                builder,
+            } => {
+                let decimal = row.get_decimal(pos, *src_precision, *src_scale)?;
+                append_decimal_to_builder(&decimal, *target_precision, *target_scale, builder)
+            }
+            TypedWriter::Date32(b) => {
+                let date = row.get_date(pos)?;
+                b.append_value(date.get_inner());
+                Ok(())
+            }
+            TypedWriter::Time32Second(b) => {
+                let millis = row.get_time(pos)?.get_inner();
+                if millis % MILLIS_PER_SECOND as i32 != 0 {
+                    return Err(RowConvertError {
+                        message: format!(
+                            "Time value {millis} ms has sub-second precision but schema expects seconds only"
+                        ),
+                    });
+                }
+                b.append_value(millis / MILLIS_PER_SECOND as i32);
+                Ok(())
+            }
+            TypedWriter::Time32Millisecond(b) => {
+                b.append_value(row.get_time(pos)?.get_inner());
+                Ok(())
+            }
+            TypedWriter::Time64Microsecond(b) => {
+                let millis = row.get_time(pos)?.get_inner();
+                let micros = (millis as i64)
+                    .checked_mul(MICROS_PER_MILLI)
+                    .ok_or_else(|| RowConvertError {
+                        message: format!(
+                            "Time value {millis} ms overflows when converting to microseconds"
+                        ),
+                    })?;
+                b.append_value(micros);
+                Ok(())
+            }
+            TypedWriter::Time64Nanosecond(b) => {
+                let millis = row.get_time(pos)?.get_inner();
+                let nanos = (millis as i64)
+                    .checked_mul(NANOS_PER_MILLI)
+                    .ok_or_else(|| RowConvertError {
+                        message: format!(
+                            "Time value {millis} ms overflows when converting to nanoseconds"
+                        ),
+                    })?;
+                b.append_value(nanos);
+                Ok(())
+            }
+            // --- TimestampNtz variants ---
+            TypedWriter::TimestampNtzSecond {
+                precision, builder, ..
+            } => {
+                let ts = row.get_timestamp_ntz(pos, *precision)?;
+                builder.append_value(ts.get_millisecond() / MILLIS_PER_SECOND);
+                Ok(())
+            }
+            TypedWriter::TimestampNtzMillisecond {
+                precision, builder, ..
+            } => {
+                let ts = row.get_timestamp_ntz(pos, *precision)?;
+                builder.append_value(ts.get_millisecond());
+                Ok(())
+            }
+            TypedWriter::TimestampNtzMicrosecond {
+                precision, builder, ..
+            } => {
+                let ts = row.get_timestamp_ntz(pos, *precision)?;
+                builder.append_value(millis_nanos_to_micros(
+                    ts.get_millisecond(),
+                    ts.get_nano_of_millisecond(),
+                )?);
+                Ok(())
+            }
+            TypedWriter::TimestampNtzNanosecond {
+                precision, builder, ..
+            } => {
+                let ts = row.get_timestamp_ntz(pos, *precision)?;
+                builder.append_value(millis_nanos_to_nanos(
+                    ts.get_millisecond(),
+                    ts.get_nano_of_millisecond(),
+                )?);
+                Ok(())
+            }
+            // --- TimestampLtz variants ---
+            TypedWriter::TimestampLtzSecond {
+                precision, builder, ..
+            } => {
+                let ts = row.get_timestamp_ltz(pos, *precision)?;
+                builder.append_value(ts.get_epoch_millisecond() / MILLIS_PER_SECOND);
+                Ok(())
+            }
+            TypedWriter::TimestampLtzMillisecond {
+                precision, builder, ..
+            } => {
+                let ts = row.get_timestamp_ltz(pos, *precision)?;
+                builder.append_value(ts.get_epoch_millisecond());
+                Ok(())
+            }
+            TypedWriter::TimestampLtzMicrosecond {
+                precision, builder, ..
+            } => {
+                let ts = row.get_timestamp_ltz(pos, *precision)?;
+                builder.append_value(millis_nanos_to_micros(
+                    ts.get_epoch_millisecond(),
+                    ts.get_nano_of_millisecond(),
+                )?);
+                Ok(())
+            }
+            TypedWriter::TimestampLtzNanosecond {
+                precision, builder, ..
+            } => {
+                let ts = row.get_timestamp_ltz(pos, *precision)?;
+                builder.append_value(millis_nanos_to_nanos(
+                    ts.get_epoch_millisecond(),
+                    ts.get_nano_of_millisecond(),
+                )?);
+                Ok(())
+            }
+            TypedWriter::List {
+                element_writer,
+                offsets,
+                validity,
+            } => {
+                let array = row.get_array(pos)?;
+                let size = array.size();
+                match &mut element_writer.inner {
+                    TypedWriter::Struct {
+                        field_writers,
+                        validity: child_validity,
+                        row_type,
+                        ..
+                    } => {
+                        for i in 0..size {
+                            if array.is_null_at(i) {
+                                for child in field_writers.iter_mut() {
+                                    child.append_null();
+                                }
+                                child_validity.push(false);
+                            } else {
+                                let nested = array.get_row(i, row_type)?;
+                                for (j, child) in field_writers.iter_mut().enumerate() {
+                                    child.write_field_at(&nested, j)?;
+                                }
+                                child_validity.push(true);
+                            }
+                        }
+                    }
+                    TypedWriter::Map {
+                        key_writer,
+                        value_writer,
+                        key_type,
+                        value_type,
+                        offsets: child_offsets,
+                        validity: child_validity,
+                    } => {
+                        for i in 0..size {
+                            if array.is_null_at(i) {
+                                child_validity.push(false);
+                                let last = *child_offsets.last().unwrap();
+                                child_offsets.push(last);
+                            } else {
+                                let map = array.get_map(i, key_type, value_type)?;
+                                write_map_into(map, key_writer, value_writer, child_offsets)?;
+                                child_validity.push(true);
+                            }
+                        }
+                    }
+                    _ => {
+                        for i in 0..size {
+                            element_writer.write_field_at(&array, i)?;
+                        }
+                    }
+                }
+                let last = *offsets.last().unwrap();
+                offsets.push(
+                    last + i32::try_from(size).map_err(|_| RowConvertError {
+                        message: format!("Array size {size} exceeds i32 range"),
+                    })?,
+                );
+                validity.push(true);
+                Ok(())
+            }
+            TypedWriter::Map {
+                key_writer,
+                value_writer,
+                offsets,
+                validity,
+                ..
+            } => {
+                let map = row.get_map(pos)?;
+                write_map_into(map, key_writer, value_writer, offsets)?;
+                validity.push(true);
+                Ok(())
+            }
+            TypedWriter::Struct {
+                field_writers,
+                validity,
+                ..
+            } => {
+                let nested = row.get_row(pos)?;
+                for (i, child) in field_writers.iter_mut().enumerate() {
+                    child.write_field_at(nested, i)?;
+                }
+                validity.push(true);
+                Ok(())
+            }
+        }
+    }
+}
+
+fn write_map_into(
+    map: FlussMap,
+    key_writer: &mut ColumnWriter,
+    value_writer: &mut ColumnWriter,
+    offsets: &mut Vec<i32>,
+) -> Result<()> {
+    let key_array = map.key_array();
+    let value_array = map.value_array();
+    for i in 0..map.size() {
+        write_array_element_into_column(key_writer, key_array, i)?;
+        write_array_element_into_column(value_writer, value_array, i)?;
+    }
+    let last = *offsets.last().unwrap();
+    offsets.push(
+        last + i32::try_from(map.size()).map_err(|_| RowConvertError {
+            message: format!("Map size {} exceeds i32 range", map.size()),
+        })?,
+    );
+    Ok(())
+}
+
+// FlussArray carries no schema; nested row/map elements need the typed
+// inherent accessors (get_row/get_map with explicit types).
+fn write_array_element_into_column(
+    writer: &mut ColumnWriter,
+    array: &FlussArray,
+    index: usize,
+) -> Result<()> {
+    match &mut writer.inner {
+        TypedWriter::Struct {
+            field_writers,
+            validity,
+            row_type,
+            ..
+        } => {
+            if array.is_null_at(index) {
+                for child in field_writers.iter_mut() {
+                    child.append_null();
+                }
+                validity.push(false);
+            } else {
+                let nested = array.get_row(index, row_type)?;
+                for (j, child) in field_writers.iter_mut().enumerate() {
+                    child.write_field_at(&nested, j)?;
+                }
+                validity.push(true);
+            }
+            Ok(())
+        }
+        TypedWriter::Map {
+            key_writer,
+            value_writer,
+            key_type,
+            value_type,
+            offsets,
+            validity,
+        } => {
+            if array.is_null_at(index) {
+                validity.push(false);
+                let last = *offsets.last().unwrap();
+                offsets.push(last);
+            } else {
+                let nested = array.get_map(index, key_type, value_type)?;
+                write_map_into(nested, key_writer, value_writer, offsets)?;
+                validity.push(true);
+            }
+            Ok(())
+        }
+        _ => writer.write_field_at(array, index),
+    }
+}
+
+fn finish_struct_array(
+    fields: arrow_schema::Fields,
+    child_arrays: Vec<ArrayRef>,
+    validity: &[bool],
+) -> ArrayRef {
+    use arrow::array::StructArray;
+    use arrow::buffer::NullBuffer;
+    use std::sync::Arc;
+
+    let null_buffer = if validity.iter().any(|v| !v) {
+        Some(NullBuffer::from(validity.to_vec()))
+    } else {
+        None
+    };
+    Arc::new(StructArray::new(fields, child_arrays, null_buffer))
+}
+
+fn finish_list_array(
+    values: ArrayRef,
+    item_nullable: bool,
+    offsets: &[i32],
+    validity: &[bool],
+) -> ArrayRef {
+    use arrow::array::ListArray;
+    use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
+    use arrow::datatypes::{Field, FieldRef};
+    use std::sync::Arc;
+
+    let offsets_buffer = OffsetBuffer::new(ScalarBuffer::from(offsets.to_vec()));
+    let null_buffer = NullBuffer::from(validity.to_vec());
+    let field = Arc::new(Field::new(
+        "item",
+        values.data_type().clone(),
+        item_nullable,
+    ));
+    let field_ref: FieldRef = field;
+
+    Arc::new(ListArray::new(
+        field_ref,
+        offsets_buffer,
+        values,
+        Some(null_buffer),
+    ))
+}
+
+fn finish_map_array(
+    keys: ArrayRef,
+    values: ArrayRef,
+    value_nullable: bool,
+    offsets: &[i32],
+    validity: &[bool],
+) -> ArrayRef {
+    use arrow::array::{Array, MapArray, StructArray};
+    use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
+    use arrow::datatypes::Field;
+    use std::sync::Arc;
+
+    let offsets_buffer = OffsetBuffer::new(ScalarBuffer::from(offsets.to_vec()));
+    let null_buffer = NullBuffer::from(validity.to_vec());
+
+    let key_field = Arc::new(Field::new("key", keys.data_type().clone(), false));
+    let value_field = Arc::new(Field::new(
+        "value",
+        values.data_type().clone(),
+        value_nullable,
+    ));
+
+    let struct_array = StructArray::from(vec![(key_field, keys), (value_field, values)]);
+
+    let entries_field = Arc::new(Field::new(
+        "entries",
+        struct_array.data_type().clone(),
+        false,
+    ));
+
+    Arc::new(MapArray::new(
+        entries_field,
+        offsets_buffer,
+        struct_array,
+        Some(null_buffer),
+        false,
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::DataTypes;
+    use crate::record::to_arrow_type;
+    use crate::row::binary_array::FlussArrayWriter;
+    use crate::row::binary_map::FlussMapWriter;
+    use crate::row::{Date, Datum, Decimal, GenericRow, Time, TimestampLtz, TimestampNtz};
+    use arrow::array::*;
+    use bigdecimal::BigDecimal;
+    use std::str::FromStr;
+
+    /// Helper: create a ColumnWriter from a Fluss DataType, deriving the Arrow type automatically.
+    fn writer_for(fluss_type: &DataType, capacity: usize) -> ColumnWriter {
+        let arrow_type = to_arrow_type(fluss_type).unwrap();
+        ColumnWriter::create(fluss_type, &arrow_type, 0, capacity).unwrap()
+    }
+
+    /// Helper: write a single datum and return the finished array.
+    fn write_one(fluss_type: &DataType, datum: Datum) -> ArrayRef {
+        let mut w = writer_for(fluss_type, 4);
+        w.write_field(&GenericRow::from_data(vec![datum])).unwrap();
+        w.finish()
+    }
+
+    #[test]
+    fn write_all_scalar_types() {
+        // Boolean
+        let arr = write_one(&DataTypes::boolean(), Datum::Bool(true));
+        assert!(
+            arr.as_any()
+                .downcast_ref::<BooleanArray>()
+                .unwrap()
+                .value(0)
+        );
+
+        // Integer types
+        let arr = write_one(&DataTypes::tinyint(), Datum::Int8(42));
+        assert_eq!(
+            arr.as_any().downcast_ref::<Int8Array>().unwrap().value(0),
+            42
+        );
+
+        let arr = write_one(&DataTypes::smallint(), Datum::Int16(1000));
+        assert_eq!(
+            arr.as_any().downcast_ref::<Int16Array>().unwrap().value(0),
+            1000
+        );
+
+        let arr = write_one(&DataTypes::int(), Datum::Int32(100_000));
+        assert_eq!(
+            arr.as_any().downcast_ref::<Int32Array>().unwrap().value(0),
+            100_000
+        );
+
+        let arr = write_one(&DataTypes::bigint(), Datum::Int64(9_000_000_000));
+        assert_eq!(
+            arr.as_any().downcast_ref::<Int64Array>().unwrap().value(0),
+            9_000_000_000
+        );
+
+        // Float types
+        let arr = write_one(&DataTypes::float(), Datum::Float32(1.5.into()));
+        assert!(
+            (arr.as_any()
+                .downcast_ref::<Float32Array>()
+                .unwrap()
+                .value(0)
+                - 1.5)
+                .abs()
+                < 0.001
+        );
+
+        let arr = write_one(&DataTypes::double(), Datum::Float64(1.125.into()));
+        assert!(
+            (arr.as_any()
+                .downcast_ref::<Float64Array>()
+                .unwrap()
+                .value(0)
+                - 1.125)
+                .abs()
+                < 0.001
+        );
+
+        // String / Char
+        let arr = write_one(&DataTypes::string(), Datum::String("hello".into()));
+        assert_eq!(
+            arr.as_any().downcast_ref::<StringArray>().unwrap().value(0),
+            "hello"
+        );
+
+        let arr = write_one(&DataTypes::char(10), Datum::String("world".into()));
+        assert_eq!(
+            arr.as_any().downcast_ref::<StringArray>().unwrap().value(0),
+            "world"
+        );
+
+        // Bytes / Binary
+        let arr = write_one(&DataTypes::bytes(), Datum::Blob(vec![1, 2, 3].into()));
+        assert_eq!(
+            arr.as_any().downcast_ref::<BinaryArray>().unwrap().value(0),
+            &[1, 2, 3]
+        );
+
+        let arr = write_one(
+            &DataTypes::binary(4),
+            Datum::Blob(vec![10, 20, 30, 40].into()),
+        );
+        assert_eq!(
+            arr.as_any()
+                .downcast_ref::<FixedSizeBinaryArray>()
+                .unwrap()
+                .value(0),
+            &[10, 20, 30, 40]
+        );
+
+        // Date
+        let arr = write_one(&DataTypes::date(), Datum::Date(Date::new(19000)));
+        assert_eq!(
+            arr.as_any().downcast_ref::<Date32Array>().unwrap().value(0),
+            19000
+        );
+
+        // Time (precision 3 → Millisecond)
+        let arr = write_one(
+            &DataTypes::time_with_precision(3),
+            Datum::Time(Time::new(45_000)),
+        );
+        assert_eq!(
+            arr.as_any()
+                .downcast_ref::<Time32MillisecondArray>()
+                .unwrap()
+                .value(0),
+            45_000
+        );
+
+        // Decimal
+        let decimal =
+            Decimal::from_big_decimal(BigDecimal::from_str("123.45").unwrap(), 10, 2).unwrap();
+        let arr = write_one(&DataTypes::decimal(10, 2), Datum::Decimal(decimal));
+        assert_eq!(
+            arr.as_any()
+                .downcast_ref::<Decimal128Array>()
+                .unwrap()
+                .value(0),
+            12345
+        );
+
+        // Timestamp NTZ (precision 3 → Millisecond)
+        let arr = write_one(
+            &DataTypes::timestamp_with_precision(3),
+            Datum::TimestampNtz(TimestampNtz::new(1_700_000_000_000)),
+        );
+        assert_eq!(
+            arr.as_any()
+                .downcast_ref::<TimestampMillisecondArray>()
+                .unwrap()
+                .value(0),
+            1_700_000_000_000
+        );
+
+        // Timestamp LTZ (precision 3 → Millisecond)
+        let arr = write_one(
+            &DataTypes::timestamp_ltz_with_precision(3),
+            Datum::TimestampLtz(TimestampLtz::new(1_700_000_000_000)),
+        );
+        assert_eq!(
+            arr.as_any()
+                .downcast_ref::<TimestampMillisecondArray>()
+                .unwrap()
+                .value(0),
+            1_700_000_000_000
+        );
+    }
+
+    #[test]
+    fn write_null_and_multiple_rows() {
+        // Null
+        let arr = write_one(&DataTypes::int(), Datum::Null);
+        assert!(arr.is_null(0));
+
+        // Multiple rows
+        let mut w = writer_for(&DataTypes::int(), 8);
+        for val in [10, 20, 30] {
+            w.write_field(&GenericRow::from_data(vec![val])).unwrap();
+        }
+        let arr = w.finish();
+        let int_arr = arr.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(int_arr.len(), 3);
+        assert_eq!(int_arr.value(0), 10);
+        assert_eq!(int_arr.value(1), 20);
+        assert_eq!(int_arr.value(2), 30);
+
+        // buffer_size grows with appended data and does not reset the builder
+        let mut w = writer_for(&DataTypes::int(), 4);
+        w.write_field(&GenericRow::from_data(vec![42_i32])).unwrap();
+        assert!(w.buffer_size() > 0);
+        w.write_field(&GenericRow::from_data(vec![99_i32])).unwrap();
+        let int_arr = w
+            .finish()
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap()
+            .clone();
+        assert_eq!((int_arr.value(0), int_arr.value(1)), (42, 99));
+    }
+
+    #[test]
+    fn write_array_type() {
+        let element_type = DataTypes::int();
+        let mut array_writer = FlussArrayWriter::new(3, &element_type);
+        array_writer.write_int(0, 10);
+        array_writer.set_null_at(1);
+        array_writer.write_int(2, 30);
+        let fluss_array = array_writer.complete().unwrap();
+
+        let fluss_type = DataTypes::array(element_type);
+
+        let arr = write_one(&fluss_type, Datum::Array(fluss_array));
+        let list_arr = arr.as_any().downcast_ref::<ListArray>().unwrap();
+        assert_eq!(list_arr.len(), 1);
+        let values = list_arr.value(0);
+        let int_values = values.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(int_values.len(), 3);
+        assert_eq!(int_values.value(0), 10);
+        assert!(int_values.is_null(1));
+        assert_eq!(int_values.value(2), 30);
+    }
+
+    #[test]
+    fn unsupported_type_returns_error() {
+        // Map is currently unsupported in ColumnWriter
+        let fluss_type = DataTypes::map(DataTypes::int(), DataTypes::string());
+        let arrow_type = ArrowDataType::Boolean; // Any arrow type
+        assert!(ColumnWriter::create(&fluss_type, &arrow_type, 0, 4).is_err());
+    }
+
+    #[test]
+    fn write_non_nullable_array_type() {
+        // 1. Define an array of non-nullable integers
+        let element_type = DataTypes::int().as_non_nullable();
+        let array_type = DataTypes::array(element_type);
+
+        // 2. Create the writer
+        let mut writer = writer_for(&array_type, 4);
+
+        // (Optional but good practice) Write a dummy row containing an empty array
+        // to ensure the builder processes it without panicking.
+        let array_writer = FlussArrayWriter::new(0, &DataTypes::int().as_non_nullable());
+        let fluss_array = array_writer.complete().unwrap();
+        writer
+            .write_field(&GenericRow::from_data(vec![Datum::Array(fluss_array)]))
+            .unwrap();
+
+        // 3. FINISH the array to get the actual Arrow output
+        let arrow_array = writer.finish();
+
+        // 4. Assert against the actual Arrow schema!
+        let list_array = arrow_array
+            .as_any()
+            .downcast_ref::<ListArray>()
+            .expect("Expected ListArray");
+        let list_field = match list_array.data_type() {
+            ArrowDataType::List(field) => field,
+            _ => panic!("Expected List type"),
+        };
+
+        // This is the true test: Did the Arrow field get marked as NOT NULL?
+        assert!(
+            !list_field.is_nullable(),
+            "Arrow field inside the list should be non-nullable"
+        );
+    }
+
+    #[test]
+    fn test_write_map_type() {
+        use crate::metadata::DataTypes;
+        let key_type = DataTypes::int();
+        let value_type = DataTypes::string();
+        let fluss_type = DataTypes::map(key_type.clone(), value_type.clone());
+
+        let mut map_writer = FlussMapWriter::new(2, &key_type, &value_type);
+        map_writer.write_entry(1.into(), "a".into()).unwrap();
+        map_writer.write_entry(2.into(), "b".into()).unwrap();
+        let map = map_writer.complete().unwrap();
+
+        let arr = write_one(&fluss_type, Datum::Map(map));
+        let map_arr = arr.as_any().downcast_ref::<MapArray>().unwrap();
+        assert_eq!(map_arr.len(), 1);
+
+        let entries = map_arr.value(0);
+        let struct_arr = entries.as_any().downcast_ref::<StructArray>().unwrap();
+        assert_eq!(struct_arr.num_columns(), 2);
+
+        let keys = struct_arr
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .unwrap();
+        let values = struct_arr
+            .column(1)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .unwrap();
+
+        assert_eq!(keys.len(), 2);
+        assert_eq!(keys.value(0), 1);
+        assert_eq!(keys.value(1), 2);
+
+        assert_eq!(values.len(), 2);
+        assert_eq!(values.value(0), "a");
+        assert_eq!(values.value(1), "b");
+    }
+
+    #[test]
+    fn test_write_null_map_type() {
+        use crate::metadata::DataTypes;
+
+        let fluss_type = DataTypes::map(DataTypes::int(), DataTypes::string());
+        let arr = write_one(&fluss_type, Datum::Null);
+        let map_arr = arr.as_any().downcast_ref::<MapArray>().unwrap();
+
+        assert_eq!(map_arr.len(), 1);
+        assert!(map_arr.is_null(0));
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs
new file mode 100644
index 0000000000..d5f7c2f1d8
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_key_writer.rs
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::row::compacted::compacted_row_writer::CompactedRowWriter;
+use bytes::Bytes;
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::DataType;
+use crate::row::Decimal;
+use crate::row::binary::{BinaryRowFormat, BinaryWriter, ValueWriter};
+use crate::row::binary_array::FlussArray;
+use crate::row::binary_map::FlussMap;
+use crate::row::datum::{TimestampLtz, TimestampNtz};
+use delegate::delegate;
+
+/// A wrapping of [`CompactedRowWriter`] used to encode key columns.
+/// The encoding is the same as [`CompactedRowWriter`], but is without header of null bits to
+/// represent whether the field value is null or not since the key columns must be not null.
+pub struct CompactedKeyWriter {
+    delegate: CompactedRowWriter,
+}
+
+impl Default for CompactedKeyWriter {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CompactedKeyWriter {
+    pub fn new() -> CompactedKeyWriter {
+        CompactedKeyWriter {
+            // in compacted key encoder, we don't need to set null bits as the key columns must be not
+            // null, to use field count 0 to init to make the null bits 0
+            delegate: CompactedRowWriter::new(0),
+        }
+    }
+
+    pub fn create_value_writer(field_type: &DataType) -> Result<ValueWriter> {
+        if matches!(field_type, DataType::Map(_)) {
+            return Err(IllegalArgument {
+                message: format!("Cannot use {field_type:?} as a key column type"),
+            });
+        }
+        ValueWriter::create_value_writer(field_type, Some(&BinaryRowFormat::Compacted))
+    }
+
+    delegate! {
+        to self.delegate {
+            pub fn reset(&mut self);
+
+            #[allow(dead_code)]
+            pub fn position(&self) -> usize;
+
+            #[allow(dead_code)]
+            pub fn buffer(&self) -> &[u8];
+
+            pub fn to_bytes(&self) -> Bytes;
+        }
+    }
+}
+
+impl BinaryWriter for CompactedKeyWriter {
+    delegate! {
+        to self.delegate {
+            fn reset(&mut self);
+
+            fn set_null_at(&mut self, pos: usize);
+
+            fn write_boolean(&mut self, value: bool);
+
+            fn write_byte(&mut self, value: u8);
+
+            fn write_binary(&mut self, bytes: &[u8], length: usize);
+
+            fn write_bytes(&mut self, value: &[u8]);
+
+            fn write_char(&mut self, value: &str, _length: usize);
+
+            fn write_string(&mut self, value: &str);
+
+            fn write_short(&mut self, value: i16);
+
+            fn write_int(&mut self, value: i32);
+
+            fn write_long(&mut self, value: i64);
+
+            fn write_float(&mut self, value: f32);
+
+            fn write_double(&mut self, value: f64);
+
+            fn write_decimal(&mut self, value: &Decimal, precision: u32);
+
+            fn write_time(&mut self, value: i32, precision: u32);
+
+            fn write_timestamp_ntz(&mut self, value: &TimestampNtz, precision: u32);
+
+            fn write_timestamp_ltz(&mut self, value: &TimestampLtz, precision: u32);
+
+            fn write_array(&mut self, value: &FlussArray);
+
+            fn write_map(&mut self, value: &FlussMap);
+        }
+    }
+
+    fn complete(&mut self) {
+        // do nothing
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs
new file mode 100644
index 0000000000..7f2b5c0429
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row.rs
@@ -0,0 +1,624 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::client::WriteFormat;
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::RowType;
+use crate::row::binary_array::FlussArray;
+use crate::row::binary_map::FlussMap;
+use crate::row::compacted::compacted_row_reader::{CompactedRowDeserializer, CompactedRowReader};
+use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz};
+use crate::row::{Decimal, GenericRow, InternalRow};
+use std::sync::{Arc, OnceLock};
+
+pub fn calculate_bit_set_width_in_bytes(arity: usize) -> usize {
+    arity.div_ceil(8)
+}
+
+// Reference implementation:
+// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRow.java
+#[allow(dead_code)]
+pub struct CompactedRow<'a> {
+    arity: usize,
+    size_in_bytes: usize,
+    decoded_row: OnceLock<GenericRow<'a>>,
+    deserializer: Arc<CompactedRowDeserializer<'a>>,
+    reader: CompactedRowReader<'a>,
+    data: &'a [u8],
+}
+
+#[allow(dead_code)]
+impl<'a> CompactedRow<'a> {
+    pub fn from_bytes(row_type: &'a RowType, data: &'a [u8]) -> Self {
+        Self::deserialize(
+            Arc::new(CompactedRowDeserializer::new(row_type)),
+            row_type.fields().len(),
+            data,
+        )
+    }
+
+    pub fn deserialize(
+        deserializer: Arc<CompactedRowDeserializer<'a>>,
+        arity: usize,
+        data: &'a [u8],
+    ) -> Self {
+        Self {
+            arity,
+            size_in_bytes: data.len(),
+            decoded_row: OnceLock::new(),
+            deserializer: Arc::clone(&deserializer),
+            reader: CompactedRowReader::new(arity, data, 0, data.len()),
+            data,
+        }
+    }
+
+    pub fn get_size_in_bytes(&self) -> usize {
+        self.size_in_bytes
+    }
+
+    fn decoded_row(&self) -> Result<&GenericRow<'_>> {
+        if let Some(row) = self.decoded_row.get() {
+            return Ok(row);
+        }
+
+        // `OnceLock::get_or_try_init` is still unstable on our toolchain.
+        // Keep the same semantics by performing the fallible decode first,
+        // then atomically installing it via `get_or_init`.
+        let decoded = self.deserializer.deserialize(&self.reader)?;
+        Ok(self.decoded_row.get_or_init(|| decoded))
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        self.data
+    }
+}
+
+impl<'a> InternalRow for CompactedRow<'a> {
+    fn get_field_count(&self) -> usize {
+        self.arity
+    }
+
+    fn is_null_at(&self, pos: usize) -> Result<bool> {
+        let fields = self.deserializer.get_row_type().fields();
+        if pos >= fields.len() {
+            return Err(IllegalArgument {
+                message: format!(
+                    "position {pos} out of bounds (row has {} fields)",
+                    fields.len()
+                ),
+            });
+        }
+        Ok(fields.as_slice()[pos].data_type.is_nullable() && self.reader.is_null_at(pos))
+    }
+
+    fn get_boolean(&self, pos: usize) -> Result<bool> {
+        self.decoded_row()?.get_boolean(pos)
+    }
+
+    fn get_byte(&self, pos: usize) -> Result<i8> {
+        self.decoded_row()?.get_byte(pos)
+    }
+
+    fn get_short(&self, pos: usize) -> Result<i16> {
+        self.decoded_row()?.get_short(pos)
+    }
+
+    fn get_int(&self, pos: usize) -> Result<i32> {
+        self.decoded_row()?.get_int(pos)
+    }
+
+    fn get_long(&self, pos: usize) -> Result<i64> {
+        self.decoded_row()?.get_long(pos)
+    }
+
+    fn get_float(&self, pos: usize) -> Result<f32> {
+        self.decoded_row()?.get_float(pos)
+    }
+
+    fn get_double(&self, pos: usize) -> Result<f64> {
+        self.decoded_row()?.get_double(pos)
+    }
+
+    fn get_char(&self, pos: usize, length: usize) -> Result<&str> {
+        self.decoded_row()?.get_char(pos, length)
+    }
+
+    fn get_string(&self, pos: usize) -> Result<&str> {
+        self.decoded_row()?.get_string(pos)
+    }
+
+    fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> Result<Decimal> {
+        self.decoded_row()?.get_decimal(pos, precision, scale)
+    }
+
+    fn get_date(&self, pos: usize) -> Result<Date> {
+        self.decoded_row()?.get_date(pos)
+    }
+
+    fn get_time(&self, pos: usize) -> Result<Time> {
+        self.decoded_row()?.get_time(pos)
+    }
+
+    fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> Result<TimestampNtz> {
+        self.decoded_row()?.get_timestamp_ntz(pos, precision)
+    }
+
+    fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> Result<TimestampLtz> {
+        self.decoded_row()?.get_timestamp_ltz(pos, precision)
+    }
+
+    fn get_binary(&self, pos: usize, length: usize) -> Result<&[u8]> {
+        self.decoded_row()?.get_binary(pos, length)
+    }
+
+    fn get_bytes(&self, pos: usize) -> Result<&[u8]> {
+        self.decoded_row()?.get_bytes(pos)
+    }
+
+    fn get_array(&self, pos: usize) -> Result<FlussArray> {
+        self.decoded_row()?.get_array(pos)
+    }
+
+    fn get_map(&self, pos: usize) -> Result<FlussMap> {
+        self.decoded_row()?.get_map(pos)
+    }
+
+    fn get_row(&self, pos: usize) -> Result<&GenericRow<'_>> {
+        self.decoded_row()?.get_row(pos)
+    }
+
+    fn as_encoded_bytes(&self, write_format: WriteFormat) -> Option<&[u8]> {
+        match write_format {
+            WriteFormat::CompactedKv => Some(self.as_bytes()),
+            WriteFormat::ArrowLog => None,
+            WriteFormat::CompactedLog => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::DataTypes;
+    use crate::row::binary::BinaryWriter;
+    use crate::row::binary_array::FlussArrayWriter;
+    use crate::row::binary_map::FlussMapWriter;
+
+    use crate::metadata::{
+        BigIntType, BooleanType, BytesType, DataType, DoubleType, FloatType, IntType, SmallIntType,
+        StringType, TinyIntType,
+    };
+    use crate::row::Datum;
+    use crate::row::compacted::compacted_row_writer::CompactedRowWriter;
+
+    #[test]
+    fn test_compacted_row() {
+        // Test all primitive types
+        let row_type = RowType::with_data_types(vec![
+            DataType::Boolean(BooleanType::new()),
+            DataType::TinyInt(TinyIntType::new()),
+            DataType::SmallInt(SmallIntType::new()),
+            DataType::Int(IntType::new()),
+            DataType::BigInt(BigIntType::new()),
+            DataType::Float(FloatType::new()),
+            DataType::Double(DoubleType::new()),
+            DataType::String(StringType::new()),
+            DataType::Bytes(BytesType::new()),
+        ]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+
+        writer.write_boolean(true);
+        writer.write_byte(1);
+        writer.write_short(100);
+        writer.write_int(1000);
+        writer.write_long(10000);
+        writer.write_float(1.5);
+        writer.write_double(2.5);
+        writer.write_string("Hello World");
+        writer.write_bytes(&[1, 2, 3, 4, 5]);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        assert_eq!(row.get_field_count(), 9);
+        assert!(row.get_boolean(0).unwrap());
+        assert_eq!(row.get_byte(1).unwrap(), 1);
+        assert_eq!(row.get_short(2).unwrap(), 100);
+        assert_eq!(row.get_int(3).unwrap(), 1000);
+        assert_eq!(row.get_long(4).unwrap(), 10000);
+        assert_eq!(row.get_float(5).unwrap(), 1.5);
+        assert_eq!(row.get_double(6).unwrap(), 2.5);
+        assert_eq!(row.get_string(7).unwrap(), "Hello World");
+        assert_eq!(row.get_bytes(8).unwrap(), &[1, 2, 3, 4, 5]);
+
+        // Test with nulls and negative values
+        let row_type = RowType::with_data_types(vec![
+            DataType::Int(IntType::new()),
+            DataType::String(StringType::new()),
+            DataType::Double(DoubleType::new()),
+        ]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+        writer.write_int(-42);
+        writer.set_null_at(1);
+        writer.write_double(2.71);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        assert!(!row.is_null_at(0).unwrap());
+        assert!(row.is_null_at(1).unwrap());
+        assert!(!row.is_null_at(2).unwrap());
+        assert_eq!(row.get_int(0).unwrap(), -42);
+        assert_eq!(row.get_double(2).unwrap(), 2.71);
+        // Verify caching works on repeated reads
+        assert_eq!(row.get_int(0).unwrap(), -42);
+    }
+
+    #[test]
+    fn test_compacted_row_temporal_and_decimal_types() {
+        // Comprehensive test covering DATE, TIME, TIMESTAMP (compact/non-compact), and DECIMAL (compact/non-compact)
+        use crate::metadata::{DecimalType, TimestampLTzType, TimestampType};
+        use crate::row::Decimal;
+        use crate::row::datum::{TimestampLtz, TimestampNtz};
+        use bigdecimal::{BigDecimal, num_bigint::BigInt};
+
+        let row_type = RowType::with_data_types(vec![
+            DataTypes::date(),
+            DataTypes::time(),
+            DataType::Timestamp(TimestampType::with_nullable(true, 3).unwrap()), // Compact (precision <= 3)
+            DataType::TimestampLTz(TimestampLTzType::with_nullable(true, 3).unwrap()), // Compact
+            DataType::Timestamp(TimestampType::with_nullable(true, 6).unwrap()), // Non-compact (precision > 3)
+            DataType::TimestampLTz(TimestampLTzType::with_nullable(true, 9).unwrap()), // Non-compact
+            DataType::Decimal(DecimalType::new(10, 2).unwrap()), // Compact (precision <= 18)
+            DataType::Decimal(DecimalType::new(28, 10).unwrap()), // Non-compact (precision > 18)
+        ]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+
+        // Write values
+        writer.write_int(19651); // Date: 2023-10-25
+        writer.write_time(34200000, 0); // Time: 09:30:00.0
+        writer.write_timestamp_ntz(&TimestampNtz::new(1698235273182), 3); // Compact timestamp
+        writer.write_timestamp_ltz(&TimestampLtz::new(1698235273182), 3); // Compact timestamp ltz
+        let ts_ntz_high = TimestampNtz::from_millis_nanos(1698235273182, 123456).unwrap();
+        let ts_ltz_high = TimestampLtz::from_millis_nanos(1698235273182, 987654).unwrap();
+        writer.write_timestamp_ntz(&ts_ntz_high, 6); // Non-compact timestamp with nanos
+        writer.write_timestamp_ltz(&ts_ltz_high, 9); // Non-compact timestamp ltz with nanos
+
+        // Create Decimal values for testing
+        let small_decimal =
+            Decimal::from_big_decimal(BigDecimal::new(BigInt::from(12345), 2), 10, 2).unwrap(); // Compact decimal: 123.45
+        let large_decimal = Decimal::from_big_decimal(
+            BigDecimal::new(BigInt::from(999999999999999999i128), 10),
+            28,
+            10,
+        )
+        .unwrap(); // Non-compact decimal
+
+        writer.write_decimal(&small_decimal, 10);
+        writer.write_decimal(&large_decimal, 28);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        // Verify all values
+        assert_eq!(row.get_date(0).unwrap().get_inner(), 19651);
+        assert_eq!(row.get_time(1).unwrap().get_inner(), 34200000);
+        assert_eq!(
+            row.get_timestamp_ntz(2, 3).unwrap().get_millisecond(),
+            1698235273182
+        );
+        assert_eq!(
+            row.get_timestamp_ltz(3, 3).unwrap().get_epoch_millisecond(),
+            1698235273182
+        );
+        let read_ts_ntz = row.get_timestamp_ntz(4, 6).unwrap();
+        assert_eq!(read_ts_ntz.get_millisecond(), 1698235273182);
+        assert_eq!(read_ts_ntz.get_nano_of_millisecond(), 123456);
+        let read_ts_ltz = row.get_timestamp_ltz(5, 9).unwrap();
+        assert_eq!(read_ts_ltz.get_epoch_millisecond(), 1698235273182);
+        assert_eq!(read_ts_ltz.get_nano_of_millisecond(), 987654);
+        // Assert on Decimal equality
+        assert_eq!(row.get_decimal(6, 10, 2).unwrap(), small_decimal);
+        assert_eq!(row.get_decimal(7, 28, 10).unwrap(), large_decimal);
+
+        // Assert on Decimal components to catch any regressions
+        let read_small_decimal = row.get_decimal(6, 10, 2).unwrap();
+        assert_eq!(read_small_decimal.precision(), 10);
+        assert_eq!(read_small_decimal.scale(), 2);
+        assert_eq!(read_small_decimal.to_unscaled_long().unwrap(), 12345);
+
+        let read_large_decimal = row.get_decimal(7, 28, 10).unwrap();
+        assert_eq!(read_large_decimal.precision(), 28);
+        assert_eq!(read_large_decimal.scale(), 10);
+        assert_eq!(
+            read_large_decimal.to_unscaled_long().unwrap(),
+            999999999999999999i64
+        );
+    }
+
+    #[test]
+    fn test_compacted_row_int_array() {
+        let row_type =
+            RowType::with_data_types(vec![DataTypes::int(), DataTypes::array(DataTypes::int())]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+        writer.write_int(42);
+
+        let elem_type = DataTypes::int();
+        let mut arr_writer = FlussArrayWriter::new(3, &elem_type);
+        arr_writer.write_int(0, 1);
+        arr_writer.write_int(1, 2);
+        arr_writer.write_int(2, 3);
+        let arr = arr_writer.complete().unwrap();
+        writer.write_array(&arr);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        assert_eq!(row.get_int(0).unwrap(), 42);
+        let read_arr = row.get_array(1).unwrap();
+        assert_eq!(read_arr.size(), 3);
+        assert_eq!(read_arr.get_int(0).unwrap(), 1);
+        assert_eq!(read_arr.get_int(1).unwrap(), 2);
+        assert_eq!(read_arr.get_int(2).unwrap(), 3);
+    }
+
+    #[test]
+    fn test_compacted_row_string_array() {
+        let row_type = RowType::with_data_types(vec![DataTypes::array(DataTypes::string())]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+
+        let elem_type = DataTypes::string();
+        let mut arr_writer = FlussArrayWriter::new(3, &elem_type);
+        arr_writer.write_string(0, "hello");
+        arr_writer.write_string(1, "fluss");
+        arr_writer.write_string(2, "rust");
+        let arr = arr_writer.complete().unwrap();
+        writer.write_array(&arr);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        let read_arr = row.get_array(0).unwrap();
+        assert_eq!(read_arr.size(), 3);
+        assert_eq!(read_arr.get_string(0).unwrap(), "hello");
+        assert_eq!(read_arr.get_string(1).unwrap(), "fluss");
+        assert_eq!(read_arr.get_string(2).unwrap(), "rust");
+    }
+
+    #[test]
+    fn test_compacted_row_array_with_nulls() {
+        let row_type = RowType::with_data_types(vec![DataTypes::array(DataTypes::int())]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+
+        let elem_type = DataTypes::int();
+        let mut arr_writer = FlussArrayWriter::new(3, &elem_type);
+        arr_writer.write_int(0, 10);
+        arr_writer.set_null_at(1);
+        arr_writer.write_int(2, 30);
+        let arr = arr_writer.complete().unwrap();
+        writer.write_array(&arr);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        let read_arr = row.get_array(0).unwrap();
+        assert_eq!(read_arr.size(), 3);
+        assert!(!read_arr.is_null_at(0));
+        assert_eq!(read_arr.get_int(0).unwrap(), 10);
+        assert!(read_arr.is_null_at(1));
+        assert!(!read_arr.is_null_at(2));
+        assert_eq!(read_arr.get_int(2).unwrap(), 30);
+    }
+
+    #[test]
+    fn test_compacted_row_empty_array() {
+        let row_type = RowType::with_data_types(vec![DataTypes::array(DataTypes::int())]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+
+        let elem_type = DataTypes::int();
+        let arr_writer = FlussArrayWriter::new(0, &elem_type);
+        let arr = arr_writer.complete().unwrap();
+        writer.write_array(&arr);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        let read_arr = row.get_array(0).unwrap();
+        assert_eq!(read_arr.size(), 0);
+    }
+
+    #[test]
+    fn test_compacted_row_nested_array() {
+        let row_type =
+            RowType::with_data_types(vec![DataTypes::array(DataTypes::array(DataTypes::int()))]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+
+        // Build inner arrays
+        let inner_type = DataTypes::int();
+        let mut inner1 = FlussArrayWriter::new(2, &inner_type);
+        inner1.write_int(0, 1);
+        inner1.write_int(1, 2);
+        let inner1_arr = inner1.complete().unwrap();
+
+        let mut inner2 = FlussArrayWriter::new(1, &inner_type);
+        inner2.write_int(0, 99);
+        let inner2_arr = inner2.complete().unwrap();
+
+        // Build outer array
+        let outer_type = DataTypes::array(DataTypes::int());
+        let mut outer_writer = FlussArrayWriter::new(2, &outer_type);
+        outer_writer.write_array(0, &inner1_arr);
+        outer_writer.write_array(1, &inner2_arr);
+        let outer_arr = outer_writer.complete().unwrap();
+
+        writer.write_array(&outer_arr);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        let read_outer = row.get_array(0).unwrap();
+        assert_eq!(read_outer.size(), 2);
+
+        let nested1 = read_outer.get_array(0).unwrap();
+        assert_eq!(nested1.size(), 2);
+        assert_eq!(nested1.get_int(0).unwrap(), 1);
+        assert_eq!(nested1.get_int(1).unwrap(), 2);
+
+        let nested2 = read_outer.get_array(1).unwrap();
+        assert_eq!(nested2.size(), 1);
+        assert_eq!(nested2.get_int(0).unwrap(), 99);
+    }
+
+    #[test]
+    fn test_compacted_row_map() {
+        let row_type =
+            RowType::with_data_types(vec![DataTypes::map(DataTypes::int(), DataTypes::string())]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+
+        let mut map_writer = FlussMapWriter::new(2, &DataTypes::int(), &DataTypes::string());
+        map_writer.write_entry(1.into(), "a".into()).unwrap();
+        map_writer.write_entry(2.into(), "b".into()).unwrap();
+        let map = map_writer.complete().unwrap();
+        writer.write_map(&map);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        let read_map = row.get_map(0).unwrap();
+        assert_eq!(read_map.size(), 2);
+        assert_eq!(read_map.key_array().get_int(0).unwrap(), 1);
+        assert_eq!(read_map.value_array().get_string(0).unwrap(), "a");
+    }
+
+    #[test]
+    fn test_compacted_row_map_with_nulls() {
+        // Row with two columns: an INT and a nullable MAP
+        let row_type = RowType::with_data_types(vec![
+            DataTypes::int(),
+            DataTypes::map(DataTypes::int(), DataTypes::string()),
+        ]);
+
+        // Write row with null map
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+        writer.write_int(42);
+        writer.set_null_at(1);
+        writer.complete();
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        assert_eq!(row.get_int(0).unwrap(), 42);
+        assert!(row.is_null_at(1).unwrap());
+
+        // Write row with non-null map
+        writer.reset();
+        writer.write_int(99);
+        let mut map_writer = FlussMapWriter::new(1, &DataTypes::int(), &DataTypes::string());
+        map_writer.write_entry(7.into(), "hello".into()).unwrap();
+        let map = map_writer.complete().unwrap();
+        writer.write_map(&map);
+        writer.complete();
+
+        let bytes2 = writer.to_bytes();
+        let row2 = CompactedRow::from_bytes(&row_type, bytes2.as_ref());
+        assert_eq!(row2.get_int(0).unwrap(), 99);
+        assert!(!row2.is_null_at(1).unwrap());
+        let read_map = row2.get_map(1).unwrap();
+        assert_eq!(read_map.size(), 1);
+        assert_eq!(read_map.key_array().get_int(0).unwrap(), 7);
+        assert_eq!(read_map.value_array().get_string(0).unwrap(), "hello");
+    }
+
+    #[test]
+    fn test_compacted_row_nested_map() {
+        // Map<STRING, ARRAY<INT>>
+        let row_type = RowType::with_data_types(vec![DataTypes::map(
+            DataTypes::string(),
+            DataTypes::array(DataTypes::int()),
+        )]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+
+        // Values: [[1, 2], [3]]
+        let inner_type = DataTypes::int();
+        let mut inner1 = FlussArrayWriter::new(2, &inner_type);
+        inner1.write_int(0, 1);
+        inner1.write_int(1, 2);
+        let inner1_arr = inner1.complete().unwrap();
+
+        let mut inner2 = FlussArrayWriter::new(1, &inner_type);
+        inner2.write_int(0, 3);
+        let inner2_arr = inner2.complete().unwrap();
+
+        let array_type = DataTypes::array(DataTypes::int());
+
+        let mut map_writer = FlussMapWriter::new(2, &DataTypes::string(), &array_type);
+        map_writer
+            .write_entry("a".into(), Datum::Array(inner1_arr))
+            .unwrap();
+        map_writer
+            .write_entry("b".into(), Datum::Array(inner2_arr))
+            .unwrap();
+        let map = map_writer.complete().unwrap();
+        writer.write_map(&map);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        let read_map = row.get_map(0).unwrap();
+        assert_eq!(read_map.size(), 2);
+        assert_eq!(read_map.key_array().get_string(0).unwrap(), "a");
+        assert_eq!(read_map.key_array().get_string(1).unwrap(), "b");
+
+        let nested1 = read_map.value_array().get_array(0).unwrap();
+        assert_eq!(nested1.size(), 2);
+        assert_eq!(nested1.get_int(0).unwrap(), 1);
+        assert_eq!(nested1.get_int(1).unwrap(), 2);
+
+        let nested2 = read_map.value_array().get_array(1).unwrap();
+        assert_eq!(nested2.size(), 1);
+        assert_eq!(nested2.get_int(0).unwrap(), 3);
+    }
+
+    #[test]
+    fn test_compacted_row_empty_map() {
+        let row_type =
+            RowType::with_data_types(vec![DataTypes::map(DataTypes::int(), DataTypes::string())]);
+
+        let mut writer = CompactedRowWriter::new(row_type.fields().len());
+
+        let map_writer = FlussMapWriter::new(0, &DataTypes::int(), &DataTypes::string());
+        let map = map_writer.complete().unwrap();
+        writer.write_map(&map);
+
+        let bytes = writer.to_bytes();
+        let row = CompactedRow::from_bytes(&row_type, bytes.as_ref());
+
+        let read_map = row.get_map(0).unwrap();
+        assert_eq!(read_map.size(), 0);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs
new file mode 100644
index 0000000000..3f2eb65352
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_reader.rs
@@ -0,0 +1,569 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::{DataType, RowType};
+use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes;
+use crate::row::compacted::compacted_row_writer::CompactedRowWriter;
+use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz};
+use crate::row::{Datum, Decimal, FlussArray, GenericRow};
+use crate::util::varint::{read_unsigned_varint_at, read_unsigned_varint_u64_at};
+use std::borrow::Cow;
+use std::str::from_utf8;
+use std::sync::Arc;
+
+#[allow(dead_code)]
+#[derive(Clone)]
+pub struct CompactedRowDeserializer<'a> {
+    row_type: Cow<'a, RowType>,
+    // Index-parallel to row_type.fields(); Some(_) only for ROW-typed fields.
+    nested: Vec<Option<Arc<CompactedRowDeserializer<'a>>>>,
+}
+
+fn build_nested_deserializers<'a>(
+    row_type: &RowType,
+) -> Vec<Option<Arc<CompactedRowDeserializer<'a>>>> {
+    row_type
+        .fields()
+        .iter()
+        .map(|f| {
+            if let DataType::Row(inner) = &f.data_type {
+                Some(Arc::new(CompactedRowDeserializer::new_from_owned(
+                    inner.clone(),
+                )))
+            } else {
+                None
+            }
+        })
+        .collect()
+}
+
+#[allow(dead_code)]
+impl<'a> CompactedRowDeserializer<'a> {
+    pub fn new(row_type: &'a RowType) -> Self {
+        let nested = build_nested_deserializers(row_type);
+        Self {
+            row_type: Cow::Borrowed(row_type),
+            nested,
+        }
+    }
+
+    pub fn new_from_owned(row_type: RowType) -> Self {
+        let nested = build_nested_deserializers(&row_type);
+        Self {
+            row_type: Cow::Owned(row_type),
+            nested,
+        }
+    }
+
+    pub fn get_row_type(&self) -> &RowType {
+        self.row_type.as_ref()
+    }
+
+    pub fn deserialize(&self, reader: &CompactedRowReader<'a>) -> Result<GenericRow<'a>> {
+        let mut row = GenericRow::new(self.row_type.fields().len());
+        let mut cursor = reader.initial_position();
+        for (col_pos, data_field) in self.row_type.fields().iter().enumerate() {
+            let dtype = &data_field.data_type;
+            if dtype.is_nullable() && reader.is_null_at(col_pos) {
+                row.set_field(col_pos, Datum::Null);
+                continue;
+            }
+            let (datum, next_cursor) = match dtype {
+                DataType::Boolean(_) => {
+                    let (val, next) = reader.read_boolean(cursor)?;
+                    (Datum::Bool(val), next)
+                }
+                DataType::TinyInt(_) => {
+                    let (val, next) = reader.read_byte(cursor)?;
+                    (Datum::Int8(val as i8), next)
+                }
+                DataType::SmallInt(_) => {
+                    let (val, next) = reader.read_short(cursor)?;
+                    (Datum::Int16(val), next)
+                }
+                DataType::Int(_) => {
+                    let (val, next) = reader.read_int(cursor)?;
+                    (Datum::Int32(val), next)
+                }
+                DataType::BigInt(_) => {
+                    let (val, next) = reader.read_long(cursor)?;
+                    (Datum::Int64(val), next)
+                }
+                DataType::Float(_) => {
+                    let (val, next) = reader.read_float(cursor)?;
+                    (Datum::Float32(val.into()), next)
+                }
+                DataType::Double(_) => {
+                    let (val, next) = reader.read_double(cursor)?;
+                    (Datum::Float64(val.into()), next)
+                }
+                // TODO: use read_char(length) in the future, but need to keep compatibility
+                DataType::Char(_) | DataType::String(_) => {
+                    let (val, next) = reader.read_string(cursor)?;
+                    (Datum::String(val.into()), next)
+                }
+                // TODO: use read_binary(length) in the future, but need to keep compatibility
+                DataType::Bytes(_) | DataType::Binary(_) => {
+                    let (val, next) = reader.read_bytes(cursor)?;
+                    (Datum::Blob(val.into()), next)
+                }
+                DataType::Decimal(decimal_type) => {
+                    let precision = decimal_type.precision();
+                    let scale = decimal_type.scale();
+                    if Decimal::is_compact_precision(precision) {
+                        // Compact: stored as i64
+                        let (val, next) = reader.read_long(cursor)?;
+                        let decimal =
+                            Decimal::from_unscaled_long(val, precision, scale).map_err(|e| {
+                                IllegalArgument {
+                                    message: format!(
+                                        "Failed to create decimal from unscaled long: {e}"
+                                    ),
+                                }
+                            })?;
+                        (Datum::Decimal(decimal), next)
+                    } else {
+                        // Non-compact: stored as minimal big-endian bytes
+                        let (bytes, next) = reader.read_bytes(cursor)?;
+                        let decimal = Decimal::from_unscaled_bytes(bytes, precision, scale)
+                            .map_err(|e| IllegalArgument {
+                                message: format!(
+                                    "Failed to create decimal from unscaled bytes: {e}"
+                                ),
+                            })?;
+                        (Datum::Decimal(decimal), next)
+                    }
+                }
+                DataType::Date(_) => {
+                    let (val, next) = reader.read_int(cursor)?;
+                    (Datum::Date(Date::new(val)), next)
+                }
+                DataType::Time(_) => {
+                    let (val, next) = reader.read_int(cursor)?;
+                    (Datum::Time(Time::new(val)), next)
+                }
+                DataType::Timestamp(timestamp_type) => {
+                    let precision = timestamp_type.precision();
+                    if TimestampNtz::is_compact(precision) {
+                        let (millis, next) = reader.read_long(cursor)?;
+                        (Datum::TimestampNtz(TimestampNtz::new(millis)), next)
+                    } else {
+                        let (millis, mid) = reader.read_long(cursor)?;
+                        let (nanos, next) = reader.read_int(mid)?;
+                        let timestamp = TimestampNtz::from_millis_nanos(millis, nanos).map_err(
+                            |e| IllegalArgument {
+                                message: format!(
+                                    "Invalid nano_of_millisecond value in compacted row timestamp: {e}"
+                                ),
+                            },
+                        )?;
+                        (Datum::TimestampNtz(timestamp), next)
+                    }
+                }
+                DataType::TimestampLTz(timestamp_ltz_type) => {
+                    let precision = timestamp_ltz_type.precision();
+                    if TimestampLtz::is_compact(precision) {
+                        let (epoch_millis, next) = reader.read_long(cursor)?;
+                        (Datum::TimestampLtz(TimestampLtz::new(epoch_millis)), next)
+                    } else {
+                        let (epoch_millis, mid) = reader.read_long(cursor)?;
+                        let (nanos, next) = reader.read_int(mid)?;
+                        let timestamp_ltz =
+                            TimestampLtz::from_millis_nanos(epoch_millis, nanos).map_err(|e| {
+                                IllegalArgument {
+                                    message: format!(
+                                        "Invalid nano_of_millisecond value in compacted row timestamp_ltz: {e}"
+                                    ),
+                                }
+                            })?;
+                        (Datum::TimestampLtz(timestamp_ltz), next)
+                    }
+                }
+                DataType::Array(_) => {
+                    let (bytes, next) = reader.read_bytes(cursor)?;
+                    (Datum::Array(FlussArray::from_bytes(bytes)?), next)
+                }
+                DataType::Row(row_type) => {
+                    let (nested_bytes, next) = reader.read_bytes(cursor)?;
+                    let nested_reader = CompactedRowReader::new(
+                        row_type.fields().len(),
+                        nested_bytes,
+                        0,
+                        nested_bytes.len(),
+                    );
+                    let nested_deser = self.nested[col_pos]
+                        .as_ref()
+                        .expect("ROW field must have nested deserializer");
+                    let nested_row = nested_deser.deserialize(&nested_reader)?;
+                    (Datum::Row(Box::new(nested_row)), next)
+                }
+                DataType::Map(map_type) => {
+                    let (bytes, next) = reader.read_bytes(cursor)?;
+                    let map = crate::row::binary_map::FlussMap::from_bytes(
+                        bytes,
+                        map_type.key_type(),
+                        map_type.value_type(),
+                    )?;
+                    (Datum::Map(map), next)
+                }
+            };
+            cursor = next_cursor;
+            row.set_field(col_pos, datum);
+        }
+        Ok(row)
+    }
+}
+
+// Reference implementation:
+// https://github.com/apache/fluss/blob/main/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRowReader.java
+#[allow(dead_code)]
+pub struct CompactedRowReader<'a> {
+    segment: &'a [u8],
+    offset: usize,
+    limit: usize,
+    header_size_in_bytes: usize,
+}
+
+#[allow(dead_code)]
+impl<'a> CompactedRowReader<'a> {
+    pub fn new(field_count: usize, data: &'a [u8], offset: usize, length: usize) -> Self {
+        let header_size_in_bytes = calculate_bit_set_width_in_bytes(field_count);
+        let limit = offset + length;
+        let position = offset + header_size_in_bytes;
+        debug_assert!(limit <= data.len());
+        debug_assert!(position <= limit);
+
+        CompactedRowReader {
+            segment: data,
+            offset,
+            limit,
+            header_size_in_bytes,
+        }
+    }
+
+    fn initial_position(&self) -> usize {
+        self.offset + self.header_size_in_bytes
+    }
+
+    fn checked_pos(&self, pos: usize, width: usize, context: &str) -> Result<usize> {
+        let next = pos.checked_add(width).ok_or_else(|| IllegalArgument {
+            message: format!("Overflow while reading {context}: pos={pos}, width={width}"),
+        })?;
+        if next > self.limit {
+            return Err(IllegalArgument {
+                message: format!(
+                    "Out-of-bounds while reading {context}: pos={pos}, width={width}, limit={}",
+                    self.limit
+                ),
+            });
+        }
+        Ok(next)
+    }
+
+    pub fn is_null_at(&self, col_pos: usize) -> bool {
+        let byte_index = col_pos >> 3;
+        let bit = col_pos & 7;
+        debug_assert!(byte_index < self.header_size_in_bytes);
+        let idx = self.offset + byte_index;
+        (self.segment[idx] & (1u8 << bit)) != 0
+    }
+
+    pub fn read_boolean(&self, pos: usize) -> Result<(bool, usize)> {
+        let (val, next) = self.read_byte(pos)?;
+        Ok((val != 0, next))
+    }
+
+    pub fn read_byte(&self, pos: usize) -> Result<(u8, usize)> {
+        let next = self.checked_pos(pos, 1, "byte")?;
+        Ok((self.segment[pos], next))
+    }
+
+    pub fn read_short(&self, pos: usize) -> Result<(i16, usize)> {
+        let next_pos = self.checked_pos(pos, 2, "short")?;
+        let mut arr = [0u8; 2];
+        arr.copy_from_slice(&self.segment[pos..next_pos]);
+        Ok((i16::from_ne_bytes(arr), next_pos))
+    }
+
+    pub fn read_int(&self, pos: usize) -> Result<(i32, usize)> {
+        match read_unsigned_varint_at(self.segment, pos, CompactedRowWriter::MAX_INT_SIZE) {
+            Ok((value, next_pos)) => Ok((value as i32, next_pos)),
+            Err(e) => Err(IllegalArgument {
+                message: format!("Invalid VarInt32 input stream at pos {pos}: {e}"),
+            }),
+        }
+    }
+
+    pub fn read_long(&self, pos: usize) -> Result<(i64, usize)> {
+        match read_unsigned_varint_u64_at(self.segment, pos, CompactedRowWriter::MAX_LONG_SIZE) {
+            Ok((value, next_pos)) => Ok((value as i64, next_pos)),
+            Err(e) => Err(IllegalArgument {
+                message: format!("Invalid VarInt64 input stream at pos {pos}: {e}"),
+            }),
+        }
+    }
+
+    pub fn read_float(&self, pos: usize) -> Result<(f32, usize)> {
+        let next_pos = self.checked_pos(pos, 4, "float")?;
+        let mut arr = [0u8; 4];
+        arr.copy_from_slice(&self.segment[pos..next_pos]);
+        Ok((f32::from_ne_bytes(arr), next_pos))
+    }
+
+    pub fn read_double(&self, pos: usize) -> Result<(f64, usize)> {
+        let next_pos = self.checked_pos(pos, 8, "double")?;
+        let mut arr = [0u8; 8];
+        arr.copy_from_slice(&self.segment[pos..next_pos]);
+        Ok((f64::from_ne_bytes(arr), next_pos))
+    }
+
+    pub fn read_binary(&self, pos: usize) -> Result<(&'a [u8], usize)> {
+        self.read_bytes(pos)
+    }
+
+    pub fn read_bytes(&self, pos: usize) -> Result<(&'a [u8], usize)> {
+        let (len, data_pos) = self.read_int(pos)?;
+        let len = usize::try_from(len).map_err(|_| IllegalArgument {
+            message: format!("Negative length while reading bytes at pos {pos}: {len}"),
+        })?;
+        let next_pos = self.checked_pos(data_pos, len, "bytes payload")?;
+        Ok((&self.segment[data_pos..next_pos], next_pos))
+    }
+
+    pub fn read_string(&self, pos: usize) -> Result<(&'a str, usize)> {
+        let (bytes, next_pos) = self.read_bytes(pos)?;
+        let s = from_utf8(bytes).map_err(|e| IllegalArgument {
+            message: format!("Invalid UTF-8 when reading string at pos {pos}: {e}"),
+        })?;
+        Ok((s, next_pos))
+    }
+}
+
+#[cfg(test)]
+mod row_type_tests {
+    use crate::metadata::{DataType, DataTypes, RowType};
+    use crate::row::binary::ValueWriter;
+    use crate::row::compacted::compacted_row_reader::{
+        CompactedRowDeserializer, CompactedRowReader,
+    };
+    use crate::row::compacted::compacted_row_writer::CompactedRowWriter;
+    use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz};
+    use crate::row::field_getter::FieldGetter;
+    use crate::row::{Datum, GenericRow, InternalRow};
+
+    fn round_trip<F>(outer_row_type: &RowType, outer_row: &GenericRow, verify: F)
+    where
+        F: FnOnce(&GenericRow),
+    {
+        // Write
+        let field_getters = FieldGetter::create_field_getters(outer_row_type);
+        let value_writers: Vec<ValueWriter> = outer_row_type
+            .fields()
+            .iter()
+            .map(|f| ValueWriter::create_value_writer(f.data_type(), None).unwrap())
+            .collect();
+        let mut writer = CompactedRowWriter::new(outer_row_type.fields().len());
+        for (i, (getter, vw)) in field_getters.iter().zip(value_writers.iter()).enumerate() {
+            let datum = getter.get_field(outer_row as &dyn InternalRow).unwrap();
+            vw.write_value(&mut writer, i, &datum).unwrap();
+        }
+        let bytes = writer.to_bytes();
+
+        // Read
+        let deser = CompactedRowDeserializer::new(outer_row_type);
+        let reader = CompactedRowReader::new(
+            outer_row_type.fields().len(),
+            bytes.as_ref(),
+            0,
+            bytes.len(),
+        );
+        let result = deser.deserialize(&reader).expect("deserialize");
+        verify(&result);
+    }
+
+    #[test]
+    fn test_row_simple_nesting() {
+        let inner_row_type = RowType::with_data_types_and_field_names(
+            vec![DataTypes::int(), DataTypes::string()],
+            vec!["x", "label"],
+        );
+        let outer_row_type = RowType::with_data_types_and_field_names(
+            vec![DataTypes::int(), DataType::Row(inner_row_type.clone())],
+            vec!["id", "nested"],
+        );
+
+        let mut inner = GenericRow::new(2);
+        inner.set_field(0, 42_i32);
+        inner.set_field(1, "hello");
+
+        let mut outer = GenericRow::new(2);
+        outer.set_field(0, 1_i32);
+        outer.set_field(1, Datum::Row(Box::new(inner)));
+
+        round_trip(&outer_row_type, &outer, |result| {
+            assert_eq!(result.get_int(0).unwrap(), 1);
+            let nested = result.get_row(1).unwrap();
+            assert_eq!(nested.get_int(0).unwrap(), 42);
+            assert_eq!(nested.get_string(1).unwrap(), "hello");
+        });
+    }
+
+    #[test]
+    fn test_row_deep_nesting() {
+        let inner_inner_row_type =
+            RowType::with_data_types_and_field_names(vec![DataTypes::int()], vec!["n"]);
+        let inner_row_type = RowType::with_data_types_and_field_names(
+            vec![DataType::Row(inner_inner_row_type.clone())],
+            vec!["inner"],
+        );
+        let outer_row_type = RowType::with_data_types_and_field_names(
+            vec![DataType::Row(inner_row_type.clone())],
+            vec!["outer"],
+        );
+
+        let mut innermost = GenericRow::new(1);
+        innermost.set_field(0, 99_i32);
+
+        let mut middle = GenericRow::new(1);
+        middle.set_field(0, Datum::Row(Box::new(innermost)));
+
+        let mut outer = GenericRow::new(1);
+        outer.set_field(0, Datum::Row(Box::new(middle)));
+
+        round_trip(&outer_row_type, &outer, |result| {
+            let mid = result.get_row(0).unwrap();
+            let inner = mid.get_row(0).unwrap();
+            assert_eq!(inner.get_int(0).unwrap(), 99);
+        });
+    }
+
+    #[test]
+    fn test_row_with_nullable_fields() {
+        // Outer nullable ROW column; nested row with a nullable STRING field set to null
+        let inner_row_type = RowType::with_data_types_and_field_names(
+            vec![DataTypes::int(), DataTypes::string()],
+            vec!["id", "optional_name"],
+        );
+        let outer_row_type = RowType::with_data_types_and_field_names(
+            vec![DataTypes::int(), DataType::Row(inner_row_type.clone())],
+            vec!["k", "nested"],
+        );
+
+        // Case 1: non-null nested row with a null field inside
+        let mut inner = GenericRow::new(2);
+        inner.set_field(0, 7_i32);
+        inner.set_field(1, Datum::Null);
+
+        let mut outer = GenericRow::new(2);
+        outer.set_field(0, 10_i32);
+        outer.set_field(1, Datum::Row(Box::new(inner)));
+
+        round_trip(&outer_row_type, &outer, |result| {
+            assert_eq!(result.get_int(0).unwrap(), 10);
+            let nested = result.get_row(1).unwrap();
+            assert_eq!(nested.get_int(0).unwrap(), 7);
+            assert!(nested.is_null_at(1).unwrap());
+        });
+
+        // Case 2: outer ROW column is null
+        let mut outer_null = GenericRow::new(2);
+        outer_null.set_field(0, 20_i32);
+        outer_null.set_field(1, Datum::Null);
+
+        round_trip(&outer_row_type, &outer_null, |result2| {
+            assert_eq!(result2.get_int(0).unwrap(), 20);
+            assert!(result2.is_null_at(1).unwrap());
+        });
+    }
+
+    #[test]
+    fn test_row_all_primitives_round_trip() {
+        let inner_row_type = RowType::with_data_types_and_field_names(
+            vec![
+                DataTypes::boolean(),
+                DataTypes::tinyint(),
+                DataTypes::smallint(),
+                DataTypes::int(),
+                DataTypes::bigint(),
+                DataTypes::float(),
+                DataTypes::double(),
+                DataTypes::string(),
+                DataTypes::bytes(),
+                DataTypes::date(),
+                DataTypes::time(),
+                DataTypes::timestamp(),
+                DataTypes::timestamp_ltz(),
+            ],
+            vec![
+                "b", "tin", "sm", "i", "lo", "fl", "db", "str", "by", "dt", "ti", "tsn", "tsl",
+            ],
+        );
+        let outer_row_type = RowType::with_data_types_and_field_names(
+            vec![DataType::Row(inner_row_type.clone())],
+            vec!["nested"],
+        );
+
+        let mut inner = GenericRow::new(13);
+        inner.set_field(0, true);
+        inner.set_field(1, 7_i8);
+        inner.set_field(2, -42_i16);
+        inner.set_field(3, 100_000_i32);
+        inner.set_field(4, 9_876_543_210_i64);
+        inner.set_field(5, std::f32::consts::PI);
+        inner.set_field(6, std::f64::consts::E);
+        inner.set_field(7, "hello world");
+        inner.set_field(8, b"binary".as_slice());
+        inner.set_field(9, Datum::Date(Date::new(20476)));
+        inner.set_field(10, Datum::Time(Time::new(36_827_123)));
+        inner.set_field(
+            11,
+            Datum::TimestampNtz(TimestampNtz::new(1_769_163_227_123)),
+        );
+        inner.set_field(
+            12,
+            Datum::TimestampLtz(TimestampLtz::new(1_769_163_227_123)),
+        );
+
+        let mut outer = GenericRow::new(1);
+        outer.set_field(0, Datum::Row(Box::new(inner)));
+
+        round_trip(&outer_row_type, &outer, |result| {
+            let n = result.get_row(0).unwrap();
+            assert!(n.get_boolean(0).unwrap());
+            assert_eq!(n.get_byte(1).unwrap(), 7);
+            assert_eq!(n.get_short(2).unwrap(), -42);
+            assert_eq!(n.get_int(3).unwrap(), 100_000);
+            assert_eq!(n.get_long(4).unwrap(), 9_876_543_210);
+            assert!((n.get_float(5).unwrap() - std::f32::consts::PI).abs() < f32::EPSILON);
+            assert!((n.get_double(6).unwrap() - std::f64::consts::E).abs() < f64::EPSILON);
+            assert_eq!(n.get_string(7).unwrap(), "hello world");
+            assert_eq!(n.get_bytes(8).unwrap(), b"binary");
+            assert_eq!(n.get_date(9).unwrap().get_inner(), 20476);
+            assert_eq!(n.get_time(10).unwrap().get_inner(), 36_827_123);
+            assert_eq!(
+                n.get_timestamp_ntz(11, 6).unwrap().get_millisecond(),
+                1_769_163_227_123,
+            );
+            assert_eq!(
+                n.get_timestamp_ltz(12, 6).unwrap().get_epoch_millisecond(),
+                1_769_163_227_123,
+            );
+        });
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs
new file mode 100644
index 0000000000..2af8767f0c
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/compacted/compacted_row_writer.rs
@@ -0,0 +1,277 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::row::Decimal;
+use crate::row::binary::BinaryWriter;
+use crate::row::binary_array::FlussArray;
+use crate::row::binary_map::FlussMap;
+use crate::row::compacted::compacted_row::calculate_bit_set_width_in_bytes;
+use crate::util::varint::{write_unsigned_varint_to_slice, write_unsigned_varint_u64_to_slice};
+use bytes::{Bytes, BytesMut};
+use std::cmp;
+
+// Writer for CompactedRow
+// Reference implementation:
+// https://github.com/apache/fluss/blob/d4a72fad240d4b81563aaf83fa3b09b5058674ed/fluss-common/src/main/java/org/apache/fluss/row/compacted/CompactedRowWriter.java#L71
+#[allow(dead_code)]
+pub struct CompactedRowWriter {
+    header_size_in_bytes: usize,
+    position: usize,
+    buffer: BytesMut,
+}
+
+#[allow(dead_code)]
+impl CompactedRowWriter {
+    pub const MAX_INT_SIZE: usize = 5;
+    pub const MAX_LONG_SIZE: usize = 10;
+
+    pub fn new(field_count: usize) -> Self {
+        let header_size = calculate_bit_set_width_in_bytes(field_count);
+        let cap = cmp::max(64, header_size);
+
+        let mut buffer = BytesMut::with_capacity(cap);
+        buffer.resize(cap, 0);
+
+        Self {
+            header_size_in_bytes: header_size,
+            position: header_size,
+            buffer,
+        }
+    }
+
+    pub fn position(&self) -> usize {
+        self.position
+    }
+
+    pub fn buffer(&self) -> &[u8] {
+        &self.buffer[..self.position]
+    }
+
+    pub fn to_bytes(&self) -> Bytes {
+        Bytes::copy_from_slice(&self.buffer[..self.position])
+    }
+
+    /// Flushes writer's ByteMut, resetting writer's inner state and returns Byte of flushed state
+    pub fn flush_bytes(&mut self) -> Bytes {
+        let used = self.buffer.split_to(self.position);
+        self.position = self.header_size_in_bytes;
+        if self.buffer.len() < self.header_size_in_bytes {
+            self.buffer.resize(self.header_size_in_bytes.max(64), 0);
+        } else {
+            self.buffer[..self.header_size_in_bytes].fill(0);
+        }
+        used.freeze()
+    }
+
+    fn ensure_capacity(&mut self, need_len: usize) {
+        if (self.buffer.len() - self.position) < need_len {
+            let new_len = cmp::max(self.buffer.len() * 2, self.buffer.len() + need_len);
+            self.buffer.resize(new_len, 0);
+        }
+    }
+
+    fn write_raw(&mut self, src: &[u8]) {
+        let end = self.position + src.len();
+        self.ensure_capacity(src.len());
+        self.buffer[self.position..end].copy_from_slice(src);
+        self.position = end;
+    }
+}
+
+impl BinaryWriter for CompactedRowWriter {
+    fn reset(&mut self) {
+        self.position = self.header_size_in_bytes;
+        self.buffer[..self.header_size_in_bytes].fill(0);
+    }
+
+    fn set_null_at(&mut self, pos: usize) {
+        let byte_index = pos >> 3;
+        let bit = pos & 7;
+        debug_assert!(byte_index < self.header_size_in_bytes);
+        self.buffer[byte_index] |= 1u8 << bit;
+    }
+
+    fn write_boolean(&mut self, value: bool) {
+        let b = if value { 1u8 } else { 0u8 };
+        self.write_raw(&[b])
+    }
+
+    fn write_byte(&mut self, value: u8) {
+        self.write_raw(&[value])
+    }
+
+    fn write_bytes(&mut self, value: &[u8]) {
+        let len_i32 = i32::try_from(value.len())
+            .expect("Byte slice too large to encode length as i32: exceeds i32::MAX");
+        self.write_int(len_i32);
+        self.write_raw(value)
+    }
+
+    fn write_char(&mut self, value: &str, _length: usize) {
+        // TODO: currently, we encoding CHAR(length) as the same with STRING, the length info can be
+        //  omitted and the bytes length should be enforced in the future.
+        self.write_string(value)
+    }
+
+    fn write_string(&mut self, value: &str) {
+        self.write_bytes(value.as_ref())
+    }
+
+    fn write_short(&mut self, value: i16) {
+        // Use native endianness to match Java's UnsafeUtils.putShort behavior
+        // Java uses sun.misc.Unsafe which writes in native byte order (typically LE on x86/ARM)
+        self.write_raw(&value.to_ne_bytes())
+    }
+
+    fn write_int(&mut self, value: i32) {
+        self.ensure_capacity(Self::MAX_INT_SIZE);
+        let bytes_written =
+            write_unsigned_varint_to_slice(value as u32, &mut self.buffer[self.position..]);
+        self.position += bytes_written;
+    }
+
+    fn write_long(&mut self, value: i64) {
+        self.ensure_capacity(Self::MAX_LONG_SIZE);
+        let bytes_written =
+            write_unsigned_varint_u64_to_slice(value as u64, &mut self.buffer[self.position..]);
+        self.position += bytes_written;
+    }
+
+    fn write_float(&mut self, value: f32) {
+        // Use native endianness to match Java's UnsafeUtils.putFloat behavior
+        self.write_raw(&value.to_ne_bytes())
+    }
+
+    fn write_double(&mut self, value: f64) {
+        // Use native endianness to match Java's UnsafeUtils.putDouble behavior
+        self.write_raw(&value.to_ne_bytes())
+    }
+
+    fn write_binary(&mut self, bytes: &[u8], length: usize) {
+        // TODO: currently, we encoding BINARY(length) as the same with BYTES, the length info can
+        //  be omitted and the bytes length should be enforced in the future.
+        self.write_bytes(&bytes[..length.min(bytes.len())])
+    }
+
+    fn write_array(&mut self, value: &FlussArray) {
+        self.write_bytes(value.as_bytes())
+    }
+
+    fn write_map(&mut self, value: &FlussMap) {
+        self.write_bytes(value.as_bytes())
+    }
+
+    fn complete(&mut self) {
+        // do nothing
+    }
+
+    fn write_decimal(&mut self, value: &Decimal, precision: u32) {
+        // Decimal is already validated and rescaled during construction.
+        // Just serialize the precomputed unscaled representation.
+        if Decimal::is_compact_precision(precision) {
+            self.write_long(
+                value
+                    .to_unscaled_long()
+                    .expect("Decimal should fit in i64 for compact precision"),
+            )
+        } else {
+            self.write_bytes(&value.to_unscaled_bytes())
+        }
+    }
+
+    fn write_time(&mut self, value: i32, _precision: u32) {
+        // TIME is always encoded as i32 (milliseconds since midnight) regardless of precision
+        self.write_int(value)
+    }
+
+    fn write_timestamp_ntz(&mut self, value: &crate::row::datum::TimestampNtz, precision: u32) {
+        if crate::row::datum::TimestampNtz::is_compact(precision) {
+            // Compact: write only milliseconds
+            self.write_long(value.get_millisecond());
+        } else {
+            // Non-compact: write milliseconds + nanoOfMillisecond
+            self.write_long(value.get_millisecond());
+            self.write_int(value.get_nano_of_millisecond());
+        }
+    }
+
+    fn write_timestamp_ltz(&mut self, value: &crate::row::datum::TimestampLtz, precision: u32) {
+        if crate::row::datum::TimestampLtz::is_compact(precision) {
+            // Compact: write only epoch milliseconds
+            self.write_long(value.get_epoch_millisecond());
+        } else {
+            // Non-compact: write epoch milliseconds + nanoOfMillisecond
+            self.write_long(value.get_epoch_millisecond());
+            self.write_int(value.get_nano_of_millisecond());
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use bigdecimal::{BigDecimal, num_bigint::BigInt};
+
+    #[test]
+    fn test_write_decimal_compact() {
+        // Compact decimal (precision <= 18)
+        let bd = BigDecimal::new(BigInt::from(12345), 2); // 123.45
+        let decimal = Decimal::from_big_decimal(bd, 10, 2).unwrap();
+
+        let mut w = CompactedRowWriter::new(1);
+        w.write_decimal(&decimal, 10);
+
+        let (val, _) = crate::util::varint::read_unsigned_varint_u64_at(
+            w.buffer(),
+            w.header_size_in_bytes,
+            CompactedRowWriter::MAX_LONG_SIZE,
+        )
+        .unwrap();
+        assert_eq!(val as i64, 12345);
+    }
+
+    #[test]
+    fn test_write_decimal_rounding() {
+        // Test HALF_UP rounding: 12.345 → 12.35
+        let bd = BigDecimal::new(BigInt::from(12345), 3);
+        let decimal = Decimal::from_big_decimal(bd, 10, 2).unwrap();
+
+        let mut w = CompactedRowWriter::new(1);
+        w.write_decimal(&decimal, 10);
+
+        let (val, _) = crate::util::varint::read_unsigned_varint_u64_at(
+            w.buffer(),
+            w.header_size_in_bytes,
+            CompactedRowWriter::MAX_LONG_SIZE,
+        )
+        .unwrap();
+        assert_eq!(val as i64, 1235); // 12.35 with scale 2
+    }
+
+    #[test]
+    fn test_write_decimal_non_compact() {
+        // Non-compact (precision > 18): uses byte array
+        let bd = BigDecimal::new(BigInt::from(12345), 0);
+        let decimal = Decimal::from_big_decimal(bd, 28, 0).unwrap();
+
+        let mut w = CompactedRowWriter::new(1);
+        w.write_decimal(&decimal, 28);
+
+        // Verify something was written (at least length varint + some bytes)
+        assert!(w.position() > w.header_size_in_bytes);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/compacted/mod.rs b/fluss-rust/crates/fluss/src/row/compacted/mod.rs
new file mode 100644
index 0000000000..fa603d23b2
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/compacted/mod.rs
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod compacted_key_writer;
+
+mod compacted_row;
+mod compacted_row_reader;
+mod compacted_row_writer;
+
+pub use compacted_key_writer::CompactedKeyWriter;
+#[allow(unused_imports)]
+pub use compacted_row::{CompactedRow, calculate_bit_set_width_in_bytes};
+#[allow(unused_imports)]
+pub use compacted_row_reader::{CompactedRowDeserializer, CompactedRowReader};
+#[allow(unused_imports)]
+pub use compacted_row_writer::CompactedRowWriter;
diff --git a/fluss-rust/crates/fluss/src/row/datum.rs b/fluss-rust/crates/fluss/src/row/datum.rs
new file mode 100644
index 0000000000..e6a67394b1
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/datum.rs
@@ -0,0 +1,1588 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::{IllegalArgument, RowConvertError};
+use crate::error::Result;
+use crate::metadata::{DataType, RowType};
+use crate::row::Decimal;
+use crate::row::GenericRow;
+use crate::row::InternalRow;
+use crate::row::binary_array::FlussArray;
+use crate::row::binary_map::FlussMap;
+use crate::row::field_getter::FieldGetter;
+use arrow::array::{
+    ArrayBuilder, BinaryBuilder, BooleanBuilder, Date32Builder, Decimal128Builder,
+    FixedSizeBinaryBuilder, Float32Builder, Float64Builder, Int8Builder, Int16Builder,
+    Int32Builder, Int64Builder, ListBuilder, MapBuilder, StringBuilder, StructBuilder,
+    Time32MillisecondBuilder, Time32SecondBuilder, Time64MicrosecondBuilder,
+    Time64NanosecondBuilder, TimestampMicrosecondBuilder, TimestampMillisecondBuilder,
+    TimestampNanosecondBuilder, TimestampSecondBuilder,
+};
+use arrow::datatypes as arrow_schema;
+use arrow::error::ArrowError;
+use jiff::ToSpan;
+use ordered_float::OrderedFloat;
+use parse_display::Display;
+use serde::Serialize;
+use std::borrow::Cow;
+
+#[allow(dead_code)]
+const THIRTY_YEARS_MICROSECONDS: i64 = 946_684_800_000_000;
+
+#[derive(Debug, Clone, Display, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)]
+pub enum Datum<'a> {
+    #[display("null")]
+    Null,
+    #[display("{0}")]
+    Bool(bool),
+    #[display("{0}")]
+    Int8(i8),
+    #[display("{0}")]
+    Int16(i16),
+    #[display("{0}")]
+    Int32(i32),
+    #[display("{0}")]
+    Int64(i64),
+    #[display("{0}")]
+    Float32(F32),
+    #[display("{0}")]
+    Float64(F64),
+    #[display("'{0}'")]
+    String(Str<'a>),
+    #[display("{:?}")]
+    Blob(Blob<'a>),
+    #[display("{0}")]
+    Decimal(Decimal),
+    #[display("{0}")]
+    Date(Date),
+    #[display("{0}")]
+    Time(Time),
+    #[display("{0}")]
+    TimestampNtz(TimestampNtz),
+    #[display("{0}")]
+    TimestampLtz(TimestampLtz),
+    #[display("{0}")]
+    Array(FlussArray),
+    #[display("{0}")]
+    Map(FlussMap),
+    #[display("{0:?}")]
+    Row(Box<GenericRow<'a>>),
+}
+
+impl Datum<'_> {
+    pub fn is_null(&self) -> bool {
+        matches!(self, Datum::Null)
+    }
+
+    pub fn as_str(&self) -> &str {
+        match self {
+            Self::String(s) => s,
+            _ => panic!("not a string: {self:?}"),
+        }
+    }
+
+    pub fn as_blob(&self) -> &[u8] {
+        match self {
+            Self::Blob(blob) => blob.as_ref(),
+            _ => panic!("not a blob: {self:?}"),
+        }
+    }
+
+    pub fn as_decimal(&self) -> &Decimal {
+        match self {
+            Self::Decimal(d) => d,
+            _ => panic!("not a decimal: {self:?}"),
+        }
+    }
+
+    pub fn as_date(&self) -> Date {
+        match self {
+            Self::Date(d) => *d,
+            _ => panic!("not a date: {self:?}"),
+        }
+    }
+
+    pub fn as_time(&self) -> Time {
+        match self {
+            Self::Time(t) => *t,
+            _ => panic!("not a time: {self:?}"),
+        }
+    }
+
+    pub fn as_timestamp_ntz(&self) -> TimestampNtz {
+        match self {
+            Self::TimestampNtz(ts) => *ts,
+            _ => panic!("not a timestamp ntz: {self:?}"),
+        }
+    }
+
+    pub fn as_timestamp_ltz(&self) -> TimestampLtz {
+        match self {
+            Self::TimestampLtz(ts) => *ts,
+            _ => panic!("not a timestamp ltz: {self:?}"),
+        }
+    }
+
+    pub fn as_array(&self) -> &FlussArray {
+        match self {
+            Self::Array(a) => a,
+            _ => panic!("not an array: {self:?}"),
+        }
+    }
+
+    pub fn is_map(&self) -> bool {
+        matches!(self, Datum::Map(_))
+    }
+
+    pub fn as_map(&self) -> &FlussMap {
+        match self {
+            Self::Map(m) => m,
+            _ => panic!("not a map: {self:?}"),
+        }
+    }
+
+    pub fn as_row(&self) -> &GenericRow<'_> {
+        match self {
+            Self::Row(r) => r.as_ref(),
+            _ => panic!("not a row: {self:?}"),
+        }
+    }
+}
+
+impl<'a> Datum<'a> {
+    pub fn into_owned(self) -> Datum<'static> {
+        match self {
+            Datum::Null => Datum::Null,
+            Datum::Bool(v) => Datum::Bool(v),
+            Datum::Int8(v) => Datum::Int8(v),
+            Datum::Int16(v) => Datum::Int16(v),
+            Datum::Int32(v) => Datum::Int32(v),
+            Datum::Int64(v) => Datum::Int64(v),
+            Datum::Float32(v) => Datum::Float32(v),
+            Datum::Float64(v) => Datum::Float64(v),
+            Datum::String(s) => Datum::String(Cow::Owned(s.into_owned())),
+            Datum::Blob(b) => Datum::Blob(Cow::Owned(b.into_owned())),
+            Datum::Decimal(d) => Datum::Decimal(d),
+            Datum::Date(d) => Datum::Date(d),
+            Datum::Time(t) => Datum::Time(t),
+            Datum::TimestampNtz(t) => Datum::TimestampNtz(t),
+            Datum::TimestampLtz(t) => Datum::TimestampLtz(t),
+            Datum::Array(a) => Datum::Array(a),
+            Datum::Map(m) => Datum::Map(m),
+            Datum::Row(boxed) => Datum::Row(Box::new(boxed.into_owned())),
+        }
+    }
+}
+
+// ----------- implement from
+impl<'a> From<i32> for Datum<'a> {
+    #[inline]
+    fn from(i: i32) -> Datum<'a> {
+        Datum::Int32(i)
+    }
+}
+
+impl<'a> From<i64> for Datum<'a> {
+    #[inline]
+    fn from(i: i64) -> Datum<'a> {
+        Datum::Int64(i)
+    }
+}
+
+impl<'a> From<i8> for Datum<'a> {
+    #[inline]
+    fn from(i: i8) -> Datum<'a> {
+        Datum::Int8(i)
+    }
+}
+
+impl<'a> From<i16> for Datum<'a> {
+    #[inline]
+    fn from(i: i16) -> Datum<'a> {
+        Datum::Int16(i)
+    }
+}
+
+pub type Str<'a> = Cow<'a, str>;
+
+impl<'a> From<String> for Datum<'a> {
+    #[inline]
+    fn from(s: String) -> Self {
+        Datum::String(Cow::Owned(s))
+    }
+}
+
+impl<'a> From<&'a str> for Datum<'a> {
+    #[inline]
+    fn from(s: &'a str) -> Datum<'a> {
+        Datum::String(Cow::Borrowed(s))
+    }
+}
+
+impl From<Option<&()>> for Datum<'_> {
+    fn from(_: Option<&()>) -> Self {
+        Self::Null
+    }
+}
+
+impl<'a> From<f32> for Datum<'a> {
+    #[inline]
+    fn from(f: f32) -> Datum<'a> {
+        Datum::Float32(F32::from(f))
+    }
+}
+
+impl<'a> From<f64> for Datum<'a> {
+    #[inline]
+    fn from(f: f64) -> Datum<'a> {
+        Datum::Float64(F64::from(f))
+    }
+}
+
+impl TryFrom<&Datum<'_>> for i32 {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Int32(i) => Ok(*i),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for i16 {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Int16(i) => Ok(*i),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for i64 {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Int64(i) => Ok(*i),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for f32 {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Float32(f) => Ok(f.into_inner()),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for f64 {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Float64(f) => Ok(f.into_inner()),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for bool {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Bool(b) => Ok(*b),
+            _ => Err(()),
+        }
+    }
+}
+
+impl<'b, 'a: 'b> TryFrom<&'b Datum<'a>> for &'b str {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &'b Datum<'a>) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::String(s) => Ok(s.as_ref()),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for i8 {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Int8(i) => Ok(*i),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for Decimal {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Decimal(d) => Ok(d.clone()),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for Date {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Date(d) => Ok(*d),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for Time {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::Time(t) => Ok(*t),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for TimestampNtz {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::TimestampNtz(ts) => Ok(*ts),
+            _ => Err(()),
+        }
+    }
+}
+
+impl TryFrom<&Datum<'_>> for TimestampLtz {
+    type Error = ();
+
+    #[inline]
+    fn try_from(from: &Datum) -> std::result::Result<Self, Self::Error> {
+        match from {
+            Datum::TimestampLtz(ts) => Ok(*ts),
+            _ => Err(()),
+        }
+    }
+}
+
+impl<'a> From<bool> for Datum<'a> {
+    #[inline]
+    fn from(b: bool) -> Datum<'a> {
+        Datum::Bool(b)
+    }
+}
+
+impl<'a> From<Decimal> for Datum<'a> {
+    #[inline]
+    fn from(d: Decimal) -> Datum<'a> {
+        Datum::Decimal(d)
+    }
+}
+
+impl<'a> From<Date> for Datum<'a> {
+    #[inline]
+    fn from(d: Date) -> Datum<'a> {
+        Datum::Date(d)
+    }
+}
+
+impl<'a> From<Time> for Datum<'a> {
+    #[inline]
+    fn from(t: Time) -> Datum<'a> {
+        Datum::Time(t)
+    }
+}
+
+impl<'a> From<TimestampNtz> for Datum<'a> {
+    #[inline]
+    fn from(ts: TimestampNtz) -> Datum<'a> {
+        Datum::TimestampNtz(ts)
+    }
+}
+
+impl<'a> From<TimestampLtz> for Datum<'a> {
+    #[inline]
+    fn from(ts: TimestampLtz) -> Datum<'a> {
+        Datum::TimestampLtz(ts)
+    }
+}
+
+impl<'a> From<FlussArray> for Datum<'a> {
+    #[inline]
+    fn from(arr: FlussArray) -> Datum<'a> {
+        Datum::Array(arr)
+    }
+}
+
+impl<'a> From<FlussMap> for Datum<'a> {
+    #[inline]
+    fn from(map: FlussMap) -> Datum<'a> {
+        Datum::Map(map)
+    }
+}
+
+pub trait ToArrow {
+    fn append_to(
+        &self,
+        builder: &mut dyn ArrayBuilder,
+        fluss_type: &crate::metadata::DataType,
+        arrow_type: &arrow_schema::DataType,
+    ) -> Result<()>;
+}
+
+// Time unit conversion constants
+pub(crate) const MILLIS_PER_SECOND: i64 = 1_000;
+pub(crate) const MICROS_PER_MILLI: i64 = 1_000;
+pub(crate) const NANOS_PER_MILLI: i64 = 1_000_000;
+
+/// Converts milliseconds and nanoseconds-within-millisecond to total microseconds.
+/// Returns an error if the conversion would overflow.
+pub(crate) fn millis_nanos_to_micros(millis: i64, nanos: i32) -> Result<i64> {
+    let millis_micros = millis
+        .checked_mul(MICROS_PER_MILLI)
+        .ok_or_else(|| RowConvertError {
+            message: format!(
+                "Timestamp milliseconds {millis} overflows when converting to microseconds"
+            ),
+        })?;
+    let nanos_micros = (nanos as i64) / MICROS_PER_MILLI;
+    millis_micros
+        .checked_add(nanos_micros)
+        .ok_or_else(|| RowConvertError {
+            message: format!(
+                "Timestamp overflow when adding microseconds: {millis_micros} + {nanos_micros}"
+            ),
+        })
+}
+
+/// Converts milliseconds and nanoseconds-within-millisecond to total nanoseconds.
+/// Returns an error if the conversion would overflow.
+pub(crate) fn millis_nanos_to_nanos(millis: i64, nanos: i32) -> Result<i64> {
+    let millis_nanos = millis
+        .checked_mul(NANOS_PER_MILLI)
+        .ok_or_else(|| RowConvertError {
+            message: format!(
+                "Timestamp milliseconds {millis} overflows when converting to nanoseconds"
+            ),
+        })?;
+    millis_nanos
+        .checked_add(nanos as i64)
+        .ok_or_else(|| RowConvertError {
+            message: format!(
+                "Timestamp overflow when adding nanoseconds: {millis_nanos} + {nanos}"
+            ),
+        })
+}
+
+/// Rescales a [`Decimal`] to the given Arrow target precision/scale and appends
+/// the resulting i128 to the builder.
+pub(crate) fn append_decimal_to_builder(
+    decimal: &Decimal,
+    target_precision: u32,
+    target_scale: i64,
+    builder: &mut Decimal128Builder,
+) -> Result<()> {
+    use bigdecimal::RoundingMode;
+
+    let bd = decimal.to_big_decimal();
+    let rescaled = bd.with_scale_round(target_scale, RoundingMode::HalfUp);
+    let (unscaled, _) = rescaled.as_bigint_and_exponent();
+
+    let actual_precision = Decimal::compute_precision(&unscaled);
+    if actual_precision > target_precision as usize {
+        return Err(RowConvertError {
+            message: format!(
+                "Decimal precision overflow: value has {actual_precision} digits but Arrow expects {target_precision} (value: {rescaled})"
+            ),
+        });
+    }
+
+    let i128_val: i128 = match unscaled.try_into() {
+        Ok(v) => v,
+        Err(_) => {
+            return Err(RowConvertError {
+                message: format!("Decimal value exceeds i128 range: {rescaled}"),
+            });
+        }
+    };
+
+    builder.append_value(i128_val);
+    Ok(())
+}
+
+trait AppendResult {
+    fn into_append_result(self) -> Result<()>;
+}
+
+impl AppendResult for () {
+    fn into_append_result(self) -> Result<()> {
+        Ok(())
+    }
+}
+
+impl AppendResult for std::result::Result<(), ArrowError> {
+    fn into_append_result(self) -> Result<()> {
+        self.map_err(|e| RowConvertError {
+            message: format!("Failed to append value: {e}"),
+        })
+    }
+}
+
+fn append_fluss_array_to_list_builder(
+    arr: &FlussArray,
+    builder: &mut dyn ArrayBuilder,
+    fluss_type: &crate::metadata::DataType,
+    arrow_type: &arrow_schema::DataType,
+) -> Result<()> {
+    let list_builder = builder
+        .as_any_mut()
+        .downcast_mut::<ListBuilder<Box<dyn ArrayBuilder>>>()
+        .ok_or_else(|| RowConvertError {
+            message: "Builder type mismatch for Array: expected ListBuilder".to_string(),
+        })?;
+
+    let element_fluss_type = match fluss_type {
+        crate::metadata::DataType::Array(a) => a.get_element_type(),
+        _ => {
+            return Err(RowConvertError {
+                message: format!("Expected Array Fluss type for Array datum, got: {fluss_type:?}"),
+            });
+        }
+    };
+
+    let element_arrow_type = match arrow_type {
+        arrow_schema::DataType::List(field) => field.data_type().clone(),
+        _ => {
+            return Err(RowConvertError {
+                message: format!("Expected List Arrow type for Array datum, got: {arrow_type:?}"),
+            });
+        }
+    };
+
+    let values_builder = list_builder.values();
+
+    for i in 0..arr.size() {
+        if arr.is_null_at(i) {
+            append_null_for_type(values_builder, &element_arrow_type)?;
+        } else {
+            let datum = read_datum_from_fluss_array(arr, i, element_fluss_type)?;
+            datum.append_to(values_builder, element_fluss_type, &element_arrow_type)?;
+        }
+    }
+    list_builder.append(true);
+    Ok(())
+}
+
+fn append_fluss_map_to_map_builder(
+    map: &crate::row::FlussMap,
+    builder: &mut dyn ArrayBuilder,
+    fluss_type: &crate::metadata::DataType,
+    arrow_type: &arrow_schema::DataType,
+) -> Result<()> {
+    let map_builder = builder
+        .as_any_mut()
+        .downcast_mut::<MapBuilder<Box<dyn ArrayBuilder>, Box<dyn ArrayBuilder>>>()
+        .ok_or_else(|| RowConvertError {
+            message: "Builder type mismatch for Map: expected MapBuilder".to_string(),
+        })?;
+
+    let expected_map_type = match fluss_type {
+        crate::metadata::DataType::Map(m) => m,
+        _ => {
+            return Err(RowConvertError {
+                message: format!("Expected Map Fluss type for Map datum, got: {fluss_type:?}"),
+            });
+        }
+    };
+
+    let (key_arrow_type, value_arrow_type) = match arrow_type {
+        arrow_schema::DataType::Map(entries_field, _) => match entries_field.data_type() {
+            arrow_schema::DataType::Struct(fields) if fields.len() == 2 => {
+                (fields[0].data_type().clone(), fields[1].data_type().clone())
+            }
+            other => {
+                return Err(RowConvertError {
+                    message: format!(
+                        "Expected Struct with 2 fields for Map entries, got: {other:?}"
+                    ),
+                });
+            }
+        },
+        _ => {
+            return Err(RowConvertError {
+                message: format!("Expected Map Arrow type for Map datum, got: {arrow_type:?}"),
+            });
+        }
+    };
+
+    let key_fluss_type = expected_map_type.key_type();
+    let value_fluss_type = expected_map_type.value_type();
+    let key_array = map.key_array();
+    let value_array = map.value_array();
+
+    for i in 0..map.size() {
+        let key_datum = read_datum_from_fluss_array(key_array, i, key_fluss_type)?;
+        key_datum.append_to(map_builder.keys(), key_fluss_type, &key_arrow_type)?;
+
+        if value_array.is_null_at(i) {
+            append_null_for_type(map_builder.values(), &value_arrow_type)?;
+        } else {
+            let val_datum = read_datum_from_fluss_array(value_array, i, value_fluss_type)?;
+            val_datum.append_to(map_builder.values(), value_fluss_type, &value_arrow_type)?;
+        }
+    }
+    map_builder.append(true).map_err(|e| RowConvertError {
+        message: format!("Failed to append Map entries: {e}"),
+    })?;
+    Ok(())
+}
+
+pub(crate) fn read_datum_from_fluss_array<'a>(
+    arr: &FlussArray,
+    pos: usize,
+    element_type: &crate::metadata::DataType,
+) -> Result<Datum<'a>> {
+    if let DataType::Row(row_type) = element_type {
+        let compacted = arr.get_row(pos, row_type)?;
+        return Ok(Datum::Row(Box::new(internal_row_to_owned_generic(
+            &compacted, row_type,
+        )?)));
+    }
+
+    // FlussArray has no attached schema; use the typed inherent accessor.
+    if let DataType::Map(map_type) = element_type {
+        return Ok(Datum::Map(arr.get_map(
+            pos,
+            map_type.key_type(),
+            map_type.value_type(),
+        )?));
+    }
+
+    let getter = FieldGetter::create(element_type, pos);
+    Ok(getter.get_field(arr)?.into_owned())
+}
+
+fn internal_row_to_owned_generic(
+    row: &dyn InternalRow,
+    row_type: &RowType,
+) -> Result<GenericRow<'static>> {
+    let mut owned = GenericRow::new(row_type.fields().len());
+    for (i, field) in row_type.fields().iter().enumerate() {
+        let getter = FieldGetter::create(field.data_type(), i);
+        owned.set_field(i, getter.get_field(row)?.into_owned());
+    }
+    Ok(owned)
+}
+
+fn append_null_for_type(
+    builder: &mut dyn ArrayBuilder,
+    data_type: &arrow_schema::DataType,
+) -> Result<()> {
+    macro_rules! downcast_null {
+        ($builder_type:ty) => {{
+            let b = builder
+                .as_any_mut()
+                .downcast_mut::<$builder_type>()
+                .ok_or_else(|| RowConvertError {
+                    message: format!(
+                        "Builder type mismatch: expected {} for {data_type:?}",
+                        stringify!($builder_type),
+                    ),
+                })?;
+            b.append_null();
+            Ok(())
+        }};
+    }
+
+    match data_type {
+        arrow_schema::DataType::Boolean => downcast_null!(BooleanBuilder),
+        arrow_schema::DataType::Int8 => downcast_null!(Int8Builder),
+        arrow_schema::DataType::Int16 => downcast_null!(Int16Builder),
+        arrow_schema::DataType::Int32 => downcast_null!(Int32Builder),
+        arrow_schema::DataType::Int64 => downcast_null!(Int64Builder),
+        arrow_schema::DataType::Float32 => downcast_null!(Float32Builder),
+        arrow_schema::DataType::Float64 => downcast_null!(Float64Builder),
+        arrow_schema::DataType::Utf8 => downcast_null!(StringBuilder),
+        arrow_schema::DataType::Binary => downcast_null!(BinaryBuilder),
+        arrow_schema::DataType::FixedSizeBinary(_) => downcast_null!(FixedSizeBinaryBuilder),
+        arrow_schema::DataType::Decimal128(_, _) => downcast_null!(Decimal128Builder),
+        arrow_schema::DataType::Date32 => downcast_null!(Date32Builder),
+        arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second) => {
+            downcast_null!(Time32SecondBuilder)
+        }
+        arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
+            downcast_null!(Time32MillisecondBuilder)
+        }
+        arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
+            downcast_null!(Time64MicrosecondBuilder)
+        }
+        arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond) => {
+            downcast_null!(Time64NanosecondBuilder)
+        }
+        arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _) => {
+            downcast_null!(TimestampSecondBuilder)
+        }
+        arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => {
+            downcast_null!(TimestampMillisecondBuilder)
+        }
+        arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => {
+            downcast_null!(TimestampMicrosecondBuilder)
+        }
+        arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => {
+            downcast_null!(TimestampNanosecondBuilder)
+        }
+        arrow_schema::DataType::List(_) => {
+            downcast_null!(ListBuilder<Box<dyn ArrayBuilder>>)
+        }
+        arrow_schema::DataType::Map(_, _) => {
+            let b = builder
+                .as_any_mut()
+                .downcast_mut::<MapBuilder<Box<dyn ArrayBuilder>, Box<dyn ArrayBuilder>>>()
+                .ok_or_else(|| RowConvertError {
+                    message: format!(
+                        "Builder type mismatch: expected MapBuilder for {data_type:?}",
+                    ),
+                })?;
+            b.append(false).map_err(|e| RowConvertError {
+                message: format!("Failed to append null Map entries: {e}"),
+            })?;
+            Ok(())
+        }
+        arrow_schema::DataType::Struct(fields) => {
+            // StructBuilder::append_null only flips parent validity; children must each get a null too.
+            let struct_builder = builder
+                .as_any_mut()
+                .downcast_mut::<StructBuilder>()
+                .ok_or_else(|| RowConvertError {
+                    message: format!(
+                        "Builder type mismatch: expected StructBuilder for {data_type:?}",
+                    ),
+                })?;
+            let cloned_fields = fields.clone();
+            {
+                let field_builders = struct_builder.field_builders_mut();
+                for (i, field) in cloned_fields.iter().enumerate() {
+                    append_null_for_type(field_builders[i].as_mut(), field.data_type())?;
+                }
+            }
+            struct_builder.append(false);
+            Ok(())
+        }
+        _ => Err(RowConvertError {
+            message: format!("Unsupported Arrow data type for null append: {data_type:?}"),
+        }),
+    }
+}
+
+fn append_generic_row_to_struct_builder(
+    row: &GenericRow<'_>,
+    builder: &mut dyn ArrayBuilder,
+    fluss_type: &crate::metadata::DataType,
+    arrow_type: &arrow_schema::DataType,
+) -> Result<()> {
+    let struct_builder = builder
+        .as_any_mut()
+        .downcast_mut::<StructBuilder>()
+        .ok_or_else(|| RowConvertError {
+            message: "Builder type mismatch for Row: expected StructBuilder".to_string(),
+        })?;
+
+    let row_type = match fluss_type {
+        crate::metadata::DataType::Row(rt) => rt,
+        _ => {
+            return Err(RowConvertError {
+                message: format!("Expected Row Fluss type for Row datum, got: {fluss_type:?}"),
+            });
+        }
+    };
+
+    let fields = match arrow_type {
+        arrow_schema::DataType::Struct(fields) => fields.clone(),
+        _ => {
+            return Err(RowConvertError {
+                message: format!("Expected Struct Arrow type for Row datum, got: {arrow_type:?}"),
+            });
+        }
+    };
+
+    if row.values.len() != fields.len() {
+        return Err(RowConvertError {
+            message: format!(
+                "Row arity mismatch: schema has {} fields, got {}",
+                fields.len(),
+                row.values.len(),
+            ),
+        });
+    }
+
+    {
+        let field_builders = struct_builder.field_builders_mut();
+        for (i, datum) in row.values.iter().enumerate() {
+            let child = field_builders[i].as_mut();
+            let child_fluss_type = row_type.fields()[i].data_type();
+            datum.append_to(child, child_fluss_type, fields[i].data_type())?;
+        }
+    }
+    struct_builder.append(true);
+    Ok(())
+}
+
+impl Datum<'_> {
+    pub fn append_to(
+        &self,
+        builder: &mut dyn ArrayBuilder,
+        fluss_type: &crate::metadata::DataType,
+        arrow_type: &arrow_schema::DataType,
+    ) -> Result<()> {
+        macro_rules! append_value_to_arrow {
+            ($builder_type:ty, $value:expr) => {
+                if let Some(b) = builder.as_any_mut().downcast_mut::<$builder_type>() {
+                    b.append_value($value).into_append_result()?;
+                    return Ok(());
+                }
+            };
+        }
+
+        match self {
+            Datum::Null => return append_null_for_type(builder, arrow_type),
+            Datum::Bool(v) => append_value_to_arrow!(BooleanBuilder, *v),
+            Datum::Int8(v) => append_value_to_arrow!(Int8Builder, *v),
+            Datum::Int16(v) => append_value_to_arrow!(Int16Builder, *v),
+            Datum::Int32(v) => append_value_to_arrow!(Int32Builder, *v),
+            Datum::Int64(v) => append_value_to_arrow!(Int64Builder, *v),
+            Datum::Float32(v) => append_value_to_arrow!(Float32Builder, v.into_inner()),
+            Datum::Float64(v) => append_value_to_arrow!(Float64Builder, v.into_inner()),
+            Datum::String(v) => append_value_to_arrow!(StringBuilder, v.as_ref()),
+            Datum::Blob(v) => match arrow_type {
+                arrow_schema::DataType::Binary => {
+                    append_value_to_arrow!(BinaryBuilder, v.as_ref());
+                }
+                arrow_schema::DataType::FixedSizeBinary(_) => {
+                    append_value_to_arrow!(FixedSizeBinaryBuilder, v.as_ref());
+                }
+                _ => {
+                    return Err(RowConvertError {
+                        message: format!(
+                            "Expected Binary or FixedSizeBinary Arrow type, got: {arrow_type:?}"
+                        ),
+                    });
+                }
+            },
+            Datum::Decimal(decimal) => {
+                // Extract target precision and scale from Arrow schema
+                let (p, s) = match arrow_type {
+                    arrow_schema::DataType::Decimal128(p, s) => (*p, *s),
+                    _ => {
+                        return Err(RowConvertError {
+                            message: format!("Expected Decimal128 Arrow type, got: {arrow_type:?}"),
+                        });
+                    }
+                };
+
+                if s < 0 {
+                    return Err(RowConvertError {
+                        message: format!("Negative decimal scale {s} is not supported"),
+                    });
+                }
+
+                if let Some(b) = builder.as_any_mut().downcast_mut::<Decimal128Builder>() {
+                    append_decimal_to_builder(decimal, p as u32, s as i64, b)?;
+                    return Ok(());
+                }
+
+                return Err(RowConvertError {
+                    message: "Builder type mismatch for Decimal128".to_string(),
+                });
+            }
+            Datum::Date(date) => {
+                append_value_to_arrow!(Date32Builder, date.get_inner());
+            }
+            Datum::Time(time) => {
+                // Time is stored as milliseconds since midnight in Fluss
+                // Convert to Arrow's time unit based on schema
+                let millis = time.get_inner();
+
+                match arrow_type {
+                    arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second) => {
+                        if let Some(b) = builder.as_any_mut().downcast_mut::<Time32SecondBuilder>()
+                        {
+                            // Validate no sub-second precision is lost
+                            if millis % MILLIS_PER_SECOND as i32 != 0 {
+                                return Err(RowConvertError {
+                                    message: format!(
+                                        "Time value {millis} ms has sub-second precision but schema expects seconds only"
+                                    ),
+                                });
+                            }
+                            b.append_value(millis / MILLIS_PER_SECOND as i32);
+                            return Ok(());
+                        }
+                    }
+                    arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
+                        if let Some(b) = builder
+                            .as_any_mut()
+                            .downcast_mut::<Time32MillisecondBuilder>()
+                        {
+                            b.append_value(millis);
+                            return Ok(());
+                        }
+                    }
+                    arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
+                        if let Some(b) = builder
+                            .as_any_mut()
+                            .downcast_mut::<Time64MicrosecondBuilder>()
+                        {
+                            let micros = (millis as i64)
+                                .checked_mul(MICROS_PER_MILLI)
+                                .ok_or_else(|| RowConvertError {
+                                    message: format!(
+                                        "Time value {millis} ms overflows when converting to microseconds"
+                                    ),
+                                })?;
+                            b.append_value(micros);
+                            return Ok(());
+                        }
+                    }
+                    arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond) => {
+                        if let Some(b) = builder
+                            .as_any_mut()
+                            .downcast_mut::<Time64NanosecondBuilder>()
+                        {
+                            let nanos = (millis as i64).checked_mul(NANOS_PER_MILLI).ok_or_else(
+                                || RowConvertError {
+                                    message: format!(
+                                        "Time value {millis} ms overflows when converting to nanoseconds"
+                                    ),
+                                },
+                            )?;
+                            b.append_value(nanos);
+                            return Ok(());
+                        }
+                    }
+                    _ => {
+                        return Err(RowConvertError {
+                            message: format!(
+                                "Expected Time32/Time64 Arrow type, got: {arrow_type:?}"
+                            ),
+                        });
+                    }
+                }
+
+                return Err(RowConvertError {
+                    message: "Builder type mismatch for Time".to_string(),
+                });
+            }
+            Datum::TimestampNtz(ts) => {
+                let millis = ts.get_millisecond();
+                let nanos = ts.get_nano_of_millisecond();
+
+                if let Some(b) = builder
+                    .as_any_mut()
+                    .downcast_mut::<TimestampSecondBuilder>()
+                {
+                    b.append_value(millis / MILLIS_PER_SECOND);
+                    return Ok(());
+                }
+                if let Some(b) = builder
+                    .as_any_mut()
+                    .downcast_mut::<TimestampMillisecondBuilder>()
+                {
+                    b.append_value(millis);
+                    return Ok(());
+                }
+                if let Some(b) = builder
+                    .as_any_mut()
+                    .downcast_mut::<TimestampMicrosecondBuilder>()
+                {
+                    b.append_value(millis_nanos_to_micros(millis, nanos)?);
+                    return Ok(());
+                }
+                if let Some(b) = builder
+                    .as_any_mut()
+                    .downcast_mut::<TimestampNanosecondBuilder>()
+                {
+                    b.append_value(millis_nanos_to_nanos(millis, nanos)?);
+                    return Ok(());
+                }
+
+                return Err(RowConvertError {
+                    message: "Builder type mismatch for TimestampNtz".to_string(),
+                });
+            }
+            Datum::TimestampLtz(ts) => {
+                let millis = ts.get_epoch_millisecond();
+                let nanos = ts.get_nano_of_millisecond();
+
+                if let Some(b) = builder
+                    .as_any_mut()
+                    .downcast_mut::<TimestampSecondBuilder>()
+                {
+                    b.append_value(millis / MILLIS_PER_SECOND);
+                    return Ok(());
+                }
+                if let Some(b) = builder
+                    .as_any_mut()
+                    .downcast_mut::<TimestampMillisecondBuilder>()
+                {
+                    b.append_value(millis);
+                    return Ok(());
+                }
+                if let Some(b) = builder
+                    .as_any_mut()
+                    .downcast_mut::<TimestampMicrosecondBuilder>()
+                {
+                    b.append_value(millis_nanos_to_micros(millis, nanos)?);
+                    return Ok(());
+                }
+                if let Some(b) = builder
+                    .as_any_mut()
+                    .downcast_mut::<TimestampNanosecondBuilder>()
+                {
+                    b.append_value(millis_nanos_to_nanos(millis, nanos)?);
+                    return Ok(());
+                }
+
+                return Err(RowConvertError {
+                    message: "Builder type mismatch for TimestampLtz".to_string(),
+                });
+            }
+            Datum::Array(arr) => {
+                return append_fluss_array_to_list_builder(arr, builder, fluss_type, arrow_type);
+            }
+            Datum::Map(map) => {
+                return append_fluss_map_to_map_builder(map, builder, fluss_type, arrow_type);
+            }
+            Datum::Row(row) => {
+                return append_generic_row_to_struct_builder(row, builder, fluss_type, arrow_type);
+            }
+        }
+
+        Err(RowConvertError {
+            message: format!(
+                "Cannot append {:?} to builder of type {}",
+                self,
+                std::any::type_name_of_val(builder)
+            ),
+        })
+    }
+}
+
+macro_rules! impl_to_arrow {
+    ($ty:ty, $variant:ident) => {
+        impl ToArrow for $ty {
+            fn append_to(
+                &self,
+                builder: &mut dyn ArrayBuilder,
+                _fluss_type: &crate::metadata::DataType,
+                _arrow_type: &arrow_schema::DataType,
+            ) -> Result<()> {
+                if let Some(b) = builder.as_any_mut().downcast_mut::<$variant>() {
+                    b.append_value(*self);
+                    Ok(())
+                } else {
+                    Err(RowConvertError {
+                        message: format!(
+                            "Cannot cast {} to {} builder",
+                            stringify!($ty),
+                            stringify!($variant)
+                        ),
+                    })
+                }
+            }
+        }
+    };
+}
+
+impl_to_arrow!(i8, Int8Builder);
+impl_to_arrow!(i16, Int16Builder);
+impl_to_arrow!(i32, Int32Builder);
+impl_to_arrow!(f32, Float32Builder);
+impl_to_arrow!(f64, Float64Builder);
+impl_to_arrow!(&str, StringBuilder);
+
+pub type F32 = OrderedFloat<f32>;
+pub type F64 = OrderedFloat<f64>;
+#[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)]
+pub struct Date(i32);
+
+#[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)]
+pub struct Time(i32);
+
+impl Time {
+    pub const fn new(inner: i32) -> Self {
+        Time(inner)
+    }
+
+    /// Get the inner value of time type (milliseconds since midnight)
+    pub fn get_inner(&self) -> i32 {
+        self.0
+    }
+}
+
+/// Maximum timestamp precision that can be stored compactly (milliseconds only).
+/// Values with precision > MAX_COMPACT_TIMESTAMP_PRECISION require additional nanosecond storage.
+pub const MAX_COMPACT_TIMESTAMP_PRECISION: u32 = 3;
+
+/// Maximum valid value for nanoseconds within a millisecond (0 to 999,999 inclusive).
+/// A millisecond contains 1,000,000 nanoseconds, so the fractional part ranges from 0 to 999,999.
+pub const MAX_NANO_OF_MILLISECOND: i32 = 999_999;
+
+#[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)]
+#[display("{millisecond}")]
+pub struct TimestampNtz {
+    millisecond: i64,
+    nano_of_millisecond: i32,
+}
+
+impl TimestampNtz {
+    pub const fn new(millisecond: i64) -> Self {
+        TimestampNtz {
+            millisecond,
+            nano_of_millisecond: 0,
+        }
+    }
+
+    pub fn from_millis_nanos(millisecond: i64, nano_of_millisecond: i32) -> Result<Self> {
+        if !(0..=MAX_NANO_OF_MILLISECOND).contains(&nano_of_millisecond) {
+            return Err(IllegalArgument {
+                message: format!(
+                    "nanoOfMillisecond must be in range [0, {MAX_NANO_OF_MILLISECOND}], got: {nano_of_millisecond}"
+                ),
+            });
+        }
+        Ok(TimestampNtz {
+            millisecond,
+            nano_of_millisecond,
+        })
+    }
+
+    pub fn get_millisecond(&self) -> i64 {
+        self.millisecond
+    }
+
+    pub fn get_nano_of_millisecond(&self) -> i32 {
+        self.nano_of_millisecond
+    }
+
+    /// Check if the timestamp is compact based on precision.
+    /// Precision <= MAX_COMPACT_TIMESTAMP_PRECISION means millisecond precision, no need for nanos.
+    pub fn is_compact(precision: u32) -> bool {
+        precision <= MAX_COMPACT_TIMESTAMP_PRECISION
+    }
+}
+
+#[derive(PartialOrd, Ord, Display, PartialEq, Eq, Debug, Copy, Clone, Default, Hash, Serialize)]
+#[display("{epoch_millisecond}")]
+pub struct TimestampLtz {
+    epoch_millisecond: i64,
+    nano_of_millisecond: i32,
+}
+
+impl TimestampLtz {
+    pub const fn new(epoch_millisecond: i64) -> Self {
+        TimestampLtz {
+            epoch_millisecond,
+            nano_of_millisecond: 0,
+        }
+    }
+
+    pub fn from_millis_nanos(epoch_millisecond: i64, nano_of_millisecond: i32) -> Result<Self> {
+        if !(0..=MAX_NANO_OF_MILLISECOND).contains(&nano_of_millisecond) {
+            return Err(IllegalArgument {
+                message: format!(
+                    "nanoOfMillisecond must be in range [0, {MAX_NANO_OF_MILLISECOND}], got: {nano_of_millisecond}"
+                ),
+            });
+        }
+        Ok(TimestampLtz {
+            epoch_millisecond,
+            nano_of_millisecond,
+        })
+    }
+
+    pub fn get_epoch_millisecond(&self) -> i64 {
+        self.epoch_millisecond
+    }
+
+    pub fn get_nano_of_millisecond(&self) -> i32 {
+        self.nano_of_millisecond
+    }
+
+    /// Check if the timestamp is compact based on precision.
+    /// Precision <= MAX_COMPACT_TIMESTAMP_PRECISION means millisecond precision, no need for nanos.
+    pub fn is_compact(precision: u32) -> bool {
+        precision <= MAX_COMPACT_TIMESTAMP_PRECISION
+    }
+}
+
+pub type Blob<'a> = Cow<'a, [u8]>;
+
+impl<'a> From<Vec<u8>> for Datum<'a> {
+    fn from(vec: Vec<u8>) -> Self {
+        Datum::Blob(Blob::from(vec))
+    }
+}
+
+impl<'a> From<&'a [u8]> for Datum<'a> {
+    fn from(bytes: &'a [u8]) -> Datum<'a> {
+        Datum::Blob(Blob::from(bytes))
+    }
+}
+
+const UNIX_EPOCH_DAY: jiff::civil::Date = jiff::civil::date(1970, 1, 1);
+
+impl Date {
+    pub const fn new(inner: i32) -> Self {
+        Date(inner)
+    }
+
+    /// Get the inner value of date type
+    pub fn get_inner(&self) -> i32 {
+        self.0
+    }
+
+    pub fn year(&self) -> i16 {
+        let date = UNIX_EPOCH_DAY + self.0.days();
+        date.year()
+    }
+    pub fn month(&self) -> i8 {
+        let date = UNIX_EPOCH_DAY + self.0.days();
+        date.month()
+    }
+
+    pub fn day(&self) -> i8 {
+        let date = UNIX_EPOCH_DAY + self.0.days();
+        date.day()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Array, Int32Builder, StringBuilder};
+
+    #[test]
+    fn datum_accessors_and_conversions() {
+        let datum = Datum::String("value".into());
+        assert_eq!(datum.as_str(), "value");
+        assert!(!datum.is_null());
+
+        let blob = Blob::from(vec![1, 2, 3]);
+        let datum = Datum::Blob(blob);
+        assert_eq!(datum.as_blob(), &[1, 2, 3]);
+
+        assert!(Datum::Null.is_null());
+
+        let datum = Datum::Int32(42);
+        let value: i32 = (&datum).try_into().unwrap();
+        assert_eq!(value, 42);
+        let value: std::result::Result<i16, _> = (&datum).try_into();
+        assert!(value.is_err());
+
+        // Test temporal types
+        let decimal = Decimal::from_unscaled_long(12345, 10, 2).unwrap();
+        let datum: Datum = decimal.clone().into();
+        assert_eq!(datum.as_decimal(), &decimal);
+        let extracted: Decimal = (&datum).try_into().unwrap();
+        assert_eq!(extracted, decimal);
+
+        let date = Date::new(19000);
+        let datum: Datum = date.into();
+        assert_eq!(datum.as_date(), date);
+
+        let ts_ltz = TimestampLtz::new(1672531200000);
+        let datum: Datum = ts_ltz.into();
+        assert_eq!(datum.as_timestamp_ltz(), ts_ltz);
+    }
+
+    #[test]
+    fn datum_append_to_builder() {
+        use crate::metadata::DataTypes;
+        let mut builder = Int32Builder::new();
+        let int_type = DataTypes::int();
+        Datum::Null
+            .append_to(&mut builder, &int_type, &arrow_schema::DataType::Int32)
+            .unwrap();
+        Datum::Int32(5)
+            .append_to(&mut builder, &int_type, &arrow_schema::DataType::Int32)
+            .unwrap();
+        let array = builder.finish();
+        assert!(array.is_null(0));
+        assert_eq!(array.value(1), 5);
+
+        let mut builder = StringBuilder::new();
+        let string_type = DataTypes::string();
+        let err = Datum::Int32(1)
+            .append_to(&mut builder, &string_type, &arrow_schema::DataType::Utf8)
+            .unwrap_err();
+        assert!(matches!(err, RowConvertError { .. }));
+    }
+
+    #[test]
+    #[should_panic]
+    fn datum_as_str_panics_on_non_string() {
+        let _ = Datum::Int32(1).as_str();
+    }
+
+    #[test]
+    #[should_panic]
+    fn datum_as_blob_panics_on_non_blob() {
+        let _ = Datum::Int16(1).as_blob();
+    }
+
+    #[test]
+    fn date_components() {
+        let date = Date::new(0);
+        assert_eq!(date.get_inner(), 0);
+        assert_eq!(date.year(), 1970);
+        assert_eq!(date.month(), 1);
+        assert_eq!(date.day(), 1);
+    }
+    #[test]
+    fn test_datum_map_appends_to_arrow() {
+        use crate::metadata::DataTypes;
+        use crate::row::binary_map::FlussMapWriter;
+        use arrow::array::MapBuilder;
+        use std::sync::Arc;
+
+        let mut writer = FlussMapWriter::new(1, &DataTypes::int(), &DataTypes::string());
+        writer.write_entry(99.into(), "arrow_test".into()).unwrap();
+        let map = writer.complete().unwrap();
+
+        let arrow_type = arrow_schema::DataType::Map(
+            Arc::new(arrow_schema::Field::new(
+                "entries",
+                arrow_schema::DataType::Struct(arrow_schema::Fields::from(vec![
+                    arrow_schema::Field::new("key", arrow_schema::DataType::Int32, false),
+                    arrow_schema::Field::new("value", arrow_schema::DataType::Utf8, true),
+                ])),
+                false,
+            )),
+            false,
+        );
+
+        let mut map_builder: MapBuilder<
+            Box<dyn arrow::array::ArrayBuilder>,
+            Box<dyn arrow::array::ArrayBuilder>,
+        > = MapBuilder::new(
+            None,
+            Box::new(Int32Builder::new()),
+            Box::new(StringBuilder::new()),
+        );
+
+        let map_type = DataTypes::map(DataTypes::int(), DataTypes::string());
+        Datum::Map(map)
+            .append_to(&mut map_builder, &map_type, &arrow_type)
+            .unwrap();
+
+        let array = map_builder.finish();
+        assert_eq!(array.len(), 1);
+        assert!(!array.is_null(0));
+    }
+
+    #[test]
+    fn test_datum_map_append_type_mismatch() {
+        use crate::metadata::DataTypes;
+        use crate::row::binary_map::FlussMapWriter;
+        use arrow::array::{Float64Builder, MapBuilder, StringBuilder};
+        use std::sync::Arc;
+
+        // 1. Construct a Map with Keys: String, Values: Float64
+        let mut writer = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::double());
+        writer.write_entry("key1".into(), 1.23.into()).unwrap();
+        let map = writer.complete().unwrap();
+
+        // 2. Define an Arrow Map builder for (String, Float64) using Boxed builders
+        let mut map_builder: MapBuilder<
+            Box<dyn arrow::array::ArrayBuilder>,
+            Box<dyn arrow::array::ArrayBuilder>,
+        > = MapBuilder::new(
+            None,
+            Box::new(StringBuilder::new()),
+            Box::new(Float64Builder::new()),
+        );
+
+        // 3. Define an INCOMPATIBLE expected Fluss type (Int32 instead of Map)
+        let mismatched_type = DataTypes::int();
+
+        // 4. Define the Arrow type (must match the builder structure)
+        let arrow_type = arrow_schema::DataType::Map(
+            Arc::new(arrow_schema::Field::new(
+                "entries",
+                arrow_schema::DataType::Struct(arrow_schema::Fields::from(vec![
+                    arrow_schema::Field::new("key", arrow_schema::DataType::Utf8, false),
+                    arrow_schema::Field::new("value", arrow_schema::DataType::Float64, true),
+                ])),
+                false,
+            )),
+            false,
+        );
+
+        // 5. Assert that append_to returns an error
+        let result = Datum::Map(map).append_to(&mut map_builder, &mismatched_type, &arrow_type);
+
+        assert!(result.is_err());
+        let err = result.unwrap_err().to_string();
+        assert!(err.contains("row convert error Expected Map Fluss type for Map datum"));
+        assert!(err.contains("Int(IntType { nullable: true })"));
+    }
+}
+
+#[cfg(test)]
+mod timestamp_tests {
+    use super::*;
+    use crate::metadata::{DataField, DataTypes};
+    use crate::record::to_arrow_type;
+    use crate::row::InternalRow;
+    use crate::row::column::ColumnarRow;
+    use arrow::array::{RecordBatch, StructArray, StructBuilder};
+    use arrow::datatypes::{Field, Fields, Schema};
+    use std::sync::Arc;
+
+    #[test]
+    fn test_timestamp_valid_nanos() {
+        // Valid range: 0 to MAX_NANO_OF_MILLISECOND for both TimestampNtz and TimestampLtz
+        let ntz1 = TimestampNtz::from_millis_nanos(1000, 0).unwrap();
+        assert_eq!(ntz1.get_nano_of_millisecond(), 0);
+
+        let ntz2 = TimestampNtz::from_millis_nanos(1000, MAX_NANO_OF_MILLISECOND).unwrap();
+        assert_eq!(ntz2.get_nano_of_millisecond(), MAX_NANO_OF_MILLISECOND);
+
+        let ntz3 = TimestampNtz::from_millis_nanos(1000, 500_000).unwrap();
+        assert_eq!(ntz3.get_nano_of_millisecond(), 500_000);
+
+        let ltz1 = TimestampLtz::from_millis_nanos(1000, 0).unwrap();
+        assert_eq!(ltz1.get_nano_of_millisecond(), 0);
+
+        let ltz2 = TimestampLtz::from_millis_nanos(1000, MAX_NANO_OF_MILLISECOND).unwrap();
+        assert_eq!(ltz2.get_nano_of_millisecond(), MAX_NANO_OF_MILLISECOND);
+    }
+
+    #[test]
+    fn test_timestamp_nanos_out_of_range() {
+        // Test that both TimestampNtz and TimestampLtz reject invalid nanos
+        let expected_msg =
+            format!("nanoOfMillisecond must be in range [0, {MAX_NANO_OF_MILLISECOND}]");
+
+        // Too large (1,000,000 is just beyond the valid range)
+        let result_ntz = TimestampNtz::from_millis_nanos(1000, MAX_NANO_OF_MILLISECOND + 1);
+        assert!(result_ntz.is_err());
+        assert!(result_ntz.unwrap_err().to_string().contains(&expected_msg));
+
+        let result_ltz = TimestampLtz::from_millis_nanos(1000, MAX_NANO_OF_MILLISECOND + 1);
+        assert!(result_ltz.is_err());
+        assert!(result_ltz.unwrap_err().to_string().contains(&expected_msg));
+
+        // Negative
+        let result_ntz = TimestampNtz::from_millis_nanos(1000, -1);
+        assert!(result_ntz.is_err());
+        assert!(result_ntz.unwrap_err().to_string().contains(&expected_msg));
+
+        let result_ltz = TimestampLtz::from_millis_nanos(1000, -1);
+        assert!(result_ltz.is_err());
+        assert!(result_ltz.unwrap_err().to_string().contains(&expected_msg));
+    }
+
+    #[test]
+    fn test_row_arrow_struct_round_trip() {
+        let row_type = crate::metadata::RowType::new(vec![
+            DataField::new("x", DataTypes::int(), None),
+            DataField::new("label", DataTypes::string(), None),
+        ]);
+        let row_type_owned = DataType::Row(row_type.clone());
+        let arrow_struct_dt = to_arrow_type(&row_type_owned).unwrap();
+        let struct_fields: Fields = match &arrow_struct_dt {
+            arrow_schema::DataType::Struct(f) => f.clone(),
+            _ => unreachable!(),
+        };
+
+        let mut struct_builder = StructBuilder::from_fields(struct_fields.clone(), 3);
+
+        let mut r0 = GenericRow::new(2);
+        r0.set_field(0, 42_i32);
+        r0.set_field(1, "hello");
+        Datum::Row(Box::new(r0))
+            .append_to(&mut struct_builder, &row_type_owned, &arrow_struct_dt)
+            .expect("append row 0");
+
+        Datum::Null
+            .append_to(&mut struct_builder, &row_type_owned, &arrow_struct_dt)
+            .expect("append null row");
+
+        let mut r2 = GenericRow::new(2);
+        r2.set_field(0, -7_i32);
+        r2.set_field(1, Datum::Null);
+        Datum::Row(Box::new(r2))
+            .append_to(&mut struct_builder, &row_type_owned, &arrow_struct_dt)
+            .expect("append row 2");
+
+        let struct_array: StructArray = struct_builder.finish();
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "nested",
+            arrow_struct_dt.clone(),
+            true,
+        )]));
+        let batch = Arc::new(
+            RecordBatch::try_new(schema, vec![Arc::new(struct_array)]).expect("record batch"),
+        );
+
+        let mut columnar = ColumnarRow::new(batch, Arc::new(row_type), 0, None);
+
+        let nested = columnar.get_row(0).expect("get_row 0");
+        assert_eq!(nested.get_int(0).unwrap(), 42);
+        assert_eq!(nested.get_string(1).unwrap(), "hello");
+
+        columnar.set_row_id(1);
+        assert!(columnar.is_null_at(0).unwrap(), "row 1 should be null");
+
+        columnar.set_row_id(2);
+        let nested = columnar.get_row(0).expect("get_row 2");
+        assert_eq!(nested.get_int(0).unwrap(), -7);
+        assert!(nested.is_null_at(1).unwrap(), "label should be null");
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/decimal.rs b/fluss-rust/crates/fluss/src/row/decimal.rs
new file mode 100644
index 0000000000..fd21b82968
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/decimal.rs
@@ -0,0 +1,472 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::{Error, Result};
+use bigdecimal::num_bigint::BigInt;
+use bigdecimal::num_traits::Zero;
+use bigdecimal::{BigDecimal, RoundingMode};
+use std::fmt;
+
+#[cfg(test)]
+use std::str::FromStr;
+
+/// Maximum decimal precision that can be stored compactly as a single i64.
+/// Values with precision > MAX_COMPACT_PRECISION require byte array storage.
+pub const MAX_COMPACT_PRECISION: u32 = 18;
+
+/// An internal data structure representing a decimal value with fixed precision and scale.
+///
+/// This data structure is immutable and stores decimal values in a compact representation
+/// (as a long value) if values are small enough (precision ≤ 18).
+///
+/// Matches Java's org.apache.fluss.row.Decimal class.
+#[derive(Debug, Clone, serde::Serialize)]
+pub struct Decimal {
+    precision: u32,
+    scale: u32,
+    // If precision <= MAX_COMPACT_PRECISION, this holds the unscaled value
+    long_val: Option<i64>,
+    // BigDecimal representation (may be cached)
+    decimal_val: Option<BigDecimal>,
+}
+
+impl Decimal {
+    /// Returns the precision of this Decimal.
+    ///
+    /// The precision is the number of digits in the unscaled value.
+    pub fn precision(&self) -> u32 {
+        self.precision
+    }
+
+    /// Returns the scale of this Decimal.
+    pub fn scale(&self) -> u32 {
+        self.scale
+    }
+
+    /// Returns whether the decimal value is small enough to be stored in a long.
+    pub fn is_compact(&self) -> bool {
+        self.precision <= MAX_COMPACT_PRECISION
+    }
+
+    /// Returns whether a given precision can be stored compactly.
+    pub fn is_compact_precision(precision: u32) -> bool {
+        precision <= MAX_COMPACT_PRECISION
+    }
+
+    /// Converts this Decimal into a BigDecimal.
+    pub fn to_big_decimal(&self) -> BigDecimal {
+        if let Some(bd) = &self.decimal_val {
+            bd.clone()
+        } else if let Some(long_val) = self.long_val {
+            BigDecimal::new(BigInt::from(long_val), self.scale as i64)
+        } else {
+            // Should never happen - we always have one representation
+            BigDecimal::new(BigInt::from(0), self.scale as i64)
+        }
+    }
+
+    /// Returns a long describing the unscaled value of this Decimal.
+    pub fn to_unscaled_long(&self) -> Result<i64> {
+        if let Some(long_val) = self.long_val {
+            Ok(long_val)
+        } else {
+            // Extract unscaled value from BigDecimal
+            let bd = self.to_big_decimal();
+            let (unscaled, _) = bd.as_bigint_and_exponent();
+            unscaled.try_into().map_err(|_| Error::IllegalArgument {
+                message: format!(
+                    "Decimal unscaled value does not fit in i64: precision={}",
+                    self.precision
+                ),
+            })
+        }
+    }
+
+    /// Returns a byte array describing the unscaled value of this Decimal.
+    pub fn to_unscaled_bytes(&self) -> Vec<u8> {
+        let bd = self.to_big_decimal();
+        let (unscaled, _) = bd.as_bigint_and_exponent();
+        unscaled.to_signed_bytes_be()
+    }
+
+    /// Creates a Decimal from Arrow's Decimal128 representation.
+    // TODO: For compact decimals with matching scale we may call from_unscaled_long
+    pub fn from_arrow_decimal128(
+        i128_val: i128,
+        arrow_scale: i64,
+        precision: u32,
+        scale: u32,
+    ) -> Result<Self> {
+        let bd = BigDecimal::new(BigInt::from(i128_val), arrow_scale);
+        Self::from_big_decimal(bd, precision, scale)
+    }
+
+    /// Creates an instance of Decimal from a BigDecimal with the given precision and scale.
+    ///
+    /// The returned decimal value may be rounded to have the desired scale. The precision
+    /// will be checked. If the precision overflows, an error is returned.
+    pub fn from_big_decimal(bd: BigDecimal, precision: u32, scale: u32) -> Result<Self> {
+        // Rescale to the target scale with HALF_UP rounding (matches Java)
+        let scaled = bd.with_scale_round(scale as i64, RoundingMode::HalfUp);
+
+        // Extract unscaled value
+        let (unscaled, exp) = scaled.as_bigint_and_exponent();
+
+        // Sanity check that scale matches
+        debug_assert_eq!(
+            exp, scale as i64,
+            "Scaled decimal exponent ({exp}) != expected scale ({scale})"
+        );
+
+        let actual_precision = Self::compute_precision(&unscaled);
+        if actual_precision > precision as usize {
+            return Err(Error::IllegalArgument {
+                message: format!(
+                    "Decimal precision overflow: value has {actual_precision} digits but precision is {precision} (value: {scaled})"
+                ),
+            });
+        }
+
+        // Compute compact representation if possible
+        let long_val = if precision <= MAX_COMPACT_PRECISION {
+            Some(i64::try_from(&unscaled).map_err(|_| Error::IllegalArgument {
+                message: format!(
+                    "Decimal mantissa exceeds i64 range for compact precision {precision}: unscaled={unscaled} (value={scaled})"
+                ),
+            })?)
+        } else {
+            None
+        };
+
+        Ok(Decimal {
+            precision,
+            scale,
+            long_val,
+            decimal_val: Some(scaled),
+        })
+    }
+
+    /// Creates an instance of Decimal from an unscaled long value with the given precision and scale.
+    pub fn from_unscaled_long(unscaled_long: i64, precision: u32, scale: u32) -> Result<Self> {
+        if precision > MAX_COMPACT_PRECISION {
+            return Err(Error::IllegalArgument {
+                message: format!(
+                    "Precision {precision} exceeds MAX_COMPACT_PRECISION ({MAX_COMPACT_PRECISION})"
+                ),
+            });
+        }
+
+        let actual_precision = Self::compute_precision(&BigInt::from(unscaled_long));
+        if actual_precision > precision as usize {
+            return Err(Error::IllegalArgument {
+                message: format!(
+                    "Decimal precision overflow: unscaled value has {actual_precision} digits but precision is {precision}"
+                ),
+            });
+        }
+
+        Ok(Decimal {
+            precision,
+            scale,
+            long_val: Some(unscaled_long),
+            decimal_val: None,
+        })
+    }
+
+    /// Creates an instance of Decimal from an unscaled byte array with the given precision and scale.
+    pub fn from_unscaled_bytes(unscaled_bytes: &[u8], precision: u32, scale: u32) -> Result<Self> {
+        let unscaled = BigInt::from_signed_bytes_be(unscaled_bytes);
+        let bd = BigDecimal::new(unscaled, scale as i64);
+        Self::from_big_decimal(bd, precision, scale)
+    }
+
+    /// Computes the precision of a decimal's unscaled value, matching Java's BigDecimal.precision().
+    pub fn compute_precision(unscaled: &BigInt) -> usize {
+        if unscaled.is_zero() {
+            return 1;
+        }
+
+        // Count ALL digits in the unscaled value (matches Java's BigDecimal.precision())
+        // For bounded precision (≤ 38 digits), string conversion is cheap and simple.
+        unscaled.magnitude().to_str_radix(10).len()
+    }
+}
+
+impl fmt::Display for Decimal {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "{}", self.to_big_decimal())
+    }
+}
+
+// Manual implementations of comparison traits to ignore cached fields
+impl PartialEq for Decimal {
+    fn eq(&self, other: &Self) -> bool {
+        // Use numeric equality like Java's Decimal.equals() which delegates to compareTo.
+        // This means 1.0 (scale=1) equals 1.00 (scale=2).
+        self.cmp(other) == std::cmp::Ordering::Equal
+    }
+}
+
+impl Eq for Decimal {}
+
+impl PartialOrd for Decimal {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Decimal {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // If both are compact and have the same scale, compare directly
+        if self.is_compact() && other.is_compact() && self.scale == other.scale {
+            self.long_val.cmp(&other.long_val)
+        } else {
+            // Otherwise, compare as BigDecimal
+            self.to_big_decimal().cmp(&other.to_big_decimal())
+        }
+    }
+}
+
+impl std::hash::Hash for Decimal {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        // Hash the BigDecimal representation.
+        //
+        // IMPORTANT: Unlike Java's BigDecimal, Rust's bigdecimal crate normalizes
+        // before hashing, so hash(1.0) == hash(1.00). Combined with our numeric
+        // equality (1.0 == 1.00), this CORRECTLY satisfies the hash/equals contract.
+        //
+        // This is BETTER than Java's implementation which has a hash/equals violation:
+        // - Java: equals(1.0, 1.00) = true, but hashCode(1.0) != hashCode(1.00)
+        // - Rust: equals(1.0, 1.00) = true, and hash(1.0) == hash(1.00) ✓
+        //
+        // Result: HashMap/HashSet will work correctly even if you create Decimals
+        // with different scales for the same numeric value (though this is rare in
+        // practice since decimals are schema-driven with fixed precision/scale).
+        self.to_big_decimal().hash(state);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_precision_calculation() {
+        // Zero is special case
+        assert_eq!(Decimal::compute_precision(&BigInt::from(0)), 1);
+
+        // Must count ALL digits including trailing zeros (matches Java BigDecimal.precision())
+        assert_eq!(Decimal::compute_precision(&BigInt::from(10)), 2);
+        assert_eq!(Decimal::compute_precision(&BigInt::from(100)), 3);
+        assert_eq!(Decimal::compute_precision(&BigInt::from(12300)), 5);
+        assert_eq!(
+            Decimal::compute_precision(&BigInt::from(10000000000i64)),
+            11
+        );
+
+        // Test the case: value=1, scale=10 → unscaled=10000000000 (11 digits)
+        let bd = BigDecimal::new(BigInt::from(1), 0);
+        assert!(
+            Decimal::from_big_decimal(bd.clone(), 1, 10).is_err(),
+            "Should reject: unscaled 10000000000 has 11 digits, precision=1 is too small"
+        );
+        assert!(
+            Decimal::from_big_decimal(bd, 11, 10).is_ok(),
+            "Should accept with correct precision=11"
+        );
+    }
+
+    /// Test precision validation boundaries
+    #[test]
+    fn test_precision_validation() {
+        let test_cases = vec![
+            (10i64, 1, 2),            // 1.0 → unscaled: 10 (2 digits)
+            (100i64, 2, 3),           // 1.00 → unscaled: 100 (3 digits)
+            (10000000000i64, 10, 11), // 1.0000000000 → unscaled: 10000000000 (11 digits)
+        ];
+
+        for (unscaled, scale, min_precision) in test_cases {
+            let bd = BigDecimal::new(BigInt::from(unscaled), scale as i64);
+
+            // Reject if precision too small
+            assert!(Decimal::from_big_decimal(bd.clone(), min_precision - 1, scale).is_err());
+            // Accept with correct precision
+            assert!(Decimal::from_big_decimal(bd, min_precision, scale).is_ok());
+        }
+
+        // i64::MAX has 19 digits, should reject with precision=5
+        let bd = BigDecimal::new(BigInt::from(i64::MAX), 0);
+        assert!(Decimal::from_big_decimal(bd, 5, 0).is_err());
+    }
+
+    /// Test creation and basic operations for both compact and non-compact decimals
+    #[test]
+    fn test_creation_and_representation() {
+        // Compact (precision ≤ 18): from unscaled long
+        let compact = Decimal::from_unscaled_long(12345, 10, 2).unwrap();
+        assert_eq!(compact.precision(), 10);
+        assert_eq!(compact.scale(), 2);
+        assert!(compact.is_compact());
+        assert_eq!(compact.to_unscaled_long().unwrap(), 12345);
+        assert_eq!(compact.to_big_decimal().to_string(), "123.45");
+
+        // Non-compact (precision > 18): from BigDecimal
+        let bd = BigDecimal::new(BigInt::from(12345), 0);
+        let non_compact = Decimal::from_big_decimal(bd, 28, 0).unwrap();
+        assert_eq!(non_compact.precision(), 28);
+        assert!(!non_compact.is_compact());
+        assert_eq!(
+            non_compact.to_unscaled_bytes(),
+            BigInt::from(12345).to_signed_bytes_be()
+        );
+
+        // Test compact boundary
+        assert!(Decimal::is_compact_precision(18));
+        assert!(!Decimal::is_compact_precision(19));
+
+        // Test rounding during creation
+        let bd = BigDecimal::new(BigInt::from(12345), 3); // 12.345
+        let rounded = Decimal::from_big_decimal(bd, 10, 2).unwrap();
+        assert_eq!(rounded.to_unscaled_long().unwrap(), 1235); // 12.35
+    }
+
+    /// Test serialization round-trip (unscaled bytes)
+    #[test]
+    fn test_serialization_roundtrip() {
+        // Compact decimal
+        let bd1 = BigDecimal::new(BigInt::from(1314567890123i64), 5); // 13145678.90123
+        let decimal1 = Decimal::from_big_decimal(bd1.clone(), 15, 5).unwrap();
+        let (unscaled1, _) = bd1.as_bigint_and_exponent();
+        let from_bytes1 =
+            Decimal::from_unscaled_bytes(&unscaled1.to_signed_bytes_be(), 15, 5).unwrap();
+        assert_eq!(from_bytes1, decimal1);
+        assert_eq!(
+            from_bytes1.to_unscaled_bytes(),
+            unscaled1.to_signed_bytes_be()
+        );
+
+        // Non-compact decimal
+        let bd2 = BigDecimal::new(BigInt::from(12345678900987654321i128), 10);
+        let decimal2 = Decimal::from_big_decimal(bd2.clone(), 23, 10).unwrap();
+        let (unscaled2, _) = bd2.as_bigint_and_exponent();
+        let from_bytes2 =
+            Decimal::from_unscaled_bytes(&unscaled2.to_signed_bytes_be(), 23, 10).unwrap();
+        assert_eq!(from_bytes2, decimal2);
+        assert_eq!(
+            from_bytes2.to_unscaled_bytes(),
+            unscaled2.to_signed_bytes_be()
+        );
+    }
+
+    /// Test numeric equality and ordering (matches Java semantics)
+    #[test]
+    fn test_equality_and_ordering() {
+        // Same value, different precision/scale → should be equal (numeric equality)
+        let d1 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(10), 1), 2, 1).unwrap(); // 1.0
+        let d2 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(100), 2), 3, 2).unwrap(); // 1.00
+        assert_eq!(d1, d2, "Numeric equality: 1.0 == 1.00");
+        assert_eq!(d1.cmp(&d2), std::cmp::Ordering::Equal);
+
+        // Test ordering with positive values
+        let small = Decimal::from_unscaled_long(10, 5, 0).unwrap();
+        let large = Decimal::from_unscaled_long(15, 5, 0).unwrap();
+        assert!(small < large);
+        assert_eq!(small.cmp(&large), std::cmp::Ordering::Less);
+
+        // Test ordering with negative values
+        let negative_large = Decimal::from_unscaled_long(-10, 5, 0).unwrap(); // -10
+        let negative_small = Decimal::from_unscaled_long(-15, 5, 0).unwrap(); // -15
+        assert!(negative_small < negative_large); // -15 < -10
+        assert_eq!(
+            negative_small.cmp(&negative_large),
+            std::cmp::Ordering::Less
+        );
+
+        // Test ordering with mixed positive and negative
+        let positive = Decimal::from_unscaled_long(5, 5, 0).unwrap();
+        let negative = Decimal::from_unscaled_long(-5, 5, 0).unwrap();
+        assert!(negative < positive);
+        assert_eq!(negative.cmp(&positive), std::cmp::Ordering::Less);
+
+        // Test clone and round-trip equality
+        let original = Decimal::from_unscaled_long(10, 5, 0).unwrap();
+        assert_eq!(original.clone(), original);
+        assert_eq!(
+            Decimal::from_unscaled_long(original.to_unscaled_long().unwrap(), 5, 0).unwrap(),
+            original
+        );
+    }
+
+    /// Test hash/equals contract (Rust implementation is correct, unlike Java)
+    #[test]
+    fn test_hash_equals_contract() {
+        use std::collections::hash_map::DefaultHasher;
+        use std::hash::{Hash, Hasher};
+
+        let d1 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(10), 1), 2, 1).unwrap(); // 1.0
+        let d2 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(100), 2), 3, 2).unwrap(); // 1.00
+
+        // Numeric equality
+        assert_eq!(d1, d2);
+
+        // Hash contract: if a == b, then hash(a) == hash(b)
+        let mut hasher1 = DefaultHasher::new();
+        d1.hash(&mut hasher1);
+        let hash1 = hasher1.finish();
+
+        let mut hasher2 = DefaultHasher::new();
+        d2.hash(&mut hasher2);
+        let hash2 = hasher2.finish();
+
+        assert_eq!(hash1, hash2, "Equal decimals must have equal hashes");
+
+        // Verify HashMap works correctly (this would fail in Java due to their hash/equals bug)
+        let mut map = std::collections::HashMap::new();
+        map.insert(d1.clone(), "value");
+        assert_eq!(map.get(&d2), Some(&"value"));
+    }
+
+    /// Test edge cases: zeros, large numbers, rescaling
+    #[test]
+    fn test_edge_cases() {
+        // Zero handling (compact and non-compact)
+        let zero_compact = Decimal::from_unscaled_long(0, 5, 2).unwrap();
+        assert_eq!(
+            zero_compact.to_big_decimal(),
+            BigDecimal::new(BigInt::from(0), 2)
+        );
+
+        let zero_non_compact =
+            Decimal::from_big_decimal(BigDecimal::new(BigInt::from(0), 2), 20, 2).unwrap();
+        assert_eq!(
+            zero_non_compact.to_big_decimal(),
+            BigDecimal::new(BigInt::from(0), 2)
+        );
+
+        // Large number (39 digits)
+        let large_bd = BigDecimal::from_str("123456789012345678901234567890123456789").unwrap();
+        let large = Decimal::from_big_decimal(large_bd, 39, 0).unwrap();
+        let double_val = large.to_big_decimal().to_string().parse::<f64>().unwrap();
+        assert!((double_val - 1.2345678901234568E38).abs() < 0.01);
+
+        // Rescaling: 5.0 (scale=1) → 5.00 (scale=2)
+        let d1 = Decimal::from_big_decimal(BigDecimal::new(BigInt::from(50), 1), 10, 1).unwrap();
+        let d2 = Decimal::from_big_decimal(d1.to_big_decimal(), 10, 2).unwrap();
+        assert_eq!(d2.to_big_decimal().to_string(), "5.00");
+        assert_eq!(d2.scale(), 2);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs b/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs
new file mode 100644
index 0000000000..81cd96fa42
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/encode/compacted_key_encoder.rs
@@ -0,0 +1,556 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::RowType;
+use crate::row::binary::ValueWriter;
+use crate::row::compacted::CompactedKeyWriter;
+use crate::row::encode::KeyEncoder;
+use crate::row::field_getter::FieldGetter;
+use crate::row::{Datum, InternalRow};
+use bytes::Bytes;
+
+#[allow(dead_code)]
+pub struct CompactedKeyEncoder {
+    field_getters: Vec<FieldGetter>,
+    field_encoders: Vec<ValueWriter>,
+    compacted_encoder: CompactedKeyWriter,
+}
+
+impl CompactedKeyEncoder {
+    /// Create a key encoder to encode the key of the input row.
+    ///
+    /// # Arguments
+    /// * `row_type` - the row type of the input row
+    /// * `keys` - the key fields to encode
+    ///
+    /// # Returns
+    /// * key_encoder - the [`KeyEncoder`]
+    pub fn create_key_encoder(row_type: &RowType, keys: &[String]) -> Result<CompactedKeyEncoder> {
+        let mut encode_col_indexes = Vec::with_capacity(keys.len());
+
+        for key in keys {
+            match row_type.get_field_index(key) {
+                Some(idx) => encode_col_indexes.push(idx),
+                None => {
+                    return Err(IllegalArgument {
+                        message: format!("Field {key:?} not found in input row type {row_type:?}"),
+                    });
+                }
+            }
+        }
+
+        Self::new(row_type, encode_col_indexes)
+    }
+
+    pub fn new(row_type: &RowType, encode_field_pos: Vec<usize>) -> Result<CompactedKeyEncoder> {
+        let mut field_getters: Vec<FieldGetter> = Vec::with_capacity(encode_field_pos.len());
+        let mut field_encoders: Vec<ValueWriter> = Vec::with_capacity(encode_field_pos.len());
+
+        for pos in &encode_field_pos {
+            let data_type = row_type.fields().get(*pos).unwrap().data_type();
+            // Validate key type support first, so unsupported types return a
+            // typed error instead of panicking in FieldGetter::create.
+            let field_encoder = CompactedKeyWriter::create_value_writer(data_type)?;
+            let field_getter = FieldGetter::create(data_type, *pos);
+            field_getters.push(field_getter);
+            field_encoders.push(field_encoder);
+        }
+
+        Ok(CompactedKeyEncoder {
+            field_encoders,
+            field_getters,
+            compacted_encoder: CompactedKeyWriter::new(),
+        })
+    }
+}
+
+#[allow(dead_code)]
+impl KeyEncoder for CompactedKeyEncoder {
+    fn encode_key(&mut self, row: &dyn InternalRow) -> Result<Bytes> {
+        self.compacted_encoder.reset();
+
+        // iterate all the fields of the row, and encode each field
+        for (pos, (field_getter, field_encoder)) in self
+            .field_getters
+            .iter()
+            .zip(self.field_encoders.iter())
+            .enumerate()
+        {
+            match &field_getter.get_field(row)? {
+                Datum::Null => {
+                    return Err(IllegalArgument {
+                        message: format!("Cannot encode key with null value at position: {pos:?}"),
+                    });
+                }
+                value => field_encoder.write_value(&mut self.compacted_encoder, pos, value)?,
+            }
+        }
+
+        Ok(self.compacted_encoder.to_bytes())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{DataType, DataTypes};
+    use crate::row::binary_array::FlussArrayWriter;
+    use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz};
+    use crate::row::{Datum, Decimal, FlussArray, GenericRow};
+
+    fn build_int_array(values: &[i32]) -> FlussArray {
+        let mut w = FlussArrayWriter::new(values.len(), &DataTypes::int());
+        for (i, v) in values.iter().enumerate() {
+            w.write_int(i, *v);
+        }
+        w.complete().unwrap()
+    }
+
+    fn build_nullable_int_array(values: &[Option<i32>]) -> FlussArray {
+        let mut w = FlussArrayWriter::new(values.len(), &DataTypes::int());
+        for (i, v) in values.iter().enumerate() {
+            match v {
+                Some(value) => w.write_int(i, *value),
+                None => w.set_null_at(i),
+            }
+        }
+        w.complete().unwrap()
+    }
+
+    fn build_float_array(values: &[f32]) -> FlussArray {
+        let mut w = FlussArrayWriter::new(values.len(), &DataTypes::float().as_non_nullable());
+        for (i, v) in values.iter().enumerate() {
+            w.write_float(i, *v);
+        }
+        w.complete().unwrap()
+    }
+
+    fn build_nested_string_array() -> FlussArray {
+        let mut inner_1 = FlussArrayWriter::new(3, &DataTypes::string());
+        inner_1.write_string(0, "a");
+        inner_1.set_null_at(1);
+        inner_1.write_string(2, "c");
+        let inner_1 = inner_1.complete().unwrap();
+
+        let mut inner_2 = FlussArrayWriter::new(2, &DataTypes::string());
+        inner_2.write_string(0, "hello");
+        inner_2.write_string(1, "world");
+        let inner_2 = inner_2.complete().unwrap();
+
+        let mut outer = FlussArrayWriter::new(3, &DataTypes::array(DataTypes::string()));
+        outer.write_array(0, &inner_1);
+        outer.set_null_at(1);
+        outer.write_array(2, &inner_2);
+        outer.complete().unwrap()
+    }
+
+    pub fn for_test_row_type(row_type: &RowType) -> CompactedKeyEncoder {
+        CompactedKeyEncoder::new(row_type, (0..row_type.fields().len()).collect())
+            .expect("CompactedKeyEncoder initialization failed")
+    }
+
+    #[test]
+    fn test_encode_map_rejected() {
+        let row_type =
+            RowType::with_data_types(vec![DataTypes::map(DataTypes::string(), DataTypes::int())]);
+
+        let res = CompactedKeyEncoder::new(&row_type, vec![0]);
+        assert!(res.is_err());
+        if let Err(e) = res {
+            assert!(
+                e.to_string().contains("Cannot use Map"),
+                "Expected error to contain 'Cannot use Map', got '{}'",
+                e
+            );
+        }
+    }
+
+    #[test]
+    fn test_encode_key() {
+        let row_type = RowType::with_data_types(vec![
+            DataTypes::int(),
+            DataTypes::bigint(),
+            DataTypes::int(),
+        ]);
+        let row = GenericRow::from_data(vec![
+            Datum::from(1i32),
+            Datum::from(3i64),
+            Datum::from(2i32),
+        ]);
+
+        let mut encoder = for_test_row_type(&row_type);
+
+        assert_eq!(
+            encoder.encode_key(&row).unwrap().iter().as_slice(),
+            [1u8, 3u8, 2u8]
+        );
+
+        let row = GenericRow::from_data(vec![
+            Datum::from(2i32),
+            Datum::from(5i64),
+            Datum::from(6i32),
+        ]);
+
+        assert_eq!(
+            encoder.encode_key(&row).unwrap().iter().as_slice(),
+            [2u8, 5u8, 6u8]
+        );
+    }
+
+    #[test]
+    fn test_encode_key_with_key_names() {
+        let data_types = vec![
+            DataTypes::string(),
+            DataTypes::bigint(),
+            DataTypes::string(),
+        ];
+        let field_names = vec!["partition", "f1", "f2"];
+
+        let row_type = RowType::with_data_types_and_field_names(data_types, field_names);
+
+        let primary_keys = &["f2".to_string()];
+
+        let mut encoder = CompactedKeyEncoder::create_key_encoder(&row_type, primary_keys).unwrap();
+
+        let row = GenericRow::from_data(vec![
+            Datum::from("p1"),
+            Datum::from(1i64),
+            Datum::from("a2"),
+        ]);
+
+        // should only get "a2" 's ASCII representation
+        assert_eq!(
+            encoder.encode_key(&row).unwrap().iter().as_slice(),
+            //  2 (start of text), 97 (the letter a), 50 (the number 2)
+            [2u8, 97u8, 50u8]
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "Cannot encode key with null value at position: 2")]
+    fn test_null_primary_key() {
+        let row_type = RowType::with_data_types(vec![
+            DataTypes::int(),
+            DataTypes::bigint(),
+            DataTypes::int(),
+            DataTypes::string(),
+        ]);
+
+        let primary_key_indices = vec![0, 1, 2];
+
+        let mut encoder = CompactedKeyEncoder::new(&row_type, primary_key_indices)
+            .expect("CompactedKeyEncoder initialization failed");
+
+        let row = GenericRow::from_data(vec![
+            Datum::from(1i32),
+            Datum::from(3i64),
+            Datum::from(2i32),
+            Datum::from("a2"),
+        ]);
+
+        assert_eq!(
+            encoder.encode_key(&row).unwrap().iter().as_slice(),
+            [1u8, 3u8, 2u8]
+        );
+
+        let row = GenericRow::from_data(vec![
+            Datum::from(1i32),
+            Datum::from(3i64),
+            Datum::Null,
+            Datum::from("a2"),
+        ]);
+
+        encoder.encode_key(&row).unwrap();
+    }
+
+    #[test]
+    fn test_int_string_as_primary_key() {
+        let row_type = RowType::with_data_types(vec![
+            DataTypes::string(),
+            DataTypes::int(),
+            DataTypes::string(),
+            DataTypes::string(),
+        ]);
+
+        let primary_key_indices = vec![1, 2];
+        let mut encoder = CompactedKeyEncoder::new(&row_type, primary_key_indices)
+            .expect("CompactedKeyEncoder initialization failed");
+
+        let row = GenericRow::from_data(vec![
+            Datum::from("a1"),
+            Datum::from(1i32),
+            Datum::from("a2"),
+            Datum::from("a3"),
+        ]);
+
+        assert_eq!(
+            encoder.encode_key(&row).unwrap().iter().as_slice(),
+            // 1 (1i32), 2 (start of text), 97 (the letter a), 50 (the number 2)
+            [1u8, 2u8, 97u8, 50u8]
+        );
+    }
+
+    #[test]
+    fn test_array_type_allowed_as_key() {
+        // Java's CompactedKeyEncoder allows Array as a key column type
+        // (the server rejects unsupported key types at table-creation time).
+        let row_type =
+            RowType::with_data_types(vec![DataTypes::int(), DataTypes::array(DataTypes::int())]);
+        let mut encoder = CompactedKeyEncoder::new(&row_type, vec![0, 1]).unwrap();
+
+        let row_a = GenericRow::from_data(vec![
+            Datum::Int32(42),
+            Datum::Array(build_int_array(&[10, 20])),
+        ]);
+        let row_b = GenericRow::from_data(vec![
+            Datum::Int32(42),
+            Datum::Array(build_int_array(&[10, 30])),
+        ]);
+
+        let encoded_a = encoder.encode_key(&row_a).unwrap();
+        let encoded_b = encoder.encode_key(&row_b).unwrap();
+
+        assert!(!encoded_a.is_empty());
+        assert_ne!(
+            encoded_a.iter().as_slice(),
+            encoded_b.iter().as_slice(),
+            "Array key payload should affect compacted key encoding"
+        );
+    }
+
+    #[test]
+    fn test_map_type_rejected_as_key() {
+        let row_type = RowType::with_data_types(vec![
+            DataTypes::int(),
+            DataTypes::map(DataTypes::int(), DataTypes::string()),
+        ]);
+        match CompactedKeyEncoder::new(&row_type, vec![0, 1]) {
+            Ok(_) => panic!("Expected error when using Map as key type"),
+            Err(err) => {
+                assert!(
+                    err.to_string().contains("Cannot use"),
+                    "Expected 'Cannot use' error, got: {err}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_all_data_types_java_compatible() {
+        // Test encoding compatibility with Java using reference from:
+        // https://github.com/apache/fluss/blob/main/fluss-common/src/test/resources/encoding/encoded_key.hex
+        use crate::metadata::{DataType, TimestampLTzType, TimestampType};
+
+        let row_type = RowType::with_data_types(vec![
+            DataTypes::boolean(),                                                 // BOOLEAN
+            DataTypes::tinyint(),                                                 // TINYINT
+            DataTypes::smallint(),                                                // SMALLINT
+            DataTypes::int(),                                                     // INT
+            DataTypes::bigint(),                                                  // BIGINT
+            DataTypes::float(),                                                   // FLOAT
+            DataTypes::double(),                                                  // DOUBLE
+            DataTypes::date(),                                                    // DATE
+            DataTypes::time(),                                                    // TIME
+            DataTypes::binary(20),                                                // BINARY(20)
+            DataTypes::bytes(),                                                   // BYTES
+            DataTypes::char(2),                                                   // CHAR(2)
+            DataTypes::string(),                                                  // STRING
+            DataTypes::decimal(5, 2),                                             // DECIMAL(5,2)
+            DataTypes::decimal(20, 0),                                            // DECIMAL(20,0)
+            DataType::Timestamp(TimestampType::with_nullable(false, 1).unwrap()), // TIMESTAMP(1)
+            DataType::Timestamp(TimestampType::with_nullable(false, 5).unwrap()), // TIMESTAMP(5)
+            DataType::TimestampLTz(TimestampLTzType::with_nullable(false, 1).unwrap()), // TIMESTAMP_LTZ(1)
+            DataType::TimestampLTz(TimestampLTzType::with_nullable(false, 5).unwrap()), // TIMESTAMP_LTZ(5)
+            DataTypes::array(DataTypes::int()), // ARRAY<INT>
+            DataTypes::array(DataTypes::float().as_non_nullable()), // ARRAY<FLOAT NOT NULL>
+            DataTypes::array(DataTypes::array(DataTypes::string())), // ARRAY<ARRAY<STRING>>
+                                                // Note: MAP is rejected as a key type (see test_encode_map_rejected)
+                                                // TODO: Add support for ROW type
+        ]);
+
+        // Exact values from Java's IndexedRowTest.genRecordForAllTypes()
+        let row = GenericRow::from_data(vec![
+            Datum::from(true),                                             // BOOLEAN: true
+            Datum::from(2i8),                                              // TINYINT: 2
+            Datum::from(10i16),                                            // SMALLINT: 10
+            Datum::from(100i32),                                           // INT: 100
+            Datum::from(-6101065172474983726i64),                          // BIGINT
+            Datum::from(13.2f32),                                          // FLOAT: 13.2
+            Datum::from(15.21f64),                                         // DOUBLE: 15.21
+            Datum::Date(Date::new(19655)), // DATE: 2023-10-25 (19655 days since epoch)
+            Datum::Time(Time::new(34200000)), // TIME: 09:30:00.0
+            Datum::from("1234567890".as_bytes()), // BINARY(20)
+            Datum::from("20".as_bytes()),  // BYTES
+            Datum::from("1"),              // CHAR(2): "1"
+            Datum::from("hello"),          // STRING: "hello"
+            Datum::Decimal(Decimal::from_unscaled_long(9, 5, 2).unwrap()), // DECIMAL(5,2)
+            Datum::Decimal(
+                Decimal::from_big_decimal(
+                    bigdecimal::BigDecimal::new(bigdecimal::num_bigint::BigInt::from(10), 0),
+                    20,
+                    0,
+                )
+                .unwrap(),
+            ), // DECIMAL(20,0)
+            Datum::TimestampNtz(TimestampNtz::new(1698235273182)), // TIMESTAMP(1)
+            Datum::TimestampNtz(TimestampNtz::new(1698235273182)), // TIMESTAMP(5)
+            Datum::TimestampLtz(TimestampLtz::new(1698235273182)), // TIMESTAMP_LTZ(1)
+            Datum::TimestampLtz(TimestampLtz::new(1698235273182)), // TIMESTAMP_LTZ(5)
+            Datum::Array(build_nullable_int_array(&[
+                Some(1),
+                Some(2),
+                Some(3),
+                Some(4),
+                Some(5),
+                Some(-11),
+                None,
+                Some(444),
+                Some(102234),
+            ])), // ARRAY<INT>: GenericArray.of(1, 2, 3, 4, 5, -11, null, 444, 102234)
+            Datum::Array(build_float_array(&[
+                0.1_f32,
+                1.1_f32,
+                -0.5_f32,
+                6.6_f32,
+                f32::MAX,
+                f32::from_bits(1),
+            ])), // ARRAY<FLOAT NOT NULL>: GenericArray.of(0.1f, 1.1f, -0.5f, 6.6f, MAX, MIN)
+            Datum::Array(build_nested_string_array()), // ARRAY<ARRAY<STRING>>
+        ]);
+
+        // Expected bytes from Java's encoded_key.hex reference file
+        #[rustfmt::skip]
+        let expected: Vec<u8> = vec![
+            // BOOLEAN: true
+            0x01,
+            // TINYINT: 2
+            0x02,
+            // SMALLINT: 10 (varint encoded)
+            0x0A,
+            // INT: 100 (varint encoded)
+            0x00, 0x64,
+            // BIGINT: -6101065172474983726
+            0xD2, 0x95, 0xFC, 0xD8, 0xCE, 0xB1, 0xAA, 0xAA, 0xAB, 0x01,
+            // FLOAT: 13.2
+            0x33, 0x33, 0x53, 0x41,
+            // DOUBLE: 15.21
+            0xEC, 0x51, 0xB8, 0x1E, 0x85, 0x6B, 0x2E, 0x40,
+            // DATE: 2023-10-25
+            0xC7, 0x99, 0x01,
+            // TIME: 09:30:00.0
+            0xC0, 0xB3, 0xA7, 0x10,
+            // BINARY(20): "1234567890"
+            0x0A, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30,
+            // BYTES: "20"
+            0x02, 0x32, 0x30,
+            // CHAR(2): "1"
+            0x01, 0x31,
+            // STRING: "hello"
+            0x05, 0x68, 0x65, 0x6C, 0x6C, 0x6F,
+            // DECIMAL(5,2): 9
+            0x09,
+            // DECIMAL(20,0): 10
+            0x01, 0x0A,
+            // TIMESTAMP(1): 1698235273182
+            0xDE, 0x9F, 0xD7, 0xB5, 0xB6, 0x31,
+            // TIMESTAMP(5): 1698235273182
+            0xDE, 0x9F, 0xD7, 0xB5, 0xB6, 0x31, 0x00,
+            // TIMESTAMP_LTZ(1): 1698235273182
+            0xDE, 0x9F, 0xD7, 0xB5, 0xB6, 0x31,
+            // TIMESTAMP_LTZ(5): 1698235273182
+            0xDE, 0x9F, 0xD7, 0xB5, 0xB6, 0x31, 0x00,
+            // ARRAY<INT>: GenericArray.of(1, 2, 3, 4, 5, -11, null, 444, 102234)
+            0x30, 0x09, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+            0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
+            0x00, 0x05, 0x00, 0x00, 0x00, 0xF5, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+            0x00, 0xBC, 0x01, 0x00, 0x00, 0x5A, 0x8F, 0x01, 0x00, 0x00, 0x00, 0x00,
+            0x00,
+            // ARRAY<FLOAT NOT NULL>: GenericArray.of(0.1f, 1.1f, -0.5f, 6.6f, MAX, MIN)
+            0x20, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xCD, 0xCC, 0xCC,
+            0x3D, 0xCD, 0xCC, 0x8C, 0x3F, 0x00, 0x00, 0x00, 0xBF, 0x33, 0x33, 0xD3,
+            0x40, 0xFF, 0xFF, 0x7F, 0x7F, 0x01, 0x00, 0x00, 0x00,
+            // ARRAY<ARRAY<STRING>>
+            0x58, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00,
+            0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x18, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00,
+            0x00, 0x02, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00,
+            0x00, 0x00, 0x00, 0x00, 0x81, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+            0x00, 0x68, 0x65, 0x6C, 0x6C, 0x6F, 0x00, 0x00, 0x85, 0x77, 0x6F, 0x72,
+            0x6C, 0x64, 0x00, 0x00, 0x85,
+        ];
+
+        let mut encoder = for_test_row_type(&row_type);
+        let encoded = encoder.encode_key(&row).unwrap();
+
+        // Assert byte-for-byte compatibility with Java's encoded_key.hex
+        assert_eq!(
+            encoded.iter().as_slice(),
+            expected.as_slice(),
+            "\n\nRust encoding does not match Java reference from encoded_key.hex\n\
+             Expected: {:02X?}\n\
+             Actual:   {:02X?}\n",
+            expected,
+            encoded.iter().as_slice()
+        );
+    }
+
+    #[test]
+    fn test_row_as_primary_key() {
+        // ROW<INT, STRING> as a primary key column
+        let inner_row_type = RowType::with_data_types_and_field_names(
+            vec![DataTypes::int(), DataTypes::string()],
+            vec!["x", "label"],
+        );
+        let row_type = RowType::with_data_types_and_field_names(
+            vec![DataTypes::int(), DataType::Row(inner_row_type.clone())],
+            vec!["id", "nested"],
+        );
+
+        let mut inner = GenericRow::new(2);
+        inner.set_field(0, 42_i32);
+        inner.set_field(1, "hello");
+
+        let mut row = GenericRow::new(2);
+        row.set_field(0, 1_i32);
+        row.set_field(1, Datum::Row(Box::new(inner)));
+
+        let mut encoder = for_test_row_type(&row_type);
+        let encoded = encoder.encode_key(&row).unwrap();
+
+        // Verify it encodes without error and produces non-empty bytes
+        assert!(!encoded.is_empty());
+
+        // Encode the same row again to verify determinism
+        let encoded2 = encoder.encode_key(&row).unwrap();
+        assert_eq!(encoded, encoded2);
+
+        // Encode a different nested row and verify different output
+        let mut inner2 = GenericRow::new(2);
+        inner2.set_field(0, 99_i32);
+        inner2.set_field(1, "world");
+
+        let mut row2 = GenericRow::new(2);
+        row2.set_field(0, 1_i32);
+        row2.set_field(1, Datum::Row(Box::new(inner2)));
+
+        let encoded3 = encoder.encode_key(&row2).unwrap();
+        assert_ne!(encoded, encoded3);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs b/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs
new file mode 100644
index 0000000000..20f28820cf
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/encode/compacted_row_encoder.rs
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::RowType;
+use crate::row::Datum;
+use crate::row::binary::{BinaryRowFormat, BinaryWriter, ValueWriter};
+use crate::row::compacted::{CompactedRowDeserializer, CompactedRowWriter};
+use crate::row::encode::RowEncoder;
+use bytes::Bytes;
+use std::sync::Arc;
+
+#[allow(dead_code)]
+pub struct CompactedRowEncoder<'a> {
+    arity: usize,
+    writer: CompactedRowWriter,
+    field_writers: Vec<ValueWriter>,
+    compacted_row_deserializer: Arc<CompactedRowDeserializer<'a>>,
+}
+
+impl<'a> CompactedRowEncoder<'a> {
+    pub fn new(row_type: RowType) -> Result<Self> {
+        let field_writers = row_type
+            .field_types()
+            .map(|d| ValueWriter::create_value_writer(d, Some(&BinaryRowFormat::Compacted)))
+            .collect::<Result<Vec<_>>>()?;
+
+        Ok(Self {
+            arity: field_writers.len(),
+            writer: CompactedRowWriter::new(field_writers.len()),
+            field_writers,
+            compacted_row_deserializer: Arc::new(CompactedRowDeserializer::new_from_owned(
+                row_type,
+            )),
+        })
+    }
+}
+
+impl RowEncoder for CompactedRowEncoder<'_> {
+    fn start_new_row(&mut self) -> Result<()> {
+        self.writer.reset();
+        Ok(())
+    }
+
+    fn encode_field(&mut self, pos: usize, value: Datum) -> Result<()> {
+        self.field_writers
+            .get(pos)
+            .ok_or_else(|| IllegalArgument {
+                message: format!("invalid position {pos} when attempting to encode value {value}"),
+            })?
+            .write_value(&mut self.writer, pos, &value)
+    }
+
+    fn finish_row(&mut self) -> Result<Bytes> {
+        Ok(self.writer.flush_bytes())
+    }
+
+    fn close(&mut self) -> Result<()> {
+        // do nothing
+        Ok(())
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/encode/mod.rs b/fluss-rust/crates/fluss/src/row/encode/mod.rs
new file mode 100644
index 0000000000..16a540ebf8
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/encode/mod.rs
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod compacted_key_encoder;
+mod compacted_row_encoder;
+
+use crate::error::{Error, Result};
+use crate::metadata::{DataLakeFormat, KvFormat, RowType};
+use crate::row::encode::compacted_key_encoder::CompactedKeyEncoder;
+use crate::row::encode::compacted_row_encoder::CompactedRowEncoder;
+use crate::row::{Datum, InternalRow};
+use bytes::Bytes;
+
+/// An interface for encoding key of row into bytes.
+#[allow(dead_code)]
+pub trait KeyEncoder: Send + Sync {
+    fn encode_key(&mut self, row: &dyn InternalRow) -> Result<Bytes>;
+}
+
+pub struct KeyEncoderFactory;
+
+impl KeyEncoderFactory {
+    /// Create a key encoder to encode the key bytes of the input row.
+    /// # Arguments
+    /// * `row_type` - the row type of the input row
+    /// * `key_fields` - the key fields to encode
+    /// * `lake_format` - the data lake format
+    ///
+    /// # Returns
+    /// key encoder
+    pub fn of(
+        row_type: &RowType,
+        key_fields: &[String],
+        data_lake_format: &Option<DataLakeFormat>,
+    ) -> Result<Box<dyn KeyEncoder>> {
+        match data_lake_format {
+            Some(DataLakeFormat::Paimon) => Err(Error::UnsupportedOperation {
+                message: "KeyEncoder for Paimon format is not yet implemented".to_string(),
+            }),
+            Some(DataLakeFormat::Lance) => Ok(Box::new(CompactedKeyEncoder::create_key_encoder(
+                row_type, key_fields,
+            )?)),
+            Some(DataLakeFormat::Iceberg) => Err(Error::UnsupportedOperation {
+                message: "KeyEncoder for Iceberg format is not yet implemented".to_string(),
+            }),
+            None => Ok(Box::new(CompactedKeyEncoder::create_key_encoder(
+                row_type, key_fields,
+            )?)),
+        }
+    }
+}
+
+/// An encoder to write binary row data. It's used to write rows
+/// one by one. When writing a new row:
+///
+/// 1. call method [`RowEncoder::start_new_row()`] to start the writing.
+/// 2. call method [`RowEncoder::encode_field()`] to write the row's field.
+/// 3. call method [`RowEncoder::finish_row()`] to finish the writing and get the written row.
+#[allow(dead_code)]
+pub trait RowEncoder: Send + Sync {
+    /// Start to write a new row.
+    ///
+    /// # Returns
+    /// * Ok(()) if successful
+    fn start_new_row(&mut self) -> Result<()>;
+
+    /// Write the row's field in given pos with given value.
+    ///
+    /// # Arguments
+    /// * pos - the position of the field to write.
+    /// * value - the value of the field to write.
+    ///
+    /// # Returns
+    /// * Ok(()) if successful
+    fn encode_field(&mut self, pos: usize, value: Datum) -> Result<()>;
+
+    /// Finish write the row, returns the written row.
+    ///
+    /// Note that returned row borrows from [`RowEncoder`]'s internal buffer which is reused for subsequent rows
+    /// [`RowEncoder::start_new_row()`] should only be called after the returned row goes out of scope.
+    ///
+    /// # Returns
+    /// * the written row
+    fn finish_row(&mut self) -> Result<Bytes>;
+
+    /// Closes the row encoder
+    ///
+    /// # Returns
+    /// * Ok(()) if successful
+    fn close(&mut self) -> Result<()>;
+}
+
+#[allow(dead_code)]
+pub struct RowEncoderFactory {}
+
+#[allow(dead_code)]
+impl RowEncoderFactory {
+    pub fn create(kv_format: KvFormat, row_type: RowType) -> Result<impl RowEncoder> {
+        Self::create_for_field_types(kv_format, row_type)
+    }
+
+    pub fn create_for_field_types(
+        kv_format: KvFormat,
+        row_type: RowType,
+    ) -> Result<impl RowEncoder> {
+        match kv_format {
+            KvFormat::INDEXED => {
+                todo!()
+            }
+            KvFormat::COMPACTED => CompactedRowEncoder::new(row_type),
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/field_getter.rs b/fluss-rust/crates/fluss/src/row/field_getter.rs
new file mode 100644
index 0000000000..3c2c7ce1ca
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/field_getter.rs
@@ -0,0 +1,305 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Result;
+use crate::metadata::{DataType, RowType};
+use crate::row::{Datum, InternalRow};
+
+#[derive(Clone)]
+pub enum FieldGetter {
+    Nullable(InnerFieldGetter),
+    NonNullable(InnerFieldGetter),
+}
+impl FieldGetter {
+    pub fn get_field<'a>(&self, row: &'a dyn InternalRow) -> Result<Datum<'a>> {
+        match self {
+            FieldGetter::Nullable(getter) => {
+                if row.is_null_at(getter.pos())? {
+                    Ok(Datum::Null)
+                } else {
+                    getter.get_field(row)
+                }
+            }
+            FieldGetter::NonNullable(getter) => getter.get_field(row),
+        }
+    }
+
+    #[allow(dead_code)]
+    pub fn create_field_getters(row_type: &RowType) -> Box<[FieldGetter]> {
+        row_type
+            .fields()
+            .iter()
+            .enumerate()
+            .map(|(pos, field)| Self::create(field.data_type(), pos))
+            .collect()
+    }
+
+    pub fn create(data_type: &DataType, pos: usize) -> FieldGetter {
+        let inner_field_getter = match data_type {
+            DataType::Char(t) => InnerFieldGetter::Char {
+                pos,
+                len: t.length() as usize,
+            },
+            DataType::String(_) => InnerFieldGetter::String { pos },
+            DataType::Boolean(_) => InnerFieldGetter::Bool { pos },
+            DataType::Binary(t) => InnerFieldGetter::Binary {
+                pos,
+                len: t.length(),
+            },
+            DataType::Bytes(_) => InnerFieldGetter::Bytes { pos },
+            DataType::TinyInt(_) => InnerFieldGetter::TinyInt { pos },
+            DataType::SmallInt(_) => InnerFieldGetter::SmallInt { pos },
+            DataType::Int(_) => InnerFieldGetter::Int { pos },
+            DataType::BigInt(_) => InnerFieldGetter::BigInt { pos },
+            DataType::Float(_) => InnerFieldGetter::Float { pos },
+            DataType::Double(_) => InnerFieldGetter::Double { pos },
+            DataType::Decimal(decimal_type) => InnerFieldGetter::Decimal {
+                pos,
+                precision: decimal_type.precision() as usize,
+                scale: decimal_type.scale() as usize,
+            },
+            DataType::Date(_) => InnerFieldGetter::Date { pos },
+            DataType::Time(_) => InnerFieldGetter::Time { pos },
+            DataType::Timestamp(t) => InnerFieldGetter::Timestamp {
+                pos,
+                precision: t.precision(),
+            },
+            DataType::TimestampLTz(t) => InnerFieldGetter::TimestampLtz {
+                pos,
+                precision: t.precision(),
+            },
+            DataType::Array(_) => InnerFieldGetter::Array { pos },
+            DataType::Map(m) => InnerFieldGetter::Map {
+                pos,
+                key_type: m.key_type().clone(),
+                value_type: m.value_type().clone(),
+            },
+            DataType::Row(_) => InnerFieldGetter::Row { pos },
+        };
+
+        if data_type.is_nullable() {
+            Self::Nullable(inner_field_getter)
+        } else {
+            Self::NonNullable(inner_field_getter)
+        }
+    }
+}
+
+#[derive(Clone)]
+pub enum InnerFieldGetter {
+    Char {
+        pos: usize,
+        len: usize,
+    },
+    String {
+        pos: usize,
+    },
+    Bool {
+        pos: usize,
+    },
+    Binary {
+        pos: usize,
+        len: usize,
+    },
+    Bytes {
+        pos: usize,
+    },
+    TinyInt {
+        pos: usize,
+    },
+    SmallInt {
+        pos: usize,
+    },
+    Int {
+        pos: usize,
+    },
+    BigInt {
+        pos: usize,
+    },
+    Float {
+        pos: usize,
+    },
+    Double {
+        pos: usize,
+    },
+    Decimal {
+        pos: usize,
+        precision: usize,
+        scale: usize,
+    },
+    Date {
+        pos: usize,
+    },
+    Time {
+        pos: usize,
+    },
+    Timestamp {
+        pos: usize,
+        precision: u32,
+    },
+    TimestampLtz {
+        pos: usize,
+        precision: u32,
+    },
+    Array {
+        pos: usize,
+    },
+    Map {
+        pos: usize,
+        key_type: DataType,
+        value_type: DataType,
+    },
+    Row {
+        pos: usize,
+    },
+}
+
+impl InnerFieldGetter {
+    pub fn get_field<'a>(&self, row: &'a dyn InternalRow) -> Result<Datum<'a>> {
+        Ok(match self {
+            InnerFieldGetter::Char { pos, len } => Datum::from(row.get_char(*pos, *len)?),
+            InnerFieldGetter::String { pos } => Datum::from(row.get_string(*pos)?),
+            InnerFieldGetter::Bool { pos } => Datum::from(row.get_boolean(*pos)?),
+            InnerFieldGetter::Binary { pos, len } => Datum::from(row.get_binary(*pos, *len)?),
+            InnerFieldGetter::Bytes { pos } => Datum::from(row.get_bytes(*pos)?),
+            InnerFieldGetter::TinyInt { pos } => Datum::from(row.get_byte(*pos)?),
+            InnerFieldGetter::SmallInt { pos } => Datum::from(row.get_short(*pos)?),
+            InnerFieldGetter::Int { pos } => Datum::from(row.get_int(*pos)?),
+            InnerFieldGetter::BigInt { pos } => Datum::from(row.get_long(*pos)?),
+            InnerFieldGetter::Float { pos } => Datum::from(row.get_float(*pos)?),
+            InnerFieldGetter::Double { pos } => Datum::from(row.get_double(*pos)?),
+            InnerFieldGetter::Decimal {
+                pos,
+                precision,
+                scale,
+            } => Datum::Decimal(row.get_decimal(*pos, *precision, *scale)?),
+            InnerFieldGetter::Date { pos } => Datum::Date(row.get_date(*pos)?),
+            InnerFieldGetter::Time { pos } => Datum::Time(row.get_time(*pos)?),
+            InnerFieldGetter::Timestamp { pos, precision } => {
+                Datum::TimestampNtz(row.get_timestamp_ntz(*pos, *precision)?)
+            }
+            InnerFieldGetter::TimestampLtz { pos, precision } => {
+                Datum::TimestampLtz(row.get_timestamp_ltz(*pos, *precision)?)
+            }
+            InnerFieldGetter::Array { pos } => Datum::Array(row.get_array(*pos)?),
+            InnerFieldGetter::Map { pos, .. } => Datum::Map(row.get_map(*pos)?),
+            InnerFieldGetter::Row { pos } => Datum::Row(Box::new(row.get_row(*pos)?.clone())),
+        })
+    }
+
+    pub fn pos(&self) -> usize {
+        match self {
+            Self::Char { pos, .. }
+            | Self::String { pos }
+            | Self::Bool { pos }
+            | Self::Binary { pos, .. }
+            | Self::Bytes { pos }
+            | Self::TinyInt { pos }
+            | Self::SmallInt { pos, .. }
+            | Self::Int { pos }
+            | Self::BigInt { pos }
+            | Self::Float { pos, .. }
+            | Self::Double { pos }
+            | Self::Decimal { pos, .. }
+            | Self::Date { pos }
+            | Self::Time { pos }
+            | Self::Timestamp { pos, .. }
+            | Self::TimestampLtz { pos, .. }
+            | Self::Array { pos }
+            | Self::Map { pos, .. }
+            | Self::Row { pos } => *pos,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::DataTypes;
+    use crate::row::GenericRow;
+    use crate::row::binary_array::FlussArrayWriter;
+    use crate::row::binary_map::FlussMapWriter;
+
+    #[test]
+    fn test_field_getter_array() {
+        let elem_type = DataTypes::int();
+        let mut arr_writer = FlussArrayWriter::new(2, &elem_type);
+        arr_writer.write_int(0, 10);
+        arr_writer.write_int(1, 20);
+        let arr = arr_writer.complete().unwrap();
+
+        let mut row = GenericRow::new(2);
+        row.set_field(0, Datum::Int32(42));
+        row.set_field(1, Datum::Array(arr.clone()));
+
+        let getter = FieldGetter::create(&DataTypes::array(DataTypes::int()), 1);
+        let datum = getter.get_field(&row).unwrap();
+
+        match datum {
+            Datum::Array(a) => {
+                assert_eq!(a.size(), 2);
+                assert_eq!(a.get_int(0).unwrap(), 10);
+                assert_eq!(a.get_int(1).unwrap(), 20);
+            }
+            _ => panic!("Expected Array datum"),
+        }
+    }
+
+    #[test]
+    fn test_field_getter_nullable_array() {
+        let row = GenericRow::from_data(vec![Datum::Null]);
+
+        let data_type = DataTypes::array(DataTypes::int());
+        let getter = FieldGetter::create(&data_type, 0);
+        let datum = getter.get_field(&row).unwrap();
+        assert!(datum.is_null());
+    }
+
+    #[test]
+    fn test_field_getter_map() {
+        let mut map_writer = FlussMapWriter::new(1, &DataTypes::int(), &DataTypes::string());
+        map_writer.write_entry(42.into(), "value".into()).unwrap();
+        let map = map_writer.complete().unwrap();
+
+        let mut row = GenericRow::new(2);
+        row.set_field(0, Datum::Int32(1));
+        row.set_field(1, Datum::Map(map));
+
+        let data_type = DataTypes::map(DataTypes::int(), DataTypes::string());
+        let getter = FieldGetter::create(&data_type, 1);
+        let datum = getter.get_field(&row).unwrap();
+
+        match datum {
+            Datum::Map(m) => {
+                assert_eq!(m.size(), 1);
+                assert_eq!(m.key_array().get_int(0).unwrap(), 42);
+                assert_eq!(m.value_array().get_string(0).unwrap(), "value");
+            }
+            _ => panic!("Expected Map datum"),
+        }
+    }
+
+    #[test]
+    fn test_field_getter_nullable_map() {
+        let row = GenericRow::from_data(vec![Datum::Null]);
+
+        let data_type = DataTypes::map(DataTypes::int(), DataTypes::string());
+        let getter = FieldGetter::create(&data_type, 0);
+        let datum = getter.get_field(&row).unwrap();
+        assert!(datum.is_null());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/fixed_schema_decoder.rs b/fluss-rust/crates/fluss/src/row/fixed_schema_decoder.rs
new file mode 100644
index 0000000000..eec83f3d39
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/fixed_schema_decoder.rs
@@ -0,0 +1,196 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Decode a `[schema_id (2 bytes) | row]` value into an [`InternalRow`]
+//! conforming to a fixed target schema, projecting across schema
+//! versions when needed.
+
+use crate::error::{Error, Result};
+use crate::metadata::{KvFormat, Schema, index_mapping};
+use crate::record::kv::SCHEMA_ID_LENGTH;
+use crate::row::{LookupRow, ProjectedRow, RowDecoder, RowDecoderFactory};
+use std::sync::Arc;
+
+pub(crate) struct FixedSchemaDecoder {
+    row_decoder: Arc<dyn RowDecoder>,
+    index_mapping: Option<Arc<[i32]>>,
+}
+
+impl FixedSchemaDecoder {
+    pub fn new_no_projection(kv_format: KvFormat, schema: &Schema) -> Result<Self> {
+        let row_decoder = RowDecoderFactory::create(kv_format, schema.row_type().clone())?;
+        Ok(Self {
+            row_decoder,
+            index_mapping: None,
+        })
+    }
+
+    pub fn new(
+        kv_format: KvFormat,
+        source_schema: &Schema,
+        target_schema: &Schema,
+    ) -> Result<Self> {
+        let mapping = index_mapping(source_schema, target_schema)?;
+        let row_decoder = RowDecoderFactory::create(kv_format, source_schema.row_type().clone())?;
+        Ok(Self {
+            row_decoder,
+            index_mapping: Some(Arc::from(mapping.into_boxed_slice())),
+        })
+    }
+
+    pub fn decode<'a>(&self, value_bytes: &'a [u8]) -> Result<LookupRow<'a>> {
+        let payload =
+            value_bytes
+                .get(SCHEMA_ID_LENGTH..)
+                .ok_or_else(|| Error::RowConvertError {
+                    message: format!(
+                        "Row payload too short: {} bytes, need at least {} for schema id",
+                        value_bytes.len(),
+                        SCHEMA_ID_LENGTH
+                    ),
+                })?;
+        let row = self.row_decoder.decode(payload);
+        match &self.index_mapping {
+            None => Ok(LookupRow::raw(row)),
+            Some(mapping) => Ok(LookupRow::projected(ProjectedRow::new(
+                row,
+                Arc::clone(mapping),
+            ))),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{Column, DataTypes, Schema};
+    use crate::record::kv::SCHEMA_ID_LENGTH;
+    use crate::row::InternalRow;
+    use crate::row::binary::BinaryWriter;
+    use crate::row::compacted::CompactedRowWriter;
+
+    fn schema_with_ids(columns: &[(i32, &str, crate::metadata::DataType)]) -> Schema {
+        let cols: Vec<Column> = columns
+            .iter()
+            .map(|(id, name, dt)| Column::new(*name, dt.clone()).with_id(*id))
+            .collect();
+        Schema::builder().with_columns(cols).build().unwrap()
+    }
+
+    fn write_value(schema_id: i16, writer: CompactedRowWriter) -> Vec<u8> {
+        let row_bytes = writer.to_bytes();
+        let mut out = Vec::with_capacity(SCHEMA_ID_LENGTH + row_bytes.len());
+        out.extend_from_slice(&schema_id.to_le_bytes());
+        out.extend_from_slice(row_bytes.as_ref());
+        out
+    }
+
+    #[test]
+    fn decode_no_projection_strips_schema_id_and_returns_row() {
+        let schema = schema_with_ids(&[(0, "a", DataTypes::int()), (1, "b", DataTypes::string())]);
+        let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &schema).unwrap();
+
+        let mut writer = CompactedRowWriter::new(2);
+        writer.write_int(42);
+        writer.write_string("hi");
+        let value = write_value(7, writer);
+
+        let row = decoder.decode(&value).unwrap();
+        assert_eq!(row.get_field_count(), 2);
+        assert_eq!(row.get_int(0).unwrap(), 42);
+        assert_eq!(row.get_string(1).unwrap(), "hi");
+    }
+
+    #[test]
+    fn decode_with_projection_pads_missing_field_with_null() {
+        // Source schema (older): [a:int, b:string]
+        let source = schema_with_ids(&[(0, "a", DataTypes::int()), (1, "b", DataTypes::string())]);
+        // Target schema (newer): added column c at id=2
+        let target = schema_with_ids(&[
+            (0, "a", DataTypes::int()),
+            (1, "b", DataTypes::string()),
+            (2, "c", DataTypes::bigint()),
+        ]);
+        let decoder = FixedSchemaDecoder::new(KvFormat::COMPACTED, &source, &target).unwrap();
+
+        let mut writer = CompactedRowWriter::new(2);
+        writer.write_int(7);
+        writer.write_string("seven");
+        let value = write_value(0, writer);
+
+        let row = decoder.decode(&value).unwrap();
+        assert_eq!(row.get_field_count(), 3);
+        assert_eq!(row.get_int(0).unwrap(), 7);
+        assert_eq!(row.get_string(1).unwrap(), "seven");
+        assert!(
+            row.is_null_at(2).unwrap(),
+            "added-but-missing column must read as null"
+        );
+    }
+
+    #[test]
+    fn decode_with_projection_drops_removed_field() {
+        // Source schema (older): [a, b, c]
+        let source = schema_with_ids(&[
+            (0, "a", DataTypes::int()),
+            (1, "b", DataTypes::string()),
+            (2, "c", DataTypes::bigint()),
+        ]);
+        // Target schema (newer): dropped b
+        let target = schema_with_ids(&[(0, "a", DataTypes::int()), (2, "c", DataTypes::bigint())]);
+        let decoder = FixedSchemaDecoder::new(KvFormat::COMPACTED, &source, &target).unwrap();
+
+        let mut writer = CompactedRowWriter::new(3);
+        writer.write_int(1);
+        writer.write_string("dropped");
+        writer.write_long(99);
+        let value = write_value(0, writer);
+
+        let row = decoder.decode(&value).unwrap();
+        assert_eq!(row.get_field_count(), 2);
+        assert_eq!(row.get_int(0).unwrap(), 1);
+        assert_eq!(row.get_long(1).unwrap(), 99);
+    }
+
+    #[test]
+    fn decode_with_projection_reorders_fields() {
+        let source = schema_with_ids(&[(0, "a", DataTypes::int()), (1, "b", DataTypes::string())]);
+        // Target reorders: b first, then a.
+        let target = schema_with_ids(&[(1, "b", DataTypes::string()), (0, "a", DataTypes::int())]);
+        let decoder = FixedSchemaDecoder::new(KvFormat::COMPACTED, &source, &target).unwrap();
+
+        let mut writer = CompactedRowWriter::new(2);
+        writer.write_int(123);
+        writer.write_string("xyz");
+        let value = write_value(0, writer);
+
+        let row = decoder.decode(&value).unwrap();
+        assert_eq!(row.get_string(0).unwrap(), "xyz");
+        assert_eq!(row.get_int(1).unwrap(), 123);
+    }
+
+    #[test]
+    fn decode_payload_too_short_errors() {
+        let schema = schema_with_ids(&[(0, "a", DataTypes::int())]);
+        let decoder = FixedSchemaDecoder::new_no_projection(KvFormat::COMPACTED, &schema).unwrap();
+        // Only 1 byte — short of the schema id.
+        match decoder.decode(&[0u8]) {
+            Ok(_) => panic!("expected error for short payload"),
+            Err(e) => assert!(e.to_string().contains("too short"), "got: {e}"),
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/lookup_row.rs b/fluss-rust/crates/fluss/src/row/lookup_row.rs
new file mode 100644
index 0000000000..6271a7ebff
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/lookup_row.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Return type of [`crate::client::table::LookupResult`] getters: a row
+//! decoded under the table's current schema, possibly via projection
+//! over an older schema's bytes.
+
+use crate::client::WriteFormat;
+use crate::error::Result;
+use crate::row::compacted::CompactedRow;
+use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz};
+use crate::row::projected_row::ProjectedRow;
+use crate::row::{Decimal, FlussArray, FlussMap, GenericRow, InternalRow};
+
+pub struct LookupRow<'a> {
+    inner: Inner<'a>,
+}
+
+enum Inner<'a> {
+    Raw(CompactedRow<'a>),
+    Projected(ProjectedRow<CompactedRow<'a>>),
+}
+
+impl<'a> LookupRow<'a> {
+    pub(crate) fn raw(row: CompactedRow<'a>) -> Self {
+        Self {
+            inner: Inner::Raw(row),
+        }
+    }
+
+    pub(crate) fn projected(row: ProjectedRow<CompactedRow<'a>>) -> Self {
+        Self {
+            inner: Inner::Projected(row),
+        }
+    }
+}
+
+macro_rules! delegate {
+    ($self:ident, $method:ident $(, $arg:expr)*) => {
+        match &$self.inner {
+            Inner::Raw(r) => r.$method($($arg),*),
+            Inner::Projected(r) => r.$method($($arg),*),
+        }
+    };
+}
+
+impl<'a> InternalRow for LookupRow<'a> {
+    fn get_field_count(&self) -> usize {
+        delegate!(self, get_field_count)
+    }
+    fn is_null_at(&self, pos: usize) -> Result<bool> {
+        delegate!(self, is_null_at, pos)
+    }
+    fn get_boolean(&self, pos: usize) -> Result<bool> {
+        delegate!(self, get_boolean, pos)
+    }
+    fn get_byte(&self, pos: usize) -> Result<i8> {
+        delegate!(self, get_byte, pos)
+    }
+    fn get_short(&self, pos: usize) -> Result<i16> {
+        delegate!(self, get_short, pos)
+    }
+    fn get_int(&self, pos: usize) -> Result<i32> {
+        delegate!(self, get_int, pos)
+    }
+    fn get_long(&self, pos: usize) -> Result<i64> {
+        delegate!(self, get_long, pos)
+    }
+    fn get_float(&self, pos: usize) -> Result<f32> {
+        delegate!(self, get_float, pos)
+    }
+    fn get_double(&self, pos: usize) -> Result<f64> {
+        delegate!(self, get_double, pos)
+    }
+    fn get_char(&self, pos: usize, length: usize) -> Result<&str> {
+        delegate!(self, get_char, pos, length)
+    }
+    fn get_string(&self, pos: usize) -> Result<&str> {
+        delegate!(self, get_string, pos)
+    }
+    fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> Result<Decimal> {
+        delegate!(self, get_decimal, pos, precision, scale)
+    }
+    fn get_date(&self, pos: usize) -> Result<Date> {
+        delegate!(self, get_date, pos)
+    }
+    fn get_time(&self, pos: usize) -> Result<Time> {
+        delegate!(self, get_time, pos)
+    }
+    fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> Result<TimestampNtz> {
+        delegate!(self, get_timestamp_ntz, pos, precision)
+    }
+    fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> Result<TimestampLtz> {
+        delegate!(self, get_timestamp_ltz, pos, precision)
+    }
+    fn get_binary(&self, pos: usize, length: usize) -> Result<&[u8]> {
+        delegate!(self, get_binary, pos, length)
+    }
+    fn get_bytes(&self, pos: usize) -> Result<&[u8]> {
+        delegate!(self, get_bytes, pos)
+    }
+    fn get_array(&self, pos: usize) -> Result<FlussArray> {
+        delegate!(self, get_array, pos)
+    }
+    fn get_map(&self, pos: usize) -> Result<FlussMap> {
+        delegate!(self, get_map, pos)
+    }
+    fn get_row(&self, pos: usize) -> Result<&GenericRow<'_>> {
+        delegate!(self, get_row, pos)
+    }
+    fn as_encoded_bytes(&self, write_format: WriteFormat) -> Option<&[u8]> {
+        delegate!(self, as_encoded_bytes, write_format)
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/mod.rs b/fluss-rust/crates/fluss/src/row/mod.rs
new file mode 100644
index 0000000000..1e045b2d14
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/mod.rs
@@ -0,0 +1,441 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod binary_array;
+pub mod binary_map;
+mod column;
+
+pub(crate) mod datum;
+mod decimal;
+
+pub mod binary;
+pub(crate) mod column_writer;
+pub mod compacted;
+pub mod encode;
+pub mod field_getter;
+mod fixed_schema_decoder;
+mod lookup_row;
+mod projected_row;
+mod row_decoder;
+
+use crate::client::WriteFormat;
+pub use binary_array::{FlussArray, FlussArrayWriter};
+pub use binary_map::{FlussMap, FlussMapWriter};
+use bytes::Bytes;
+pub use column::*;
+pub use compacted::CompactedRow;
+pub use datum::*;
+pub use decimal::{Decimal, MAX_COMPACT_PRECISION};
+pub use encode::KeyEncoder;
+pub(crate) use fixed_schema_decoder::FixedSchemaDecoder;
+pub use lookup_row::LookupRow;
+pub(crate) use projected_row::ProjectedRow;
+pub use row_decoder::{CompactedRowDecoder, RowDecoder, RowDecoderFactory};
+use serde::Serialize;
+
+pub struct BinaryRow<'a> {
+    data: BinaryDataWrapper<'a>,
+}
+
+pub enum BinaryDataWrapper<'a> {
+    Bytes(Bytes),
+    Ref(&'a [u8]),
+}
+
+impl<'a> BinaryRow<'a> {
+    /// Returns the binary representation of this row as a byte slice.
+    pub fn as_bytes(&'a self) -> &'a [u8] {
+        match &self.data {
+            BinaryDataWrapper::Bytes(bytes) => bytes.as_ref(),
+            BinaryDataWrapper::Ref(r) => r,
+        }
+    }
+}
+
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+
+pub trait InternalRow: Send + Sync {
+    /// Returns the number of fields in this row
+    fn get_field_count(&self) -> usize;
+
+    /// Returns true if the element is null at the given position
+    fn is_null_at(&self, pos: usize) -> Result<bool>;
+
+    /// Returns the boolean value at the given position
+    fn get_boolean(&self, pos: usize) -> Result<bool>;
+
+    /// Returns the byte value at the given position
+    fn get_byte(&self, pos: usize) -> Result<i8>;
+
+    /// Returns the short value at the given position
+    fn get_short(&self, pos: usize) -> Result<i16>;
+
+    /// Returns the integer value at the given position
+    fn get_int(&self, pos: usize) -> Result<i32>;
+
+    /// Returns the long value at the given position
+    fn get_long(&self, pos: usize) -> Result<i64>;
+
+    /// Returns the float value at the given position
+    fn get_float(&self, pos: usize) -> Result<f32>;
+
+    /// Returns the double value at the given position
+    fn get_double(&self, pos: usize) -> Result<f64>;
+
+    /// Returns the string value at the given position with fixed length
+    fn get_char(&self, pos: usize, length: usize) -> Result<&str>;
+
+    /// Returns the string value at the given position
+    fn get_string(&self, pos: usize) -> Result<&str>;
+
+    /// Returns the decimal value at the given position
+    fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> Result<Decimal>;
+
+    /// Returns the date value at the given position (date as days since epoch)
+    fn get_date(&self, pos: usize) -> Result<Date>;
+
+    /// Returns the time value at the given position (time as milliseconds since midnight)
+    fn get_time(&self, pos: usize) -> Result<Time>;
+
+    /// Returns the timestamp value at the given position (timestamp without timezone)
+    ///
+    /// The precision is required to determine whether the timestamp value was stored
+    /// in a compact representation (precision <= 3) or with nanosecond precision.
+    fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> Result<TimestampNtz>;
+
+    /// Returns the timestamp value at the given position (timestamp with local timezone)
+    ///
+    /// The precision is required to determine whether the timestamp value was stored
+    /// in a compact representation (precision <= 3) or with nanosecond precision.
+    fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> Result<TimestampLtz>;
+
+    /// Returns the binary value at the given position with fixed length
+    fn get_binary(&self, pos: usize, length: usize) -> Result<&[u8]>;
+
+    /// Returns the binary value at the given position
+    fn get_bytes(&self, pos: usize) -> Result<&[u8]>;
+
+    /// Returns the array value at the given position
+    fn get_array(&self, pos: usize) -> Result<FlussArray>;
+
+    /// Returns the map value at the given position
+    fn get_map(&self, pos: usize) -> Result<FlussMap>;
+
+    /// Returns the nested row value at the given position
+    fn get_row(&self, pos: usize) -> Result<&GenericRow<'_>> {
+        Err(IllegalArgument {
+            message: format!("get_row not supported at position {pos}"),
+        })
+    }
+
+    /// Returns encoded bytes if already encoded
+    fn as_encoded_bytes(&self, _write_format: WriteFormat) -> Option<&[u8]> {
+        None
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)]
+pub struct GenericRow<'a> {
+    pub values: Vec<Datum<'a>>,
+}
+
+impl<'a> GenericRow<'a> {
+    fn get_value(&self, pos: usize) -> Result<&Datum<'a>> {
+        self.values.get(pos).ok_or_else(|| IllegalArgument {
+            message: format!(
+                "position {pos} out of bounds (row has {} fields)",
+                self.values.len()
+            ),
+        })
+    }
+
+    fn try_convert<T: TryFrom<&'a Datum<'a>>>(
+        &'a self,
+        pos: usize,
+        expected_type: &str,
+    ) -> Result<T> {
+        let datum = self.get_value(pos)?;
+        T::try_from(datum).map_err(|_| IllegalArgument {
+            message: format!(
+                "type mismatch at position {pos}: expected {expected_type}, got {datum:?}"
+            ),
+        })
+    }
+}
+
+impl<'a> InternalRow for GenericRow<'a> {
+    fn get_field_count(&self) -> usize {
+        self.values.len()
+    }
+
+    fn is_null_at(&self, pos: usize) -> Result<bool> {
+        Ok(self.get_value(pos)?.is_null())
+    }
+
+    fn get_boolean(&self, pos: usize) -> Result<bool> {
+        self.try_convert(pos, "Boolean")
+    }
+
+    fn get_byte(&self, pos: usize) -> Result<i8> {
+        self.try_convert(pos, "TinyInt")
+    }
+
+    fn get_short(&self, pos: usize) -> Result<i16> {
+        self.try_convert(pos, "SmallInt")
+    }
+
+    fn get_int(&self, pos: usize) -> Result<i32> {
+        self.try_convert(pos, "Int")
+    }
+
+    fn get_long(&self, pos: usize) -> Result<i64> {
+        self.try_convert(pos, "BigInt")
+    }
+
+    fn get_float(&self, pos: usize) -> Result<f32> {
+        self.try_convert(pos, "Float")
+    }
+
+    fn get_double(&self, pos: usize) -> Result<f64> {
+        self.try_convert(pos, "Double")
+    }
+
+    fn get_char(&self, pos: usize, _length: usize) -> Result<&str> {
+        // don't check length, following java client
+        self.get_string(pos)
+    }
+
+    fn get_string(&self, pos: usize) -> Result<&str> {
+        self.try_convert(pos, "String")
+    }
+
+    fn get_decimal(&self, pos: usize, _precision: usize, _scale: usize) -> Result<Decimal> {
+        match self.get_value(pos)? {
+            Datum::Decimal(d) => Ok(d.clone()),
+            other => Err(IllegalArgument {
+                message: format!(
+                    "type mismatch at position {pos}: expected Decimal, got {other:?}"
+                ),
+            }),
+        }
+    }
+
+    fn get_date(&self, pos: usize) -> Result<Date> {
+        match self.get_value(pos)? {
+            Datum::Date(d) => Ok(*d),
+            Datum::Int32(i) => Ok(Date::new(*i)),
+            other => Err(IllegalArgument {
+                message: format!(
+                    "type mismatch at position {pos}: expected Date or Int32, got {other:?}"
+                ),
+            }),
+        }
+    }
+
+    fn get_time(&self, pos: usize) -> Result<Time> {
+        match self.get_value(pos)? {
+            Datum::Time(t) => Ok(*t),
+            Datum::Int32(i) => Ok(Time::new(*i)),
+            other => Err(IllegalArgument {
+                message: format!(
+                    "type mismatch at position {pos}: expected Time or Int32, got {other:?}"
+                ),
+            }),
+        }
+    }
+
+    fn get_timestamp_ntz(&self, pos: usize, _precision: u32) -> Result<TimestampNtz> {
+        match self.get_value(pos)? {
+            Datum::TimestampNtz(t) => Ok(*t),
+            other => Err(IllegalArgument {
+                message: format!(
+                    "type mismatch at position {pos}: expected TimestampNtz, got {other:?}"
+                ),
+            }),
+        }
+    }
+
+    fn get_timestamp_ltz(&self, pos: usize, _precision: u32) -> Result<TimestampLtz> {
+        match self.get_value(pos)? {
+            Datum::TimestampLtz(t) => Ok(*t),
+            other => Err(IllegalArgument {
+                message: format!(
+                    "type mismatch at position {pos}: expected TimestampLtz, got {other:?}"
+                ),
+            }),
+        }
+    }
+
+    fn get_binary(&self, pos: usize, _length: usize) -> Result<&[u8]> {
+        match self.get_value(pos)? {
+            Datum::Blob(b) => Ok(b.as_ref()),
+            other => Err(IllegalArgument {
+                message: format!("type mismatch at position {pos}: expected Binary, got {other:?}"),
+            }),
+        }
+    }
+
+    fn get_bytes(&self, pos: usize) -> Result<&[u8]> {
+        match self.get_value(pos)? {
+            Datum::Blob(b) => Ok(b.as_ref()),
+            other => Err(IllegalArgument {
+                message: format!("type mismatch at position {pos}: expected Bytes, got {other:?}"),
+            }),
+        }
+    }
+
+    fn get_array(&self, pos: usize) -> Result<FlussArray> {
+        match self.get_value(pos)? {
+            Datum::Array(a) => Ok(a.clone()),
+            other => Err(IllegalArgument {
+                message: format!("type mismatch at position {pos}: expected Array, got {other:?}"),
+            }),
+        }
+    }
+
+    fn get_map(&self, pos: usize) -> Result<FlussMap> {
+        match self.get_value(pos)? {
+            Datum::Map(m) => Ok(m.clone()),
+            other => Err(IllegalArgument {
+                message: format!("type mismatch at position {pos}: expected Map, got {other:?}"),
+            }),
+        }
+    }
+
+    fn get_row(&self, pos: usize) -> Result<&GenericRow<'_>> {
+        match self.get_value(pos)? {
+            Datum::Row(r) => Ok(r.as_ref()),
+            other => Err(IllegalArgument {
+                message: format!("type mismatch at position {pos}: expected Row, got {other:?}"),
+            }),
+        }
+    }
+}
+
+impl<'a> GenericRow<'a> {
+    /// Consumes this row and returns one whose `Datum` values are all
+    /// `'static` (borrowed `Cow`s are promoted to owned, nested rows recurse).
+    /// Lets a row outlive the bytes it was decoded from.
+    pub fn into_owned(self) -> GenericRow<'static> {
+        GenericRow {
+            values: self.values.into_iter().map(Datum::into_owned).collect(),
+        }
+    }
+}
+
+impl<'a> GenericRow<'a> {
+    pub fn from_data(data: Vec<impl Into<Datum<'a>>>) -> GenericRow<'a> {
+        GenericRow {
+            values: data.into_iter().map(Into::into).collect(),
+        }
+    }
+
+    /// Creates a GenericRow with the specified number of fields, all initialized to null.
+    ///
+    /// This is useful when you need to create a row with a specific field count
+    /// but only want to set some fields (e.g., for KV delete operations where
+    /// only primary key fields need to be set).
+    ///
+    /// # Example
+    /// ```
+    /// use fluss::row::GenericRow;
+    ///
+    /// let mut row = GenericRow::new(3);
+    /// row.set_field(0, 42); // Only set the primary key
+    /// // Fields 1 and 2 remain null
+    /// ```
+    pub fn new(field_count: usize) -> GenericRow<'a> {
+        GenericRow {
+            values: vec![Datum::Null; field_count],
+        }
+    }
+
+    /// Sets the field at the given position to the specified value.
+    ///
+    /// # Panics
+    /// Panics if `pos` is out of bounds (>= field count).
+    pub fn set_field(&mut self, pos: usize, value: impl Into<Datum<'a>>) {
+        self.values[pos] = value.into();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn is_null_at_checks_datum_nullity() {
+        let mut row = GenericRow::new(2);
+        row.set_field(0, Datum::Null);
+        row.set_field(1, 42_i32);
+
+        assert!(row.is_null_at(0).unwrap());
+        assert!(!row.is_null_at(1).unwrap());
+    }
+
+    #[test]
+    fn is_null_at_out_of_bounds_returns_error() {
+        let row = GenericRow::from_data(vec![42_i32]);
+        let err = row.is_null_at(5).unwrap_err();
+        assert!(
+            err.to_string().contains("out of bounds"),
+            "Expected out of bounds error, got: {err}"
+        );
+    }
+
+    #[test]
+    fn new_initializes_nulls() {
+        let row = GenericRow::new(3);
+        assert_eq!(row.get_field_count(), 3);
+        assert!(row.is_null_at(0).unwrap());
+        assert!(row.is_null_at(1).unwrap());
+        assert!(row.is_null_at(2).unwrap());
+    }
+
+    #[test]
+    fn partial_row_for_delete() {
+        // Simulates delete scenario: only primary key (field 0) is set
+        let mut row = GenericRow::new(3);
+        row.set_field(0, 123_i32);
+        // Fields 1 and 2 remain null
+        assert_eq!(row.get_field_count(), 3);
+        assert_eq!(row.get_int(0).unwrap(), 123);
+        assert!(row.is_null_at(1).unwrap());
+        assert!(row.is_null_at(2).unwrap());
+    }
+
+    #[test]
+    fn type_mismatch_returns_error() {
+        let row = GenericRow::from_data(vec![Datum::Int64(999)]);
+        let err = row.get_string(0).unwrap_err();
+        assert!(
+            err.to_string().contains("type mismatch"),
+            "Expected type mismatch error, got: {err}"
+        );
+    }
+
+    #[test]
+    fn out_of_bounds_returns_error() {
+        let row = GenericRow::from_data(vec![42_i32]);
+        let err = row.get_int(5).unwrap_err();
+        assert!(
+            err.to_string().contains("out of bounds"),
+            "Expected out of bounds error, got: {err}"
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/projected_row.rs b/fluss-rust/crates/fluss/src/row/projected_row.rs
new file mode 100644
index 0000000000..f08778ccda
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/projected_row.rs
@@ -0,0 +1,284 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! View over an [`InternalRow`] that re-orders, drops, and null-pads
+//! fields according to a target→source index mapping.
+
+use crate::client::WriteFormat;
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::UNEXIST_MAPPING;
+use crate::row::datum::{Date, Time, TimestampLtz, TimestampNtz};
+use crate::row::{Decimal, FlussArray, FlussMap, GenericRow, InternalRow};
+use std::sync::Arc;
+
+pub(crate) struct ProjectedRow<R> {
+    index_mapping: Arc<[i32]>,
+    inner: R,
+}
+
+impl<R> ProjectedRow<R> {
+    pub fn new(inner: R, index_mapping: Arc<[i32]>) -> Self {
+        Self {
+            index_mapping,
+            inner,
+        }
+    }
+
+    fn source_index(&self, pos: usize) -> Result<usize> {
+        let mapped = self
+            .index_mapping
+            .get(pos)
+            .copied()
+            .ok_or_else(|| IllegalArgument {
+                message: format!(
+                    "position {pos} out of bounds (projected row has {} fields)",
+                    self.index_mapping.len()
+                ),
+            })?;
+        if mapped == UNEXIST_MAPPING {
+            return Err(IllegalArgument {
+                message: format!(
+                    "field at position {pos} does not exist in the source row \
+                     (caller should check is_null_at first)"
+                ),
+            });
+        }
+        Ok(mapped as usize)
+    }
+}
+
+macro_rules! project {
+    ($self:ident, $method:ident, $pos:expr $(, $arg:expr)*) => {
+        $self.inner.$method($self.source_index($pos)?, $($arg),*)
+    };
+}
+
+impl<R: InternalRow> InternalRow for ProjectedRow<R> {
+    fn get_field_count(&self) -> usize {
+        self.index_mapping.len()
+    }
+
+    fn is_null_at(&self, pos: usize) -> Result<bool> {
+        let mapped = self
+            .index_mapping
+            .get(pos)
+            .copied()
+            .ok_or_else(|| IllegalArgument {
+                message: format!(
+                    "position {pos} out of bounds (projected row has {} fields)",
+                    self.index_mapping.len()
+                ),
+            })?;
+        if mapped == UNEXIST_MAPPING {
+            return Ok(true);
+        }
+        self.inner.is_null_at(mapped as usize)
+    }
+
+    fn get_boolean(&self, pos: usize) -> Result<bool> {
+        project!(self, get_boolean, pos)
+    }
+    fn get_byte(&self, pos: usize) -> Result<i8> {
+        project!(self, get_byte, pos)
+    }
+    fn get_short(&self, pos: usize) -> Result<i16> {
+        project!(self, get_short, pos)
+    }
+    fn get_int(&self, pos: usize) -> Result<i32> {
+        project!(self, get_int, pos)
+    }
+    fn get_long(&self, pos: usize) -> Result<i64> {
+        project!(self, get_long, pos)
+    }
+    fn get_float(&self, pos: usize) -> Result<f32> {
+        project!(self, get_float, pos)
+    }
+    fn get_double(&self, pos: usize) -> Result<f64> {
+        project!(self, get_double, pos)
+    }
+    fn get_char(&self, pos: usize, length: usize) -> Result<&str> {
+        project!(self, get_char, pos, length)
+    }
+    fn get_string(&self, pos: usize) -> Result<&str> {
+        project!(self, get_string, pos)
+    }
+    fn get_decimal(&self, pos: usize, precision: usize, scale: usize) -> Result<Decimal> {
+        project!(self, get_decimal, pos, precision, scale)
+    }
+    fn get_date(&self, pos: usize) -> Result<Date> {
+        project!(self, get_date, pos)
+    }
+    fn get_time(&self, pos: usize) -> Result<Time> {
+        project!(self, get_time, pos)
+    }
+    fn get_timestamp_ntz(&self, pos: usize, precision: u32) -> Result<TimestampNtz> {
+        project!(self, get_timestamp_ntz, pos, precision)
+    }
+    fn get_timestamp_ltz(&self, pos: usize, precision: u32) -> Result<TimestampLtz> {
+        project!(self, get_timestamp_ltz, pos, precision)
+    }
+    fn get_binary(&self, pos: usize, length: usize) -> Result<&[u8]> {
+        project!(self, get_binary, pos, length)
+    }
+    fn get_bytes(&self, pos: usize) -> Result<&[u8]> {
+        project!(self, get_bytes, pos)
+    }
+    fn get_array(&self, pos: usize) -> Result<FlussArray> {
+        project!(self, get_array, pos)
+    }
+
+    fn get_map(&self, pos: usize) -> Result<FlussMap> {
+        project!(self, get_map, pos)
+    }
+
+    fn get_row(&self, pos: usize) -> Result<&GenericRow<'_>> {
+        project!(self, get_row, pos)
+    }
+
+    fn as_encoded_bytes(&self, _write_format: WriteFormat) -> Option<&[u8]> {
+        // Projection changes the field layout, so the inner row's
+        // encoded form no longer matches.
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::row::{Datum, GenericRow};
+
+    fn mapping(slots: &[i32]) -> Arc<[i32]> {
+        Arc::from(slots.to_vec().into_boxed_slice())
+    }
+
+    fn row_of<'a>(values: Vec<Datum<'a>>) -> GenericRow<'a> {
+        GenericRow { values }
+    }
+
+    #[test]
+    fn projects_and_reorders_longs() {
+        let mapping = mapping(&[2, 0, 1, 4]);
+        let inner = row_of(vec![
+            Datum::Int64(0),
+            Datum::Int64(1),
+            Datum::Int64(2),
+            Datum::Int64(3),
+            Datum::Int64(4),
+        ]);
+        let projected = ProjectedRow::new(inner, mapping);
+
+        assert_eq!(projected.get_field_count(), 4);
+        assert_eq!(projected.get_long(0).unwrap(), 2);
+        assert_eq!(projected.get_long(1).unwrap(), 0);
+        assert_eq!(projected.get_long(2).unwrap(), 1);
+        assert_eq!(projected.get_long(3).unwrap(), 4);
+    }
+
+    #[test]
+    fn projects_strings_and_doubles() {
+        let mapping = mapping(&[2, 0, 1, 4]);
+
+        let strings = row_of(vec![
+            Datum::String("0".into()),
+            Datum::String("1".into()),
+            Datum::String("2".into()),
+            Datum::String("3".into()),
+            Datum::String("4".into()),
+        ]);
+        let projected = ProjectedRow::new(strings, Arc::clone(&mapping));
+        assert_eq!(projected.get_string(0).unwrap(), "2");
+        assert_eq!(projected.get_string(1).unwrap(), "0");
+        assert_eq!(projected.get_string(3).unwrap(), "4");
+
+        let doubles = row_of(vec![
+            Datum::Float64(0.5.into()),
+            Datum::Float64(0.6.into()),
+            Datum::Float64(0.7.into()),
+            Datum::Float64(0.8.into()),
+            Datum::Float64(0.9.into()),
+            Datum::Float64(1.0.into()),
+        ]);
+        let projected = ProjectedRow::new(doubles, Arc::clone(&mapping));
+        assert_eq!(projected.get_double(0).unwrap(), 0.7);
+        assert_eq!(projected.get_double(1).unwrap(), 0.5);
+        assert_eq!(projected.get_double(3).unwrap(), 0.9);
+    }
+
+    #[test]
+    fn null_handling_passes_through_inner_nulls() {
+        let mapping = mapping(&[2, 0, 1, 4]);
+        let inner = row_of(vec![
+            Datum::Int64(5),
+            Datum::Int64(6),
+            Datum::Null,
+            Datum::Int64(8),
+            Datum::Null,
+            Datum::Int64(10),
+        ]);
+        let projected = ProjectedRow::new(inner, mapping);
+
+        assert!(projected.is_null_at(0).unwrap());
+        assert!(!projected.is_null_at(1).unwrap());
+        assert!(!projected.is_null_at(2).unwrap());
+        assert!(projected.is_null_at(3).unwrap());
+    }
+
+    #[test]
+    fn unexist_mapping_reports_null_and_errors_on_get() {
+        let mapping = mapping(&[0, 1, UNEXIST_MAPPING, 2]);
+        let inner = row_of(vec![Datum::Int64(10), Datum::Int64(20), Datum::Int64(30)]);
+        let projected = ProjectedRow::new(inner, mapping);
+
+        assert_eq!(projected.get_field_count(), 4);
+        assert_eq!(projected.get_long(0).unwrap(), 10);
+        assert_eq!(projected.get_long(1).unwrap(), 20);
+        assert!(projected.is_null_at(2).unwrap());
+        let err = projected.get_long(2).unwrap_err();
+        assert!(err.to_string().contains("does not exist"), "got: {err}");
+        assert_eq!(projected.get_long(3).unwrap(), 30);
+    }
+
+    #[test]
+    fn out_of_bounds_position_returns_error() {
+        let mapping = mapping(&[0, 1]);
+        let inner = row_of(vec![Datum::Int64(1), Datum::Int64(2)]);
+        let projected = ProjectedRow::new(inner, mapping);
+
+        let err = projected.is_null_at(5).unwrap_err();
+        assert!(err.to_string().contains("out of bounds"), "got: {err}");
+        let err = projected.get_long(5).unwrap_err();
+        assert!(err.to_string().contains("out of bounds"), "got: {err}");
+    }
+
+    #[test]
+    fn shared_mapping_can_back_many_rows() {
+        let mapping = mapping(&[1, 0]);
+        let row_a = ProjectedRow::new(
+            row_of(vec![Datum::Int64(10), Datum::Int64(20)]),
+            Arc::clone(&mapping),
+        );
+        let row_b = ProjectedRow::new(
+            row_of(vec![Datum::Int64(30), Datum::Int64(40)]),
+            Arc::clone(&mapping),
+        );
+        assert_eq!(row_a.get_long(0).unwrap(), 20);
+        assert_eq!(row_a.get_long(1).unwrap(), 10);
+        assert_eq!(row_b.get_long(0).unwrap(), 40);
+        assert_eq!(row_b.get_long(1).unwrap(), 30);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/row/row_decoder.rs b/fluss-rust/crates/fluss/src/row/row_decoder.rs
new file mode 100644
index 0000000000..aea8c86e94
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/row/row_decoder.rs
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Row decoder for deserializing binary row formats.
+//!
+//! Mirrors the Java org.apache.fluss.row.decode package.
+
+use crate::error::{Error, Result};
+use crate::metadata::{KvFormat, RowType};
+use crate::row::compacted::{CompactedRow, CompactedRowDeserializer};
+use std::sync::Arc;
+
+/// Decoder for creating BinaryRow from bytes.
+///
+/// This trait provides an abstraction for decoding different row formats
+/// (COMPACTED, INDEXED, etc.) from binary data.
+///
+/// Reference: org.apache.fluss.row.decode.RowDecoder
+pub trait RowDecoder: Send + Sync {
+    /// Decode bytes into a CompactedRow.
+    ///
+    /// The lifetime 'a ties the returned row to the input data, ensuring
+    /// the data remains valid as long as the row is used.
+    fn decode<'a>(&self, data: &'a [u8]) -> CompactedRow<'a>;
+}
+
+/// Decoder for CompactedRow format.
+///
+/// Uses the existing CompactedRow infrastructure for decoding.
+/// This is a thin wrapper that implements the RowDecoder trait.
+///
+/// Reference: org.apache.fluss.row.decode.CompactedRowDecoder
+pub struct CompactedRowDecoder {
+    field_count: usize,
+    deserializer: Arc<CompactedRowDeserializer<'static>>,
+}
+
+impl CompactedRowDecoder {
+    /// Create a new CompactedRowDecoder with the given row type.
+    pub fn new(row_type: RowType) -> Self {
+        let field_count = row_type.fields().len();
+        let deserializer = Arc::new(CompactedRowDeserializer::new_from_owned(row_type));
+
+        Self {
+            field_count,
+            deserializer,
+        }
+    }
+}
+
+impl RowDecoder for CompactedRowDecoder {
+    fn decode<'a>(&self, data: &'a [u8]) -> CompactedRow<'a> {
+        // Use existing CompactedRow::deserialize() infrastructure
+        CompactedRow::deserialize(Arc::clone(&self.deserializer), self.field_count, data)
+    }
+}
+
+/// Factory for creating RowDecoders based on KvFormat.
+///
+/// Reference: org.apache.fluss.row.decode.RowDecoder.create()
+pub struct RowDecoderFactory;
+
+impl RowDecoderFactory {
+    /// Create a RowDecoder for the given format and row type.
+    pub fn create(kv_format: KvFormat, row_type: RowType) -> Result<Arc<dyn RowDecoder>> {
+        match kv_format {
+            KvFormat::COMPACTED => Ok(Arc::new(CompactedRowDecoder::new(row_type))),
+            KvFormat::INDEXED => Err(Error::UnsupportedOperation {
+                message: "INDEXED format is not yet supported".to_string(),
+            }),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::DataTypes;
+    use crate::row::InternalRow;
+    use crate::row::binary::BinaryWriter;
+    use crate::row::compacted::CompactedRowWriter;
+
+    #[test]
+    fn test_compacted_row_decoder() {
+        // Write a CompactedRow
+        let mut writer = CompactedRowWriter::new(2);
+        writer.write_int(42);
+        writer.write_string("hello");
+
+        let data = writer.to_bytes();
+
+        // Create decoder with RowType
+        let row_type = RowType::with_data_types(vec![DataTypes::int(), DataTypes::string()]);
+        let decoder = CompactedRowDecoder::new(row_type);
+
+        // Decode
+        let row = decoder.decode(&data);
+
+        // Verify
+        assert_eq!(row.get_field_count(), 2);
+        assert_eq!(row.get_int(0).unwrap(), 42);
+        assert_eq!(row.get_string(1).unwrap(), "hello");
+    }
+
+    #[test]
+    fn test_row_decoder_factory() {
+        let row_type = RowType::with_data_types(vec![DataTypes::int(), DataTypes::string()]);
+        let decoder = RowDecoderFactory::create(KvFormat::COMPACTED, row_type).unwrap();
+
+        // Write a row
+        let mut writer = CompactedRowWriter::new(2);
+        writer.write_int(100);
+        writer.write_string("world");
+        let data = writer.to_bytes();
+
+        // Decode
+        let row = decoder.decode(&data);
+
+        // Verify
+        assert_eq!(row.get_int(0).unwrap(), 100);
+        assert_eq!(row.get_string(1).unwrap(), "world");
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/api_key.rs b/fluss-rust/crates/fluss/src/rpc/api_key.rs
new file mode 100644
index 0000000000..977b69d1d6
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/api_key.rs
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::rpc::api_key::ApiKey::Unknown;
+use crate::rpc::api_version::{ApiVersion, ApiVersionRange};
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)]
+pub enum ApiKey {
+    ApiVersion,                 // 1000
+    CreateDatabase,             // 1001
+    DropDatabase,               // 1002
+    ListDatabases,              // 1003
+    DatabaseExists,             // 1004
+    CreateTable,                // 1005
+    DropTable,                  // 1006
+    GetTable,                   // 1007
+    ListTables,                 // 1008
+    ListPartitionInfos,         // 1009
+    TableExists,                // 1010
+    GetTableSchema,             // 1011
+    MetaData,                   // 1012
+    ProduceLog,                 // 1014
+    FetchLog,                   // 1015
+    PutKv,                      // 1016
+    Lookup,                     // 1017
+    ListOffsets,                // 1021
+    GetFileSystemSecurityToken, // 1025
+    InitWriter,                 // 1026
+    GetLatestLakeSnapshot,      // 1032
+    LimitScan,                  // 1033
+    PrefixLookup,               // 1034
+    GetDatabaseInfo,            // 1035
+    CreatePartition,            // 1036
+    DropPartition,              // 1037
+    Authenticate,               // 1038
+    Unknown(i16),
+}
+
+impl ApiKey {
+    /// Returns the range of versions supported by the client for this API key.
+    pub fn supported_versions(&self) -> Option<ApiVersionRange> {
+        match self {
+            // Most APIs only support v0.
+            ApiKey::ApiVersion
+            | ApiKey::CreateDatabase
+            | ApiKey::DropDatabase
+            | ApiKey::ListDatabases
+            | ApiKey::DatabaseExists
+            | ApiKey::CreateTable
+            | ApiKey::DropTable
+            | ApiKey::GetTable
+            | ApiKey::ListTables
+            | ApiKey::ListPartitionInfos
+            | ApiKey::TableExists
+            | ApiKey::GetTableSchema
+            | ApiKey::MetaData
+            | ApiKey::ProduceLog
+            | ApiKey::FetchLog
+            | ApiKey::ListOffsets
+            | ApiKey::GetFileSystemSecurityToken
+            | ApiKey::InitWriter
+            | ApiKey::GetLatestLakeSnapshot
+            | ApiKey::LimitScan
+            | ApiKey::GetDatabaseInfo
+            | ApiKey::CreatePartition
+            | ApiKey::DropPartition
+            | ApiKey::Authenticate
+            // TODO(key-encoding-v1): The Java server supports v0..v1 for these
+            // APIs, but the Rust client has not yet implemented the v1 key
+            // encoding format. Pinned to v0 until that is done.
+            | ApiKey::PutKv | ApiKey::Lookup | ApiKey::PrefixLookup => {
+                Some(ApiVersionRange::new(ApiVersion(0), ApiVersion(0)))
+            }
+            Unknown(_) => None,
+        }
+    }
+}
+
+impl From<i16> for ApiKey {
+    fn from(key: i16) -> Self {
+        match key {
+            1000 => ApiKey::ApiVersion,
+            1001 => ApiKey::CreateDatabase,
+            1002 => ApiKey::DropDatabase,
+            1003 => ApiKey::ListDatabases,
+            1004 => ApiKey::DatabaseExists,
+            1005 => ApiKey::CreateTable,
+            1006 => ApiKey::DropTable,
+            1007 => ApiKey::GetTable,
+            1008 => ApiKey::ListTables,
+            1009 => ApiKey::ListPartitionInfos,
+            1010 => ApiKey::TableExists,
+            1011 => ApiKey::GetTableSchema,
+            1012 => ApiKey::MetaData,
+            1014 => ApiKey::ProduceLog,
+            1015 => ApiKey::FetchLog,
+            1016 => ApiKey::PutKv,
+            1017 => ApiKey::Lookup,
+            1021 => ApiKey::ListOffsets,
+            1025 => ApiKey::GetFileSystemSecurityToken,
+            1026 => ApiKey::InitWriter,
+            1032 => ApiKey::GetLatestLakeSnapshot,
+            1033 => ApiKey::LimitScan,
+            1034 => ApiKey::PrefixLookup,
+            1035 => ApiKey::GetDatabaseInfo,
+            1036 => ApiKey::CreatePartition,
+            1037 => ApiKey::DropPartition,
+            1038 => ApiKey::Authenticate,
+
+            _ => Unknown(key),
+        }
+    }
+}
+
+impl From<ApiKey> for i16 {
+    fn from(key: ApiKey) -> Self {
+        match key {
+            ApiKey::ApiVersion => 1000,
+            ApiKey::CreateDatabase => 1001,
+            ApiKey::DropDatabase => 1002,
+            ApiKey::ListDatabases => 1003,
+            ApiKey::DatabaseExists => 1004,
+            ApiKey::CreateTable => 1005,
+            ApiKey::DropTable => 1006,
+            ApiKey::GetTable => 1007,
+            ApiKey::ListTables => 1008,
+            ApiKey::ListPartitionInfos => 1009,
+            ApiKey::TableExists => 1010,
+            ApiKey::GetTableSchema => 1011,
+            ApiKey::MetaData => 1012,
+            ApiKey::ProduceLog => 1014,
+            ApiKey::FetchLog => 1015,
+            ApiKey::PutKv => 1016,
+            ApiKey::Lookup => 1017,
+            ApiKey::ListOffsets => 1021,
+            ApiKey::GetFileSystemSecurityToken => 1025,
+            ApiKey::InitWriter => 1026,
+            ApiKey::GetLatestLakeSnapshot => 1032,
+            ApiKey::LimitScan => 1033,
+            ApiKey::PrefixLookup => 1034,
+            ApiKey::GetDatabaseInfo => 1035,
+            ApiKey::CreatePartition => 1036,
+            ApiKey::DropPartition => 1037,
+            ApiKey::Authenticate => 1038,
+            Unknown(x) => x,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn api_key_round_trip() {
+        let cases = [
+            (1000, ApiKey::ApiVersion),
+            (1001, ApiKey::CreateDatabase),
+            (1002, ApiKey::DropDatabase),
+            (1003, ApiKey::ListDatabases),
+            (1004, ApiKey::DatabaseExists),
+            (1005, ApiKey::CreateTable),
+            (1006, ApiKey::DropTable),
+            (1007, ApiKey::GetTable),
+            (1008, ApiKey::ListTables),
+            (1009, ApiKey::ListPartitionInfos),
+            (1010, ApiKey::TableExists),
+            (1011, ApiKey::GetTableSchema),
+            (1012, ApiKey::MetaData),
+            (1014, ApiKey::ProduceLog),
+            (1015, ApiKey::FetchLog),
+            (1016, ApiKey::PutKv),
+            (1017, ApiKey::Lookup),
+            (1021, ApiKey::ListOffsets),
+            (1025, ApiKey::GetFileSystemSecurityToken),
+            (1026, ApiKey::InitWriter),
+            (1032, ApiKey::GetLatestLakeSnapshot),
+            (1033, ApiKey::LimitScan),
+            (1034, ApiKey::PrefixLookup),
+            (1035, ApiKey::GetDatabaseInfo),
+            (1036, ApiKey::CreatePartition),
+            (1037, ApiKey::DropPartition),
+            (1038, ApiKey::Authenticate),
+        ];
+
+        for (raw, key) in cases {
+            assert_eq!(ApiKey::from(raw), key);
+            let mapped: i16 = key.into();
+            assert_eq!(mapped, raw);
+        }
+
+        let unknown = ApiKey::from(9999);
+        assert_eq!(unknown, ApiKey::Unknown(9999));
+        let mapped: i16 = unknown.into();
+        assert_eq!(mapped, 9999);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/api_version.rs b/fluss-rust/crates/fluss/src/rpc/api_version.rs
new file mode 100644
index 0000000000..f009d6914f
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/api_version.rs
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Copy)]
+pub struct ApiVersion(pub i16);
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct ApiVersionRange {
+    min: ApiVersion,
+    max: ApiVersion,
+}
+
+impl std::fmt::Display for ApiVersion {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+#[allow(dead_code)]
+impl ApiVersionRange {
+    pub const fn new(min: ApiVersion, max: ApiVersion) -> Self {
+        assert!(min.0 <= max.0);
+
+        Self { min, max }
+    }
+
+    pub fn min(&self) -> ApiVersion {
+        self.min
+    }
+
+    pub fn max(&self) -> ApiVersion {
+        self.max
+    }
+}
+
+impl std::fmt::Display for ApiVersionRange {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}", self.min, self.max)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn api_version_display() {
+        let version = ApiVersion(3);
+        assert_eq!(version.to_string(), "3");
+    }
+
+    #[test]
+    fn api_version_range_accessors() {
+        let range = ApiVersionRange::new(ApiVersion(1), ApiVersion(4));
+        assert_eq!(range.min(), ApiVersion(1));
+        assert_eq!(range.max(), ApiVersion(4));
+        assert_eq!(range.to_string(), "1:4");
+    }
+
+    #[test]
+    #[should_panic]
+    fn api_version_range_panics_on_invalid_bounds() {
+        let _ = ApiVersionRange::new(ApiVersion(4), ApiVersion(1));
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/convert.rs b/fluss-rust/crates/fluss/src/rpc/convert.rs
new file mode 100644
index 0000000000..441645c2e6
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/convert.rs
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::cluster::{ServerNode, ServerType};
+use crate::metadata::TablePath;
+use crate::proto::{PbServerNode, PbTablePath};
+
+pub fn to_table_path(table_path: &TablePath) -> PbTablePath {
+    PbTablePath {
+        database_name: table_path.database().to_string(),
+        table_name: table_path.table().to_string(),
+    }
+}
+
+pub fn from_pb_server_node(pb_server_node: PbServerNode, server_type: ServerType) -> ServerNode {
+    ServerNode::new(
+        pb_server_node.node_id,
+        pb_server_node.host,
+        pb_server_node.port as u32,
+        server_type,
+    )
+}
+
+pub fn from_pb_table_path(pb_table_path: &PbTablePath) -> TablePath {
+    TablePath::new(
+        pb_table_path.database_name.to_string(),
+        pb_table_path.table_name.to_string(),
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::proto::{PbServerNode, PbTablePath};
+
+    #[test]
+    fn table_path_round_trip() {
+        let table_path = TablePath::new("db".to_string(), "table".to_string());
+        let pb = to_table_path(&table_path);
+        assert_eq!(pb.database_name, "db");
+        assert_eq!(pb.table_name, "table");
+
+        let restored = from_pb_table_path(&pb);
+        assert_eq!(restored, table_path);
+
+        let manual = PbTablePath {
+            database_name: "db2".to_string(),
+            table_name: "table2".to_string(),
+        };
+        let restored = from_pb_table_path(&manual);
+        assert_eq!(restored.database(), "db2");
+        assert_eq!(restored.table(), "table2");
+    }
+
+    #[test]
+    fn server_node_from_pb() {
+        let pb = PbServerNode {
+            node_id: 7,
+            host: "127.0.0.1".to_string(),
+            port: 9092,
+            listeners: None,
+            rack: None,
+        };
+        let node = from_pb_server_node(pb, ServerType::TabletServer);
+        assert_eq!(node.id(), 7);
+        assert_eq!(node.url(), "127.0.0.1:9092");
+        assert_eq!(node.uid(), "ts-7");
+
+        let pb = PbServerNode {
+            node_id: 3,
+            host: "localhost".to_string(),
+            port: 8123,
+            listeners: None,
+            rack: None,
+        };
+        let node = from_pb_server_node(pb, ServerType::CoordinatorServer);
+        assert_eq!(node.uid(), "cs-3");
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/error.rs b/fluss-rust/crates/fluss/src/rpc/error.rs
new file mode 100644
index 0000000000..da3a11e295
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/error.rs
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::api_version::ApiVersion;
+use prost::DecodeError;
+use std::sync::Arc;
+use thiserror::Error;
+
+#[derive(Error, Debug)]
+#[non_exhaustive]
+pub enum RpcError {
+    #[error("Cannot write message: {0}")]
+    WriteMessageError(#[from] crate::rpc::frame::WriteError),
+
+    #[error("Cannot read framed message: {0}")]
+    ReadMessageError(#[from] crate::rpc::frame::ReadError),
+
+    #[error("Rpc Decode Error: {0}")]
+    RpcDecodeError(#[from] DecodeError),
+
+    #[error("connection error")]
+    ConnectionError(String),
+
+    #[error("IO Error: {0}")]
+    IO(#[from] std::io::Error),
+
+    #[error("Connection is poisoned: {0}")]
+    Poisoned(Arc<RpcError>),
+
+    #[error(
+        "Data left at the end of the message. Got {message_size} bytes but only read {read} bytes. api_key={api_key:?} api_version={api_version}"
+    )]
+    TooMuchData {
+        message_size: u64,
+        read: u64,
+        api_key: ApiKey,
+        api_version: ApiVersion,
+    },
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs b/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs
new file mode 100644
index 0000000000..418f5443dc
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/fluss_api_error.rs
@@ -0,0 +1,510 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::ErrorResponse;
+use std::fmt::{Debug, Display, Formatter};
+
+/// API error response from Fluss server
+pub struct ApiError {
+    pub code: i32,
+    pub message: String,
+}
+
+impl Debug for ApiError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ApiError")
+            .field("code", &self.code)
+            .field("message", &self.message)
+            .finish()
+    }
+}
+
+impl Display for ApiError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        Debug::fmt(self, f)
+    }
+}
+
+impl ApiError {
+    /// Returns `true` if retrying the request may succeed. Delegates to [`FlussError::is_retriable`].
+    pub fn is_retriable(&self) -> bool {
+        FlussError::for_code(self.code).is_retriable()
+    }
+}
+
+/// Fluss protocol errors. These errors are part of the client-server protocol.
+/// The error codes cannot be changed, but the names can be.
+///
+/// Do not add exceptions that occur only on the client or only on the server here.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+#[repr(i32)]
+pub enum FlussError {
+    /// The server experienced an unexpected error when processing the request.
+    UnknownServerError = -1,
+    /// No error occurred.
+    None = 0,
+    /// The server disconnected before a response was received.
+    NetworkException = 1,
+    /// The version of API is not supported.
+    UnsupportedVersion = 2,
+    /// This message has failed its CRC checksum, exceeds the valid size, has a null key for a primary key table, or is otherwise corrupt.
+    CorruptMessage = 3,
+    /// The database does not exist.
+    DatabaseNotExist = 4,
+    /// The database is not empty.
+    DatabaseNotEmpty = 5,
+    /// The database already exists.
+    DatabaseAlreadyExist = 6,
+    /// The table does not exist.
+    TableNotExist = 7,
+    /// The table already exists.
+    TableAlreadyExist = 8,
+    /// The schema does not exist.
+    SchemaNotExist = 9,
+    /// Exception occur while storage data for log in server.
+    LogStorageException = 10,
+    /// Exception occur while storage data for kv in server.
+    KvStorageException = 11,
+    /// Not leader or follower.
+    NotLeaderOrFollower = 12,
+    /// The record is too large.
+    RecordTooLargeException = 13,
+    /// The record is corrupt.
+    CorruptRecordException = 14,
+    /// The client has attempted to perform an operation on an invalid table.
+    InvalidTableException = 15,
+    /// The client has attempted to perform an operation on an invalid database.
+    InvalidDatabaseException = 16,
+    /// The replication factor is larger then the number of available tablet servers.
+    InvalidReplicationFactor = 17,
+    /// Produce request specified an invalid value for required acks.
+    InvalidRequiredAcks = 18,
+    /// The log offset is out of range.
+    LogOffsetOutOfRangeException = 19,
+    /// The table is not primary key table.
+    NonPrimaryKeyTableException = 20,
+    /// The table or bucket does not exist.
+    UnknownTableOrBucketException = 21,
+    /// The update version is invalid.
+    InvalidUpdateVersionException = 22,
+    /// The coordinator is invalid.
+    InvalidCoordinatorException = 23,
+    /// The leader epoch is invalid.
+    FencedLeaderEpochException = 24,
+    /// The request time out.
+    RequestTimeOut = 25,
+    /// The general storage exception.
+    StorageException = 26,
+    /// The server did not attempt to execute this operation.
+    OperationNotAttemptedException = 27,
+    /// Records are written to the server already, but to fewer in-sync replicas than required.
+    NotEnoughReplicasAfterAppendException = 28,
+    /// Messages are rejected since there are fewer in-sync replicas than required.
+    NotEnoughReplicasException = 29,
+    /// Get file access security token exception.
+    SecurityTokenException = 30,
+    /// The tablet server received an out of order sequence batch.
+    OutOfOrderSequenceException = 31,
+    /// The tablet server received a duplicate sequence batch.
+    DuplicateSequenceException = 32,
+    /// This exception is raised by the tablet server if it could not locate the writer metadata.
+    UnknownWriterIdException = 33,
+    /// The requested column projection is invalid.
+    InvalidColumnProjection = 34,
+    /// The requested target column to write is invalid.
+    InvalidTargetColumn = 35,
+    /// The partition does not exist.
+    PartitionNotExists = 36,
+    /// The table is not partitioned.
+    TableNotPartitionedException = 37,
+    /// The timestamp is invalid.
+    InvalidTimestampException = 38,
+    /// The config is invalid.
+    InvalidConfigException = 39,
+    /// The lake storage is not configured.
+    LakeStorageNotConfiguredException = 40,
+    /// The kv snapshot is not exist.
+    KvSnapshotNotExist = 41,
+    /// The partition already exists.
+    PartitionAlreadyExists = 42,
+    /// The partition spec is invalid.
+    PartitionSpecInvalidException = 43,
+    /// There is no currently available leader for the given partition.
+    LeaderNotAvailableException = 44,
+    /// Exceed the maximum number of partitions.
+    PartitionMaxNumException = 45,
+    /// Authentication failed.
+    AuthenticateException = 46,
+    /// Security is disabled.
+    SecurityDisabledException = 47,
+    /// Authorization failed.
+    AuthorizationException = 48,
+    /// Exceed the maximum number of buckets.
+    BucketMaxNumException = 49,
+    /// The tiering epoch is invalid.
+    FencedTieringEpochException = 50,
+    /// Authentication failed with retriable exception.
+    RetriableAuthenticateException = 51,
+    /// The server rack info is invalid.
+    InvalidServerRackInfoException = 52,
+    /// The lake snapshot is not exist.
+    LakeSnapshotNotExist = 53,
+    /// The lake table already exists.
+    LakeTableAlreadyExist = 54,
+    /// The new ISR contains at least one ineligible replica.
+    IneligibleReplicaException = 55,
+    /// The alter table is invalid.
+    InvalidAlterTableException = 56,
+    /// Deletion operations are disabled on this table.
+    DeletionDisabledException = 57,
+}
+
+impl FlussError {
+    /// Returns the error code for this error.
+    pub fn code(&self) -> i32 {
+        *self as i32
+    }
+
+    pub fn is_retriable(&self) -> bool {
+        matches!(
+            self,
+            FlussError::NetworkException
+                | FlussError::CorruptMessage
+                | FlussError::SchemaNotExist
+                | FlussError::LogStorageException
+                | FlussError::KvStorageException
+                | FlussError::NotLeaderOrFollower
+                | FlussError::CorruptRecordException
+                | FlussError::UnknownTableOrBucketException
+                | FlussError::RequestTimeOut
+                | FlussError::StorageException
+                | FlussError::NotEnoughReplicasAfterAppendException
+                | FlussError::NotEnoughReplicasException
+                | FlussError::LeaderNotAvailableException
+        )
+    }
+
+    /// Returns a friendly description of the error.
+    pub fn message(&self) -> &'static str {
+        match self {
+            FlussError::UnknownServerError => {
+                "The server experienced an unexpected error when processing the request."
+            }
+            FlussError::None => "No error",
+            FlussError::NetworkException => {
+                "The server disconnected before a response was received."
+            }
+            FlussError::UnsupportedVersion => "The version of API is not supported.",
+            FlussError::CorruptMessage => {
+                "This message has failed its CRC checksum, exceeds the valid size, has a null key for a primary key table, or is otherwise corrupt."
+            }
+            FlussError::DatabaseNotExist => "The database does not exist.",
+            FlussError::DatabaseNotEmpty => "The database is not empty.",
+            FlussError::DatabaseAlreadyExist => "The database already exists.",
+            FlussError::TableNotExist => "The table does not exist.",
+            FlussError::TableAlreadyExist => "The table already exists.",
+            FlussError::SchemaNotExist => "The schema does not exist.",
+            FlussError::LogStorageException => {
+                "Exception occur while storage data for log in server."
+            }
+            FlussError::KvStorageException => {
+                "Exception occur while storage data for kv in server."
+            }
+            FlussError::NotLeaderOrFollower => "Not leader or follower.",
+            FlussError::RecordTooLargeException => "The record is too large.",
+            FlussError::CorruptRecordException => "The record is corrupt.",
+            FlussError::InvalidTableException => {
+                "The client has attempted to perform an operation on an invalid table."
+            }
+            FlussError::InvalidDatabaseException => {
+                "The client has attempted to perform an operation on an invalid database."
+            }
+            FlussError::InvalidReplicationFactor => {
+                "The replication factor is larger then the number of available tablet servers."
+            }
+            FlussError::InvalidRequiredAcks => {
+                "Produce request specified an invalid value for required acks."
+            }
+            FlussError::LogOffsetOutOfRangeException => "The log offset is out of range.",
+            FlussError::NonPrimaryKeyTableException => "The table is not primary key table.",
+            FlussError::UnknownTableOrBucketException => "The table or bucket does not exist.",
+            FlussError::InvalidUpdateVersionException => "The update version is invalid.",
+            FlussError::InvalidCoordinatorException => "The coordinator is invalid.",
+            FlussError::FencedLeaderEpochException => "The leader epoch is invalid.",
+            FlussError::RequestTimeOut => "The request time out.",
+            FlussError::StorageException => "The general storage exception.",
+            FlussError::OperationNotAttemptedException => {
+                "The server did not attempt to execute this operation."
+            }
+            FlussError::NotEnoughReplicasAfterAppendException => {
+                "Records are written to the server already, but to fewer in-sync replicas than required."
+            }
+            FlussError::NotEnoughReplicasException => {
+                "Messages are rejected since there are fewer in-sync replicas than required."
+            }
+            FlussError::SecurityTokenException => "Get file access security token exception.",
+            FlussError::OutOfOrderSequenceException => {
+                "The tablet server received an out of order sequence batch."
+            }
+            FlussError::DuplicateSequenceException => {
+                "The tablet server received a duplicate sequence batch."
+            }
+            FlussError::UnknownWriterIdException => {
+                "This exception is raised by the tablet server if it could not locate the writer metadata."
+            }
+            FlussError::InvalidColumnProjection => "The requested column projection is invalid.",
+            FlussError::InvalidTargetColumn => "The requested target column to write is invalid.",
+            FlussError::PartitionNotExists => "The partition does not exist.",
+            FlussError::TableNotPartitionedException => "The table is not partitioned.",
+            FlussError::InvalidTimestampException => "The timestamp is invalid.",
+            FlussError::InvalidConfigException => "The config is invalid.",
+            FlussError::LakeStorageNotConfiguredException => "The lake storage is not configured.",
+            FlussError::KvSnapshotNotExist => "The kv snapshot does not exist.",
+            FlussError::PartitionAlreadyExists => "The partition already exists.",
+            FlussError::PartitionSpecInvalidException => "The partition spec is invalid.",
+            FlussError::LeaderNotAvailableException => {
+                "There is no currently available leader for the given partition."
+            }
+            FlussError::PartitionMaxNumException => "Exceed the maximum number of partitions.",
+            FlussError::AuthenticateException => "Authentication failed.",
+            FlussError::SecurityDisabledException => "Security is disabled.",
+            FlussError::AuthorizationException => "Authorization failed.",
+            FlussError::BucketMaxNumException => "Exceed the maximum number of buckets.",
+            FlussError::FencedTieringEpochException => "The tiering epoch is invalid.",
+            FlussError::RetriableAuthenticateException => {
+                "Authentication failed with retriable exception."
+            }
+            FlussError::InvalidServerRackInfoException => "The server rack info is invalid.",
+            FlussError::LakeSnapshotNotExist => "The lake snapshot does not exist.",
+            FlussError::LakeTableAlreadyExist => "The lake table already exists.",
+            FlussError::IneligibleReplicaException => {
+                "The new ISR contains at least one ineligible replica."
+            }
+            FlussError::InvalidAlterTableException => "The alter table is invalid.",
+            FlussError::DeletionDisabledException => {
+                "Deletion operations are disabled on this table."
+            }
+        }
+    }
+
+    /// Create an ApiError from this error with the default message.
+    pub fn to_api_error(&self, message: Option<String>) -> ApiError {
+        ApiError {
+            code: self.code(),
+            message: message.unwrap_or(self.message().to_string()),
+        }
+    }
+
+    /// Get the FlussError for the given error code.
+    /// Returns `UnknownServerError` if the code is not recognized.
+    pub fn for_code(code: i32) -> Self {
+        match code {
+            -1 => FlussError::UnknownServerError,
+            0 => FlussError::None,
+            1 => FlussError::NetworkException,
+            2 => FlussError::UnsupportedVersion,
+            3 => FlussError::CorruptMessage,
+            4 => FlussError::DatabaseNotExist,
+            5 => FlussError::DatabaseNotEmpty,
+            6 => FlussError::DatabaseAlreadyExist,
+            7 => FlussError::TableNotExist,
+            8 => FlussError::TableAlreadyExist,
+            9 => FlussError::SchemaNotExist,
+            10 => FlussError::LogStorageException,
+            11 => FlussError::KvStorageException,
+            12 => FlussError::NotLeaderOrFollower,
+            13 => FlussError::RecordTooLargeException,
+            14 => FlussError::CorruptRecordException,
+            15 => FlussError::InvalidTableException,
+            16 => FlussError::InvalidDatabaseException,
+            17 => FlussError::InvalidReplicationFactor,
+            18 => FlussError::InvalidRequiredAcks,
+            19 => FlussError::LogOffsetOutOfRangeException,
+            20 => FlussError::NonPrimaryKeyTableException,
+            21 => FlussError::UnknownTableOrBucketException,
+            22 => FlussError::InvalidUpdateVersionException,
+            23 => FlussError::InvalidCoordinatorException,
+            24 => FlussError::FencedLeaderEpochException,
+            25 => FlussError::RequestTimeOut,
+            26 => FlussError::StorageException,
+            27 => FlussError::OperationNotAttemptedException,
+            28 => FlussError::NotEnoughReplicasAfterAppendException,
+            29 => FlussError::NotEnoughReplicasException,
+            30 => FlussError::SecurityTokenException,
+            31 => FlussError::OutOfOrderSequenceException,
+            32 => FlussError::DuplicateSequenceException,
+            33 => FlussError::UnknownWriterIdException,
+            34 => FlussError::InvalidColumnProjection,
+            35 => FlussError::InvalidTargetColumn,
+            36 => FlussError::PartitionNotExists,
+            37 => FlussError::TableNotPartitionedException,
+            38 => FlussError::InvalidTimestampException,
+            39 => FlussError::InvalidConfigException,
+            40 => FlussError::LakeStorageNotConfiguredException,
+            41 => FlussError::KvSnapshotNotExist,
+            42 => FlussError::PartitionAlreadyExists,
+            43 => FlussError::PartitionSpecInvalidException,
+            44 => FlussError::LeaderNotAvailableException,
+            45 => FlussError::PartitionMaxNumException,
+            46 => FlussError::AuthenticateException,
+            47 => FlussError::SecurityDisabledException,
+            48 => FlussError::AuthorizationException,
+            49 => FlussError::BucketMaxNumException,
+            50 => FlussError::FencedTieringEpochException,
+            51 => FlussError::RetriableAuthenticateException,
+            52 => FlussError::InvalidServerRackInfoException,
+            53 => FlussError::LakeSnapshotNotExist,
+            54 => FlussError::LakeTableAlreadyExist,
+            55 => FlussError::IneligibleReplicaException,
+            56 => FlussError::InvalidAlterTableException,
+            57 => FlussError::DeletionDisabledException,
+            _ => FlussError::UnknownServerError,
+        }
+    }
+}
+
+impl Display for FlussError {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.message())
+    }
+}
+
+impl From<ErrorResponse> for ApiError {
+    fn from(error_response: ErrorResponse) -> Self {
+        let code = error_response.error_code;
+        let message = error_response
+            .error_message
+            .unwrap_or_else(|| FlussError::for_code(code).message().to_string());
+        ApiError { code, message }
+    }
+}
+
+impl From<ApiError> for FlussError {
+    fn from(api_error: ApiError) -> Self {
+        FlussError::for_code(api_error.code)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn for_code_maps_known_and_unknown() {
+        assert_eq!(FlussError::for_code(0), FlussError::None);
+        assert_eq!(
+            FlussError::for_code(FlussError::AuthorizationException.code()),
+            FlussError::AuthorizationException
+        );
+        assert_eq!(FlussError::for_code(9999), FlussError::UnknownServerError);
+    }
+
+    #[test]
+    fn to_api_error_uses_message() {
+        let err = FlussError::InvalidTableException.to_api_error(None);
+        assert_eq!(err.code, FlussError::InvalidTableException.code());
+        assert!(err.message.contains("invalid table"));
+    }
+
+    #[test]
+    fn error_response_conversion_round_trip() {
+        let response = ErrorResponse {
+            error_code: FlussError::TableNotExist.code(),
+            error_message: Some("missing".to_string()),
+        };
+        let api_error = ApiError::from(response);
+        assert_eq!(api_error.code, FlussError::TableNotExist.code());
+        assert_eq!(api_error.message, "missing");
+        let fluss_error = FlussError::from(api_error);
+        assert_eq!(fluss_error, FlussError::TableNotExist);
+    }
+
+    #[test]
+    fn error_response_preserves_unknown_wire_code() {
+        let response = ErrorResponse {
+            error_code: 9999,
+            error_message: Some("NewException: forward compat".to_string()),
+        };
+        let api_error = ApiError::from(response);
+        assert_eq!(api_error.code, 9999);
+        assert_eq!(api_error.message, "NewException: forward compat");
+        assert_eq!(FlussError::from(api_error), FlussError::UnknownServerError);
+    }
+
+    #[test]
+    fn error_response_falls_back_to_default_message_for_unknown_code() {
+        let response = ErrorResponse {
+            error_code: 9999,
+            error_message: None,
+        };
+        let api_error = ApiError::from(response);
+        assert_eq!(api_error.code, 9999);
+        assert_eq!(api_error.message, FlussError::UnknownServerError.message());
+        assert!(!api_error.is_retriable());
+    }
+
+    #[test]
+    fn is_retriable_known_retriable_errors() {
+        let retriable = [
+            FlussError::NetworkException,
+            FlussError::CorruptMessage,
+            FlussError::SchemaNotExist,
+            FlussError::LogStorageException,
+            FlussError::KvStorageException,
+            FlussError::NotLeaderOrFollower,
+            FlussError::CorruptRecordException,
+            FlussError::UnknownTableOrBucketException,
+            FlussError::RequestTimeOut,
+            FlussError::StorageException,
+            FlussError::NotEnoughReplicasAfterAppendException,
+            FlussError::NotEnoughReplicasException,
+            FlussError::LeaderNotAvailableException,
+        ];
+        for err in &retriable {
+            assert!(err.is_retriable(), "{err:?} should be retriable");
+        }
+    }
+
+    #[test]
+    fn is_retriable_known_non_retriable_errors() {
+        let non_retriable = [
+            FlussError::UnknownServerError,
+            FlussError::None,
+            FlussError::TableNotExist,
+            FlussError::AuthenticateException,
+            FlussError::AuthorizationException,
+            FlussError::RecordTooLargeException,
+            FlussError::DeletionDisabledException,
+            FlussError::InvalidCoordinatorException,
+            FlussError::FencedLeaderEpochException,
+            FlussError::FencedTieringEpochException,
+            FlussError::RetriableAuthenticateException,
+        ];
+        for err in &non_retriable {
+            assert!(!err.is_retriable(), "{err:?} should not be retriable");
+        }
+    }
+
+    #[test]
+    fn api_error_is_retriable_delegates_to_fluss_error() {
+        let retriable_api = FlussError::RequestTimeOut.to_api_error(None);
+        assert!(retriable_api.is_retriable());
+
+        let permanent_api = FlussError::TableNotExist.to_api_error(None);
+        assert!(!permanent_api.is_retriable());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/frame.rs b/fluss-rust/crates/fluss/src/rpc/frame.rs
new file mode 100644
index 0000000000..81cc0946d7
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/frame.rs
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use prost::DecodeError;
+use thiserror::Error;
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
+
+#[derive(Error, Debug)]
+#[non_exhaustive]
+pub enum ReadError {
+    #[error("Cannot read data: {0}")]
+    IO(#[from] std::io::Error),
+
+    #[error("Negative message size: {size}")]
+    NegativeMessageSize { size: i32 },
+
+    #[error("Message too large, limit is {limit} bytes but got {actual} bytes")]
+    MessageTooLarge { limit: usize, actual: usize },
+
+    #[error("Fail to decode error response: {0}")]
+    ProtoErrorResponseDecodeError(#[from] DecodeError),
+}
+
+pub trait AsyncMessageRead {
+    fn read_message(
+        &mut self,
+        max_message_size: usize,
+    ) -> impl Future<Output = Result<Vec<u8>, ReadError>> + Send;
+}
+
+impl<R> AsyncMessageRead for R
+where
+    R: AsyncRead + Send + Unpin,
+{
+    async fn read_message(&mut self, max_message_size: usize) -> Result<Vec<u8>, ReadError> {
+        let mut len_buf = [0u8; 4];
+        self.read_exact(&mut len_buf).await?;
+        let len = i32::from_be_bytes(len_buf);
+
+        let len = usize::try_from(len).map_err(|_| ReadError::NegativeMessageSize { size: len })?;
+        // check max message size to not blow up memory
+        if len > max_message_size {
+            // We need to seek so that next message is readable. However `self.seek` would require `R: AsyncSeek` which
+            // doesn't hold for many types we want to work with. So do some manual seeking.
+            let mut to_read = len;
+            let mut buf = vec![]; // allocate empty buffer
+            while to_read > 0 {
+                let step = max_message_size.min(to_read);
+
+                // resize buffer if required
+                buf.resize(step, 0);
+
+                self.read_exact(&mut buf).await?;
+                to_read -= step;
+            }
+
+            return Err(ReadError::MessageTooLarge {
+                limit: max_message_size,
+                actual: len,
+            });
+        }
+
+        let mut buf = vec![0u8; len];
+        self.read_exact(&mut buf).await?;
+        Ok(buf)
+    }
+}
+
+#[derive(Error, Debug)]
+#[non_exhaustive]
+pub enum WriteError {
+    #[error("Cannot write data: {0}")]
+    IO(#[from] std::io::Error),
+
+    #[error("Message too large: {size}")]
+    TooLarge { size: usize },
+}
+
+pub trait AsyncMessageWrite {
+    fn write_message(&mut self, msg: &[u8]) -> impl Future<Output = Result<(), WriteError>> + Send;
+}
+
+impl<W> AsyncMessageWrite for W
+where
+    W: AsyncWrite + Send + Unpin,
+{
+    async fn write_message(&mut self, msg: &[u8]) -> Result<(), WriteError> {
+        let len = i32::try_from(msg.len()).map_err(|_| WriteError::TooLarge { size: msg.len() })?;
+        self.write_all(len.to_be_bytes().as_ref()).await?;
+
+        if !msg.is_empty() {
+            self.write_all(msg).await?;
+        }
+        Ok(())
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/message/api_versions.rs b/fluss-rust/crates/fluss/src/rpc/message/api_versions.rs
new file mode 100644
index 0000000000..579c66a7b1
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/api_versions.rs
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::{
+    ApiVersionsRequest as ProtoApiVersionsRequest, ApiVersionsResponse as ProtoApiVersionsResponse,
+};
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::{ReadError, WriteError};
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug, Clone)]
+pub struct ApiVersionsRequest {
+    pub inner_request: ProtoApiVersionsRequest,
+}
+
+impl ApiVersionsRequest {
+    pub fn new(client_name: &str, client_version: &str) -> Self {
+        Self {
+            inner_request: ProtoApiVersionsRequest {
+                client_software_name: client_name.to_string(),
+                client_software_version: client_version.to_string(),
+            },
+        }
+    }
+}
+
+impl RequestBody for ApiVersionsRequest {
+    type ResponseBody = ProtoApiVersionsResponse;
+    const API_KEY: ApiKey = ApiKey::ApiVersion;
+}
+
+impl_write_type!(ApiVersionsRequest);
+impl_read_type!(ProtoApiVersionsResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/authenticate.rs b/fluss-rust/crates/fluss/src/rpc/message/authenticate.rs
new file mode 100644
index 0000000000..1874b30463
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/authenticate.rs
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::{AuthenticateRequest as ProtoAuthenticateRequest, AuthenticateResponse};
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::{ReadError, WriteError};
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug, Clone)]
+pub struct AuthenticateRequest {
+    pub inner_request: ProtoAuthenticateRequest,
+}
+
+impl AuthenticateRequest {
+    /// Build a SASL/PLAIN authenticate request.
+    /// Token format: `\0<username>\0<password>` (NUL-separated UTF-8).
+    pub fn new_plain(username: &str, password: &str) -> Self {
+        let mut token = Vec::with_capacity(1 + username.len() + 1 + password.len());
+        token.push(0u8);
+        token.extend_from_slice(username.as_bytes());
+        token.push(0u8);
+        token.extend_from_slice(password.as_bytes());
+
+        Self {
+            inner_request: ProtoAuthenticateRequest {
+                protocol: "PLAIN".to_string(),
+                token,
+            },
+        }
+    }
+
+    /// Build an authenticate request from a server challenge (for multi-round auth).
+    pub fn from_challenge(protocol: &str, challenge: Vec<u8>) -> Self {
+        Self {
+            inner_request: ProtoAuthenticateRequest {
+                protocol: protocol.to_string(),
+                token: challenge,
+            },
+        }
+    }
+}
+
+impl RequestBody for AuthenticateRequest {
+    type ResponseBody = AuthenticateResponse;
+    const API_KEY: ApiKey = ApiKey::Authenticate;
+}
+
+impl_write_type!(AuthenticateRequest);
+impl_read_type!(AuthenticateResponse);
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_new_plain_token_format() {
+        let req = AuthenticateRequest::new_plain("admin", "secret");
+        assert_eq!(req.inner_request.protocol, "PLAIN");
+        assert_eq!(req.inner_request.token, b"\0admin\0secret");
+    }
+
+    #[test]
+    fn test_new_plain_empty_credentials() {
+        let req = AuthenticateRequest::new_plain("", "");
+        assert_eq!(req.inner_request.token, b"\0\0");
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/message/create_database.rs b/fluss-rust/crates/fluss/src/rpc/message/create_database.rs
new file mode 100644
index 0000000000..ed0868da6f
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/create_database.rs
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::DatabaseDescriptor;
+use crate::{impl_read_type, impl_write_type, proto};
+
+use crate::error::Result as FlussResult;
+use crate::proto::CreateDatabaseResponse;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::ReadError;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct CreateDatabaseRequest {
+    pub inner_request: proto::CreateDatabaseRequest,
+}
+
+impl CreateDatabaseRequest {
+    pub fn new(
+        database_name: &str,
+        database_descriptor: Option<&DatabaseDescriptor>,
+        ignore_if_exists: bool,
+    ) -> FlussResult<Self> {
+        let database_json = if let Some(descriptor) = database_descriptor {
+            Some(descriptor.to_json_bytes()?)
+        } else {
+            None
+        };
+
+        Ok(CreateDatabaseRequest {
+            inner_request: proto::CreateDatabaseRequest {
+                database_name: database_name.to_string(),
+                ignore_if_exists,
+                database_json,
+            },
+        })
+    }
+}
+
+impl RequestBody for CreateDatabaseRequest {
+    type ResponseBody = CreateDatabaseResponse;
+
+    const API_KEY: ApiKey = ApiKey::CreateDatabase;
+}
+
+impl_write_type!(CreateDatabaseRequest);
+impl_read_type!(CreateDatabaseResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/create_partition.rs b/fluss-rust/crates/fluss/src/rpc/message/create_partition.rs
new file mode 100644
index 0000000000..68595cfc5c
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/create_partition.rs
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::{PartitionSpec, TablePath};
+use crate::proto::CreatePartitionResponse;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::convert::to_table_path;
+use crate::rpc::frame::{ReadError, WriteError};
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct CreatePartitionRequest {
+    pub inner_request: proto::CreatePartitionRequest,
+}
+
+impl CreatePartitionRequest {
+    pub fn new(
+        table_path: &TablePath,
+        partition_spec: &PartitionSpec,
+        ignore_if_exists: bool,
+    ) -> Self {
+        CreatePartitionRequest {
+            inner_request: proto::CreatePartitionRequest {
+                table_path: to_table_path(table_path),
+                partition_spec: partition_spec.to_pb(),
+                // canonical proto field is misnamed; it carries the "ignore if exists" flag
+                ignore_if_not_exists: ignore_if_exists,
+            },
+        }
+    }
+}
+
+impl RequestBody for CreatePartitionRequest {
+    type ResponseBody = CreatePartitionResponse;
+
+    const API_KEY: ApiKey = ApiKey::CreatePartition;
+}
+
+impl_write_type!(CreatePartitionRequest);
+impl_read_type!(CreatePartitionResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/create_table.rs b/fluss-rust/crates/fluss/src/rpc/message/create_table.rs
new file mode 100644
index 0000000000..4647fec686
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/create_table.rs
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::{JsonSerde, TableDescriptor, TablePath};
+use crate::{impl_read_type, impl_write_type, proto};
+
+use crate::error::Result as FlussResult;
+use crate::proto::CreateTableResponse;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::convert::to_table_path;
+use crate::rpc::frame::ReadError;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct CreateTableRequest {
+    pub inner_request: proto::CreateTableRequest,
+}
+
+impl CreateTableRequest {
+    pub fn new(
+        table_path: &TablePath,
+        table_descriptor: &TableDescriptor,
+        ignore_if_exists: bool,
+    ) -> FlussResult<Self> {
+        Ok(CreateTableRequest {
+            inner_request: proto::CreateTableRequest {
+                table_path: to_table_path(table_path),
+                table_json: serde_json::to_vec(&table_descriptor.serialize_json()?).unwrap(),
+                ignore_if_exists,
+            },
+        })
+    }
+}
+
+impl RequestBody for CreateTableRequest {
+    type ResponseBody = CreateTableResponse;
+
+    const API_KEY: ApiKey = ApiKey::CreateTable;
+}
+
+impl_write_type!(CreateTableRequest);
+impl_read_type!(CreateTableResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs b/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs
new file mode 100644
index 0000000000..4a9588a209
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/database_exists.rs
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct DatabaseExistsRequest {
+    pub inner_request: proto::DatabaseExistsRequest,
+}
+
+impl DatabaseExistsRequest {
+    pub fn new(database_name: &str) -> Self {
+        DatabaseExistsRequest {
+            inner_request: proto::DatabaseExistsRequest {
+                database_name: database_name.to_string(),
+            },
+        }
+    }
+}
+
+impl RequestBody for DatabaseExistsRequest {
+    type ResponseBody = proto::DatabaseExistsResponse;
+
+    const API_KEY: ApiKey = ApiKey::DatabaseExists;
+}
+
+impl_write_type!(DatabaseExistsRequest);
+impl_read_type!(proto::DatabaseExistsResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs b/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs
new file mode 100644
index 0000000000..bf7477f311
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/drop_database.rs
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct DropDatabaseRequest {
+    pub inner_request: proto::DropDatabaseRequest,
+}
+
+impl DropDatabaseRequest {
+    pub fn new(database_name: &str, ignore_if_not_exists: bool, cascade: bool) -> Self {
+        DropDatabaseRequest {
+            inner_request: proto::DropDatabaseRequest {
+                database_name: database_name.to_string(),
+                ignore_if_not_exists,
+                cascade,
+            },
+        }
+    }
+}
+
+impl RequestBody for DropDatabaseRequest {
+    type ResponseBody = proto::DropDatabaseResponse;
+
+    const API_KEY: ApiKey = ApiKey::DropDatabase;
+}
+
+impl_write_type!(DropDatabaseRequest);
+impl_read_type!(proto::DropDatabaseResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/drop_partition.rs b/fluss-rust/crates/fluss/src/rpc/message/drop_partition.rs
new file mode 100644
index 0000000000..c7494acbac
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/drop_partition.rs
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::{PartitionSpec, TablePath};
+use crate::proto::DropPartitionResponse;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::convert::to_table_path;
+use crate::rpc::frame::{ReadError, WriteError};
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct DropPartitionRequest {
+    pub inner_request: proto::DropPartitionRequest,
+}
+
+impl DropPartitionRequest {
+    pub fn new(
+        table_path: &TablePath,
+        partition_spec: &PartitionSpec,
+        ignore_if_not_exists: bool,
+    ) -> Self {
+        DropPartitionRequest {
+            inner_request: proto::DropPartitionRequest {
+                table_path: to_table_path(table_path),
+                partition_spec: partition_spec.to_pb(),
+                ignore_if_not_exists,
+            },
+        }
+    }
+}
+
+impl RequestBody for DropPartitionRequest {
+    type ResponseBody = DropPartitionResponse;
+
+    const API_KEY: ApiKey = ApiKey::DropPartition;
+}
+
+impl_write_type!(DropPartitionRequest);
+impl_read_type!(DropPartitionResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs b/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs
new file mode 100644
index 0000000000..b452cf075e
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/drop_table.rs
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::TablePath;
+use crate::{impl_read_type, impl_write_type, proto};
+
+use crate::proto::DropTableResponse;
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::convert::to_table_path;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct DropTableRequest {
+    pub inner_request: proto::DropTableRequest,
+}
+
+impl DropTableRequest {
+    pub fn new(table_path: &TablePath, ignore_if_not_exists: bool) -> Self {
+        DropTableRequest {
+            inner_request: proto::DropTableRequest {
+                table_path: to_table_path(table_path),
+                ignore_if_not_exists,
+            },
+        }
+    }
+}
+
+impl RequestBody for DropTableRequest {
+    type ResponseBody = DropTableResponse;
+
+    const API_KEY: ApiKey = ApiKey::DropTable;
+}
+
+impl_write_type!(DropTableRequest);
+impl_read_type!(DropTableResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/fetch.rs b/fluss-rust/crates/fluss/src/rpc/message/fetch.rs
new file mode 100644
index 0000000000..67930f844f
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/fetch.rs
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::FetchLogResponse;
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use prost::Message;
+
+use bytes::{Buf, BufMut};
+
+#[allow(dead_code)]
+const LOG_FETCH_MAX_BYTES: i32 = 16 * 1024 * 1024;
+#[allow(dead_code)]
+const LOG_FETCH_MIN_BYTES: i32 = 1;
+#[allow(dead_code)]
+const LOG_FETCH_WAIT_MAX_TIME: i32 = 500;
+
+pub struct FetchLogRequest {
+    pub inner_request: proto::FetchLogRequest,
+}
+
+impl FetchLogRequest {
+    pub fn new(fetch_log_request: proto::FetchLogRequest) -> Self {
+        Self {
+            inner_request: fetch_log_request,
+        }
+    }
+}
+
+impl RequestBody for FetchLogRequest {
+    type ResponseBody = FetchLogResponse;
+
+    const API_KEY: ApiKey = ApiKey::FetchLog;
+}
+
+impl_write_type!(FetchLogRequest);
+impl_read_type!(FetchLogResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs b/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs
new file mode 100644
index 0000000000..63647d5266
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/get_database_info.rs
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct GetDatabaseInfoRequest {
+    pub inner_request: proto::GetDatabaseInfoRequest,
+}
+
+impl GetDatabaseInfoRequest {
+    pub fn new(database_name: &str) -> Self {
+        GetDatabaseInfoRequest {
+            inner_request: proto::GetDatabaseInfoRequest {
+                database_name: database_name.to_string(),
+            },
+        }
+    }
+}
+
+impl RequestBody for GetDatabaseInfoRequest {
+    type ResponseBody = proto::GetDatabaseInfoResponse;
+
+    const API_KEY: ApiKey = ApiKey::GetDatabaseInfo;
+}
+
+impl_write_type!(GetDatabaseInfoRequest);
+impl_read_type!(proto::GetDatabaseInfoResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs b/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs
new file mode 100644
index 0000000000..5138fe7298
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/get_latest_lake_snapshot.rs
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto;
+use crate::proto::PbTablePath;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+
+use crate::metadata::TablePath;
+use crate::rpc::frame::ReadError;
+
+use crate::{impl_read_type, impl_write_type};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct GetLatestLakeSnapshotRequest {
+    pub inner_request: proto::GetLakeSnapshotRequest,
+}
+
+impl GetLatestLakeSnapshotRequest {
+    pub fn new(table_path: &TablePath) -> Self {
+        let inner_request = proto::GetLakeSnapshotRequest {
+            table_path: PbTablePath {
+                database_name: table_path.database().to_string(),
+                table_name: table_path.table().to_string(),
+            },
+            snapshot_id: None,
+            readable: None,
+        };
+
+        Self { inner_request }
+    }
+}
+
+impl RequestBody for GetLatestLakeSnapshotRequest {
+    type ResponseBody = proto::GetLakeSnapshotResponse;
+    const API_KEY: ApiKey = ApiKey::GetLatestLakeSnapshot;
+}
+
+impl_write_type!(GetLatestLakeSnapshotRequest);
+impl_read_type!(proto::GetLakeSnapshotResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_security_token.rs b/fluss-rust/crates/fluss/src/rpc/message/get_security_token.rs
new file mode 100644
index 0000000000..741c8482f4
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/get_security_token.rs
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::{GetFileSystemSecurityTokenRequest, GetFileSystemSecurityTokenResponse};
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::{ReadError, WriteError};
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct GetSecurityTokenRequest {
+    pub inner_request: GetFileSystemSecurityTokenRequest,
+}
+
+impl GetSecurityTokenRequest {
+    pub fn new() -> Self {
+        Self {
+            inner_request: GetFileSystemSecurityTokenRequest {},
+        }
+    }
+}
+
+impl Default for GetSecurityTokenRequest {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl RequestBody for GetSecurityTokenRequest {
+    type ResponseBody = GetFileSystemSecurityTokenResponse;
+    const API_KEY: ApiKey = ApiKey::GetFileSystemSecurityToken;
+}
+
+impl_write_type!(GetSecurityTokenRequest);
+impl_read_type!(GetFileSystemSecurityTokenResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_table.rs b/fluss-rust/crates/fluss/src/rpc/message/get_table.rs
new file mode 100644
index 0000000000..a7562f92ea
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/get_table.rs
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::{GetTableInfoRequest, GetTableInfoResponse, PbTablePath};
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+
+use crate::metadata::TablePath;
+use crate::rpc::frame::ReadError;
+
+use crate::{impl_read_type, impl_write_type};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct GetTableRequest {
+    pub inner_request: GetTableInfoRequest,
+}
+
+impl GetTableRequest {
+    pub fn new(table_path: &TablePath) -> Self {
+        let inner_request = GetTableInfoRequest {
+            table_path: PbTablePath {
+                database_name: table_path.database().to_owned(),
+                table_name: table_path.table().to_owned(),
+            },
+        };
+
+        Self { inner_request }
+    }
+}
+
+impl RequestBody for GetTableRequest {
+    type ResponseBody = GetTableInfoResponse;
+    const API_KEY: ApiKey = ApiKey::GetTable;
+}
+
+impl_write_type!(GetTableRequest);
+impl_read_type!(GetTableInfoResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/get_table_schema.rs b/fluss-rust/crates/fluss/src/rpc/message/get_table_schema.rs
new file mode 100644
index 0000000000..1c7c00b7bd
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/get_table_schema.rs
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::{GetTableSchemaRequest, GetTableSchemaResponse, PbTablePath};
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+
+use crate::metadata::TablePath;
+use crate::rpc::frame::ReadError;
+
+use crate::{impl_read_type, impl_write_type};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+/// `schema_id = None` requests the latest schema.
+#[derive(Debug)]
+pub struct GetTableSchemaRequestMsg {
+    pub inner_request: GetTableSchemaRequest,
+}
+
+impl GetTableSchemaRequestMsg {
+    pub fn new(table_path: &TablePath, schema_id: Option<i32>) -> Self {
+        let inner_request = GetTableSchemaRequest {
+            table_path: PbTablePath {
+                database_name: table_path.database().to_owned(),
+                table_name: table_path.table().to_owned(),
+            },
+            schema_id,
+        };
+        Self { inner_request }
+    }
+}
+
+impl RequestBody for GetTableSchemaRequestMsg {
+    type ResponseBody = GetTableSchemaResponse;
+    const API_KEY: ApiKey = ApiKey::GetTableSchema;
+}
+
+impl_write_type!(GetTableSchemaRequestMsg);
+impl_read_type!(GetTableSchemaResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/header.rs b/fluss-rust/crates/fluss/src/rpc/message/header.rs
new file mode 100644
index 0000000000..11155f6833
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/header.rs
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::ErrorResponse;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::api_version::ApiVersion;
+use crate::rpc::frame::{ReadError, WriteError};
+use crate::rpc::message::{ReadType, WriteType};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+pub(crate) const REQUEST_HEADER_LENGTH: usize = 8;
+const SUCCESS_RESPONSE: u8 = 0;
+#[allow(dead_code)]
+const ERROR_RESPONSE: u8 = 1;
+#[allow(dead_code)]
+const SERVER_FAILURE: u8 = 2;
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct RequestHeader {
+    /// The API key of this request.
+    pub request_api_key: ApiKey,
+
+    pub request_api_version: ApiVersion,
+
+    pub request_id: i32,
+
+    pub client_id: Option<String>,
+}
+
+impl<W> WriteType<W> for RequestHeader
+where
+    W: BufMut,
+{
+    fn write(&self, writer: &mut W) -> Result<(), WriteError> {
+        writer.put_i16(self.request_api_key.into());
+        writer.put_i16(self.request_api_version.0);
+        writer.put_i32(self.request_id);
+        Ok(())
+    }
+}
+
+#[derive(Debug, PartialEq)]
+pub struct ResponseHeader {
+    pub request_id: i32,
+    pub error_response: Option<ErrorResponse>,
+}
+
+impl<R> ReadType<R> for ResponseHeader
+where
+    R: Buf,
+{
+    fn read(reader: &mut R) -> Result<Self, ReadError> {
+        let resp_type = reader.get_u8();
+        let request_id = reader.get_i32();
+        if resp_type != SUCCESS_RESPONSE {
+            let error_response = ErrorResponse::decode(reader)?;
+            return Ok(ResponseHeader {
+                request_id,
+                error_response: Some(error_response),
+            });
+        }
+        Ok(ResponseHeader {
+            request_id,
+            error_response: None,
+        })
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/message/init_writer.rs b/fluss-rust/crates/fluss/src/rpc/message/init_writer.rs
new file mode 100644
index 0000000000..b2e64a5f89
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/init_writer.rs
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::{InitWriterResponse, PbTablePath};
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::{ReadError, WriteError};
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+pub struct InitWriterRequest {
+    pub inner_request: proto::InitWriterRequest,
+}
+
+impl InitWriterRequest {
+    pub fn new(table_paths: Vec<PbTablePath>) -> Self {
+        InitWriterRequest {
+            inner_request: proto::InitWriterRequest {
+                table_path: table_paths,
+            },
+        }
+    }
+}
+
+impl RequestBody for InitWriterRequest {
+    type ResponseBody = InitWriterResponse;
+
+    const API_KEY: ApiKey = ApiKey::InitWriter;
+}
+
+impl_write_type!(InitWriterRequest);
+impl_read_type!(InitWriterResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/limit_scan.rs b/fluss-rust/crates/fluss/src/rpc/message/limit_scan.rs
new file mode 100644
index 0000000000..c71b03c350
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/limit_scan.rs
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::LimitScanResponse;
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use prost::Message;
+
+use bytes::{Buf, BufMut};
+
+pub struct LimitScanRequest {
+    pub inner_request: proto::LimitScanRequest,
+}
+
+impl LimitScanRequest {
+    pub fn new(table_id: i64, partition_id: Option<i64>, bucket_id: i32, limit: i32) -> Self {
+        let request = proto::LimitScanRequest {
+            table_id,
+            partition_id,
+            bucket_id,
+            limit,
+        };
+
+        Self {
+            inner_request: request,
+        }
+    }
+}
+
+impl RequestBody for LimitScanRequest {
+    type ResponseBody = LimitScanResponse;
+
+    const API_KEY: ApiKey = ApiKey::LimitScan;
+}
+
+impl_write_type!(LimitScanRequest);
+impl_read_type!(LimitScanResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs b/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs
new file mode 100644
index 0000000000..74ca494464
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/list_databases.rs
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug, Default)]
+pub struct ListDatabasesRequest {
+    pub inner_request: proto::ListDatabasesRequest,
+}
+
+impl ListDatabasesRequest {
+    pub fn new() -> Self {
+        ListDatabasesRequest {
+            inner_request: proto::ListDatabasesRequest {
+                include_summary: None,
+            },
+        }
+    }
+}
+
+impl RequestBody for ListDatabasesRequest {
+    type ResponseBody = proto::ListDatabasesResponse;
+
+    const API_KEY: ApiKey = ApiKey::ListDatabases;
+}
+
+impl_write_type!(ListDatabasesRequest);
+impl_read_type!(proto::ListDatabasesResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs b/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs
new file mode 100644
index 0000000000..2ec1437093
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/list_offsets.rs
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{BucketId, PartitionId, TableId, impl_read_type, impl_write_type, proto};
+
+use crate::error::Result as FlussResult;
+use crate::error::{Error, FlussError};
+use crate::proto::{ErrorResponse, ListOffsetsResponse};
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use std::collections::HashMap;
+
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+/// Offset type constants as per proto comments
+pub const LIST_EARLIEST_OFFSET: i32 = 0;
+pub const LIST_LATEST_OFFSET: i32 = 1;
+pub const LIST_OFFSET_FROM_TIMESTAMP: i32 = 2;
+
+/// Client follower server id constant
+pub const CLIENT_FOLLOWER_SERVER_ID: i32 = -1;
+
+/// Offset specification for list offsets request
+#[derive(Debug, Clone)]
+pub enum OffsetSpec {
+    /// Earliest offset spec
+    Earliest,
+    /// Latest offset spec  
+    Latest,
+    /// Timestamp offset spec
+    Timestamp(i64),
+}
+
+impl OffsetSpec {
+    pub fn offset_type(&self) -> i32 {
+        match self {
+            OffsetSpec::Earliest => LIST_EARLIEST_OFFSET,
+            OffsetSpec::Latest => LIST_LATEST_OFFSET,
+            OffsetSpec::Timestamp(_) => LIST_OFFSET_FROM_TIMESTAMP,
+        }
+    }
+
+    pub fn start_timestamp(&self) -> Option<i64> {
+        match self {
+            OffsetSpec::Timestamp(ts) => Some(*ts),
+            _ => None,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct ListOffsetsRequest {
+    pub inner_request: proto::ListOffsetsRequest,
+}
+
+impl ListOffsetsRequest {
+    pub fn new(
+        table_id: TableId,
+        partition_id: Option<PartitionId>,
+        bucket_ids: Vec<BucketId>,
+        offset_spec: OffsetSpec,
+    ) -> Self {
+        ListOffsetsRequest {
+            inner_request: proto::ListOffsetsRequest {
+                follower_server_id: CLIENT_FOLLOWER_SERVER_ID,
+                offset_type: offset_spec.offset_type(),
+                table_id,
+                partition_id,
+                bucket_id: bucket_ids,
+                start_timestamp: offset_spec.start_timestamp(),
+            },
+        }
+    }
+}
+
+impl RequestBody for ListOffsetsRequest {
+    type ResponseBody = ListOffsetsResponse;
+
+    const API_KEY: ApiKey = ApiKey::ListOffsets;
+}
+
+impl_write_type!(ListOffsetsRequest);
+impl_read_type!(ListOffsetsResponse);
+
+impl ListOffsetsResponse {
+    pub fn offsets(&self) -> FlussResult<HashMap<i32, i64>> {
+        self.buckets_resp
+            .iter()
+            .map(|resp| {
+                if let Some(error_code) = resp.error_code
+                    && error_code != FlussError::None.code()
+                {
+                    let api_error = ErrorResponse {
+                        error_code,
+                        error_message: resp.error_message.clone(),
+                    }
+                    .into();
+                    return Err(Error::FlussAPIError { api_error });
+                }
+                // if no error msg, offset must exists
+                resp.offset
+                    .map(|offset| (resp.bucket_id, offset))
+                    .ok_or_else(|| Error::UnexpectedError {
+                        message: format!(
+                            "Missing offset for bucket {} without error code.",
+                            resp.bucket_id
+                        ),
+                        source: None,
+                    })
+            })
+            .collect()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::proto::{ListOffsetsResponse, PbListOffsetsRespForBucket};
+
+    #[test]
+    fn offsets_returns_api_error_on_error_code() {
+        let response = ListOffsetsResponse {
+            buckets_resp: vec![PbListOffsetsRespForBucket {
+                bucket_id: 1,
+                error_code: Some(FlussError::TableNotExist.code()),
+                error_message: Some("missing".to_string()),
+                offset: None,
+            }],
+        };
+
+        let result = response.offsets();
+        assert!(matches!(result, Err(Error::FlussAPIError { .. })));
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_partition_infos.rs b/fluss-rust/crates/fluss/src/rpc/message/list_partition_infos.rs
new file mode 100644
index 0000000000..cf24f466a0
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/list_partition_infos.rs
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::{PartitionInfo, PartitionSpec, TablePath};
+use crate::proto::ListPartitionInfosResponse;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::convert::to_table_path;
+use crate::rpc::frame::{ReadError, WriteError};
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct ListPartitionInfosRequest {
+    pub inner_request: proto::ListPartitionInfosRequest,
+}
+
+impl ListPartitionInfosRequest {
+    pub fn new(table_path: &TablePath, partial_partition_spec: Option<&PartitionSpec>) -> Self {
+        ListPartitionInfosRequest {
+            inner_request: proto::ListPartitionInfosRequest {
+                table_path: to_table_path(table_path),
+                partial_partition_spec: partial_partition_spec.map(|s| s.to_pb()),
+            },
+        }
+    }
+}
+
+impl RequestBody for ListPartitionInfosRequest {
+    type ResponseBody = ListPartitionInfosResponse;
+
+    const API_KEY: ApiKey = ApiKey::ListPartitionInfos;
+}
+
+impl_write_type!(ListPartitionInfosRequest);
+impl_read_type!(ListPartitionInfosResponse);
+
+impl ListPartitionInfosResponse {
+    pub fn get_partitions_info(&self) -> Vec<PartitionInfo> {
+        self.partitions_info
+            .iter()
+            .map(PartitionInfo::from_pb)
+            .collect()
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs b/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs
new file mode 100644
index 0000000000..8ff72141cb
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/list_tables.rs
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{impl_read_type, impl_write_type, proto};
+
+use crate::proto::ListTablesResponse;
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[derive(Debug)]
+pub struct ListTablesRequest {
+    pub inner_request: proto::ListTablesRequest,
+}
+
+impl ListTablesRequest {
+    pub fn new(database_name: &str) -> Self {
+        ListTablesRequest {
+            inner_request: proto::ListTablesRequest {
+                database_name: database_name.to_string(),
+            },
+        }
+    }
+}
+
+impl RequestBody for ListTablesRequest {
+    type ResponseBody = ListTablesResponse;
+
+    const API_KEY: ApiKey = ApiKey::ListTables;
+}
+
+impl_write_type!(ListTablesRequest);
+impl_read_type!(ListTablesResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/lookup.rs b/fluss-rust/crates/fluss/src/rpc/message/lookup.rs
new file mode 100644
index 0000000000..e205fa6b67
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/lookup.rs
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::LookupResponse;
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{BucketId, PartitionId, TableId, impl_read_type, impl_write_type, proto};
+use bytes::Bytes;
+use prost::Message;
+
+use bytes::{Buf, BufMut};
+
+pub struct LookupRequest {
+    pub inner_request: proto::LookupRequest,
+}
+
+impl LookupRequest {
+    pub fn new_batched(
+        table_id: TableId,
+        buckets: Vec<(BucketId, Option<PartitionId>, Vec<Bytes>)>,
+    ) -> Self {
+        let buckets_req: Vec<proto::PbLookupReqForBucket> = buckets
+            .into_iter()
+            .map(
+                |(bucket_id, partition_id, keys)| proto::PbLookupReqForBucket {
+                    partition_id,
+                    bucket_id,
+                    keys,
+                },
+            )
+            .collect();
+
+        let request = proto::LookupRequest {
+            table_id,
+            buckets_req,
+            insert_if_not_exists: None,
+            acks: None,
+            timeout_ms: None,
+        };
+
+        Self {
+            inner_request: request,
+        }
+    }
+}
+
+impl RequestBody for LookupRequest {
+    type ResponseBody = LookupResponse;
+
+    const API_KEY: ApiKey = ApiKey::Lookup;
+}
+
+impl_write_type!(LookupRequest);
+impl_read_type!(LookupResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/mod.rs b/fluss-rust/crates/fluss/src/rpc/message/mod.rs
new file mode 100644
index 0000000000..096066ed43
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/mod.rs
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::{ReadError, WriteError};
+use bytes::{Buf, BufMut};
+
+mod api_versions;
+mod authenticate;
+mod create_database;
+mod create_partition;
+mod create_table;
+mod database_exists;
+mod drop_database;
+mod drop_partition;
+mod drop_table;
+mod fetch;
+mod get_database_info;
+mod get_latest_lake_snapshot;
+mod get_security_token;
+mod get_table;
+mod get_table_schema;
+mod header;
+mod init_writer;
+mod limit_scan;
+mod list_databases;
+mod list_offsets;
+mod list_partition_infos;
+mod list_tables;
+mod lookup;
+mod prefix_lookup;
+mod produce_log;
+mod put_kv;
+mod table_exists;
+mod update_metadata;
+
+pub use crate::rpc::RpcError;
+pub use api_versions::*;
+pub use authenticate::*;
+pub use create_database::*;
+pub use create_partition::*;
+pub use create_table::*;
+pub use database_exists::*;
+pub use drop_database::*;
+pub use drop_partition::*;
+pub use drop_table::*;
+pub use fetch::*;
+pub use get_database_info::*;
+pub use get_latest_lake_snapshot::*;
+pub use get_security_token::*;
+pub use get_table::*;
+pub use get_table_schema::*;
+pub use header::*;
+pub use init_writer::*;
+pub use limit_scan::*;
+pub use list_databases::*;
+pub use list_offsets::*;
+pub use list_partition_infos::*;
+pub use list_tables::*;
+pub use lookup::*;
+pub use prefix_lookup::*;
+pub use produce_log::*;
+pub use put_kv::*;
+pub use table_exists::*;
+pub use update_metadata::*;
+
+pub trait RequestBody {
+    type ResponseBody;
+
+    const API_KEY: ApiKey;
+}
+
+impl<T: RequestBody> RequestBody for &T {
+    type ResponseBody = T::ResponseBody;
+
+    const API_KEY: ApiKey = T::API_KEY;
+}
+
+pub trait WriteType<W>: Sized
+where
+    W: BufMut,
+{
+    fn write(&self, writer: &mut W) -> Result<(), WriteError>;
+}
+
+pub trait ReadType<R>: Sized
+where
+    R: Buf,
+{
+    fn read(reader: &mut R) -> Result<Self, ReadError>;
+}
+
+#[macro_export]
+macro_rules! impl_write_type {
+    ($type:ty) => {
+        impl<W> WriteType<W> for $type
+        where
+            W: BufMut,
+        {
+            fn write(&self, writer: &mut W) -> Result<(), WriteError> {
+                Ok(self.inner_request.encode(writer).unwrap())
+            }
+        }
+    };
+}
+
+#[macro_export]
+macro_rules! impl_read_type {
+    ($type:ty) => {
+        impl<R> ReadType<R> for $type
+        where
+            R: Buf,
+        {
+            fn read(reader: &mut R) -> Result<Self, ReadError> {
+                Ok(<$type>::decode(reader).unwrap())
+            }
+        }
+    };
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/message/prefix_lookup.rs b/fluss-rust/crates/fluss/src/rpc/message/prefix_lookup.rs
new file mode 100644
index 0000000000..e71ffe7c7b
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/prefix_lookup.rs
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::proto::PrefixLookupResponse;
+use crate::rpc::frame::ReadError;
+
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{BucketId, PartitionId, TableId, impl_read_type, impl_write_type, proto};
+use bytes::Bytes;
+use prost::Message;
+
+use bytes::{Buf, BufMut};
+
+pub struct PrefixLookupRequest {
+    pub inner_request: proto::PrefixLookupRequest,
+}
+
+impl PrefixLookupRequest {
+    pub fn new_batched(
+        table_id: TableId,
+        buckets: Vec<(BucketId, Option<PartitionId>, Vec<Bytes>)>,
+    ) -> Self {
+        let buckets_req: Vec<proto::PbPrefixLookupReqForBucket> = buckets
+            .into_iter()
+            .map(
+                |(bucket_id, partition_id, keys)| proto::PbPrefixLookupReqForBucket {
+                    partition_id,
+                    bucket_id,
+                    keys,
+                },
+            )
+            .collect();
+
+        let request = proto::PrefixLookupRequest {
+            table_id,
+            buckets_req,
+        };
+
+        Self {
+            inner_request: request,
+        }
+    }
+}
+
+impl RequestBody for PrefixLookupRequest {
+    type ResponseBody = PrefixLookupResponse;
+
+    const API_KEY: ApiKey = ApiKey::PrefixLookup;
+}
+
+impl_write_type!(PrefixLookupRequest);
+impl_read_type!(PrefixLookupResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs
new file mode 100644
index 0000000000..8be2463832
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/produce_log.rs
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::error::Result as FlussResult;
+use crate::proto::{PbProduceLogReqForBucket, ProduceLogResponse};
+use crate::rpc::frame::ReadError;
+
+use crate::client::ReadyWriteBatch;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+pub struct ProduceLogRequest {
+    pub inner_request: proto::ProduceLogRequest,
+}
+
+impl ProduceLogRequest {
+    pub fn new(
+        table_id: i64,
+        ack: i16,
+        max_request_timeout_ms: i32,
+        ready_batches: &mut [ReadyWriteBatch],
+    ) -> FlussResult<Self> {
+        let mut request = proto::ProduceLogRequest {
+            table_id,
+            acks: ack as i32,
+            timeout_ms: max_request_timeout_ms,
+            ..Default::default()
+        };
+        for ready_batch in ready_batches {
+            request.buckets_req.push(PbProduceLogReqForBucket {
+                partition_id: ready_batch.table_bucket.partition_id(),
+                bucket_id: ready_batch.table_bucket.bucket_id(),
+                records: ready_batch.write_batch.build()?,
+            })
+        }
+
+        Ok(ProduceLogRequest {
+            inner_request: request,
+        })
+    }
+}
+
+impl RequestBody for ProduceLogRequest {
+    type ResponseBody = ProduceLogResponse;
+
+    const API_KEY: ApiKey = ApiKey::ProduceLog;
+}
+
+impl_write_type!(ProduceLogRequest);
+impl_read_type!(ProduceLogResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/put_kv.rs b/fluss-rust/crates/fluss/src/rpc/message/put_kv.rs
new file mode 100644
index 0000000000..e76496d123
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/put_kv.rs
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+use crate::client::ReadyWriteBatch;
+use crate::proto::{PbPutKvReqForBucket, PutKvResponse};
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::ReadError;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+#[allow(dead_code)]
+pub struct PutKvRequest {
+    pub inner_request: proto::PutKvRequest,
+}
+
+#[allow(dead_code)]
+impl PutKvRequest {
+    pub fn new(
+        table_id: i64,
+        ack: i16,
+        max_request_timeout_ms: i32,
+        target_columns: Vec<i32>,
+        ready_batches: &mut [ReadyWriteBatch],
+    ) -> crate::error::Result<Self> {
+        let mut request = proto::PutKvRequest {
+            table_id,
+            acks: ack as i32,
+            timeout_ms: max_request_timeout_ms,
+            target_columns,
+            ..Default::default()
+        };
+        for ready_batch in ready_batches {
+            request.buckets_req.push(PbPutKvReqForBucket {
+                partition_id: ready_batch.table_bucket.partition_id(),
+                bucket_id: ready_batch.table_bucket.bucket_id(),
+                records: ready_batch.write_batch.build()?,
+            })
+        }
+
+        Ok(PutKvRequest {
+            inner_request: request,
+        })
+    }
+}
+
+impl RequestBody for PutKvRequest {
+    type ResponseBody = PutKvResponse;
+
+    const API_KEY: ApiKey = ApiKey::PutKv;
+}
+
+impl_write_type!(PutKvRequest);
+impl_read_type!(PutKvResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs b/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs
new file mode 100644
index 0000000000..5bc848e32a
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/table_exists.rs
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::TablePath;
+use crate::{impl_read_type, impl_write_type, proto};
+
+use crate::proto::TableExistsResponse;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::convert::to_table_path;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+
+use crate::rpc::frame::ReadError;
+
+use bytes::{Buf, BufMut};
+use prost::Message;
+#[derive(Debug)]
+pub struct TableExistsRequest {
+    pub inner_request: proto::TableExistsRequest,
+}
+
+impl TableExistsRequest {
+    pub fn new(table_path: &TablePath) -> Self {
+        TableExistsRequest {
+            inner_request: proto::TableExistsRequest {
+                table_path: to_table_path(table_path),
+            },
+        }
+    }
+}
+
+impl RequestBody for TableExistsRequest {
+    type ResponseBody = TableExistsResponse;
+
+    const API_KEY: ApiKey = ApiKey::TableExists;
+}
+
+impl_write_type!(TableExistsRequest);
+impl_read_type!(TableExistsResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs b/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs
new file mode 100644
index 0000000000..fd96ca5e67
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/message/update_metadata.rs
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::metadata::{PhysicalTablePath, TablePath};
+use crate::proto::{MetadataResponse, PbPhysicalTablePath, PbTablePath};
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::frame::ReadError;
+use crate::rpc::frame::WriteError;
+use crate::rpc::message::{ReadType, RequestBody, WriteType};
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use crate::{impl_read_type, impl_write_type, proto};
+use bytes::{Buf, BufMut};
+use prost::Message;
+
+pub struct UpdateMetadataRequest {
+    pub inner_request: proto::MetadataRequest,
+}
+
+impl UpdateMetadataRequest {
+    pub fn new(
+        table_paths: &HashSet<&TablePath>,
+        physical_table_paths: &HashSet<&Arc<PhysicalTablePath>>,
+        partition_ids: Vec<i64>,
+    ) -> Self {
+        UpdateMetadataRequest {
+            inner_request: proto::MetadataRequest {
+                table_path: table_paths
+                    .iter()
+                    .map(|path| PbTablePath {
+                        database_name: path.database().to_string(),
+                        table_name: path.table().to_string(),
+                    })
+                    .collect(),
+                partitions_path: physical_table_paths
+                    .iter()
+                    .map(|path| PbPhysicalTablePath {
+                        database_name: path.get_database_name().to_string(),
+                        table_name: path.get_table_name().to_string(),
+                        partition_name: path.get_partition_name().map(|pn| pn.to_string()),
+                    })
+                    .collect(),
+                partitions_id: partition_ids,
+            },
+        }
+    }
+}
+
+impl RequestBody for UpdateMetadataRequest {
+    type ResponseBody = MetadataResponse;
+
+    const API_KEY: ApiKey = ApiKey::MetaData;
+}
+
+impl_write_type!(UpdateMetadataRequest);
+impl_read_type!(MetadataResponse);
diff --git a/fluss-rust/crates/fluss/src/rpc/mod.rs b/fluss-rust/crates/fluss/src/rpc/mod.rs
new file mode 100644
index 0000000000..6f3a88d1ba
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/mod.rs
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod api_key;
+pub(crate) use api_key::ApiKey;
+mod api_version;
+pub mod error;
+mod fluss_api_error;
+pub use fluss_api_error::{ApiError, FlussError};
+mod frame;
+pub mod message;
+pub use error::*;
+mod server_connection;
+pub use server_connection::*;
+mod convert;
+mod transport;
+
+pub use convert::*;
diff --git a/fluss-rust/crates/fluss/src/rpc/server_connection.rs b/fluss-rust/crates/fluss/src/rpc/server_connection.rs
new file mode 100644
index 0000000000..e1148f9d06
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/server_connection.rs
@@ -0,0 +1,1300 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::cluster::{ServerNode, ServerType};
+use crate::error::Error;
+use crate::metrics::{
+    CLIENT_BYTES_RECEIVED_TOTAL, CLIENT_BYTES_SENT_TOTAL, CLIENT_REQUEST_LATENCY_MS,
+    CLIENT_REQUESTS_IN_FLIGHT, CLIENT_REQUESTS_TOTAL, CLIENT_RESPONSES_TOTAL, LABEL_API_KEY,
+    api_key_label,
+};
+use crate::proto::PbApiVersion;
+use crate::rpc::api_key::ApiKey;
+use crate::rpc::api_version::ApiVersion;
+use crate::rpc::error::RpcError;
+use crate::rpc::error::RpcError::ConnectionError;
+use crate::rpc::frame::{AsyncMessageRead, AsyncMessageWrite};
+use crate::rpc::message::{
+    ApiVersionsRequest, REQUEST_HEADER_LENGTH, ReadType, RequestBody, RequestHeader,
+    ResponseHeader, WriteType,
+};
+use crate::rpc::transport::Transport;
+use futures::future::BoxFuture;
+use log::warn;
+use parking_lot::{Mutex, RwLock};
+use std::collections::HashMap;
+use std::fmt;
+use std::io::Cursor;
+use std::ops::DerefMut;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicI32, Ordering};
+use std::task::Poll;
+use std::time::{Duration, Instant};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufStream, WriteHalf};
+use tokio::sync::Mutex as AsyncMutex;
+use tokio::sync::oneshot::{Sender, channel};
+use tokio::task::JoinHandle;
+
+pub type MessengerTransport = ServerConnectionInner<BufStream<Transport>>;
+
+pub type ServerConnection = Arc<MessengerTransport>;
+
+// Matches Java's ExponentialBackoff(100ms initial, 2x multiplier, 5000ms max, 0.2 jitter).
+const AUTH_INITIAL_BACKOFF_MS: f64 = 100.0;
+const AUTH_MAX_BACKOFF_MS: f64 = 5000.0;
+const AUTH_BACKOFF_MULTIPLIER: f64 = 2.0;
+const AUTH_JITTER: f64 = 0.2;
+
+#[derive(Clone)]
+pub struct SaslConfig {
+    pub username: String,
+    pub password: String,
+}
+
+impl fmt::Debug for SaslConfig {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("SaslConfig")
+            .field("username", &self.username)
+            .field("password", &"[REDACTED]")
+            .finish()
+    }
+}
+
+/// Represents the negotiated API versions between the client and a server node.
+/// Built from the server's `ApiVersionsResponse` by intersecting each API's
+/// client-supported range with the server-supported range, keeping the highest
+/// usable version.
+#[derive(Clone, Debug)]
+pub struct ServerApiVersions {
+    versions: HashMap<ApiKey, Result<ApiVersion, String>>,
+}
+
+impl ServerApiVersions {
+    /// Build from the server's advertised API version list.
+    pub fn new(server_versions: &[PbApiVersion]) -> Self {
+        let mut versions = HashMap::new();
+        for sv in server_versions {
+            let api_key = ApiKey::from(i16::try_from(sv.api_key).unwrap());
+            // Skip unknown API keys — the client does not support them.
+            let client_range = match api_key.supported_versions() {
+                Some(range) => range,
+                None => continue,
+            };
+            let server_min = i16::try_from(sv.min_version).unwrap();
+            let server_max = i16::try_from(sv.max_version).unwrap();
+            let min_version = client_range.min().0.max(server_min);
+            let max_version = client_range.max().0.min(server_max);
+            if min_version > max_version {
+                versions.insert(
+                    api_key,
+                    Err(format!(
+                        "The server does not support {:?} with version in range [{},{}]. \
+                         The supported range is [{},{}].",
+                        api_key,
+                        client_range.min(),
+                        client_range.max(),
+                        server_min,
+                        server_max,
+                    )),
+                );
+            } else {
+                versions.insert(api_key, Ok(ApiVersion(max_version)));
+            }
+        }
+        Self { versions }
+    }
+
+    /// Get the negotiated (highest usable) version for a given API key.
+    pub fn highest_available_version(&self, api_key: ApiKey) -> Result<ApiVersion, Error> {
+        match self.versions.get(&api_key) {
+            Some(Ok(version)) => Ok(*version),
+            Some(Err(msg)) => Err(Error::UnsupportedVersion {
+                message: msg.clone(),
+            }),
+            None => Err(Error::UnsupportedVersion {
+                message: format!("The server does not support {:?}", api_key),
+            }),
+        }
+    }
+}
+
+/// Resolve the API version to use for a given API key.
+fn resolve_api_version_for(
+    api_versions: Option<&ServerApiVersions>,
+    api_key: ApiKey,
+) -> Result<ApiVersion, Error> {
+    // version equals highestSupportedVersion might happen when requesting api version check
+    // before serverApiVersions is initialized. We always use the highest version for api
+    // version checking.
+    let default_version = api_key
+        .supported_versions()
+        .map(|range| range.max())
+        .unwrap();
+    match api_versions {
+        Some(versions) => versions.highest_available_version(api_key),
+        None => Ok(default_version),
+    }
+}
+
+/// Validate that the server's advertised `server_type` matches the type we expect
+/// for the target `ServerNode`.
+fn validate_server_type(
+    expected: &ServerType,
+    response_server_type: Option<i32>,
+) -> Result<(), Error> {
+    // For forward-compat with servers that do not populate `server_type`, validation is skipped.
+    let Some(type_id) = response_server_type else {
+        return Ok(());
+    };
+    let actual = ServerType::from_type_id(type_id);
+    if actual.as_ref() == Some(expected) {
+        return Ok(());
+    }
+    let actual_desc = actual
+        .map(|t| t.to_string())
+        .unwrap_or_else(|| format!("Unknown(type_id={type_id})"));
+    Err(Error::InvalidServerType {
+        message: format!(
+            "Expected server type {expected} but the server advertised {actual_desc}. \
+             The client may be talking to the wrong endpoint \
+             (e.g. coordinator vs tablet server)."
+        ),
+    })
+}
+
+#[derive(Debug, Default)]
+pub struct RpcClient {
+    connections: RwLock<HashMap<String, ServerConnection>>,
+    client_id: Arc<str>,
+    timeout: Option<Duration>,
+    max_message_size: usize,
+    sasl_config: Option<SaslConfig>,
+}
+
+impl RpcClient {
+    pub fn new() -> Self {
+        RpcClient {
+            connections: Default::default(),
+            client_id: Arc::from(""),
+            timeout: None,
+            max_message_size: usize::MAX,
+            sasl_config: None,
+        }
+    }
+
+    pub fn with_timeout(mut self, timeout: Duration) -> Self {
+        self.timeout = Some(timeout);
+        self
+    }
+
+    pub fn with_sasl(mut self, username: String, password: String) -> Self {
+        self.sasl_config = Some(SaslConfig { username, password });
+        self
+    }
+
+    pub async fn get_connection(
+        &self,
+        server_node: &ServerNode,
+    ) -> Result<ServerConnection, Error> {
+        let server_id = server_node.uid();
+        {
+            let connections = self.connections.read();
+            if let Some(conn) = connections.get(server_id).cloned() {
+                if !conn.is_poisoned() {
+                    return Ok(conn);
+                }
+            }
+        }
+        let new_server = self.connect(server_node).await?;
+        {
+            let mut connections = self.connections.write();
+            if let Some(race_conn) = connections.get(server_id) {
+                if !race_conn.is_poisoned() {
+                    return Ok(race_conn.clone());
+                }
+            }
+
+            connections.insert(server_id.to_owned(), new_server.clone());
+        }
+        Ok(new_server)
+    }
+
+    async fn connect(&self, server_node: &ServerNode) -> Result<ServerConnection, Error> {
+        let url = server_node.url();
+        let transport = Transport::connect(&url, self.timeout)
+            .await
+            .map_err(|error| ConnectionError(error.to_string()))?;
+
+        let messenger = ServerConnectionInner::new(
+            BufStream::new(transport),
+            self.max_message_size,
+            self.client_id.clone(),
+        );
+        let connection = ServerConnection::new(messenger);
+
+        // Negotiate API versions (must happen before authentication).
+        Self::check_api_versions(&connection, server_node.server_type()).await?;
+
+        if let Some(ref sasl) = self.sasl_config {
+            Self::authenticate(&connection, &sasl.username, &sasl.password).await?;
+        }
+
+        Ok(connection)
+    }
+
+    /// Send an `ApiVersionsRequest`, validate the advertised `server_type`, and
+    /// store the negotiated versions on the connection.
+    async fn check_api_versions(
+        connection: &ServerConnection,
+        expected_server_type: &ServerType,
+    ) -> Result<(), Error> {
+        let request = ApiVersionsRequest::new("fluss-rust", env!("CARGO_PKG_VERSION"));
+        let response = connection.request(request).await?;
+        validate_server_type(expected_server_type, response.server_type)?;
+        let api_versions = ServerApiVersions::new(&response.api_versions);
+        *connection.api_versions.lock() = Some(api_versions);
+        Ok(())
+    }
+
+    /// Perform SASL/PLAIN authentication handshake.
+    ///
+    /// Retries on `RetriableAuthenticateException` with exponential backoff
+    /// (matching Java's unbounded retry behaviour). Non-retriable errors
+    /// (wrong password, unknown user) propagate immediately as
+    /// `Error::FlussAPIError` with the original error code.
+    async fn authenticate(
+        connection: &ServerConnection,
+        username: &str,
+        password: &str,
+    ) -> Result<(), Error> {
+        use crate::rpc::fluss_api_error::FlussError;
+        use crate::rpc::message::AuthenticateRequest;
+        use rand::Rng;
+
+        let initial_request = AuthenticateRequest::new_plain(username, password);
+        let mut retry_count: u32 = 0;
+
+        loop {
+            let request = initial_request.clone();
+            let result = connection.request(request).await;
+
+            match result {
+                Ok(response) => {
+                    // Check for server challenge (multi-round auth).
+                    // PLAIN mechanism never sends a challenge, but we handle it
+                    // for protocol correctness matching Java's handleAuthenticateResponse.
+                    if let Some(challenge) = response.challenge {
+                        let challenge_req = AuthenticateRequest::from_challenge("PLAIN", challenge);
+                        connection.request(challenge_req).await?;
+                    }
+                    return Ok(());
+                }
+                Err(Error::FlussAPIError { ref api_error })
+                    if FlussError::for_code(api_error.code)
+                        == FlussError::RetriableAuthenticateException =>
+                {
+                    retry_count += 1;
+                    // Cap the exponent like Java's ExponentialBackoff.expMax so that
+                    // jitter still produces a range at steady state instead of being
+                    // clamped to AUTH_MAX_BACKOFF_MS.
+                    let exp_max = (AUTH_MAX_BACKOFF_MS / AUTH_INITIAL_BACKOFF_MS).log2();
+                    let exp = ((retry_count as f64) - 1.0).min(exp_max);
+                    let term = AUTH_INITIAL_BACKOFF_MS * AUTH_BACKOFF_MULTIPLIER.powf(exp);
+                    let jitter_factor =
+                        1.0 - AUTH_JITTER + rand::rng().random::<f64>() * (2.0 * AUTH_JITTER);
+                    let backoff_ms = (term * jitter_factor) as u64;
+                    log::warn!(
+                        "SASL authentication retriable failure (attempt {retry_count}), \
+                         retrying in {backoff_ms}ms: {}",
+                        api_error.message
+                    );
+                    tokio::time::sleep(Duration::from_millis(backoff_ms)).await;
+                }
+                // Server-side auth errors (wrong password, unknown user, etc.)
+                // propagate with their original error code preserved.
+                Err(e) => return Err(e),
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+struct Response {
+    #[allow(dead_code)]
+    header: ResponseHeader,
+    data: Cursor<Vec<u8>>,
+}
+
+#[derive(Debug)]
+struct ActiveRequest {
+    channel: Sender<Result<Response, RpcError>>,
+}
+
+/// Tracks per-request connection metrics and ensures in-flight gauge cleanup on drop.
+struct RequestMetricsLifecycle {
+    label: Option<&'static str>,
+    start: Instant,
+    completed: bool,
+}
+
+impl RequestMetricsLifecycle {
+    fn begin(api_key: crate::rpc::ApiKey, request_bytes: u64) -> Self {
+        let label = api_key_label(api_key);
+        if let Some(label) = label {
+            metrics::counter!(CLIENT_REQUESTS_TOTAL, LABEL_API_KEY => label).increment(1);
+            metrics::counter!(CLIENT_BYTES_SENT_TOTAL, LABEL_API_KEY => label)
+                .increment(request_bytes);
+            metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).increment(1.0);
+        }
+        Self {
+            label,
+            start: Instant::now(),
+            completed: false,
+        }
+    }
+
+    fn complete(&mut self, response_bytes: u64) {
+        let Some(label) = self.label else {
+            return;
+        };
+        if self.completed {
+            return;
+        }
+
+        metrics::counter!(CLIENT_RESPONSES_TOTAL, LABEL_API_KEY => label).increment(1);
+        metrics::counter!(CLIENT_BYTES_RECEIVED_TOTAL, LABEL_API_KEY => label)
+            .increment(response_bytes);
+        metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).decrement(1.0);
+        metrics::histogram!(CLIENT_REQUEST_LATENCY_MS, LABEL_API_KEY => label)
+            .record(self.start.elapsed().as_secs_f64() * 1000.0);
+        self.completed = true;
+    }
+}
+
+impl Drop for RequestMetricsLifecycle {
+    fn drop(&mut self) {
+        if self.completed {
+            return;
+        }
+        if let Some(label) = self.label {
+            metrics::gauge!(CLIENT_REQUESTS_IN_FLIGHT, LABEL_API_KEY => label).decrement(1.0);
+            self.completed = true;
+        }
+    }
+}
+
+#[derive(Debug)]
+enum ConnectionState {
+    /// Currently active requests by request ID.
+    ///
+    /// An active request is one that got prepared or send but the response wasn't received yet.
+    RequestMap(HashMap<i32, ActiveRequest>),
+
+    /// One or our streams died and we are unable to process any more requests.
+    Poison(Arc<RpcError>),
+}
+
+impl ConnectionState {
+    fn poison(&mut self, err: RpcError) -> Arc<RpcError> {
+        match self {
+            Self::RequestMap(map) => {
+                let err = Arc::new(err);
+
+                // inform all active requests
+                for (_request_id, active_request) in map.drain() {
+                    // it's OK if the other side is gone
+                    active_request
+                        .channel
+                        .send(Err(RpcError::Poisoned(Arc::clone(&err))))
+                        .ok();
+                }
+                *self = Self::Poison(Arc::clone(&err));
+                err
+            }
+            Self::Poison(e) => {
+                // already poisoned, used existing error
+                Arc::clone(e)
+            }
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct ServerConnectionInner<RW> {
+    /// The half of the stream that we use to send data TO the broker.
+    ///
+    /// This will be used by [`request`](Self::request) to queue up messages.
+    stream_write: Arc<AsyncMutex<WriteHalf<RW>>>,
+
+    client_id: Arc<str>,
+
+    request_id: AtomicI32,
+
+    state: Arc<Mutex<ConnectionState>>,
+
+    /// Negotiated API versions for this connection.
+    /// `None` until the ApiVersions handshake completes.
+    api_versions: Mutex<Option<ServerApiVersions>>,
+
+    join_handle: JoinHandle<()>,
+}
+
+impl<RW> ServerConnectionInner<RW>
+where
+    RW: AsyncRead + AsyncWrite + Send + 'static,
+{
+    pub fn new(stream: RW, max_message_size: usize, client_id: Arc<str>) -> Self {
+        let (stream_read, stream_write) = tokio::io::split(stream);
+        let state = Arc::new(Mutex::new(ConnectionState::RequestMap(HashMap::default())));
+        let state_captured = Arc::clone(&state);
+
+        let join_handle = tokio::spawn(async move {
+            let mut stream_read = stream_read;
+            loop {
+                match stream_read.read_message(max_message_size).await {
+                    Ok(msg) => {
+                        // message was read, so all subsequent errors should not poison the whole stream
+                        let mut cursor = Cursor::new(msg);
+                        let header = match ResponseHeader::read(&mut cursor) {
+                            Ok(header) => header,
+                            Err(err) => {
+                                log::warn!("Cannot read message header, ignoring message: {err:?}");
+                                continue;
+                            }
+                        };
+
+                        let active_request = match state_captured.lock().deref_mut() {
+                            ConnectionState::RequestMap(map) => {
+                                match map.remove(&header.request_id) {
+                                    Some(active_request) => active_request,
+                                    _ => {
+                                        log::warn!(
+                                            request_id:% = header.request_id;
+                                            "Got response for unknown request",
+                                        );
+                                        continue;
+                                    }
+                                }
+                            }
+                            ConnectionState::Poison(_) => {
+                                // stream is poisoned, no need to anything
+                                return;
+                            }
+                        };
+
+                        // we don't care if the other side is gone
+                        active_request
+                            .channel
+                            .send(Ok(Response {
+                                header,
+                                data: cursor,
+                            }))
+                            .ok();
+                    }
+                    Err(e) => {
+                        state_captured.lock().poison(RpcError::ReadMessageError(e));
+                        return;
+                    }
+                }
+            }
+        });
+
+        Self {
+            stream_write: Arc::new(AsyncMutex::new(stream_write)),
+            client_id,
+            request_id: AtomicI32::new(0),
+            state,
+            api_versions: Mutex::new(None),
+            join_handle,
+        }
+    }
+
+    fn resolve_api_version(&self, api_key: ApiKey) -> Result<ApiVersion, Error> {
+        let guard = self.api_versions.lock();
+        resolve_api_version_for(guard.as_ref(), api_key)
+    }
+
+    fn is_poisoned(&self) -> bool {
+        let guard = self.state.lock();
+        matches!(*guard, ConnectionState::Poison(_))
+    }
+
+    pub async fn request<R>(&self, msg: R) -> Result<R::ResponseBody, Error>
+    where
+        R: RequestBody + Send + WriteType<Vec<u8>>,
+        R::ResponseBody: ReadType<Cursor<Vec<u8>>>,
+    {
+        let api_version = self.resolve_api_version(R::API_KEY)?;
+        let request_id = self.request_id.fetch_add(1, Ordering::SeqCst) & 0x7FFFFFFF;
+        let header = RequestHeader {
+            request_api_key: R::API_KEY,
+            request_api_version: api_version,
+            request_id,
+            client_id: Some(String::from(self.client_id.as_ref())),
+        };
+
+        let mut buf = Vec::new();
+        // write header
+        header
+            .write(&mut buf)
+            .map_err(RpcError::WriteMessageError)?;
+        // write message body
+        msg.write(&mut buf).map_err(RpcError::WriteMessageError)?;
+
+        let (tx, rx) = channel();
+
+        // to prevent stale data in inner state, ensure that we would remove the request again if we are cancelled while
+        // sending the request
+        let _cleanup_on_cancel =
+            CleanupRequestStateOnCancel::new(Arc::clone(&self.state), request_id);
+
+        match self.state.lock().deref_mut() {
+            ConnectionState::RequestMap(map) => {
+                map.insert(request_id, ActiveRequest { channel: tx });
+            }
+            ConnectionState::Poison(e) => return Err(RpcError::Poisoned(Arc::clone(e)).into()),
+        }
+
+        // count only the API message body, excluding the protocol header.
+        let request_body_bytes = buf.len().saturating_sub(REQUEST_HEADER_LENGTH) as u64;
+        let mut request_metrics = RequestMetricsLifecycle::begin(R::API_KEY, request_body_bytes);
+
+        self.send_message(buf)
+            .await
+            .inspect_err(|_| request_metrics.complete(0))?;
+        _cleanup_on_cancel.message_sent();
+        let mut response = rx
+            .await
+            .map_err(|e| Error::UnexpectedError {
+                message: "Receive error: response channel closed".to_string(),
+                source: Some(Box::new(e)),
+            })
+            .and_then(|r| r.map_err(Error::from))
+            .inspect_err(|_| request_metrics.complete(0))?;
+
+        // count only the API message body, excluding the response header.
+        let response_bytes =
+            (response.data.get_ref().len() as u64).saturating_sub(response.data.position());
+        request_metrics.complete(response_bytes);
+
+        if let Some(error_response) = response.header.error_response {
+            return Err(Error::FlussAPIError {
+                api_error: crate::rpc::ApiError::from(error_response),
+            });
+        }
+
+        let body = R::ResponseBody::read(&mut response.data).map_err(RpcError::ReadMessageError)?;
+
+        let read_bytes = response.data.position();
+        let message_bytes = response.data.into_inner().len() as u64;
+        if read_bytes != message_bytes {
+            return Err(RpcError::TooMuchData {
+                message_size: message_bytes,
+                read: read_bytes,
+                api_key: R::API_KEY,
+                api_version,
+            }
+            .into());
+        }
+        Ok(body)
+    }
+
+    async fn send_message(&self, msg: Vec<u8>) -> Result<(), RpcError> {
+        match self.send_message_inner(msg).await {
+            Ok(()) => Ok(()),
+            Err(e) => {
+                // need to poison the stream because message framing might be out-of-sync
+                let mut state = self.state.lock();
+                Err(RpcError::Poisoned(state.poison(e)))
+            }
+        }
+    }
+
+    async fn send_message_inner(&self, msg: Vec<u8>) -> Result<(), RpcError> {
+        let mut stream_write = Arc::clone(&self.stream_write).lock_owned().await;
+
+        // use a wrapper so that cancellation doesn't cancel the send operation and leaves half-send messages on the wire
+        let fut = CancellationSafeFuture::new(async move {
+            stream_write.write_message(&msg).await?;
+            stream_write.flush().await?;
+            Ok(())
+        });
+
+        fut.await
+    }
+}
+
+impl<RW> Drop for ServerConnectionInner<RW> {
+    fn drop(&mut self) {
+        // todo: should remove from server_connections map?
+        self.join_handle.abort();
+    }
+}
+
+struct CancellationSafeFuture<F>
+where
+    F: Future + Send + 'static,
+{
+    /// Mark if the inner future finished. If not, we must spawn a helper task on drop.
+    done: bool,
+
+    /// Inner future.
+    ///
+    /// Wrapped in an `Option` so we can extract it during drop. Inside that option however we also need a pinned
+    /// box because once this wrapper is polled, it will be pinned in memory -- even during drop. Now the inner
+    /// future does not necessarily implement `Unpin`, so we need a heap allocation to pin it in memory even when we
+    /// move it out of this option.
+    inner: Option<BoxFuture<'static, F::Output>>,
+}
+
+impl<F> CancellationSafeFuture<F>
+where
+    F: Future + Send,
+{
+    fn new(fut: F) -> Self {
+        Self {
+            done: false,
+            inner: Some(Box::pin(fut)),
+        }
+    }
+}
+
+impl<F> Future for CancellationSafeFuture<F>
+where
+    F: Future + Send,
+{
+    type Output = F::Output;
+
+    fn poll(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<Self::Output> {
+        let inner = self
+            .inner
+            .as_mut()
+            .expect("CancellationSafeFuture polled after completion");
+
+        match inner.as_mut().poll(cx) {
+            Poll::Ready(res) => {
+                self.done = true;
+                self.inner = None; // Prevent re-polling
+                Poll::Ready(res)
+            }
+            Poll::Pending => Poll::Pending,
+        }
+    }
+}
+
+impl<F> Drop for CancellationSafeFuture<F>
+where
+    F: Future + Send + 'static,
+{
+    fn drop(&mut self) {
+        // If the future hasn't finished yet, we must ensure it completes in the background.
+        // This prevents leaving half-sent messages on the wire if the caller cancels the request.
+        if let Some(fut) = self.inner.take() {
+            // Attempt to get a handle to the current Tokio runtime.
+            // This avoids a panic if the runtime has already shut down.
+            if let Ok(handle) = tokio::runtime::Handle::try_current() {
+                handle.spawn(async move {
+                    let _ = fut.await;
+                });
+            } else {
+                // Fallback: If no runtime is active, we cannot spawn.
+                // At this point, the future 'fut' will be dropped.
+                // Since the runtime is likely shutting down anyway,
+                // the underlying connection is probably being closed.
+                warn!("Tokio runtime not found during drop; background task cancelled.");
+            }
+        }
+    }
+}
+
+/// Helper that ensures that a request is removed when a request is cancelled before it was actually sent out.
+struct CleanupRequestStateOnCancel {
+    state: Arc<Mutex<ConnectionState>>,
+    request_id: i32,
+    message_sent: bool,
+}
+
+impl CleanupRequestStateOnCancel {
+    /// Create new helper.
+    ///
+    /// You must call [`message_sent`](Self::message_sent) when the request was sent.
+    fn new(state: Arc<Mutex<ConnectionState>>, request_id: i32) -> Self {
+        Self {
+            state,
+            request_id,
+            message_sent: false,
+        }
+    }
+
+    /// Request was sent. Do NOT clean the state any longer.
+    fn message_sent(mut self) {
+        self.message_sent = true;
+    }
+}
+
+impl Drop for CleanupRequestStateOnCancel {
+    fn drop(&mut self) {
+        if !self.message_sent {
+            if let ConnectionState::RequestMap(map) = self.state.lock().deref_mut() {
+                map.remove(&self.request_id);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::error::Error;
+    use crate::rpc::ApiKey;
+    use crate::rpc::api_version::ApiVersion;
+    use crate::rpc::frame::{ReadError, WriteError};
+    use crate::rpc::message::{ReadType, RequestBody, WriteType};
+    use metrics::{SharedString, Unit};
+    use metrics_util::CompositeKey;
+    use metrics_util::debugging::{DebugValue, DebuggingRecorder};
+    use std::sync::OnceLock;
+    use tokio::io::{AsyncReadExt, AsyncWriteExt, BufStream};
+    use tokio::sync::Mutex as AsyncMutex;
+
+    // -- Test-only request/response types --------------------------------
+
+    struct TestProduceRequest;
+    struct TestProduceResponse;
+
+    impl RequestBody for TestProduceRequest {
+        type ResponseBody = TestProduceResponse;
+        const API_KEY: ApiKey = ApiKey::ProduceLog;
+    }
+
+    impl WriteType<Vec<u8>> for TestProduceRequest {
+        fn write(&self, _w: &mut Vec<u8>) -> Result<(), WriteError> {
+            Ok(())
+        }
+    }
+
+    impl ReadType<Cursor<Vec<u8>>> for TestProduceResponse {
+        fn read(_r: &mut Cursor<Vec<u8>>) -> Result<Self, ReadError> {
+            Ok(TestProduceResponse)
+        }
+    }
+
+    struct TestMetadataRequest;
+    struct TestMetadataResponse;
+
+    impl RequestBody for TestMetadataRequest {
+        type ResponseBody = TestMetadataResponse;
+        const API_KEY: ApiKey = ApiKey::MetaData;
+    }
+
+    impl WriteType<Vec<u8>> for TestMetadataRequest {
+        fn write(&self, _w: &mut Vec<u8>) -> Result<(), WriteError> {
+            Ok(())
+        }
+    }
+
+    impl ReadType<Cursor<Vec<u8>>> for TestMetadataResponse {
+        fn read(_r: &mut Cursor<Vec<u8>>) -> Result<Self, ReadError> {
+            Ok(TestMetadataResponse)
+        }
+    }
+
+    // -- Mock server -----------------------------------------------------
+
+    /// Reads framed requests and echoes back minimal success responses.
+    async fn mock_echo_server(mut stream: tokio::io::DuplexStream) {
+        loop {
+            let mut len_buf = [0u8; 4];
+            if stream.read_exact(&mut len_buf).await.is_err() {
+                return;
+            }
+            let len = i32::from_be_bytes(len_buf) as usize;
+
+            let mut payload = vec![0u8; len];
+            if stream.read_exact(&mut payload).await.is_err() {
+                return;
+            }
+
+            // Header layout: api_key(2) + api_version(2) + request_id(4)
+            let request_id = i32::from_be_bytes([payload[4], payload[5], payload[6], payload[7]]);
+
+            // Response: resp_type(1, 0=success) + request_id(4)
+            let mut resp = Vec::with_capacity(5);
+            resp.push(0u8);
+            resp.extend_from_slice(&request_id.to_be_bytes());
+
+            let resp_len = (resp.len() as i32).to_be_bytes();
+            if stream.write_all(&resp_len).await.is_err()
+                || stream.write_all(&resp).await.is_err()
+                || stream.flush().await.is_err()
+            {
+                return;
+            }
+        }
+    }
+
+    /// Reads framed requests and echoes back error responses (resp_type=1).
+    async fn mock_error_server(mut stream: tokio::io::DuplexStream) {
+        use prost::Message;
+
+        loop {
+            let mut len_buf = [0u8; 4];
+            if stream.read_exact(&mut len_buf).await.is_err() {
+                return;
+            }
+            let len = i32::from_be_bytes(len_buf) as usize;
+
+            let mut payload = vec![0u8; len];
+            if stream.read_exact(&mut payload).await.is_err() {
+                return;
+            }
+
+            let request_id = i32::from_be_bytes([payload[4], payload[5], payload[6], payload[7]]);
+
+            let err = crate::proto::ErrorResponse {
+                error_code: 1,
+                error_message: Some("test error".to_string()),
+            };
+            let mut err_buf = Vec::new();
+            err.encode(&mut err_buf).expect("ErrorResponse encode");
+
+            let mut resp = Vec::with_capacity(5 + err_buf.len());
+            resp.push(1u8); // ERROR_RESPONSE
+            resp.extend_from_slice(&request_id.to_be_bytes());
+            resp.extend(err_buf);
+
+            let resp_len = (resp.len() as i32).to_be_bytes();
+            if stream.write_all(&resp_len).await.is_err()
+                || stream.write_all(&resp).await.is_err()
+                || stream.flush().await.is_err()
+            {
+                return;
+            }
+        }
+    }
+
+    // -- Recorder setup --------------------------------------------------
+
+    /// Shared test recorder (installed once per test binary).
+    static TEST_SNAPSHOTTER: OnceLock<metrics_util::debugging::Snapshotter> = OnceLock::new();
+    static TEST_LOCK: OnceLock<AsyncMutex<()>> = OnceLock::new();
+
+    fn test_snapshotter() -> &'static metrics_util::debugging::Snapshotter {
+        TEST_SNAPSHOTTER.get_or_init(|| {
+            let recorder = DebuggingRecorder::new();
+            let snapshotter = recorder.snapshotter();
+            recorder
+                .install()
+                .expect("debugging recorder install should succeed in this test binary");
+            snapshotter
+        })
+    }
+
+    fn test_lock() -> &'static AsyncMutex<()> {
+        TEST_LOCK.get_or_init(|| AsyncMutex::new(()))
+    }
+
+    type SnapshotEntry = (CompositeKey, Option<Unit>, Option<SharedString>, DebugValue);
+
+    fn has_api_label(key: &CompositeKey, label: &str) -> bool {
+        key.key()
+            .labels()
+            .any(|l| l.key() == LABEL_API_KEY && l.value() == label)
+    }
+
+    fn counter_for_label(entries: &[SnapshotEntry], metric_name: &str, label: &str) -> u64 {
+        entries
+            .iter()
+            .find_map(|(key, _, _, value)| {
+                if key.key().name() != metric_name || !has_api_label(key, label) {
+                    return None;
+                }
+                match value {
+                    DebugValue::Counter(v) => Some(*v),
+                    _ => None,
+                }
+            })
+            .unwrap_or(0)
+    }
+
+    fn gauge_for_label(entries: &[SnapshotEntry], metric_name: &str, label: &str) -> f64 {
+        entries
+            .iter()
+            .find_map(|(key, _, _, value)| {
+                if key.key().name() != metric_name || !has_api_label(key, label) {
+                    return None;
+                }
+                match value {
+                    DebugValue::Gauge(v) => Some(v.into_inner()),
+                    _ => None,
+                }
+            })
+            .unwrap_or(0.0)
+    }
+
+    fn counter_sum(entries: &[SnapshotEntry], metric_name: &str) -> u64 {
+        entries
+            .iter()
+            .filter_map(|(key, _, _, value)| {
+                if key.key().name() != metric_name {
+                    return None;
+                }
+                match value {
+                    DebugValue::Counter(v) => Some(*v),
+                    _ => None,
+                }
+            })
+            .sum()
+    }
+
+    fn histogram_sample_count_for_label(
+        entries: &[SnapshotEntry],
+        metric_name: &str,
+        label: &str,
+    ) -> usize {
+        entries
+            .iter()
+            .find_map(|(key, _, _, value)| {
+                if key.key().name() != metric_name || !has_api_label(key, label) {
+                    return None;
+                }
+                match value {
+                    DebugValue::Histogram(v) => Some(v.len()),
+                    _ => None,
+                }
+            })
+            .unwrap_or(0)
+    }
+
+    // -- Tests -----------------------------------------------------------
+
+    #[tokio::test]
+    async fn request_records_metrics_for_reportable_api_key() {
+        let _test_guard = test_lock().lock().await;
+        let snapshotter = test_snapshotter();
+
+        let (client, server) = tokio::io::duplex(4096);
+        tokio::spawn(mock_echo_server(server));
+
+        let conn = ServerConnectionInner::new(BufStream::new(client), usize::MAX, Arc::from("t"));
+        *conn.api_versions.lock() = Some(ServerApiVersions::new(&[PbApiVersion {
+            api_key: 1014,
+            min_version: 0,
+            max_version: 0,
+        }]));
+
+        let before: Vec<_> = snapshotter.snapshot().into_vec();
+        let request_before = counter_for_label(&before, CLIENT_REQUESTS_TOTAL, "produce_log");
+        let response_before = counter_for_label(&before, CLIENT_RESPONSES_TOTAL, "produce_log");
+        let latency_samples_before =
+            histogram_sample_count_for_label(&before, CLIENT_REQUEST_LATENCY_MS, "produce_log");
+
+        conn.request(TestProduceRequest).await.unwrap();
+
+        let after: Vec<_> = snapshotter.snapshot().into_vec();
+        let request_after = counter_for_label(&after, CLIENT_REQUESTS_TOTAL, "produce_log");
+        let response_after = counter_for_label(&after, CLIENT_RESPONSES_TOTAL, "produce_log");
+        let latency_samples_after =
+            histogram_sample_count_for_label(&after, CLIENT_REQUEST_LATENCY_MS, "produce_log");
+        assert_eq!(
+            request_after - request_before,
+            1,
+            "produce_log request counter should increment by 1"
+        );
+        assert_eq!(
+            response_after - response_before,
+            1,
+            "produce_log completion counter should increment by 1"
+        );
+        assert_eq!(
+            latency_samples_after - latency_samples_before,
+            1,
+            "request latency histogram sample count should increment by 1 for produce_log"
+        );
+    }
+
+    #[tokio::test]
+    async fn request_skips_metrics_for_non_reportable_api_key() {
+        let _test_guard = test_lock().lock().await;
+        let snapshotter = test_snapshotter();
+
+        let (client, server) = tokio::io::duplex(4096);
+        tokio::spawn(mock_echo_server(server));
+
+        let conn = ServerConnectionInner::new(BufStream::new(client), usize::MAX, Arc::from("t"));
+        *conn.api_versions.lock() = Some(ServerApiVersions::new(&[PbApiVersion {
+            api_key: 1012,
+            min_version: 0,
+            max_version: 0,
+        }]));
+        let before: Vec<_> = snapshotter.snapshot().into_vec();
+        let request_sum_before = counter_sum(&before, CLIENT_REQUESTS_TOTAL);
+        let response_sum_before = counter_sum(&before, CLIENT_RESPONSES_TOTAL);
+
+        conn.request(TestMetadataRequest).await.unwrap();
+
+        let snapshot: Vec<_> = snapshotter.snapshot().into_vec();
+        let request_sum_after = counter_sum(&snapshot, CLIENT_REQUESTS_TOTAL);
+        let response_sum_after = counter_sum(&snapshot, CLIENT_RESPONSES_TOTAL);
+        assert_eq!(
+            request_sum_after, request_sum_before,
+            "non-reportable API keys must not change request counters"
+        );
+        assert_eq!(
+            response_sum_after, response_sum_before,
+            "non-reportable API keys must not change response counters"
+        );
+
+        // No metric entry should carry a non-reportable API key label.
+        let non_reportable = snapshot
+            .iter()
+            .any(|(key, _, _, _)| has_api_label(key, "metadata"));
+        assert!(
+            !non_reportable,
+            "non-reportable API keys must not appear in metrics"
+        );
+    }
+
+    #[tokio::test]
+    async fn request_records_completion_metrics_when_send_fails() {
+        let _test_guard = test_lock().lock().await;
+        let snapshotter = test_snapshotter();
+
+        let (client, server) = tokio::io::duplex(64);
+        drop(server); // force write failure on request path
+        let conn = ServerConnectionInner::new(BufStream::new(client), usize::MAX, Arc::from("t"));
+        *conn.api_versions.lock() = Some(ServerApiVersions::new(&[PbApiVersion {
+            api_key: 1014,
+            min_version: 0,
+            max_version: 0,
+        }]));
+
+        let before: Vec<_> = snapshotter.snapshot().into_vec();
+        let request_before = counter_for_label(&before, CLIENT_REQUESTS_TOTAL, "produce_log");
+        let response_before = counter_for_label(&before, CLIENT_RESPONSES_TOTAL, "produce_log");
+        let bytes_received_before =
+            counter_for_label(&before, CLIENT_BYTES_RECEIVED_TOTAL, "produce_log");
+        let result = conn.request(TestProduceRequest).await;
+        assert!(
+            result.is_err(),
+            "request should fail when transport is closed"
+        );
+        let after: Vec<_> = snapshotter.snapshot().into_vec();
+        let request_after = counter_for_label(&after, CLIENT_REQUESTS_TOTAL, "produce_log");
+        let response_after = counter_for_label(&after, CLIENT_RESPONSES_TOTAL, "produce_log");
+        let bytes_received_after =
+            counter_for_label(&after, CLIENT_BYTES_RECEIVED_TOTAL, "produce_log");
+        let inflight_after = gauge_for_label(&after, CLIENT_REQUESTS_IN_FLIGHT, "produce_log");
+
+        assert_eq!(
+            request_after - request_before,
+            1,
+            "failed request should still count as request"
+        );
+        assert_eq!(
+            response_after - response_before,
+            1,
+            "failed request should still count as a completion like Java ConnectionMetrics"
+        );
+        assert_eq!(
+            bytes_received_after - bytes_received_before,
+            0,
+            "failed send should record zero received bytes"
+        );
+        assert_eq!(
+            inflight_after, 0.0,
+            "in-flight gauge must return to zero after failure"
+        );
+    }
+
+    #[tokio::test]
+    async fn request_records_completion_metrics_when_server_returns_api_error() {
+        let _test_guard = test_lock().lock().await;
+        let snapshotter = test_snapshotter();
+
+        let (client, server) = tokio::io::duplex(4096);
+        tokio::spawn(mock_error_server(server));
+
+        let conn = ServerConnectionInner::new(BufStream::new(client), usize::MAX, Arc::from("t"));
+        *conn.api_versions.lock() = Some(ServerApiVersions::new(&[PbApiVersion {
+            api_key: 1014,
+            min_version: 0,
+            max_version: 0,
+        }]));
+
+        let before: Vec<_> = snapshotter.snapshot().into_vec();
+        let response_before = counter_for_label(&before, CLIENT_RESPONSES_TOTAL, "produce_log");
+        let bytes_received_before =
+            counter_for_label(&before, CLIENT_BYTES_RECEIVED_TOTAL, "produce_log");
+
+        let result = conn.request(TestProduceRequest).await;
+        assert!(
+            matches!(result, Err(Error::FlussAPIError { .. })),
+            "request should fail with FlussAPIError when server returns error_response"
+        );
+
+        let after: Vec<_> = snapshotter.snapshot().into_vec();
+        let response_after = counter_for_label(&after, CLIENT_RESPONSES_TOTAL, "produce_log");
+        let bytes_received_after =
+            counter_for_label(&after, CLIENT_BYTES_RECEIVED_TOTAL, "produce_log");
+        let inflight_after = gauge_for_label(&after, CLIENT_REQUESTS_IN_FLIGHT, "produce_log");
+
+        assert_eq!(
+            response_after - response_before,
+            1,
+            "API error response should count as completion like Java"
+        );
+        assert_eq!(
+            bytes_received_after - bytes_received_before,
+            0,
+            "API error response should record zero body bytes like Java onRequestFailure"
+        );
+        assert_eq!(
+            inflight_after, 0.0,
+            "in-flight gauge must return to zero after API error"
+        );
+    }
+
+    #[tokio::test]
+    async fn server_api_versions_negotiation() {
+        assert_eq!(
+            resolve_api_version_for(None, ApiKey::ApiVersion).unwrap(),
+            ApiVersion(0)
+        );
+
+        assert_eq!(
+            resolve_api_version_for(None, ApiKey::PutKv).unwrap(),
+            ApiVersion(0)
+        );
+
+        let server_versions = vec![
+            // PutKv: server v0..v3, client v0 only (v1 key encoding not yet implemented) → negotiated v0
+            PbApiVersion {
+                api_key: 1016,
+                min_version: 0,
+                max_version: 3,
+            },
+            // ProduceLog: server v0..v2, client v0 only → negotiated v0
+            PbApiVersion {
+                api_key: 1014,
+                min_version: 0,
+                max_version: 2,
+            },
+            // Disjoint: server v5..v7, client v0 only → error
+            PbApiVersion {
+                api_key: 1015,
+                min_version: 5,
+                max_version: 7,
+            },
+            // Unknown key (9999) → skipped
+            PbApiVersion {
+                api_key: 9999,
+                min_version: 0,
+                max_version: 5,
+            },
+        ];
+        let negotiated = ServerApiVersions::new(&server_versions);
+
+        // Successful negotiation cases
+        assert_eq!(
+            negotiated.highest_available_version(ApiKey::PutKv).unwrap(),
+            ApiVersion(0)
+        );
+        assert_eq!(
+            negotiated
+                .highest_available_version(ApiKey::ProduceLog)
+                .unwrap(),
+            ApiVersion(0)
+        );
+
+        // Disjoint range → error
+        assert!(
+            negotiated
+                .highest_available_version(ApiKey::FetchLog)
+                .unwrap_err()
+                .to_string()
+                .contains(&format!(
+                    "The server does not support {:?}",
+                    ApiKey::FetchLog
+                ))
+        );
+
+        // Unknown key is skipped → not in map → error
+        assert!(
+            negotiated
+                .highest_available_version(ApiKey::Unknown(9999))
+                .is_err()
+        );
+
+        // Key not advertised by server → error
+        assert!(
+            ServerApiVersions::new(&[])
+                .highest_available_version(ApiKey::FetchLog)
+                .is_err()
+        );
+    }
+
+    #[test]
+    fn server_type_validation() {
+        // Happy path: server advertises the expected type.
+        assert!(
+            validate_server_type(
+                &ServerType::CoordinatorServer,
+                Some(ServerType::CoordinatorServer.to_type_id()),
+            )
+            .is_ok()
+        );
+        assert!(
+            validate_server_type(
+                &ServerType::TabletServer,
+                Some(ServerType::TabletServer.to_type_id()),
+            )
+            .is_ok()
+        );
+
+        // Mismatch: connected to a coordinator while expecting a tablet server
+        // (and vice versa).
+        let err = validate_server_type(
+            &ServerType::TabletServer,
+            Some(ServerType::CoordinatorServer.to_type_id()),
+        )
+        .unwrap_err();
+        assert!(
+            matches!(err, Error::InvalidServerType { .. }),
+            "expected InvalidServerType, got: {err:?}"
+        );
+
+        assert!(matches!(
+            validate_server_type(
+                &ServerType::CoordinatorServer,
+                Some(ServerType::TabletServer.to_type_id()),
+            ),
+            Err(Error::InvalidServerType { .. })
+        ));
+
+        validate_server_type(&ServerType::TabletServer, None).ok();
+        // Unknown / unmapped type id still fails, with the raw id surfaced so
+        // operators can diagnose protocol drift.
+        assert!(matches!(
+            validate_server_type(&ServerType::CoordinatorServer, Some(99),),
+            Err(Error::InvalidServerType { .. })
+        ));
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/rpc/transport.rs b/fluss-rust/crates/fluss/src/rpc/transport.rs
new file mode 100644
index 0000000000..a6f721f6aa
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/rpc/transport.rs
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::rpc::error::RpcError;
+use std::ops::DerefMut;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use std::time::Duration;
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+use tokio::net::TcpStream;
+
+#[derive(Debug)]
+pub enum Transport {
+    Plain { inner: TcpStream },
+}
+
+impl AsyncRead for Transport {
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<std::io::Result<()>> {
+        match self.deref_mut() {
+            Self::Plain { inner } => Pin::new(inner).poll_read(cx, buf),
+        }
+    }
+}
+
+impl AsyncWrite for Transport {
+    fn poll_write(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<std::io::Result<usize>> {
+        match self.deref_mut() {
+            Self::Plain { inner } => Pin::new(inner).poll_write(cx, buf),
+        }
+    }
+
+    fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<std::io::Result<()>> {
+        match self.deref_mut() {
+            Self::Plain { inner } => Pin::new(inner).poll_flush(cx),
+        }
+    }
+
+    fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<std::io::Result<()>> {
+        match self.deref_mut() {
+            Self::Plain { inner } => Pin::new(inner).poll_shutdown(cx),
+        }
+    }
+}
+
+impl Transport {
+    pub async fn connect(server: &str, timeout: Option<Duration>) -> Result<Self, RpcError> {
+        let tcp_stream = Self::connect_timeout(server, timeout).await?;
+        Ok(Transport::Plain { inner: tcp_stream })
+    }
+
+    async fn connect_timeout(host: &str, timeout: Option<Duration>) -> Result<TcpStream, RpcError> {
+        match timeout {
+            Some(timeout) => Ok(tokio::time::timeout(timeout, TcpStream::connect(host))
+                .await
+                .map_err(|_| {
+                    RpcError::ConnectionError(format!("Timeout connecting to host {host}"))
+                })??),
+            None => Ok(TcpStream::connect(host).await?),
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/test_utils.rs b/fluss-rust/crates/fluss/src/test_utils.rs
new file mode 100644
index 0000000000..f1e17e5f58
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/test_utils.rs
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::cluster::{BucketLocation, Cluster, ServerNode, ServerType};
+use crate::metadata::{
+    DataField, DataTypes, PhysicalTablePath, Schema, TableBucket, TableDescriptor, TableInfo,
+    TablePath,
+};
+use crate::metrics::{LABEL_DATABASE, LABEL_TABLE, ScannerMetrics};
+use std::collections::HashMap;
+use std::sync::Arc;
+
+pub(crate) fn build_table_info(table_path: TablePath, table_id: i64, buckets: i32) -> TableInfo {
+    let row_type = DataTypes::row(vec![DataField::new("id", DataTypes::int(), None)]);
+    let schema_builder = Schema::builder().with_row_type(&row_type);
+    let schema = schema_builder.build().expect("schema build");
+    let table_descriptor = TableDescriptor::builder()
+        .schema(schema)
+        .distributed_by(Some(buckets), vec![])
+        .build()
+        .expect("descriptor build");
+    TableInfo::of(table_path, table_id, 1, table_descriptor, 0, 0)
+}
+
+pub(crate) fn build_cluster(table_path: &TablePath, table_id: i64, buckets: i32) -> Cluster {
+    let server = ServerNode::new(1, "127.0.0.1".to_string(), 9092, ServerType::TabletServer);
+
+    let mut servers = HashMap::new();
+    servers.insert(server.id(), server.clone());
+
+    let mut locations_by_path = HashMap::new();
+    let mut locations_by_bucket = HashMap::new();
+    let mut bucket_locations = Vec::new();
+
+    for bucket_id in 0..buckets {
+        let table_bucket = TableBucket::new(table_id, bucket_id);
+        let bucket_location = BucketLocation::new(
+            table_bucket.clone(),
+            Some(server.clone()),
+            Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))),
+        );
+        bucket_locations.push(bucket_location.clone());
+        locations_by_bucket.insert(table_bucket, bucket_location);
+    }
+    locations_by_path.insert(
+        Arc::new(PhysicalTablePath::of(Arc::new(table_path.clone()))),
+        bucket_locations,
+    );
+
+    let mut table_id_by_path = HashMap::new();
+    table_id_by_path.insert(table_path.clone(), table_id);
+
+    let mut table_info_by_path = HashMap::new();
+    table_info_by_path.insert(
+        table_path.clone(),
+        build_table_info(table_path.clone(), table_id, buckets),
+    );
+
+    Cluster::new(
+        None,
+        servers,
+        locations_by_path,
+        locations_by_bucket,
+        table_id_by_path,
+        table_info_by_path,
+        HashMap::new(),
+    )
+}
+
+pub(crate) fn build_cluster_arc(
+    table_path: &TablePath,
+    table_id: i64,
+    buckets: i32,
+) -> Arc<Cluster> {
+    Arc::new(build_cluster(table_path, table_id, buckets))
+}
+
+/// Build an `Arc<ScannerMetrics>` for tests. Most callers don't install
+/// a recorder, so the cached handles are no-ops; tests that *do* install
+/// `metrics::with_local_recorder(...)` must call this *inside* the
+/// recorder closure for the cached handles to bind to that recorder.
+pub(crate) fn test_scanner_metrics(table_path: &TablePath) -> Arc<ScannerMetrics> {
+    Arc::new(ScannerMetrics::new(table_path))
+}
+
+/// Asserts that every entry whose name starts with `fluss.client.scanner.`
+/// carries both the `database` and `table` labels matching the expected
+/// values. Use after a `Snapshotter::snapshot().into_vec()` to verify all
+/// emitted scanner metrics in one shot — protects against future scanner
+/// metrics that bypass [`ScannerMetrics`].
+pub(crate) fn assert_scanner_entries_labeled(
+    entries: &[(
+        metrics_util::CompositeKey,
+        Option<metrics::Unit>,
+        Option<metrics::SharedString>,
+        metrics_util::debugging::DebugValue,
+    )],
+    expected_database: &str,
+    expected_table: &str,
+) {
+    for (key, _, _, _) in entries {
+        let name = key.key().name();
+        if !name.starts_with("fluss.client.scanner.") {
+            continue;
+        }
+        let labels: Vec<_> = key
+            .key()
+            .labels()
+            .map(|l| (l.key().to_string(), l.value().to_string()))
+            .collect();
+        let database = labels
+            .iter()
+            .find(|(k, _)| k == LABEL_DATABASE)
+            .unwrap_or_else(|| {
+                panic!("scanner metric `{name}` is missing the database label; labels={labels:?}")
+            });
+        let table = labels
+            .iter()
+            .find(|(k, _)| k == LABEL_TABLE)
+            .unwrap_or_else(|| {
+                panic!("scanner metric `{name}` is missing the table label; labels={labels:?}")
+            });
+        assert_eq!(
+            database.1, expected_database,
+            "scanner metric `{name}` has unexpected database label"
+        );
+        assert_eq!(
+            table.1, expected_table,
+            "scanner metric `{name}` has unexpected table label"
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/util/mod.rs b/fluss-rust/crates/fluss/src/util/mod.rs
new file mode 100644
index 0000000000..ee8dde4a41
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/util/mod.rs
@@ -0,0 +1,239 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub mod murmur_hash;
+pub mod partition;
+pub mod varint;
+
+use crate::TableId;
+use crate::metadata::TableBucket;
+use linked_hash_map::LinkedHashMap;
+use std::collections::{HashMap, HashSet};
+use std::hash::Hash;
+use std::sync::Arc;
+use std::time::{SystemTime, UNIX_EPOCH};
+
+pub fn current_time_ms() -> i64 {
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap_or(std::time::Duration::ZERO)
+        .as_millis() as i64
+}
+
+// Removed: delete_file() is no longer used.
+// File cleanup is now handled via RAII with FileCleanupGuard in arrow.rs
+// which uses Rust's drop order to ensure files are closed before deletion.
+
+pub struct FairBucketStatusMap<S> {
+    map: LinkedHashMap<TableBucket, Arc<S>>,
+    size: usize,
+}
+
+#[allow(dead_code)]
+impl<S> FairBucketStatusMap<S> {
+    pub fn new() -> Self {
+        Self {
+            map: LinkedHashMap::new(),
+            size: 0,
+        }
+    }
+
+    /// Moves the bucket to the end of the iteration order
+    pub fn move_to_end(&mut self, table_bucket: TableBucket)
+    where
+        TableBucket: Eq + Hash,
+    {
+        if let Some(status) = self.map.remove(&table_bucket) {
+            self.map.insert(table_bucket, status);
+        }
+    }
+
+    /// Updates the status and moves the bucket to the end
+    pub fn update_and_move_to_end(&mut self, table_bucket: TableBucket, status: S)
+    where
+        TableBucket: Eq + Hash,
+    {
+        self.map.remove(&table_bucket);
+        self.map.insert(table_bucket, Arc::new(status));
+        self.update_size();
+    }
+
+    /// Updates the status without changing the order
+    pub fn update(&mut self, table_bucket: TableBucket, status: Arc<S>)
+    where
+        TableBucket: Eq + Hash,
+    {
+        self.map.insert(table_bucket, status);
+        self.update_size();
+    }
+
+    /// Removes a bucket
+    pub fn remove(&mut self, table_bucket: &TableBucket)
+    where
+        TableBucket: Eq + Hash,
+    {
+        self.map.remove(table_bucket);
+        self.update_size();
+    }
+
+    /// Returns an immutable view of all buckets
+    pub fn bucket_set(&self) -> HashSet<&TableBucket>
+    where
+        TableBucket: Eq + Hash,
+    {
+        self.map.keys().collect()
+    }
+
+    /// Clears all buckets
+    pub fn clear(&mut self) {
+        self.map.clear();
+        self.update_size();
+    }
+
+    /// Checks if a bucket exists
+    pub fn contains(&self, table_bucket: &TableBucket) -> bool
+    where
+        TableBucket: Eq + Hash,
+    {
+        self.map.contains_key(table_bucket)
+    }
+
+    /// Returns an immutable view of the bucket-status map
+    pub fn bucket_status_map(&self) -> &LinkedHashMap<TableBucket, Arc<S>> {
+        &self.map
+    }
+
+    /// Returns status values in current order
+    pub fn bucket_status_values(&self) -> Vec<&Arc<S>> {
+        self.map.values().collect()
+    }
+
+    /// Gets the status for a bucket
+    pub fn status_value(&self, table_bucket: &TableBucket) -> Option<&Arc<S>>
+    where
+        TableBucket: Eq + Hash,
+    {
+        self.map.get(table_bucket)
+    }
+
+    /// Applies a function to each bucket-status pair
+    pub fn for_each<F>(&self, mut f: F)
+    where
+        F: FnMut(&TableBucket, &S),
+    {
+        for (bucket, status) in &self.map {
+            f(bucket, status);
+        }
+    }
+
+    /// Gets the current bucket count (thread-safe)
+    pub fn size(&self) -> usize {
+        self.size
+    }
+
+    pub fn set(&mut self, bucket_to_status: HashMap<TableBucket, Arc<S>>)
+    where
+        TableBucket: Eq + Hash + Clone,
+        S: Clone,
+    {
+        self.map.clear();
+
+        // Group buckets by table ID
+        let mut table_to_buckets: LinkedHashMap<TableId, Vec<TableBucket>> = LinkedHashMap::new();
+        for bucket in bucket_to_status.keys() {
+            table_to_buckets
+                .entry(bucket.table_id())
+                .or_default()
+                .push(bucket.clone());
+        }
+
+        // Insert buckets grouped by table
+        for (_, buckets) in table_to_buckets {
+            for bucket in buckets {
+                if let Some(status) = bucket_to_status.get(&bucket) {
+                    self.map.insert(bucket, status.clone());
+                }
+            }
+        }
+
+        self.update_size();
+    }
+
+    fn update_size(&mut self) {
+        self.size = self.map.len()
+    }
+}
+
+impl<S> Default for FairBucketStatusMap<S> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::Arc;
+
+    #[test]
+    fn fair_bucket_status_map_tracks_order_and_size() {
+        let bucket0 = TableBucket::new(1, 0);
+        let bucket1 = TableBucket::new(1, 1);
+
+        let mut map = FairBucketStatusMap::new();
+        map.update_and_move_to_end(bucket0.clone(), 10);
+        map.update_and_move_to_end(bucket1.clone(), 20);
+        assert_eq!(map.size(), 2);
+
+        let values: Vec<i32> = map
+            .bucket_status_values()
+            .into_iter()
+            .map(|value| **value)
+            .collect();
+        assert_eq!(values, vec![10, 20]);
+
+        map.move_to_end(bucket0.clone());
+        let values: Vec<i32> = map
+            .bucket_status_values()
+            .into_iter()
+            .map(|value| **value)
+            .collect();
+        assert_eq!(values, vec![20, 10]);
+    }
+
+    #[test]
+    fn fair_bucket_status_map_mutations() {
+        let bucket0 = TableBucket::new(1, 0);
+        let bucket1 = TableBucket::new(2, 1);
+
+        let mut map = FairBucketStatusMap::new();
+        let mut input = HashMap::new();
+        input.insert(bucket0.clone(), Arc::new(1));
+        input.insert(bucket1.clone(), Arc::new(2));
+        map.set(input);
+
+        assert!(map.contains(&bucket0));
+        assert!(map.contains(&bucket1));
+        assert_eq!(map.bucket_set().len(), 2);
+
+        map.remove(&bucket1);
+        assert_eq!(map.size(), 1);
+
+        map.clear();
+        assert_eq!(map.size(), 0);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/util/murmur_hash.rs b/fluss-rust/crates/fluss/src/util/murmur_hash.rs
new file mode 100644
index 0000000000..12229c717d
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/util/murmur_hash.rs
@@ -0,0 +1,222 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/* This file is based on source code of Apache Flink Project (https://flink.apache.org/), licensed by the Apache
+ * Software Foundation (ASF) under the Apache License, Version 2.0. See the NOTICE file distributed with this work for
+ * additional information regarding copyright ownership. */
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+
+pub const MURMUR3_DEFAULT_SEED: u32 = 0;
+pub const FLINK_MURMUR3_DEFAULT_SEED: i32 = 42;
+
+const C1: u32 = 0xCC9E_2D51;
+const C2: u32 = 0x1B87_3593;
+const R1: u32 = 15;
+const R2: u32 = 13;
+const M: u32 = 5;
+const N: u32 = 0xE654_6B64;
+const CHUNK_SIZE: usize = 4;
+
+/// Hashes the data using 32-bit Murmur3 hash with 0 as seed
+///
+/// # Arguments
+/// * `data` - byte array containing data to be hashed
+///
+/// # Returns
+/// Returns hash value
+pub fn hash_bytes(data: &[u8]) -> u32 {
+    hash_bytes_with_seed(data, MURMUR3_DEFAULT_SEED)
+}
+
+#[inline(always)]
+fn hash_bytes_with_seed(data: &[u8], seed: u32) -> u32 {
+    let length = data.len();
+    let chunks = length / CHUNK_SIZE;
+    let length_aligned = chunks * CHUNK_SIZE;
+
+    let mut h1 = hash_full_chunks(data, seed);
+    let mut k1 = 0u32;
+
+    for (shift, &b) in data[length_aligned..].iter().enumerate() {
+        k1 |= (b as u32) << (8 * shift);
+    }
+
+    h1 ^= k1.wrapping_mul(C1).rotate_left(R1).wrapping_mul(C2);
+
+    fmix(h1, length)
+}
+
+/// Hashes the data using Fluss'/Flink's variant of 32-bit Murmur hash with 42 as seed and tail bytes mixed into hash byte-by-byte
+/// Maximum data array size supported is 2GB
+///
+/// # Arguments
+/// * `data` - byte array containing data to be hashed
+///
+/// # Returns
+/// * result of hashing, `Ok(hash_value)`
+///
+/// # Error
+/// Returns `Err(IllegalArgument)` if byte array is larger than 2GB
+pub fn fluss_hash_bytes(data: &[u8]) -> Result<i32> {
+    fluss_hash_bytes_with_seed(data, FLINK_MURMUR3_DEFAULT_SEED)
+}
+#[inline(always)]
+fn fluss_hash_bytes_with_seed(data: &[u8], seed: i32) -> Result<i32> {
+    let length = data.len();
+
+    if length >= i32::MAX as usize {
+        return Err(IllegalArgument {
+            message: "data array size {length} is bigger than supported".to_string(),
+        });
+    }
+
+    let chunks = length / CHUNK_SIZE;
+    let length_aligned = chunks * CHUNK_SIZE;
+
+    let mut h1 = hash_full_chunks(data, seed as u32);
+
+    for byte in data.iter().take(length).skip(length_aligned) {
+        let k1 = mix_k1(*byte as u32);
+        h1 = mix_h1(h1, k1);
+    }
+
+    Ok(fmix(h1, length) as i32)
+}
+
+#[inline(always)]
+fn hash_full_chunks(data: &[u8], seed: u32) -> u32 {
+    data.chunks_exact(CHUNK_SIZE).fold(seed, |h1, chunk| {
+        let block = u32::from_le_bytes(chunk.try_into().unwrap());
+        let k1 = mix_k1(block);
+        mix_h1(h1, k1)
+    })
+}
+
+#[inline(always)]
+fn mix_k1(k1: u32) -> u32 {
+    k1.wrapping_mul(C1).rotate_left(R1).wrapping_mul(C2)
+}
+
+#[inline(always)]
+fn mix_h1(h1: u32, k1: u32) -> u32 {
+    (h1 ^ k1).rotate_left(R2).wrapping_mul(M).wrapping_add(N)
+}
+
+// Finalization mix - force all bits of a hash block to avalanche
+#[inline(always)]
+fn fmix(mut h1: u32, length: usize) -> u32 {
+    h1 ^= length as u32;
+    bit_mix(h1)
+}
+
+/// Hashes an i32 using Fluss'/Flink's variant of Murmur
+///
+/// # Arguments
+/// * `input` - i32 value to be hashed
+///
+/// # Returns
+/// Returns hash value
+pub fn fluss_hash_i32(input: i32) -> i32 {
+    let mut input = input as u32;
+    input = input.wrapping_mul(C1);
+    input = input.rotate_left(R1);
+    input = input.wrapping_mul(C2);
+    input = input.rotate_left(R2);
+
+    input = input.wrapping_mul(M).wrapping_add(N);
+    input ^= CHUNK_SIZE as u32;
+    let output = bit_mix(input) as i32;
+
+    if output >= 0 {
+        output
+    } else if output != i32::MIN {
+        -output
+    } else {
+        0
+    }
+}
+
+const BIT_MIX_A: u32 = 0x85EB_CA6B;
+const BIT_MIX_B: u32 = 0xC2B2_AE35;
+
+#[inline(always)]
+fn bit_mix(mut input: u32) -> u32 {
+    input = input ^ (input >> 16);
+    input = input.wrapping_mul(BIT_MIX_A);
+    input = input ^ (input >> 13);
+    input = input.wrapping_mul(BIT_MIX_B);
+    input = input ^ (input >> 16);
+    input
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_murmur3() {
+        //
+        let empty_data_hash = hash_bytes(&[]);
+        assert_eq!(empty_data_hash, 0);
+
+        let empty_data_hash = hash_bytes_with_seed(&[], 1);
+        assert_eq!(0x514E_28B7, empty_data_hash);
+
+        let empty_data_hash = hash_bytes_with_seed(&[], 0xFFFF_FFFF);
+        assert_eq!(0x81F1_6F39, empty_data_hash);
+
+        let hash = hash_bytes("The quick brown fox jumps over the lazy dog".as_bytes());
+        assert_eq!(0x2E4F_F723, hash);
+
+        let hash = hash_bytes_with_seed(
+            "The quick brown fox jumps over the lazy dog".as_bytes(),
+            0x9747_B28C,
+        );
+        assert_eq!(0x2FA8_26CD, hash);
+    }
+
+    #[test]
+    fn test_flink_murmur() {
+        let empty_data_hash = fluss_hash_bytes_with_seed(&[], 0).expect("Failed to hash");
+        assert_eq!(empty_data_hash, 0);
+
+        let empty_data_hash = fluss_hash_bytes(&[]).expect("Failed to hash");
+        assert_eq!(0x087F_CD5C, empty_data_hash);
+
+        let empty_data_hash =
+            fluss_hash_bytes_with_seed(&[], 0xFFFF_FFFFu32 as i32).expect("Failed to hash");
+        assert_eq!(0x81F1_6F39u32 as i32, empty_data_hash);
+
+        let hash =
+            fluss_hash_bytes_with_seed("The quick brown fox jumps over the lazy dog".as_bytes(), 0)
+                .expect("Failed to hash");
+        assert_eq!(0x5FD2_0A20, hash);
+
+        let hash = fluss_hash_bytes("The quick brown fox jumps over the lazy dog".as_bytes())
+            .expect("Failed to hash");
+        assert_eq!(0x1BC6_F880, hash);
+
+        let hash = fluss_hash_i32(0);
+        assert_eq!(0x2362_F9DE, hash);
+
+        let hash = fluss_hash_i32(42);
+        assert_eq!(0x43A4_6E1D, hash);
+
+        let hash = fluss_hash_i32(-77);
+        assert_eq!(0x2EEB_27DE, hash);
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/util/partition.rs b/fluss-rust/crates/fluss/src/util/partition.rs
new file mode 100644
index 0000000000..ccc71a6b08
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/util/partition.rs
@@ -0,0 +1,532 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Utils for partition.
+use crate::error::Error::IllegalArgument;
+use crate::error::Result;
+use crate::metadata::DataType;
+use crate::row::{Date, Datum, Time, TimestampLtz, TimestampNtz};
+use jiff::ToSpan;
+use std::fmt::Write;
+
+fn hex_string(bytes: &[u8]) -> String {
+    let mut hex = String::with_capacity(bytes.len() * 2);
+    for &b in bytes {
+        write!(hex, "{b:02x}").unwrap();
+    }
+    hex
+}
+
+fn reformat_float(value: f32) -> String {
+    if value.is_nan() {
+        "NaN".to_string()
+    } else if value.is_infinite() {
+        if value > 0.0 {
+            "Inf".to_string()
+        } else {
+            "-Inf".to_string()
+        }
+    } else {
+        value.to_string().replace('.', "_")
+    }
+}
+
+fn reformat_double(value: f64) -> String {
+    if value.is_nan() {
+        "NaN".to_string()
+    } else if value.is_infinite() {
+        if value > 0.0 {
+            "Inf".to_string()
+        } else {
+            "-Inf".to_string()
+        }
+    } else {
+        value.to_string().replace('.', "_")
+    }
+}
+
+const UNIX_EPOCH_DATE: jiff::civil::Date = jiff::civil::date(1970, 1, 1);
+
+fn day_to_string(days: i32) -> String {
+    let date = UNIX_EPOCH_DATE + days.days();
+    format!("{:04}-{:02}-{:02}", date.year(), date.month(), date.day())
+}
+
+fn date_to_string(date: Date) -> String {
+    day_to_string(date.get_inner())
+}
+
+const MILLIS_PER_SECOND: i64 = 1_000;
+const MILLIS_PER_MINUTE: i64 = 60 * MILLIS_PER_SECOND;
+const MILLIS_PER_HOUR: i64 = 60 * MILLIS_PER_MINUTE;
+
+fn milli_to_string(milli: i32) -> String {
+    let hour = milli.div_euclid(MILLIS_PER_HOUR as i32);
+    let min = milli
+        .rem_euclid(MILLIS_PER_HOUR as i32)
+        .div_euclid(MILLIS_PER_MINUTE as i32);
+    let sec = milli
+        .rem_euclid(MILLIS_PER_MINUTE as i32)
+        .div_euclid(MILLIS_PER_SECOND as i32);
+    let ms = milli.rem_euclid(MILLIS_PER_SECOND as i32);
+
+    format!("{hour:02}-{min:02}-{sec:02}_{ms:03}")
+}
+
+fn time_to_string(time: Time) -> String {
+    milli_to_string(time.get_inner())
+}
+
+trait Timestamp {
+    fn get_milli(&self) -> i64;
+    fn get_nano_of_milli(&self) -> i32;
+}
+
+impl Timestamp for TimestampNtz {
+    fn get_milli(&self) -> i64 {
+        self.get_millisecond()
+    }
+
+    fn get_nano_of_milli(&self) -> i32 {
+        self.get_nano_of_millisecond()
+    }
+}
+
+impl Timestamp for TimestampLtz {
+    fn get_milli(&self) -> i64 {
+        self.get_epoch_millisecond()
+    }
+
+    fn get_nano_of_milli(&self) -> i32 {
+        self.get_nano_of_millisecond()
+    }
+}
+
+/// This formats date time while adhering to java side behaviour
+///
+fn timestamp_to_string<T: Timestamp>(ts: T) -> String {
+    let millis = ts.get_milli();
+    let nanos = ts.get_nano_of_milli();
+
+    let millis_of_second = millis.rem_euclid(MILLIS_PER_SECOND);
+    let total_secs = millis.div_euclid(MILLIS_PER_SECOND);
+
+    let epoch = jiff::Timestamp::UNIX_EPOCH;
+    let ts_jiff = epoch + jiff::Span::new().seconds(total_secs);
+    let dt = ts_jiff.to_zoned(jiff::tz::TimeZone::UTC).datetime();
+
+    if nanos > 0 {
+        format!(
+            "{:04}-{:02}-{:02}-{:02}-{:02}-{:02}_{:03}{:06}",
+            dt.year(),
+            dt.month(),
+            dt.day(),
+            dt.hour(),
+            dt.minute(),
+            dt.second(),
+            millis_of_second,
+            nanos
+        )
+    } else if millis_of_second > 0 {
+        format!(
+            "{:04}-{:02}-{:02}-{:02}-{:02}-{:02}_{:03}",
+            dt.year(),
+            dt.month(),
+            dt.day(),
+            dt.hour(),
+            dt.minute(),
+            dt.second(),
+            millis_of_second
+        )
+    } else {
+        format!(
+            "{:04}-{:02}-{:02}-{:02}-{:02}-{:02}_",
+            dt.year(),
+            dt.month(),
+            dt.day(),
+            dt.hour(),
+            dt.minute(),
+            dt.second(),
+        )
+    }
+}
+
+/// Converts a Datum value to its string representation for partition naming.
+pub fn convert_value_of_type(value: &Datum, data_type: &DataType) -> Result<String> {
+    match (value, data_type) {
+        (Datum::String(s), DataType::Char(_) | DataType::String(_)) => Ok(s.to_string()),
+        (Datum::Bool(b), DataType::Boolean(_)) => Ok(b.to_string()),
+        (Datum::Blob(bytes), DataType::Binary(_) | DataType::Bytes(_)) => Ok(hex_string(bytes)),
+        (Datum::Int8(v), DataType::TinyInt(_)) => Ok(v.to_string()),
+        (Datum::Int16(v), DataType::SmallInt(_)) => Ok(v.to_string()),
+        (Datum::Int32(v), DataType::Int(_)) => Ok(v.to_string()),
+        (Datum::Int64(v), DataType::BigInt(_)) => Ok(v.to_string()),
+        (Datum::Date(d), DataType::Date(_)) => Ok(date_to_string(*d)),
+        (Datum::Time(t), DataType::Time(_)) => Ok(time_to_string(*t)),
+        (Datum::Float32(f), DataType::Float(_)) => Ok(reformat_float(f.into_inner())),
+        (Datum::Float64(f), DataType::Double(_)) => Ok(reformat_double(f.into_inner())),
+        (Datum::TimestampLtz(ts), DataType::TimestampLTz(_)) => Ok(timestamp_to_string(*ts)),
+        (Datum::TimestampNtz(ts), DataType::Timestamp(_)) => Ok(timestamp_to_string(*ts)),
+        _ => Err(IllegalArgument {
+            message: format!(
+                "Unsupported conversion to partition key from data type: {data_type:?}, value: {value:?}"
+            ),
+        }),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::metadata::{
+        BigIntType, BinaryType, BooleanType, BytesType, CharType, DateType, DoubleType, FloatType,
+        IntType, SmallIntType, StringType, TimeType, TimestampLTzType, TimestampType, TinyIntType,
+    };
+    use crate::row::{Date, Time, TimestampLtz, TimestampNtz};
+    use std::borrow::Cow;
+
+    use crate::metadata::TablePath;
+
+    #[test]
+    fn test_string() {
+        let datum = Datum::String(Cow::Borrowed("Fluss"));
+
+        let to_string_result = convert_value_of_type(&datum, &DataType::String(StringType::new()))
+            .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "Fluss");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_char() {
+        let datum = Datum::String(Cow::Borrowed("F"));
+
+        let to_string_result = convert_value_of_type(&datum, &DataType::Char(CharType::new(1)))
+            .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "F");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_boolean() {
+        let datum = Datum::Bool(true);
+
+        let to_string_result =
+            convert_value_of_type(&datum, &DataType::Boolean(BooleanType::new()))
+                .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "true");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_byte() {
+        let datum = Datum::Blob(Cow::Borrowed(&[0x10, 0x20, 0x30, 0x40, 0x50, 0xFF]));
+
+        let to_string_result = convert_value_of_type(&datum, &DataType::Bytes(BytesType::new()))
+            .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "1020304050ff");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_binary() {
+        let datum = Datum::Blob(Cow::Borrowed(&[0x10, 0x20, 0x30, 0x40, 0x50, 0xFF]));
+
+        let to_string_result = convert_value_of_type(&datum, &DataType::Binary(BinaryType::new(6)))
+            .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "1020304050ff");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_tiny_int() {
+        let datum = Datum::Int8(100);
+
+        let to_string_result =
+            convert_value_of_type(&datum, &DataType::TinyInt(TinyIntType::new()))
+                .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "100");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_small_int() {
+        let datum = Datum::Int16(-32760);
+
+        let to_string_result =
+            convert_value_of_type(&datum, &DataType::SmallInt(SmallIntType::new()))
+                .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "-32760");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_int() {
+        let datum = Datum::Int32(299000);
+
+        let to_string_result = convert_value_of_type(&datum, &DataType::Int(IntType::new()))
+            .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "299000");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_big_int() {
+        let datum = Datum::Int64(1748662955428);
+
+        let to_string_result = convert_value_of_type(&datum, &DataType::BigInt(BigIntType::new()))
+            .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "1748662955428");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_date() {
+        let datum = Datum::Date(Date::new(20235));
+
+        let to_string_result = convert_value_of_type(&datum, &DataType::Date(DateType::new()))
+            .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "2025-05-27");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_time() {
+        let datum = Datum::Time(Time::new(5402199));
+
+        let to_string_result =
+            convert_value_of_type(&datum, &DataType::Time(TimeType::new(3).unwrap()))
+                .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "01-30-02_199");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_float() {
+        let datum = Datum::Float32(5.73.into());
+
+        let to_string_result = convert_value_of_type(&datum, &DataType::Float(FloatType::new()))
+            .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "5_73");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        let datum = Datum::Float32(f32::NAN.into());
+        assert_eq!(
+            convert_value_of_type(&datum, &DataType::Float(FloatType::new()))
+                .expect("datum conversion to partition string failed"),
+            "NaN"
+        );
+
+        let datum = Datum::Float32(f32::INFINITY.into());
+        assert_eq!(
+            convert_value_of_type(&datum, &DataType::Float(FloatType::new()))
+                .expect("datum conversion to partition string failed"),
+            "Inf"
+        );
+
+        let datum = Datum::Float32(f32::NEG_INFINITY.into());
+        assert_eq!(
+            convert_value_of_type(&datum, &DataType::Float(FloatType::new()))
+                .expect("datum conversion to partition string failed"),
+            "-Inf"
+        );
+    }
+
+    #[test]
+    fn test_double() {
+        let datum = Datum::Float64(5.73737.into());
+
+        let to_string_result = convert_value_of_type(&datum, &DataType::Double(DoubleType::new()))
+            .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "5_73737");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        let datum = Datum::Float64(f64::NAN.into());
+        assert_eq!(
+            convert_value_of_type(&datum, &DataType::Double(DoubleType::new()))
+                .expect("datum conversion to partition string failed"),
+            "NaN"
+        );
+
+        let datum = Datum::Float64(f64::INFINITY.into());
+        assert_eq!(
+            convert_value_of_type(&datum, &DataType::Double(DoubleType::new()))
+                .expect("datum conversion to partition string failed"),
+            "Inf"
+        );
+
+        let datum = Datum::Float64(f64::NEG_INFINITY.into());
+        assert_eq!(
+            convert_value_of_type(&datum, &DataType::Double(DoubleType::new()))
+                .expect("datum conversion to partition string failed"),
+            "-Inf"
+        );
+    }
+
+    #[test]
+    fn test_timestamp_ntz() {
+        let datum = Datum::TimestampNtz(
+            TimestampNtz::from_millis_nanos(1748662955428, 99988)
+                .expect("TimestampNtz init failed"),
+        );
+
+        let to_string_result =
+            convert_value_of_type(&datum, &DataType::Timestamp(TimestampType::new(9).unwrap()))
+                .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "2025-05-31-03-42-35_428099988");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        // Zero nanos of millis
+        let datum = Datum::TimestampNtz(
+            TimestampNtz::from_millis_nanos(1748662955428, 0).expect("TimestampNtz init failed"),
+        );
+
+        let to_string_result =
+            convert_value_of_type(&datum, &DataType::Timestamp(TimestampType::new(9).unwrap()))
+                .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "2025-05-31-03-42-35_428");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        // Zero millis
+        let datum = Datum::TimestampNtz(
+            TimestampNtz::from_millis_nanos(1748662955000, 99988)
+                .expect("TimestampNtz init failed"),
+        );
+
+        let to_string_result =
+            convert_value_of_type(&datum, &DataType::Timestamp(TimestampType::new(9).unwrap()))
+                .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "2025-05-31-03-42-35_000099988");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        // Zero millis and zero nanos
+        let datum = Datum::TimestampNtz(
+            TimestampNtz::from_millis_nanos(1748662955000, 0).expect("TimestampNtz init failed"),
+        );
+
+        let to_string_result =
+            convert_value_of_type(&datum, &DataType::Timestamp(TimestampType::new(9).unwrap()))
+                .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "2025-05-31-03-42-35_");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        // Negative millis
+        let datum = Datum::TimestampNtz(
+            TimestampNtz::from_millis_nanos(-1748662955428, 99988)
+                .expect("TimestampNtz init failed"),
+        );
+
+        let to_string_result =
+            convert_value_of_type(&datum, &DataType::Timestamp(TimestampType::new(9).unwrap()))
+                .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "1914-08-03-20-17-24_572099988");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+
+    #[test]
+    fn test_timestamp_ltz() {
+        let datum = Datum::TimestampLtz(
+            TimestampLtz::from_millis_nanos(1748662955428, 99988)
+                .expect("TimestampLtz init failed"),
+        );
+
+        let to_string_result = convert_value_of_type(
+            &datum,
+            &DataType::TimestampLTz(TimestampLTzType::new(9).unwrap()),
+        )
+        .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "2025-05-31-03-42-35_428099988");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        // Zero nanos of millis
+        let datum = Datum::TimestampLtz(
+            TimestampLtz::from_millis_nanos(1748662955428, 0).expect("TimestampLtz init failed"),
+        );
+
+        let to_string_result = convert_value_of_type(
+            &datum,
+            &DataType::TimestampLTz(TimestampLTzType::new(9).unwrap()),
+        )
+        .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "2025-05-31-03-42-35_428");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        // Zero millis
+        let datum = Datum::TimestampLtz(
+            TimestampLtz::from_millis_nanos(1748662955000, 99988)
+                .expect("TimestampLtz init failed"),
+        );
+
+        let to_string_result = convert_value_of_type(
+            &datum,
+            &DataType::TimestampLTz(TimestampLTzType::new(9).unwrap()),
+        )
+        .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "2025-05-31-03-42-35_000099988");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        // Zero millis and zero nanos
+        let datum = Datum::TimestampLtz(
+            TimestampLtz::from_millis_nanos(1748662955000, 0).expect("TimestampLtz init failed"),
+        );
+
+        let to_string_result = convert_value_of_type(
+            &datum,
+            &DataType::TimestampLTz(TimestampLTzType::new(9).unwrap()),
+        )
+        .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "2025-05-31-03-42-35_");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+
+        // Negative millis
+        let datum = Datum::TimestampLtz(
+            TimestampLtz::from_millis_nanos(-1748662955428, 99988)
+                .expect("TimestampLtz init failed"),
+        );
+
+        let to_string_result = convert_value_of_type(
+            &datum,
+            &DataType::TimestampLTz(TimestampLTzType::new(9).unwrap()),
+        )
+        .expect("datum conversion to partition string failed");
+        assert_eq!(to_string_result, "1914-08-03-20-17-24_572099988");
+        let detect_invalid = TablePath::detect_invalid_name(&to_string_result);
+        assert!(detect_invalid.is_none());
+    }
+}
diff --git a/fluss-rust/crates/fluss/src/util/varint.rs b/fluss-rust/crates/fluss/src/util/varint.rs
new file mode 100644
index 0000000000..83a75f6c37
--- /dev/null
+++ b/fluss-rust/crates/fluss/src/util/varint.rs
@@ -0,0 +1,498 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Variable-length integer encoding utilities.
+//!
+//! This module provides utilities for encoding integers in variable-length format,
+//! which can save space when encoding small integers. The encoding uses 7 bits per byte
+//! with the most significant bit as a continuation flag.
+
+use bytes::BufMut;
+use std::io::{self, Read, Write};
+
+/// Write an unsigned integer in variable-length format.
+///
+/// The encoding uses 7 bits per byte with the MSB set to 1 if more bytes follow.
+/// This matches the encoding used in Google Protocol Buffers.
+#[allow(dead_code)]
+pub fn write_unsigned_varint<W: Write>(value: u32, writer: &mut W) -> io::Result<usize> {
+    let mut v = value;
+    let mut bytes_written = 0;
+
+    while (v & !0x7F) != 0 {
+        writer.write_all(&[((v as u8) & 0x7F) | 0x80])?;
+        bytes_written += 1;
+        v >>= 7;
+    }
+    writer.write_all(&[v as u8])?;
+    bytes_written += 1;
+
+    Ok(bytes_written)
+}
+
+/// Write an unsigned integer in variable-length format to a buffer.
+pub fn write_unsigned_varint_buf(value: u32, buf: &mut impl BufMut) {
+    let mut v = value;
+
+    while (v & !0x7F) != 0 {
+        buf.put_u8(((v as u8) & 0x7F) | 0x80);
+        v >>= 7;
+    }
+    buf.put_u8(v as u8);
+}
+
+/// Read an unsigned integer stored in variable-length format.
+#[allow(dead_code)]
+pub fn read_unsigned_varint<R: Read>(reader: &mut R) -> io::Result<u32> {
+    let mut tmp = [0u8; 1];
+    reader.read_exact(&mut tmp)?;
+    let mut byte = tmp[0] as i8;
+
+    if byte >= 0 {
+        return Ok(byte as u32);
+    }
+
+    let mut result = (byte & 127) as u32;
+
+    reader.read_exact(&mut tmp)?;
+    byte = tmp[0] as i8;
+    if byte >= 0 {
+        result |= (byte as u32) << 7;
+    } else {
+        result |= ((byte & 127) as u32) << 7;
+
+        reader.read_exact(&mut tmp)?;
+        byte = tmp[0] as i8;
+        if byte >= 0 {
+            result |= (byte as u32) << 14;
+        } else {
+            result |= ((byte & 127) as u32) << 14;
+
+            reader.read_exact(&mut tmp)?;
+            byte = tmp[0] as i8;
+            if byte >= 0 {
+                result |= (byte as u32) << 21;
+            } else {
+                result |= ((byte & 127) as u32) << 21;
+
+                reader.read_exact(&mut tmp)?;
+                byte = tmp[0] as i8;
+                result |= (byte as u32) << 28;
+
+                if byte < 0 {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        "Invalid u32 varint encoding: too many bytes (most significant bit in the 5th byte is set)",
+                    ));
+                }
+            }
+        }
+    }
+
+    Ok(result)
+}
+
+/// Read an unsigned integer from a byte slice in variable-length format.
+pub fn read_unsigned_varint_bytes(bytes: &[u8]) -> io::Result<(u32, usize)> {
+    if bytes.is_empty() {
+        return Err(io::Error::new(
+            io::ErrorKind::UnexpectedEof,
+            "Cannot read varint from empty buffer",
+        ));
+    }
+
+    let mut byte = bytes[0] as i8;
+    let mut index = 1;
+
+    if byte >= 0 {
+        return Ok((byte as u32, index));
+    }
+
+    let mut result = (byte & 127) as u32;
+
+    if index >= bytes.len() {
+        return Err(io::Error::new(
+            io::ErrorKind::UnexpectedEof,
+            "Incomplete varint",
+        ));
+    }
+    byte = bytes[index] as i8;
+    index += 1;
+    if byte >= 0 {
+        result |= (byte as u32) << 7;
+    } else {
+        result |= ((byte & 127) as u32) << 7;
+
+        if index >= bytes.len() {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Incomplete varint",
+            ));
+        }
+        byte = bytes[index] as i8;
+        index += 1;
+        if byte >= 0 {
+            result |= (byte as u32) << 14;
+        } else {
+            result |= ((byte & 127) as u32) << 14;
+
+            if index >= bytes.len() {
+                return Err(io::Error::new(
+                    io::ErrorKind::UnexpectedEof,
+                    "Incomplete varint",
+                ));
+            }
+            byte = bytes[index] as i8;
+            index += 1;
+            if byte >= 0 {
+                result |= (byte as u32) << 21;
+            } else {
+                result |= ((byte & 127) as u32) << 21;
+
+                if index >= bytes.len() {
+                    return Err(io::Error::new(
+                        io::ErrorKind::UnexpectedEof,
+                        "Incomplete varint",
+                    ));
+                }
+                byte = bytes[index] as i8;
+                index += 1;
+                result |= (byte as u32) << 28;
+
+                if byte < 0 {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        "Invalid u32 varint encoding: too many bytes (most significant bit in the 5th byte is set)",
+                    ));
+                }
+            }
+        }
+    }
+
+    Ok((result, index))
+}
+
+/// Calculate the number of bytes needed to encode a u32 in variable-length format.
+///
+/// Varint encoding uses 7 bits per byte, so we need `ceil(bits_used / 7)` bytes.
+/// This function computes that efficiently using the formula:
+///
+/// size = ((38 - leading_zeros) * 74899) >> 19  +  (leading_zeros >> 5)
+///
+/// Where:
+/// - `38 = 32 + 6` (6 accounts for ceiling in division)
+/// - `74899 = 2^19 / 7` (enables division by 7 via multiply + shift)
+/// - `leading_zeros >> 5` adds 1 when value is 0 (minimum 1 byte)
+pub fn size_of_unsigned_varint(value: u32) -> usize {
+    let leading_zeros = value.leading_zeros();
+    let leading_zeros_below_38_divided_by_7 = ((38 - leading_zeros) * 0b10010010010010011) >> 19;
+    (leading_zeros_below_38_divided_by_7 + (leading_zeros >> 5)) as usize
+}
+
+/// Calculate the number of bytes needed to encode a u64 in variable-length format.
+///
+/// Varint encoding uses 7 bits per byte, so we need `ceil(bits_used / 7)` bytes.
+/// This function computes that efficiently using the formula:
+///
+/// size = ((70 - leading_zeros) * 74899) >> 19  +  (leading_zeros >> 6)
+///
+/// - `70 = 64 + 6` (6 accounts for ceiling in division)
+/// - `74899 = 2^19 / 7` (enables division by 7 via multiply + shift)
+/// - `leading_zeros >> 6` adds 1 when value is 0 (minimum 1 byte)
+#[allow(dead_code)]
+pub fn size_of_unsigned_varint_u64(value: u64) -> usize {
+    let leading_zeros = value.leading_zeros();
+    let leading_zeros_below_70_divided_by_7 = ((70 - leading_zeros) * 0b10010010010010011) >> 19;
+    (leading_zeros_below_70_divided_by_7 + (leading_zeros >> 6)) as usize
+}
+
+/// Write an unsigned 64-bit integer in variable-length format to a buffer.
+#[allow(dead_code)]
+pub fn write_unsigned_varint_u64_buf(value: u64, buf: &mut impl BufMut) {
+    let mut v = value;
+    while (v & !0x7F) != 0 {
+        buf.put_u8(((v as u8) & 0x7F) | 0x80);
+        v >>= 7;
+    }
+    buf.put_u8(v as u8);
+}
+
+/// Write directly to a mutable byte slice, returning the number of bytes written.
+/// Used by CompactedRowWriter which manages its own position.
+///
+/// # Panics
+/// Panics if the slice is too small to hold the encoded varint.
+/// The slice must have at least 5 bytes available (the maximum size for a u32 varint).
+/// Use [`size_of_unsigned_varint`] to calculate the required size beforehand.
+pub fn write_unsigned_varint_to_slice(value: u32, slice: &mut [u8]) -> usize {
+    let mut v = value;
+    let mut written = 0;
+
+    while (v & !0x7F) != 0 {
+        slice[written] = ((v as u8) & 0x7F) | 0x80;
+        written += 1;
+        v >>= 7;
+    }
+    slice[written] = v as u8;
+    written + 1
+}
+
+/// Write unsigned 64-bit varint directly to a mutable byte slice.
+///
+/// # Panics
+/// Panics if the slice is too small to hold the encoded varint.
+/// The slice must have at least 10 bytes available (the maximum size for a u64 varint).
+pub fn write_unsigned_varint_u64_to_slice(value: u64, slice: &mut [u8]) -> usize {
+    let mut v = value;
+    let mut written = 0;
+
+    while (v & !0x7F) != 0 {
+        slice[written] = ((v as u8) & 0x7F) | 0x80;
+        written += 1;
+        v >>= 7;
+    }
+    slice[written] = v as u8;
+    written + 1
+}
+
+/// Read unsigned varint from a slice starting at given position.
+/// Returns (value, next_position).
+/// Used by CompactedRowReader which manages positions.
+pub fn read_unsigned_varint_at(
+    slice: &[u8],
+    mut pos: usize,
+    max_bytes: usize,
+) -> io::Result<(u32, usize)> {
+    let mut result: u32 = 0;
+    let mut shift = 0;
+
+    for _ in 0..max_bytes {
+        if pos >= slice.len() {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Unexpected end of varint",
+            ));
+        }
+        let b = slice[pos];
+        pos += 1;
+        result |= ((b & 0x7F) as u32) << shift;
+        if (b & 0x80) == 0 {
+            return Ok((result, pos));
+        }
+        shift += 7;
+    }
+
+    Err(io::Error::new(
+        io::ErrorKind::InvalidData,
+        "Invalid VarInt32 input stream",
+    ))
+}
+
+/// Read unsigned 64-bit varint from a slice starting at given position.
+pub fn read_unsigned_varint_u64_at(
+    slice: &[u8],
+    mut pos: usize,
+    max_bytes: usize,
+) -> io::Result<(u64, usize)> {
+    let mut result: u64 = 0;
+    let mut shift = 0;
+
+    for _ in 0..max_bytes {
+        if pos >= slice.len() {
+            return Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "Unexpected end of varint",
+            ));
+        }
+        let b = slice[pos];
+        pos += 1;
+        result |= ((b & 0x7F) as u64) << shift;
+        if (b & 0x80) == 0 {
+            return Ok((result, pos));
+        }
+        shift += 7;
+    }
+
+    Err(io::Error::new(
+        io::ErrorKind::InvalidData,
+        "Invalid VarInt64 input stream",
+    ))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::io::Cursor;
+
+    #[test]
+    fn test_unsigned_varint_round_trip() {
+        let test_values = vec![
+            0u32,
+            1,
+            127,
+            128,
+            255,
+            256,
+            16383,
+            16384,
+            2097151,
+            2097152,
+            268435455,
+            268435456,
+            u32::MAX,
+        ];
+
+        for value in test_values {
+            // Test with Write trait
+            let mut buffer = Vec::new();
+            let written = write_unsigned_varint(value, &mut buffer).unwrap();
+
+            let mut reader = Cursor::new(&buffer);
+            let read_value = read_unsigned_varint(&mut reader).unwrap();
+
+            assert_eq!(value, read_value, "Round trip failed for value {value}");
+            assert_eq!(
+                written,
+                buffer.len(),
+                "Bytes written mismatch for value {value}"
+            );
+
+            // Test with BufMut
+            let mut buf = bytes::BytesMut::new();
+            write_unsigned_varint_buf(value, &mut buf);
+            assert_eq!(buf.len(), written, "BufMut write length mismatch");
+
+            // Test size calculation
+            let calculated_size = size_of_unsigned_varint(value);
+            assert_eq!(
+                calculated_size,
+                buffer.len(),
+                "Size calculation failed for value {value}"
+            );
+
+            // Test reading from bytes
+            let (read_value_bytes, bytes_read) = read_unsigned_varint_bytes(&buffer).unwrap();
+            assert_eq!(
+                value, read_value_bytes,
+                "Bytes read failed for value {value}"
+            );
+            assert_eq!(
+                bytes_read,
+                buffer.len(),
+                "Bytes read count mismatch for value {value}"
+            );
+        }
+    }
+
+    #[test]
+    fn test_size_of_unsigned_varint() {
+        assert_eq!(size_of_unsigned_varint(0), 1);
+        assert_eq!(size_of_unsigned_varint(127), 1);
+        assert_eq!(size_of_unsigned_varint(128), 2);
+        assert_eq!(size_of_unsigned_varint(16383), 2);
+        assert_eq!(size_of_unsigned_varint(16384), 3);
+        assert_eq!(size_of_unsigned_varint(2097151), 3);
+        assert_eq!(size_of_unsigned_varint(2097152), 4);
+        assert_eq!(size_of_unsigned_varint(268435455), 4);
+        assert_eq!(size_of_unsigned_varint(268435456), 5);
+        assert_eq!(size_of_unsigned_varint(u32::MAX), 5);
+    }
+
+    #[test]
+    fn test_size_of_unsigned_varint_u64() {
+        assert_eq!(size_of_unsigned_varint_u64(0), 1);
+        assert_eq!(size_of_unsigned_varint_u64(127), 1);
+        assert_eq!(size_of_unsigned_varint_u64(128), 2);
+        assert_eq!(size_of_unsigned_varint_u64(16383), 2);
+        assert_eq!(size_of_unsigned_varint_u64(16384), 3);
+        assert_eq!(size_of_unsigned_varint_u64(2097151), 3);
+        assert_eq!(size_of_unsigned_varint_u64(2097152), 4);
+        assert_eq!(size_of_unsigned_varint_u64(268435455), 4);
+        assert_eq!(size_of_unsigned_varint_u64(268435456), 5);
+        assert_eq!(size_of_unsigned_varint_u64(u32::MAX as u64), 5);
+        assert_eq!(size_of_unsigned_varint_u64(34359738367), 5);
+        assert_eq!(size_of_unsigned_varint_u64(34359738368), 6);
+        assert_eq!(size_of_unsigned_varint_u64(4398046511103), 6);
+        assert_eq!(size_of_unsigned_varint_u64(4398046511104), 7);
+        assert_eq!(size_of_unsigned_varint_u64(562949953421311), 7);
+        assert_eq!(size_of_unsigned_varint_u64(562949953421312), 8);
+        assert_eq!(size_of_unsigned_varint_u64(72057594037927935), 8);
+        assert_eq!(size_of_unsigned_varint_u64(72057594037927936), 9);
+        assert_eq!(size_of_unsigned_varint_u64(9223372036854775807), 9);
+        assert_eq!(size_of_unsigned_varint_u64(9223372036854775808), 10);
+        assert_eq!(size_of_unsigned_varint_u64(u64::MAX), 10);
+    }
+
+    #[test]
+    fn test_read_unsigned_varint_bytes_error_handling() {
+        // Empty buffer
+        assert!(read_unsigned_varint_bytes(&[]).is_err());
+
+        // Incomplete varint (continuation bit set but no next byte)
+        assert!(read_unsigned_varint_bytes(&[0x80]).is_err());
+        assert!(read_unsigned_varint_bytes(&[0xFF, 0x80]).is_err());
+    }
+
+    #[test]
+    fn test_write_read_to_slice() {
+        // Test u32 varint to slice
+        let test_values_u32 = vec![0u32, 127, 128, 16384, u32::MAX];
+
+        for value in test_values_u32 {
+            let mut buffer = vec![0u8; 10];
+            let written = write_unsigned_varint_to_slice(value, &mut buffer);
+
+            let (read_value, next_pos) = read_unsigned_varint_at(&buffer, 0, 5).unwrap();
+            assert_eq!(value, read_value);
+            assert_eq!(written, next_pos);
+        }
+
+        // Test u64 varint to slice
+        let test_values_u64 = vec![0u64, 127, 128, 16384, u32::MAX as u64, u64::MAX];
+
+        for value in test_values_u64 {
+            let mut buffer = vec![0u8; 10];
+            let written = write_unsigned_varint_u64_to_slice(value, &mut buffer);
+
+            let (read_value, next_pos) = read_unsigned_varint_u64_at(&buffer, 0, 10).unwrap();
+            assert_eq!(value, read_value);
+            assert_eq!(written, next_pos);
+        }
+    }
+
+    #[test]
+    fn test_read_at_with_offset() {
+        // Write multiple varints and read at different positions
+        let mut buffer = vec![0u8; 20];
+        let mut pos = 0;
+
+        pos += write_unsigned_varint_to_slice(127, &mut buffer[pos..]);
+        pos += write_unsigned_varint_to_slice(16384, &mut buffer[pos..]);
+        let end_pos = pos + write_unsigned_varint_to_slice(u32::MAX, &mut buffer[pos..]);
+
+        // Read back
+        let (val1, pos1) = read_unsigned_varint_at(&buffer, 0, 5).unwrap();
+        assert_eq!(val1, 127);
+
+        let (val2, pos2) = read_unsigned_varint_at(&buffer, pos1, 5).unwrap();
+        assert_eq!(val2, 16384);
+
+        let (val3, pos3) = read_unsigned_varint_at(&buffer, pos2, 5).unwrap();
+        assert_eq!(val3, u32::MAX);
+        assert_eq!(pos3, end_pos);
+    }
+}
diff --git a/fluss-rust/crates/fluss/tests/integration/admin.rs b/fluss-rust/crates/fluss/tests/integration/admin.rs
new file mode 100644
index 0000000000..0860cbef97
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/integration/admin.rs
@@ -0,0 +1,566 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(test)]
+mod admin_test {
+    use crate::integration::utils::get_shared_cluster;
+    use fluss::error::FlussError;
+    use fluss::metadata::{
+        DataTypes, DatabaseDescriptorBuilder, KvFormat, LogFormat, PartitionSpec, Schema,
+        TableDescriptor, TablePath,
+    };
+    use std::collections::HashMap;
+
+    #[tokio::test]
+    async fn test_create_database() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("should get admin");
+
+        let db_descriptor = DatabaseDescriptorBuilder::default()
+            .comment("test_db")
+            .custom_properties([("k1", "v1"), ("k2", "v2")].into())
+            .build();
+
+        let db_name = "test_create_database";
+
+        assert!(!admin.database_exists(db_name).await.unwrap());
+
+        // create database
+        admin
+            .create_database(db_name, Some(&db_descriptor), false)
+            .await
+            .expect("should create database");
+
+        // database should exist
+        assert!(admin.database_exists(db_name).await.unwrap());
+
+        // get database
+        let db_info = admin
+            .get_database_info(db_name)
+            .await
+            .expect("should get database info");
+
+        assert_eq!(db_info.database_name(), db_name);
+        assert_eq!(db_info.database_descriptor(), &db_descriptor);
+
+        // drop database
+        admin
+            .drop_database(db_name, false, true)
+            .await
+            .expect("should drop_database");
+
+        // database shouldn't exist now
+        assert!(!admin.database_exists(db_name).await.unwrap());
+    }
+
+    #[tokio::test]
+    async fn test_create_table() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin client");
+
+        let test_db_name = "test_create_table_db";
+        let db_descriptor = DatabaseDescriptorBuilder::default()
+            .comment("Database for test_create_table")
+            .build();
+
+        assert!(!admin.database_exists(test_db_name).await.unwrap());
+        admin
+            .create_database(test_db_name, Some(&db_descriptor), false)
+            .await
+            .expect("Failed to create test database");
+
+        let test_table_name = "test_user_table";
+        let table_path = TablePath::new(test_db_name, test_table_name);
+
+        // build table schema
+        let table_schema = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .column("age", DataTypes::int())
+            .with_comment("User's age (optional)")
+            .column("email", DataTypes::string())
+            .primary_key(vec!["id".to_string()])
+            .build()
+            .expect("Failed to build table schema");
+
+        // build table descriptor
+        let table_descriptor = TableDescriptor::builder()
+            .schema(table_schema.clone())
+            .comment("Test table for user data (id, name, age, email)")
+            .distributed_by(Some(3), vec!["id".to_string()])
+            .property("table.replication.factor", "1")
+            .log_format(LogFormat::ARROW)
+            .kv_format(KvFormat::INDEXED)
+            .build()
+            .expect("Failed to build table descriptor");
+
+        // create test table
+        admin
+            .create_table(&table_path, &table_descriptor, false)
+            .await
+            .expect("Failed to create test table");
+
+        assert!(
+            admin.table_exists(&table_path).await.unwrap(),
+            "Table {:?} should exist after creation",
+            table_path
+        );
+
+        let tables = admin.list_tables(test_db_name).await.unwrap();
+        assert_eq!(
+            tables.len(),
+            1,
+            "There should be exactly one table in the database"
+        );
+        assert!(
+            tables.contains(&test_table_name.to_string()),
+            "Table list should contain the created table"
+        );
+
+        let table_info = admin
+            .get_table_info(&table_path)
+            .await
+            .expect("Failed to get table info");
+
+        // verify table comment
+        assert_eq!(
+            table_info.get_comment(),
+            Some("Test table for user data (id, name, age, email)"),
+            "Table comment mismatch"
+        );
+
+        // verify schema columns
+        let actual_schema = table_info.get_schema();
+        assert_eq!(actual_schema, table_descriptor.schema(), "Schema mismatch");
+
+        // verify primary key
+        assert_eq!(
+            table_info.get_primary_keys(),
+            &vec!["id".to_string()],
+            "Primary key columns mismatch"
+        );
+
+        // verify distribution and properties
+        assert_eq!(table_info.get_num_buckets(), 3, "Bucket count mismatch");
+        assert_eq!(
+            table_info.get_bucket_keys(),
+            &vec!["id".to_string()],
+            "Bucket keys mismatch"
+        );
+
+        // The server may add extra default properties, so verify that all
+        // expected properties are present rather than requiring an exact match.
+        let actual_props = table_info.get_properties();
+        for (key, value) in table_descriptor.properties() {
+            assert_eq!(
+                actual_props.get(key),
+                Some(value),
+                "Property mismatch for key '{}'",
+                key
+            );
+        }
+
+        // drop table
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+        // table shouldn't exist now
+        assert!(!admin.table_exists(&table_path).await.unwrap());
+
+        // drop database
+        admin
+            .drop_database(test_db_name, false, true)
+            .await
+            .expect("Should drop database");
+
+        // database shouldn't exist now
+        assert!(!admin.database_exists(test_db_name).await.unwrap());
+    }
+
+    #[tokio::test]
+    async fn test_partition_apis() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin client");
+
+        let test_db_name = "test_partition_apis_db";
+        let db_descriptor = DatabaseDescriptorBuilder::default()
+            .comment("Database for test_partition_apis")
+            .build();
+
+        admin
+            .create_database(test_db_name, Some(&db_descriptor), true)
+            .await
+            .expect("Failed to create test database");
+
+        let test_table_name = "partitioned_table";
+        let table_path = TablePath::new(test_db_name, test_table_name);
+
+        let table_schema = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .column("dt", DataTypes::string())
+            .column("region", DataTypes::string())
+            .primary_key(vec!["id", "dt", "region"])
+            .build()
+            .expect("Failed to build table schema");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(table_schema)
+            .distributed_by(Some(3), vec!["id".to_string()])
+            .partitioned_by(vec!["dt", "region"])
+            .property("table.replication.factor", "1")
+            .log_format(LogFormat::ARROW)
+            .kv_format(KvFormat::COMPACTED)
+            .build()
+            .expect("Failed to build table descriptor");
+
+        admin
+            .create_table(&table_path, &table_descriptor, true)
+            .await
+            .expect("Failed to create partitioned table");
+
+        let partitions = admin
+            .list_partition_infos(&table_path)
+            .await
+            .expect("Failed to list partitions");
+        assert!(
+            partitions.is_empty(),
+            "Expected no partitions initially, found {}",
+            partitions.len()
+        );
+
+        let mut partition_values = HashMap::new();
+        partition_values.insert("dt", "2024-01-15");
+        partition_values.insert("region", "EMEA");
+        let partition_spec = PartitionSpec::new(partition_values);
+
+        admin
+            .create_partition(&table_path, &partition_spec, false)
+            .await
+            .expect("Failed to create partition");
+
+        let partitions = admin
+            .list_partition_infos(&table_path)
+            .await
+            .expect("Failed to list partitions");
+        assert_eq!(
+            partitions.len(),
+            1,
+            "Expected exactly one partition after creation"
+        );
+        assert_eq!(
+            partitions[0].get_partition_name(),
+            "2024-01-15$EMEA",
+            "Partition name mismatch"
+        );
+
+        // list with partial spec filter - should find the partition
+        let mut partition_values = HashMap::new();
+        partition_values.insert("dt", "2024-01-15");
+        let partial_partition_spec = PartitionSpec::new(partition_values);
+
+        let partitions_with_spec = admin
+            .list_partition_infos_with_spec(&table_path, Some(&partial_partition_spec))
+            .await
+            .expect("Failed to list partitions with spec");
+        assert_eq!(
+            partitions_with_spec.len(),
+            1,
+            "Expected one partition matching the spec"
+        );
+        assert_eq!(
+            partitions_with_spec[0].get_partition_name(),
+            "2024-01-15$EMEA",
+            "Partition name mismatch with spec filter"
+        );
+
+        // list with non-matching spec - should find no partitions
+        let mut non_matching_values = HashMap::new();
+        non_matching_values.insert("dt", "2024-01-16");
+        let non_matching_spec = PartitionSpec::new(non_matching_values);
+        let partitions_non_matching = admin
+            .list_partition_infos_with_spec(&table_path, Some(&non_matching_spec))
+            .await
+            .expect("Failed to list partitions with non-matching spec");
+        assert!(
+            partitions_non_matching.is_empty(),
+            "Expected no partitions for non-matching spec"
+        );
+
+        admin
+            .drop_partition(&table_path, &partition_spec, false)
+            .await
+            .expect("Failed to drop partition");
+
+        let partitions = admin
+            .list_partition_infos(&table_path)
+            .await
+            .expect("Failed to list partitions");
+        assert!(
+            partitions.is_empty(),
+            "Expected no partitions after drop, found {}",
+            partitions.len()
+        );
+
+        admin
+            .drop_table(&table_path, true)
+            .await
+            .expect("Failed to drop table");
+        admin
+            .drop_database(test_db_name, true, true)
+            .await
+            .expect("Should drop database");
+    }
+
+    #[tokio::test]
+    async fn test_fluss_error_response() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin client");
+
+        let table_path = TablePath::new("fluss", "not_exist");
+
+        let result = admin.get_table_info(&table_path).await;
+        assert!(result.is_err(), "Expected error but got Ok");
+
+        let error = result.unwrap_err();
+        assert_eq!(
+            error.api_error(),
+            Some(FlussError::TableNotExist),
+            "Expected TableNotExist error, got {:?}",
+            error
+        );
+    }
+
+    /// Helper to assert that an error is a FlussAPIError with the expected code.
+    fn assert_api_error(error: fluss::error::Error, expected: FlussError) {
+        assert_eq!(
+            error.api_error(),
+            Some(expected),
+            "Expected {:?}, got {:?}",
+            expected,
+            error
+        );
+    }
+
+    #[tokio::test]
+    async fn test_error_database_not_exist() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().unwrap();
+
+        // get_database_info for non-existent database
+        let result = admin.get_database_info("no_such_db").await;
+        assert_api_error(result.unwrap_err(), FlussError::DatabaseNotExist);
+
+        // drop_database without ignore flag
+        let result = admin.drop_database("no_such_db", false, false).await;
+        assert_api_error(result.unwrap_err(), FlussError::DatabaseNotExist);
+
+        // list_tables for non-existent database
+        let result = admin.list_tables("no_such_db").await;
+        assert_api_error(result.unwrap_err(), FlussError::DatabaseNotExist);
+    }
+
+    #[tokio::test]
+    async fn test_error_database_already_exist() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().unwrap();
+
+        let db_name = "test_error_db_already_exist";
+        let descriptor = DatabaseDescriptorBuilder::default().build();
+
+        admin
+            .create_database(db_name, Some(&descriptor), false)
+            .await
+            .unwrap();
+
+        // create same database again without ignore flag
+        let result = admin
+            .create_database(db_name, Some(&descriptor), false)
+            .await;
+        assert_api_error(result.unwrap_err(), FlussError::DatabaseAlreadyExist);
+
+        // with ignore flag should succeed
+        admin
+            .create_database(db_name, Some(&descriptor), true)
+            .await
+            .expect("create_database with ignore_if_exists should succeed");
+
+        // cleanup
+        admin.drop_database(db_name, true, true).await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_error_table_already_exist() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().unwrap();
+
+        let db_name = "test_error_tbl_already_exist_db";
+        let descriptor = DatabaseDescriptorBuilder::default().build();
+        admin
+            .create_database(db_name, Some(&descriptor), true)
+            .await
+            .unwrap();
+
+        let table_path = TablePath::new(db_name, "my_table");
+        let schema = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .build()
+            .unwrap();
+        let table_descriptor = TableDescriptor::builder()
+            .schema(schema)
+            .distributed_by(Some(1), vec![])
+            .property("table.replication.factor", "1")
+            .build()
+            .unwrap();
+
+        admin
+            .create_table(&table_path, &table_descriptor, false)
+            .await
+            .unwrap();
+
+        // create same table again without ignore flag
+        let result = admin
+            .create_table(&table_path, &table_descriptor, false)
+            .await;
+        assert_api_error(result.unwrap_err(), FlussError::TableAlreadyExist);
+
+        // with ignore flag should succeed
+        admin
+            .create_table(&table_path, &table_descriptor, true)
+            .await
+            .expect("create_table with ignore_if_exists should succeed");
+
+        // cleanup
+        admin.drop_table(&table_path, true).await.unwrap();
+        admin.drop_database(db_name, true, true).await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn test_error_table_not_exist() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().unwrap();
+
+        let table_path = TablePath::new("fluss", "no_such_table");
+
+        // drop without ignore flag
+        let result = admin.drop_table(&table_path, false).await;
+        assert_api_error(result.unwrap_err(), FlussError::TableNotExist);
+
+        // drop with ignore flag should succeed
+        admin
+            .drop_table(&table_path, true)
+            .await
+            .expect("drop_table with ignore_if_not_exists should succeed");
+    }
+
+    #[tokio::test]
+    async fn test_get_server_nodes() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().unwrap();
+
+        let nodes = admin
+            .get_server_nodes()
+            .await
+            .expect("should get server nodes");
+
+        assert!(
+            !nodes.is_empty(),
+            "Expected at least one server node in the cluster"
+        );
+
+        let has_coordinator = nodes
+            .iter()
+            .any(|n| *n.server_type() == fluss::ServerType::CoordinatorServer);
+        assert!(has_coordinator, "Expected a coordinator server node");
+
+        let tablet_count = nodes
+            .iter()
+            .filter(|n| *n.server_type() == fluss::ServerType::TabletServer)
+            .count();
+        assert!(
+            tablet_count >= 1,
+            "Expected at least one tablet server node"
+        );
+
+        for node in &nodes {
+            assert!(
+                !node.host().is_empty(),
+                "Server node host should not be empty"
+            );
+            assert!(node.port() > 0, "Server node port should be > 0");
+            assert!(
+                !node.uid().is_empty(),
+                "Server node uid should not be empty"
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn test_error_table_not_partitioned() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().unwrap();
+
+        let db_name = "test_error_not_partitioned_db";
+        let descriptor = DatabaseDescriptorBuilder::default().build();
+        admin
+            .create_database(db_name, Some(&descriptor), true)
+            .await
+            .unwrap();
+
+        let table_path = TablePath::new(db_name, "non_partitioned_table");
+        let schema = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .build()
+            .unwrap();
+        let table_descriptor = TableDescriptor::builder()
+            .schema(schema)
+            .distributed_by(Some(1), vec![])
+            .property("table.replication.factor", "1")
+            .build()
+            .unwrap();
+
+        admin
+            .create_table(&table_path, &table_descriptor, false)
+            .await
+            .unwrap();
+
+        // list_partition_infos on non-partitioned table
+        let result = admin.list_partition_infos(&table_path).await;
+        assert_api_error(
+            result.unwrap_err(),
+            FlussError::TableNotPartitionedException,
+        );
+
+        // cleanup
+        admin.drop_table(&table_path, true).await.unwrap();
+        admin.drop_database(db_name, true, true).await.unwrap();
+    }
+}
diff --git a/fluss-rust/crates/fluss/tests/integration/batch_scanner.rs b/fluss-rust/crates/fluss/tests/integration/batch_scanner.rs
new file mode 100644
index 0000000000..0b484a8c66
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/integration/batch_scanner.rs
@@ -0,0 +1,338 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#[cfg(test)]
+mod batch_scanner_test {
+    use crate::integration::utils::{create_table, get_shared_cluster};
+    use arrow::array::{Int32Array, StringArray, record_batch};
+    use fluss::metadata::{DataTypes, LogFormat, Schema, TableBucket, TableDescriptor, TablePath};
+    use fluss::row::GenericRow;
+    use std::collections::HashMap;
+
+    /// End-to-end check that the scanner yields the appended rows once and then
+    /// `None`, honoring the configured limit.
+    #[tokio::test]
+    async fn batch_scanner_returns_appended_rows_then_none() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("admin");
+
+        let table_path = TablePath::new("fluss", "test_batch_scanner_log");
+        let descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("c1", DataTypes::int())
+                    .column("c2", DataTypes::string())
+                    .build()
+                    .expect("schema"),
+            )
+            // Single bucket so a single BatchScanner sees every row.
+            .distributed_by(Some(1), vec!["c1".to_string()])
+            .build()
+            .expect("descriptor");
+        create_table(&admin, &table_path, &descriptor).await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        let writer = table
+            .new_append()
+            .expect("append")
+            .create_writer()
+            .expect("writer");
+
+        let batch = record_batch!(
+            ("c1", Int32, [1, 2, 3, 4, 5]),
+            ("c2", Utf8, ["a", "b", "c", "d", "e"])
+        )
+        .unwrap();
+        writer.append_arrow_batch(batch).expect("append batch");
+        writer.flush().await.expect("flush");
+
+        let table_info = table.get_table_info();
+        let bucket = TableBucket::new(table_info.table_id, 0);
+
+        let mut scanner = table
+            .new_scan()
+            .limit(3)
+            .expect("limit")
+            .create_bucket_batch_scanner(bucket.clone())
+            .expect("create batch scanner");
+
+        let first = scanner
+            .next_batch()
+            .await
+            .expect("poll")
+            .expect("first batch should be Some");
+
+        assert_eq!(first.bucket(), &bucket);
+        // The server may return fewer rows than the limit on the first call,
+        // but must never exceed it.
+        assert!(
+            first.num_records() > 0 && first.num_records() <= 3,
+            "expected 1..=3 records, got {}",
+            first.num_records()
+        );
+
+        assert!(
+            scanner.next_batch().await.expect("poll").is_none(),
+            "scanner must end after one batch"
+        );
+    }
+
+    /// Limit scan on a primary-key table: decodes the value-record batch and
+    /// honors the limit. Exercises the KV wire path (distinct from the log one).
+    #[tokio::test]
+    async fn batch_scanner_reads_primary_key_table() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("admin");
+
+        let table_path = TablePath::new("fluss", "test_batch_scanner_pk");
+        let descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .primary_key(vec!["id"])
+                    .build()
+                    .expect("schema"),
+            )
+            // Single bucket so one BatchScanner sees every row.
+            .distributed_by(Some(1), vec!["id".to_string()])
+            .build()
+            .expect("descriptor");
+        create_table(&admin, &table_path, &descriptor).await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        let writer = table
+            .new_upsert()
+            .expect("upsert")
+            .create_writer()
+            .expect("writer");
+
+        let expected: HashMap<i32, &str> =
+            [(1, "a"), (2, "b"), (3, "c"), (4, "d"), (5, "e")].into();
+        for (id, name) in &expected {
+            let mut row = GenericRow::new(2);
+            row.set_field(0, *id);
+            row.set_field(1, *name);
+            writer.upsert(&row).expect("upsert row");
+        }
+        writer.flush().await.expect("flush");
+
+        let table_info = table.get_table_info();
+        let bucket = TableBucket::new(table_info.table_id, 0);
+
+        let mut scanner = table
+            .new_scan()
+            .limit(3)
+            .expect("limit")
+            .create_bucket_batch_scanner(bucket.clone())
+            .expect("create batch scanner");
+
+        let first = scanner
+            .next_batch()
+            .await
+            .expect("poll")
+            .expect("first batch should be Some");
+
+        assert_eq!(first.bucket(), &bucket);
+        let rows = first.batch();
+        assert_eq!(rows.num_columns(), 2, "id + name");
+        assert!(
+            rows.num_rows() > 0 && rows.num_rows() <= 3,
+            "expected 1..=3 records, got {}",
+            rows.num_rows()
+        );
+
+        // Every returned (id, name) must match what we upserted.
+        let ids = rows
+            .column(0)
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .expect("id column Int32");
+        let names = rows
+            .column(1)
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .expect("name column Utf8");
+        for i in 0..rows.num_rows() {
+            let id = ids.value(i);
+            let name = names.value(i);
+            assert_eq!(
+                expected.get(&id),
+                Some(&name),
+                "decoded row ({id}, {name}) does not match upserted data"
+            );
+        }
+
+        assert!(
+            scanner.next_batch().await.expect("poll").is_none(),
+            "scanner must end after one batch"
+        );
+    }
+
+    /// A bucket with the wrong table_id or an out-of-range bucket_id must be
+    /// rejected before any RPC is made.
+    #[tokio::test]
+    async fn batch_scanner_rejects_invalid_bucket() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("admin");
+
+        let table_path = TablePath::new("fluss", "test_batch_scanner_table_id");
+        let descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("c1", DataTypes::int())
+                    .build()
+                    .expect("schema"),
+            )
+            .distributed_by(Some(1), vec!["c1".to_string()])
+            .build()
+            .expect("descriptor");
+        create_table(&admin, &table_path, &descriptor).await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        let table_id = table.get_table_info().table_id;
+
+        // Wrong table_id.
+        assert!(
+            table
+                .new_scan()
+                .limit(1)
+                .expect("limit")
+                .create_bucket_batch_scanner(TableBucket::new(table_id + 9999, 0))
+                .is_err(),
+            "must reject mismatched table_id"
+        );
+
+        // Bucket id past the single bucket of this table.
+        assert!(
+            table
+                .new_scan()
+                .limit(1)
+                .expect("limit")
+                .create_bucket_batch_scanner(TableBucket::new(table_id, 99))
+                .is_err(),
+            "must reject out-of-range bucket_id"
+        );
+    }
+
+    /// A limit scan over a non-ARROW log table must be rejected (the log path
+    /// decodes Arrow IPC).
+    #[tokio::test]
+    async fn batch_scanner_rejects_non_arrow_log_format() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("admin");
+
+        let table_path = TablePath::new("fluss", "test_batch_scanner_indexed");
+        let descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("c1", DataTypes::int())
+                    .build()
+                    .expect("schema"),
+            )
+            .log_format(LogFormat::INDEXED)
+            .distributed_by(Some(1), vec!["c1".to_string()])
+            .build()
+            .expect("descriptor");
+        create_table(&admin, &table_path, &descriptor).await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        let bucket = TableBucket::new(table.get_table_info().table_id, 0);
+
+        assert!(
+            table
+                .new_scan()
+                .limit(1)
+                .expect("limit")
+                .create_bucket_batch_scanner(bucket)
+                .is_err(),
+            "must reject INDEXED log format"
+        );
+    }
+
+    /// `.limit(n)` must reject non-positive values before any scanner is built.
+    #[tokio::test]
+    async fn batch_scanner_rejects_non_positive_limit() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("admin");
+
+        let table_path = TablePath::new("fluss", "test_batch_scanner_bad_limit");
+        let descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("c1", DataTypes::int())
+                    .build()
+                    .expect("schema"),
+            )
+            .distributed_by(Some(1), vec!["c1".to_string()])
+            .build()
+            .expect("descriptor");
+        create_table(&admin, &table_path, &descriptor).await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        assert!(table.new_scan().limit(0).is_err());
+        assert!(table.new_scan().limit(-5).is_err());
+    }
+
+    /// A configured limit must be rejected by the log scanners rather than
+    /// silently ignored.
+    #[tokio::test]
+    async fn limit_is_rejected_by_log_scanners() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("admin");
+
+        let table_path = TablePath::new("fluss", "test_batch_scanner_limit_logscan");
+        let descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("c1", DataTypes::int())
+                    .build()
+                    .expect("schema"),
+            )
+            .distributed_by(Some(1), vec!["c1".to_string()])
+            .build()
+            .expect("descriptor");
+        create_table(&admin, &table_path, &descriptor).await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        assert!(
+            table
+                .new_scan()
+                .limit(5)
+                .expect("limit")
+                .create_log_scanner()
+                .is_err(),
+            "create_log_scanner must reject a configured limit"
+        );
+        assert!(
+            table
+                .new_scan()
+                .limit(5)
+                .expect("limit")
+                .create_record_batch_log_scanner()
+                .is_err(),
+            "create_record_batch_log_scanner must reject a configured limit"
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs b/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs
new file mode 100644
index 0000000000..0860be5d74
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/integration/fluss_cluster.rs
@@ -0,0 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+pub use fluss_test_cluster::{FlussTestingCluster, FlussTestingClusterBuilder};
diff --git a/fluss-rust/crates/fluss/tests/integration/kv_table.rs b/fluss-rust/crates/fluss/tests/integration/kv_table.rs
new file mode 100644
index 0000000000..4da7c75d5b
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/integration/kv_table.rs
@@ -0,0 +1,1772 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#[cfg(test)]
+mod kv_table_test {
+    use crate::integration::utils::{
+        ColumnPlan, array_dt_basics_columns, as_row_type, create_partitions, create_table,
+        dt_array_int, dt_map_string_int, dt_row_seq_label, get_shared_cluster, make_int_array,
+        make_string_array, map_dt_basics_columns, row_dt_basics_columns, scalar_dt_columns,
+    };
+    use fluss::client::TableUpsert;
+    use fluss::metadata::{DataField, DataTypes, Schema, TableDescriptor, TablePath};
+    use fluss::row::binary_array::FlussArrayWriter;
+    use fluss::row::binary_map::FlussMapWriter;
+    use fluss::row::{
+        Date, Datum, Decimal, GenericRow, InternalRow, Time, TimestampLtz, TimestampNtz,
+    };
+    use futures::stream::{FuturesUnordered, StreamExt};
+
+    fn make_key(id: i32) -> GenericRow<'static> {
+        make_key_with_field_count(id, 3)
+    }
+
+    fn make_key_with_field_count(id: i32, field_count: usize) -> GenericRow<'static> {
+        let mut row = GenericRow::new(field_count);
+        row.set_field(0, id);
+        row
+    }
+
+    #[tokio::test]
+    async fn upsert_delete_and_lookup() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().unwrap();
+
+        let table_path = TablePath::new("fluss", "test_upsert_and_lookup");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .column("age", DataTypes::bigint())
+                    .primary_key(vec!["id"])
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection.get_table(&table_path).await.unwrap();
+
+        let table_upsert = table.new_upsert().expect("Failed to create upsert");
+        let upsert_writer = table_upsert
+            .create_writer()
+            .expect("Failed to create writer");
+
+        let test_data = [(1, "Verso", 32i64), (2, "Noco", 25), (3, "Esquie", 35)];
+
+        // Upsert rows (fire-and-forget, then flush)
+        for (id, name, age) in &test_data {
+            let mut row = GenericRow::new(3);
+            row.set_field(0, *id);
+            row.set_field(1, *name);
+            row.set_field(2, *age);
+            upsert_writer.upsert(&row).expect("Failed to upsert row");
+        }
+        upsert_writer.flush().await.expect("Failed to flush");
+
+        // Lookup records
+        let mut lookuper = table
+            .new_lookup()
+            .expect("Failed to create lookup")
+            .create_lookuper()
+            .expect("Failed to create lookuper");
+
+        // Verify lookup results
+        for (id, expected_name, expected_age) in &test_data {
+            let result = lookuper
+                .lookup(&make_key(*id))
+                .await
+                .expect("Failed to lookup");
+            let row = result.get_single_row().unwrap().expect("Row should exist");
+
+            assert_eq!(row.get_int(0).unwrap(), *id, "id mismatch");
+            assert_eq!(row.get_string(1).unwrap(), *expected_name, "name mismatch");
+            assert_eq!(row.get_long(2).unwrap(), *expected_age, "age mismatch");
+        }
+
+        // Update the record with new age (await acknowledgment)
+        let mut updated_row = GenericRow::new(3);
+        updated_row.set_field(0, 1);
+        updated_row.set_field(1, "Verso");
+        updated_row.set_field(2, 33i64);
+        upsert_writer
+            .upsert(&updated_row)
+            .expect("Failed to upsert updated row")
+            .await
+            .expect("Failed to wait for upsert acknowledgment");
+
+        // Verify the update
+        let result = lookuper
+            .lookup(&make_key(1))
+            .await
+            .expect("Failed to lookup after update");
+        let found_row = result.get_single_row().unwrap().expect("Row should exist");
+        assert_eq!(
+            found_row.get_long(2).unwrap(),
+            updated_row.get_long(2).unwrap(),
+            "Age should be updated"
+        );
+        assert_eq!(
+            found_row.get_string(1).unwrap(),
+            updated_row.get_string(1).unwrap(),
+            "Name should remain unchanged"
+        );
+
+        // Delete record with id=1 (await acknowledgment)
+        let mut delete_row = GenericRow::new(3);
+        delete_row.set_field(0, 1);
+        upsert_writer
+            .delete(&delete_row)
+            .expect("Failed to delete")
+            .await
+            .expect("Failed to wait for delete acknowledgment");
+
+        // Verify deletion
+        let result = lookuper
+            .lookup(&make_key(1))
+            .await
+            .expect("Failed to lookup deleted record");
+        assert!(
+            result.get_single_row().unwrap().is_none(),
+            "Record 1 should not exist after delete"
+        );
+
+        // Verify other records still exist
+        for i in [2, 3] {
+            let result = lookuper
+                .lookup(&make_key(i))
+                .await
+                .expect("Failed to lookup");
+            assert!(
+                result.get_single_row().unwrap().is_some(),
+                "Record {} should still exist after deleting record 1",
+                i
+            );
+        }
+
+        // Lookup non-existent key
+        let result = lookuper
+            .lookup(&make_key(999))
+            .await
+            .expect("Failed to lookup non-existent key");
+        assert!(
+            result.get_single_row().unwrap().is_none(),
+            "Non-existent key should return None"
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    #[tokio::test]
+    async fn composite_primary_keys() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().unwrap();
+
+        let table_path = TablePath::new("fluss", "test_composite_pk");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("region", DataTypes::string())
+                    .column("user_id", DataTypes::int())
+                    .column("score", DataTypes::bigint())
+                    .primary_key(vec!["region", "user_id"])
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection.get_table(&table_path).await.unwrap();
+
+        let table_upsert = table.new_upsert().expect("Failed to create upsert");
+        let upsert_writer = table_upsert
+            .create_writer()
+            .expect("Failed to create writer");
+
+        // Insert records with composite keys
+        let test_data = [
+            ("US", 1, 100i64),
+            ("US", 2, 200i64),
+            ("EU", 1, 150i64),
+            ("EU", 2, 250i64),
+        ];
+
+        for (region, user_id, score) in &test_data {
+            let mut row = GenericRow::new(3);
+            row.set_field(0, *region);
+            row.set_field(1, *user_id);
+            row.set_field(2, *score);
+            upsert_writer.upsert(&row).expect("Failed to upsert");
+        }
+        upsert_writer.flush().await.expect("Failed to flush");
+
+        // Lookup with composite key
+        let mut lookuper = table
+            .new_lookup()
+            .expect("Failed to create lookup")
+            .create_lookuper()
+            .expect("Failed to create lookuper");
+
+        // Lookup (US, 1) - should return score 100
+        let mut key = GenericRow::new(3);
+        key.set_field(0, "US");
+        key.set_field(1, 1);
+        let result = lookuper.lookup(&key).await.expect("Failed to lookup");
+        let row = result.get_single_row().unwrap().expect("Row should exist");
+        assert_eq!(
+            row.get_long(2).unwrap(),
+            100,
+            "Score for (US, 1) should be 100"
+        );
+
+        // Lookup (EU, 2) - should return score 250
+        let mut key = GenericRow::new(3);
+        key.set_field(0, "EU");
+        key.set_field(1, 2);
+        let result = lookuper.lookup(&key).await.expect("Failed to lookup");
+        let row = result.get_single_row().unwrap().expect("Row should exist");
+        assert_eq!(
+            row.get_long(2).unwrap(),
+            250,
+            "Score for (EU, 2) should be 250"
+        );
+
+        // Update (US, 1) score (await acknowledgment)
+        let mut update_row = GenericRow::new(3);
+        update_row.set_field(0, "US");
+        update_row.set_field(1, 1);
+        update_row.set_field(2, 500i64);
+        upsert_writer
+            .upsert(&update_row)
+            .expect("Failed to update")
+            .await
+            .expect("Failed to wait for update acknowledgment");
+
+        // Verify update
+        let mut key = GenericRow::new(3);
+        key.set_field(0, "US");
+        key.set_field(1, 1);
+        let result = lookuper.lookup(&key).await.expect("Failed to lookup");
+        let row = result.get_single_row().unwrap().expect("Row should exist");
+        assert_eq!(
+            row.get_long(2).unwrap(),
+            update_row.get_long(2).unwrap(),
+            "Row score should be updated"
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    /// Partial-update preserves columns absent from the partial-write set.
+    #[tokio::test]
+    async fn partial_update() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_partial_update");
+
+        let nested_type = DataTypes::row(vec![
+            DataField::new("seq", DataTypes::int(), None),
+            DataField::new("label", DataTypes::string(), None),
+        ]);
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .column("score", DataTypes::bigint())
+                    .column("nested", nested_type)
+                    .column(
+                        "attrs",
+                        DataTypes::map(DataTypes::string(), DataTypes::int()),
+                    )
+                    .column("tags", DataTypes::array(DataTypes::string()))
+                    .primary_key(vec!["id"])
+                    .build()
+                    .expect("schema"),
+            )
+            .build()
+            .expect("table descriptor");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        let table_upsert = table.new_upsert().expect("upsert");
+        let upsert_writer = table_upsert.create_writer().expect("writer");
+
+        let mut nested0 = GenericRow::new(2);
+        nested0.set_field(0, 10_i32);
+        nested0.set_field(1, "alpha");
+        let attrs0 = {
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::int());
+            w.write_entry("a".into(), 1.into()).unwrap();
+            w.write_entry("b".into(), 2.into()).unwrap();
+            w.complete().expect("attrs0")
+        };
+        let tags0 = make_string_array(&[Some("alpha-tag"), Some("beta-tag")]);
+
+        let mut row = GenericRow::new(6);
+        row.set_field(0, 1);
+        row.set_field(1, "Verso");
+        row.set_field(2, 100i64);
+        row.set_field(3, Datum::Row(Box::new(nested0)));
+        row.set_field(4, Datum::Map(attrs0));
+        row.set_field(5, tags0);
+        upsert_writer
+            .upsert(&row)
+            .expect("upsert initial")
+            .await
+            .expect("ack initial");
+
+        let mut lookuper = table
+            .new_lookup()
+            .expect("lookup")
+            .create_lookuper()
+            .expect("lookuper");
+
+        // Helper to issue a partial upsert against a specific column set.
+        async fn partial_upsert(table_upsert: &TableUpsert, cols: &[&str], row: GenericRow<'_>) {
+            let pu = table_upsert
+                .partial_update_with_column_names(cols)
+                .expect("partial upsert");
+            let pw = pu.create_writer().expect("partial writer");
+            pw.upsert(&row)
+                .expect("partial upsert")
+                .await
+                .expect("partial ack");
+        }
+
+        // === Partial update on a scalar column — compound columns preserved ===
+        let mut p1 = GenericRow::new(6);
+        p1.set_field(0, 1);
+        p1.set_field(1, Datum::Null);
+        p1.set_field(2, 420i64);
+        p1.set_field(3, Datum::Null);
+        p1.set_field(4, Datum::Null);
+        p1.set_field(5, Datum::Null);
+        partial_upsert(&table_upsert, &["id", "score"], p1).await;
+
+        let result = lookuper.lookup(&make_key(1)).await.expect("lookup");
+        let r = result
+            .get_single_row()
+            .expect("get row")
+            .expect("row exists");
+        assert_eq!(r.get_string(1).unwrap(), "Verso", "name preserved");
+        assert_eq!(r.get_long(2).unwrap(), 420, "score updated");
+        let n = r.get_row(3).unwrap();
+        assert_eq!(n.get_int(0).unwrap(), 10, "ROW preserved");
+        assert_eq!(r.get_map(4).unwrap().size(), 2, "MAP preserved");
+        assert_eq!(r.get_array(5).unwrap().size(), 2, "ARRAY preserved");
+
+        // === Partial update on the ROW column ===
+        let mut new_nested = GenericRow::new(2);
+        new_nested.set_field(0, 99_i32);
+        new_nested.set_field(1, "omega");
+        let mut p2 = GenericRow::new(6);
+        p2.set_field(0, 1);
+        p2.set_field(1, Datum::Null);
+        p2.set_field(2, Datum::Null);
+        p2.set_field(3, Datum::Row(Box::new(new_nested)));
+        p2.set_field(4, Datum::Null);
+        p2.set_field(5, Datum::Null);
+        partial_upsert(&table_upsert, &["id", "nested"], p2).await;
+
+        let result = lookuper.lookup(&make_key(1)).await.expect("lookup");
+        let r = result
+            .get_single_row()
+            .expect("get row")
+            .expect("row exists");
+        assert_eq!(r.get_string(1).unwrap(), "Verso", "name preserved");
+        assert_eq!(r.get_long(2).unwrap(), 420, "score preserved");
+        let n = r.get_row(3).unwrap();
+        assert_eq!(n.get_int(0).unwrap(), 99);
+        assert_eq!(n.get_string(1).unwrap(), "omega");
+        assert_eq!(r.get_map(4).unwrap().size(), 2, "MAP preserved");
+        assert_eq!(r.get_array(5).unwrap().size(), 2, "ARRAY preserved");
+
+        // === Partial update on the MAP column ===
+        let new_attrs = {
+            let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::int());
+            w.write_entry("z".into(), 99.into()).unwrap();
+            w.complete().expect("new_attrs")
+        };
+        let mut p3 = GenericRow::new(6);
+        p3.set_field(0, 1);
+        p3.set_field(1, Datum::Null);
+        p3.set_field(2, Datum::Null);
+        p3.set_field(3, Datum::Null);
+        p3.set_field(4, Datum::Map(new_attrs));
+        p3.set_field(5, Datum::Null);
+        partial_upsert(&table_upsert, &["id", "attrs"], p3).await;
+
+        let result = lookuper.lookup(&make_key(1)).await.expect("lookup");
+        let r = result
+            .get_single_row()
+            .expect("get row")
+            .expect("row exists");
+        assert_eq!(r.get_string(1).unwrap(), "Verso", "name preserved");
+        let n = r.get_row(3).unwrap();
+        assert_eq!(n.get_int(0).unwrap(), 99, "ROW preserved");
+        let m = r.get_map(4).unwrap();
+        assert_eq!(m.size(), 1);
+        assert_eq!(m.get(&Datum::from("z")).unwrap(), Some(Datum::from(99_i32)));
+        assert_eq!(r.get_array(5).unwrap().size(), 2, "ARRAY preserved");
+
+        // === Partial update on the ARRAY column ===
+        let new_tags = make_string_array(&[Some("gamma-tag")]);
+        let mut p4 = GenericRow::new(6);
+        p4.set_field(0, 1);
+        p4.set_field(1, Datum::Null);
+        p4.set_field(2, Datum::Null);
+        p4.set_field(3, Datum::Null);
+        p4.set_field(4, Datum::Null);
+        p4.set_field(5, new_tags);
+        partial_upsert(&table_upsert, &["id", "tags"], p4).await;
+
+        let result = lookuper.lookup(&make_key(1)).await.expect("lookup");
+        let r = result
+            .get_single_row()
+            .expect("get row")
+            .expect("row exists");
+        assert_eq!(r.get_string(1).unwrap(), "Verso", "name preserved");
+        let n = r.get_row(3).unwrap();
+        assert_eq!(n.get_int(0).unwrap(), 99, "ROW preserved");
+        assert_eq!(r.get_map(4).unwrap().size(), 1, "MAP preserved");
+        let a = r.get_array(5).unwrap();
+        assert_eq!(a.size(), 1);
+        assert_eq!(a.get_string(0).unwrap(), "gamma-tag");
+
+        admin.drop_table(&table_path, false).await.expect("drop");
+    }
+
+    /// Partitioned KV upsert + lookup against every compound type.
+    #[tokio::test]
+    async fn partitioned_table_upsert_and_lookup() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_partitioned_kv_table");
+
+        let nested_type = DataTypes::row(vec![
+            DataField::new("seq", DataTypes::int(), None),
+            DataField::new("label", DataTypes::string(), None),
+        ]);
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("region", DataTypes::string())
+                    .column("user_id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .column("score", DataTypes::bigint())
+                    .column("nested", nested_type)
+                    .column(
+                        "attrs",
+                        DataTypes::map(DataTypes::string(), DataTypes::int()),
+                    )
+                    .column("tags", DataTypes::array(DataTypes::string()))
+                    .primary_key(vec!["region", "user_id"])
+                    .build()
+                    .expect("schema"),
+            )
+            .partitioned_by(vec!["region"])
+            .build()
+            .expect("table descriptor");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+        create_partitions(&admin, &table_path, "region", &["US", "EU", "APAC"]).await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        let table_upsert = table.new_upsert().expect("upsert");
+        let upsert_writer = table_upsert.create_writer().expect("writer");
+
+        let test_data = [
+            ("US", 1_i32, "Gustave", 100_i64, 11_i32, "a", 1_i32, "alpha"),
+            ("US", 2, "Lune", 200, 22, "b", 2, "beta"),
+            ("EU", 1, "Sciel", 150, 33, "c", 3, "gamma"),
+            ("EU", 2, "Maelle", 250, 44, "d", 4, "delta"),
+            ("APAC", 1, "Noco", 300, 55, "e", 5, "epsilon"),
+        ];
+
+        for (region, user_id, name, score, seq, label, attr_v, tag) in &test_data {
+            let mut nested = GenericRow::new(2);
+            nested.set_field(0, *seq);
+            nested.set_field(1, *label);
+            let attrs = {
+                let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::int());
+                w.write_entry((*label).into(), (*attr_v).into()).unwrap();
+                w.complete().expect("attrs")
+            };
+            let tags = make_string_array(&[Some(*tag)]);
+
+            let mut row = GenericRow::new(7);
+            row.set_field(0, *region);
+            row.set_field(1, *user_id);
+            row.set_field(2, *name);
+            row.set_field(3, *score);
+            row.set_field(4, Datum::Row(Box::new(nested)));
+            row.set_field(5, Datum::Map(attrs));
+            row.set_field(6, tags);
+            upsert_writer.upsert(&row).expect("upsert");
+        }
+        upsert_writer.flush().await.expect("flush");
+
+        let mut lookuper = table
+            .new_lookup()
+            .expect("lookup")
+            .create_lookuper()
+            .expect("lookuper");
+
+        // === Per-partition lookup verifies all compound columns ===
+        for (region, user_id, name, score, seq, label, attr_v, tag) in &test_data {
+            let mut key = GenericRow::new(7);
+            key.set_field(0, *region);
+            key.set_field(1, *user_id);
+
+            let result = lookuper.lookup(&key).await.expect("lookup");
+            let row = result
+                .get_single_row()
+                .expect("get row")
+                .expect("row exists");
+
+            assert_eq!(row.get_string(0).unwrap(), *region);
+            assert_eq!(row.get_int(1).unwrap(), *user_id);
+            assert_eq!(row.get_string(2).unwrap(), *name);
+            assert_eq!(row.get_long(3).unwrap(), *score);
+            let nested = row.get_row(4).unwrap();
+            assert_eq!(nested.get_int(0).unwrap(), *seq);
+            assert_eq!(nested.get_string(1).unwrap(), *label);
+            let attrs = row.get_map(5).unwrap();
+            assert_eq!(attrs.size(), 1);
+            assert_eq!(
+                attrs.get(&Datum::from(*label)).unwrap(),
+                Some(Datum::from(*attr_v))
+            );
+            let tags = row.get_array(6).unwrap();
+            assert_eq!(tags.size(), 1);
+            assert_eq!(tags.get_string(0).unwrap(), *tag);
+        }
+
+        // === Update a row in US partition ===
+        let mut updated_nested = GenericRow::new(2);
+        updated_nested.set_field(0, 999_i32);
+        updated_nested.set_field(1, "updated");
+        let updated_attrs = {
+            let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::int());
+            w.write_entry("u".into(), 999.into()).unwrap();
+            w.complete().expect("updated_attrs")
+        };
+        let updated_tags = make_string_array(&[Some("renamed")]);
+        let mut updated_row = GenericRow::new(7);
+        updated_row.set_field(0, "US");
+        updated_row.set_field(1, 1);
+        updated_row.set_field(2, "Gustave Updated");
+        updated_row.set_field(3, 999_i64);
+        updated_row.set_field(4, Datum::Row(Box::new(updated_nested)));
+        updated_row.set_field(5, Datum::Map(updated_attrs));
+        updated_row.set_field(6, updated_tags);
+        upsert_writer
+            .upsert(&updated_row)
+            .expect("upsert updated")
+            .await
+            .expect("ack updated");
+
+        let mut key = GenericRow::new(7);
+        key.set_field(0, "US");
+        key.set_field(1, 1);
+        let result = lookuper.lookup(&key).await.expect("lookup");
+        let row = result
+            .get_single_row()
+            .expect("get row")
+            .expect("row exists");
+        assert_eq!(row.get_string(2).unwrap(), "Gustave Updated");
+        assert_eq!(row.get_long(3).unwrap(), 999);
+        let nested = row.get_row(4).unwrap();
+        assert_eq!(nested.get_int(0).unwrap(), 999);
+        let attrs = row.get_map(5).unwrap();
+        assert_eq!(
+            attrs.get(&Datum::from("u")).unwrap(),
+            Some(Datum::from(999_i32))
+        );
+        let tags = row.get_array(6).unwrap();
+        assert_eq!(tags.get_string(0).unwrap(), "renamed");
+
+        // === Lookup in non-existent partition returns None ===
+        let mut missing = GenericRow::new(7);
+        missing.set_field(0, "UNKNOWN_REGION");
+        missing.set_field(1, 1);
+        let result = lookuper
+            .lookup(&missing)
+            .await
+            .expect("lookup unknown partition");
+        assert!(result.get_single_row().expect("get").is_none());
+
+        // === Delete a row within a partition ===
+        let mut delete_key = GenericRow::new(7);
+        delete_key.set_field(0, "EU");
+        delete_key.set_field(1, 1);
+        upsert_writer
+            .delete(&delete_key)
+            .expect("delete")
+            .await
+            .expect("ack delete");
+        let mut key = GenericRow::new(7);
+        key.set_field(0, "EU");
+        key.set_field(1, 1);
+        let result = lookuper.lookup(&key).await.expect("lookup");
+        assert!(result.get_single_row().expect("get").is_none());
+
+        // === Sibling row in same partition still exists ===
+        let mut key = GenericRow::new(7);
+        key.set_field(0, "EU");
+        key.set_field(1, 2);
+        let result = lookuper.lookup(&key).await.expect("lookup");
+        let row = result
+            .get_single_row()
+            .expect("get row")
+            .expect("row exists");
+        assert_eq!(row.get_string(2).unwrap(), "Maelle");
+        assert_eq!(row.get_array(6).unwrap().get_string(0).unwrap(), "delta");
+
+        admin.drop_table(&table_path, false).await.expect("drop");
+    }
+
+    /// Integration test covering put and get operations for all supported datatypes.
+    /// Integration test for concurrent batched lookups across partitions.
+    #[tokio::test]
+    async fn batched_concurrent_lookups_partitioned() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_batched_lookups_partitioned");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("region", DataTypes::string())
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .primary_key(vec!["region", "id"])
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .partitioned_by(vec!["region"])
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+        create_partitions(&admin, &table_path, "region", &["US", "EU", "APAC"]).await;
+
+        let connection = cluster.get_fluss_connection().await;
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+
+        // Insert records across all partitions
+        let table_upsert = table.new_upsert().expect("Failed to create upsert");
+        let writer = table_upsert
+            .create_writer()
+            .expect("Failed to create writer");
+
+        let regions = ["US", "EU", "APAC"];
+        for region in &regions {
+            for id in 0..5i32 {
+                let mut row = GenericRow::new(3);
+                row.set_field(0, *region);
+                row.set_field(1, id);
+                row.set_field(2, format!("{}-{}", region, id));
+                writer.upsert(&row).expect("Failed to upsert");
+            }
+        }
+        writer.flush().await.expect("Failed to flush");
+
+        let mut lookupers: Vec<_> = (0..regions.len() * 5)
+            .map(|_| {
+                table
+                    .new_lookup()
+                    .expect("Failed to create lookup")
+                    .create_lookuper()
+                    .expect("Failed to create lookuper")
+            })
+            .collect();
+
+        let mut futures = FuturesUnordered::new();
+        for (i, lookuper) in lookupers.iter_mut().enumerate() {
+            let region = regions[i / 5];
+            let id = (i % 5) as i32;
+
+            futures.push(async move {
+                let mut key = GenericRow::new(3);
+                key.set_field(0, region);
+                key.set_field(1, id);
+
+                let result = lookuper.lookup(&key).await.expect("Failed to lookup");
+                let row = result
+                    .get_single_row()
+                    .expect("Failed to get row")
+                    .expect("Row should exist");
+
+                let actual_region = row.get_string(0).unwrap();
+                let actual_id = row.get_int(1).unwrap();
+                let actual_name = row.get_string(2).unwrap();
+
+                assert_eq!(actual_region, region, "region mismatch");
+                assert_eq!(actual_id, id, "id mismatch");
+                assert_eq!(actual_name, format!("{}-{}", region, id), "name mismatch");
+
+                (region.to_string(), id)
+            });
+        }
+
+        let mut results = Vec::new();
+        while let Some(result) = futures.next().await {
+            results.push(result);
+        }
+
+        assert_eq!(
+            results.len(),
+            regions.len() * 5,
+            "Not all lookups completed"
+        );
+
+        // Verify we got results from all partitions
+        for region in &regions {
+            let count = results.iter().filter(|(r, _)| r == region).count();
+            assert_eq!(count, 5, "Expected 5 results for region {}", region);
+        }
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    // Strings >7 chars for `b` force the encoder's variable-length area,
+    // which is where prefix-key / primary-key byte layouts diverge.
+    #[tokio::test]
+    async fn prefix_lookup() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_prefix_lookup");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("a", DataTypes::int())
+                    .column("b", DataTypes::string())
+                    .column("c", DataTypes::bigint())
+                    .column("d", DataTypes::string())
+                    .primary_key(vec!["a", "b", "c"])
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .distributed_by(Some(3), vec!["a".to_string(), "b".to_string()])
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+
+        let table_upsert = table.new_upsert().expect("Failed to create upsert");
+        let writer = table_upsert
+            .create_writer()
+            .expect("Failed to create writer");
+
+        let test_data: &[(i32, &str, i64, &str)] = &[
+            (1, "aaaaaaaaa", 1, "value1"),
+            (1, "aaaaaaaaa", 2, "value2"),
+            (1, "aaaaaaaaa", 3, "value3"),
+            (2, "aaaaaaaaa", 4, "value4"),
+        ];
+        for (a, b, c, d) in test_data {
+            let mut row = GenericRow::new(4);
+            row.set_field(0, *a);
+            row.set_field(1, *b);
+            row.set_field(2, *c);
+            row.set_field(3, *d);
+            writer.upsert(&row).expect("Failed to upsert");
+        }
+        writer.flush().await.expect("Failed to flush");
+
+        let mut prefix_lookuper = table
+            .new_lookup()
+            .expect("Failed to create lookup")
+            .lookup_by(vec!["a".to_string(), "b".to_string()])
+            .create_lookuper()
+            .expect("Failed to create prefix lookuper");
+
+        let mut prefix = GenericRow::new(2);
+        prefix.set_field(0, 1);
+        prefix.set_field(1, "aaaaaaaaa");
+        let result = prefix_lookuper
+            .lookup(&prefix)
+            .await
+            .expect("Failed to prefix lookup");
+        let rows = result.get_rows().expect("Failed to decode rows");
+        assert_eq!(rows.len(), 3, "Prefix (1, 'aaaaaaaaa') should match 3 rows");
+        for (i, row) in rows.iter().enumerate() {
+            assert_eq!(row.get_int(0).unwrap(), 1);
+            assert_eq!(row.get_string(1).unwrap(), "aaaaaaaaa");
+            assert_eq!(row.get_long(2).unwrap(), (i as i64) + 1);
+            assert_eq!(row.get_string(3).unwrap(), format!("value{}", i + 1));
+        }
+
+        let mut prefix = GenericRow::new(2);
+        prefix.set_field(0, 2);
+        prefix.set_field(1, "aaaaaaaaa");
+        let result = prefix_lookuper
+            .lookup(&prefix)
+            .await
+            .expect("Failed to prefix lookup");
+        let rows = result.get_rows().expect("Failed to decode rows");
+        assert_eq!(rows.len(), 1);
+        assert_eq!(rows[0].get_int(0).unwrap(), 2);
+        assert_eq!(rows[0].get_string(1).unwrap(), "aaaaaaaaa");
+        assert_eq!(rows[0].get_long(2).unwrap(), 4);
+        assert_eq!(rows[0].get_string(3).unwrap(), "value4");
+
+        let mut prefix = GenericRow::new(2);
+        prefix.set_field(0, 3);
+        prefix.set_field(1, "a");
+        let result = prefix_lookuper
+            .lookup(&prefix)
+            .await
+            .expect("Failed to prefix lookup");
+        let rows = result.get_rows().expect("Failed to decode rows");
+        assert_eq!(rows.len(), 0);
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    #[tokio::test]
+    async fn prefix_lookup_partitioned() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_prefix_lookup_partitioned");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("region", DataTypes::string())
+                    .column("a", DataTypes::int())
+                    .column("b", DataTypes::string())
+                    .column("c", DataTypes::bigint())
+                    .column("d", DataTypes::string())
+                    .primary_key(vec!["region", "a", "b", "c"])
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .partitioned_by(vec!["region"])
+            .distributed_by(Some(3), vec!["a".to_string(), "b".to_string()])
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+        create_partitions(&admin, &table_path, "region", &["US", "EU"]).await;
+
+        let connection = cluster.get_fluss_connection().await;
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+
+        let table_upsert = table.new_upsert().expect("Failed to create upsert");
+        let writer = table_upsert
+            .create_writer()
+            .expect("Failed to create writer");
+
+        let test_data: &[(&str, i32, &str, i64, &str)] = &[
+            ("US", 1, "aaaaaaaaa", 1, "us-1"),
+            ("US", 1, "aaaaaaaaa", 2, "us-2"),
+            ("US", 2, "aaaaaaaaa", 3, "us-3"),
+            ("EU", 1, "aaaaaaaaa", 4, "eu-1"),
+            ("EU", 1, "bbbbbbbbb", 5, "eu-2"),
+        ];
+        for (region, a, b, c, d) in test_data {
+            let mut row = GenericRow::new(5);
+            row.set_field(0, *region);
+            row.set_field(1, *a);
+            row.set_field(2, *b);
+            row.set_field(3, *c);
+            row.set_field(4, *d);
+            writer.upsert(&row).expect("Failed to upsert");
+        }
+        writer.flush().await.expect("Failed to flush");
+
+        let mut prefix_lookuper = table
+            .new_lookup()
+            .expect("Failed to create lookup")
+            .lookup_by(vec!["region".to_string(), "a".to_string(), "b".to_string()])
+            .create_lookuper()
+            .expect("Failed to create prefix lookuper");
+
+        // Prefix (US, 1, "aaaaaaaaa") — 2 rows.
+        let mut prefix = GenericRow::new(3);
+        prefix.set_field(0, "US");
+        prefix.set_field(1, 1);
+        prefix.set_field(2, "aaaaaaaaa");
+        let result = prefix_lookuper
+            .lookup(&prefix)
+            .await
+            .expect("Failed to prefix lookup");
+        let rows = result.get_rows().expect("Failed to decode rows");
+        assert_eq!(rows.len(), 2);
+        for row in &rows {
+            assert_eq!(row.get_string(0).unwrap(), "US");
+            assert_eq!(row.get_int(1).unwrap(), 1);
+            assert_eq!(row.get_string(2).unwrap(), "aaaaaaaaa");
+        }
+
+        // Prefix (EU, 1, "bbbbbbbbb") — 1 row.
+        let mut prefix = GenericRow::new(3);
+        prefix.set_field(0, "EU");
+        prefix.set_field(1, 1);
+        prefix.set_field(2, "bbbbbbbbb");
+        let result = prefix_lookuper
+            .lookup(&prefix)
+            .await
+            .expect("Failed to prefix lookup");
+        let rows = result.get_rows().expect("Failed to decode rows");
+        assert_eq!(rows.len(), 1);
+        assert_eq!(rows[0].get_string(4).unwrap(), "eu-2");
+
+        let mut prefix = GenericRow::new(3);
+        prefix.set_field(0, "APAC");
+        prefix.set_field(1, 1);
+        prefix.set_field(2, "aaaaaaaaa");
+        let result = prefix_lookuper
+            .lookup(&prefix)
+            .await
+            .expect("Failed to prefix lookup");
+        let rows = result.get_rows().expect("Failed to decode rows");
+        assert_eq!(rows.len(), 0);
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    #[tokio::test]
+    async fn prefix_lookup_validation_errors() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_prefix_lookup_validation");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("a", DataTypes::int())
+                    .column("b", DataTypes::string())
+                    .column("c", DataTypes::bigint())
+                    .primary_key(vec!["a", "b", "c"])
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .distributed_by(Some(3), vec!["a".to_string(), "b".to_string()])
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+
+        let err = table
+            .new_lookup()
+            .expect("Failed to create lookup")
+            .lookup_by(vec!["b".to_string(), "a".to_string()])
+            .create_lookuper()
+            .err()
+            .expect("Expected validation error for wrong order");
+        assert!(err.to_string().contains("must contain all bucket keys"));
+
+        let err = table
+            .new_lookup()
+            .expect("Failed to create lookup")
+            .lookup_by(vec!["a".to_string(), "b".to_string(), "c".to_string()])
+            .create_lookuper()
+            .err()
+            .expect("Expected validation error for extra lookup columns");
+        assert!(err.to_string().contains("must contain all bucket keys"));
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    /// Integration test for concurrent batched lookups.
+    #[tokio::test]
+    async fn batched_concurrent_lookups() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss".to_string(), "test_batched_lookups".to_string());
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .column("value", DataTypes::bigint())
+                    .primary_key(vec!["id".to_string()])
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+
+        let table_upsert = table.new_upsert().expect("Failed to create upsert");
+
+        // Insert only even-numbered records (0, 2, 4, ..., 98) in parallel
+        let num_records = 100i32;
+        let mut upsert_futures = FuturesUnordered::new();
+        for i in (0..num_records).step_by(2) {
+            let writer = table_upsert
+                .create_writer()
+                .expect("Failed to create writer");
+            upsert_futures.push(async move {
+                let mut row = GenericRow::new(3);
+                row.set_field(0, i);
+                row.set_field(1, format!("name_{}", i));
+                row.set_field(2, (i * 100) as i64);
+                writer
+                    .upsert(&row)
+                    .expect("Failed to upsert")
+                    .await
+                    .expect("Failed to await upsert ack");
+            });
+        }
+        // Wait for all upserts to be acknowledged
+        while upsert_futures.next().await.is_some() {}
+
+        // Create multiple lookupers for concurrent lookups
+        let num_lookupers = 50i32;
+        let mut lookupers: Vec<_> = (0..num_lookupers)
+            .map(|_| {
+                table
+                    .new_lookup()
+                    .expect("Failed to create lookup")
+                    .create_lookuper()
+                    .expect("Failed to create lookuper")
+            })
+            .collect();
+
+        // Run concurrent lookups
+        let mut futures = FuturesUnordered::new();
+        for (i, lookuper) in lookupers.iter_mut().enumerate() {
+            // First 10 lookupers all lookup id=0 (same key multiple times)
+            let id = if i < 10 { 0 } else { i as i32 };
+            let expects_result = id % 2 == 0; // Even IDs exist
+
+            futures.push(async move {
+                let key = make_key(id);
+                let result = lookuper.lookup(&key).await.expect("Failed to lookup");
+                let row_opt = result.get_single_row().expect("Failed to get row");
+
+                if expects_result {
+                    let row = row_opt.unwrap_or_else(|| panic!("Row {} should exist", id));
+                    assert_eq!(row.get_int(0).unwrap(), id, "id mismatch for key {}", id);
+                    assert_eq!(
+                        row.get_string(1).unwrap(),
+                        format!("name_{}", id),
+                        "name mismatch for key {}",
+                        id
+                    );
+                    assert_eq!(
+                        row.get_long(2).unwrap(),
+                        (id * 100) as i64,
+                        "value mismatch for key {}",
+                        id
+                    );
+                } else {
+                    assert!(row_opt.is_none(), "Row {} should not exist", id);
+                }
+                (id, expects_result)
+            });
+        }
+
+        // Collect all results and verify
+        let mut results = Vec::with_capacity(num_lookupers as usize);
+        while let Some(result) = futures.next().await {
+            results.push(result);
+        }
+
+        // Verify all lookups completed successfully
+        assert_eq!(
+            results.len(),
+            num_lookupers as usize,
+            "Not all lookups completed"
+        );
+
+        // Verify we had the expected mix of scenarios
+        let same_key_lookups = results.iter().filter(|(id, _)| *id == 0).count();
+        assert_eq!(same_key_lookups, 10, "Should have 10 lookups for same key");
+
+        let non_existing_lookups = results.iter().filter(|(_, exists)| !exists).count();
+        assert!(
+            non_existing_lookups > 0,
+            "Should have some non-existing key lookups"
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    /// Test that KV format v2 tables with non-default bucket key reject v0 clients.
+    /// The Rust client currently only supports API version 0 for PutKv/Lookup/PrefixLookup.
+    /// When the server creates a table with kv_format_version=2 and a non-default bucket key,
+    /// it rejects v0 clients because CompactedKeyEncoder (v1) is required.
+    // TODO(key-encoding-v1): Once v1 key encoding is implemented and the client advertises
+    //  PutKv/Lookup/PrefixLookup v1, this test should be updated to verify that v1 clients
+    //  can successfully write to and read from kv_format_v2 tables with non-default bucket keys.
+    #[tokio::test]
+    async fn kv_format_v2_table_rejects_v0_client() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().unwrap();
+
+        let table_path = TablePath::new("fluss", "test_kv_format_v2_reject_v0");
+
+        // Create a KV table with:
+        // 1. kv_format_version = 2
+        // 2. non-default bucket key ("a" is a subset of pk ("a", "b"))
+        // 3. datalake format is exist.
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("a", DataTypes::int())
+                    .column("b", DataTypes::string())
+                    .column("c", DataTypes::string())
+                    .primary_key(vec!["a", "b"])
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .distributed_by(Some(2), vec!["a".to_string()])
+            .property("table.kv.format-version", "2")
+            .property("table.datalake.format", "lance")
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection.get_table(&table_path).await.unwrap();
+
+        // Test PutKv with v0 client - should fail with UNSUPPORTED_VERSION
+        let table_upsert = table.new_upsert().expect("Failed to create upsert");
+        let upsert_writer = table_upsert
+            .create_writer()
+            .expect("Failed to create writer");
+
+        let mut row = GenericRow::new(3);
+        row.set_field(0, 1);
+        row.set_field(1, "a");
+        row.set_field(2, "value1");
+        let upsert_result = upsert_writer
+            .upsert(&row)
+            .expect("Failed to upsert row")
+            .await;
+        assert!(
+            upsert_result.is_err(),
+            "PutKv with v0 client should be rejected for kv_format_v2 table with non-default bucket key"
+        );
+        let err_msg = upsert_result.unwrap_err().to_string();
+        assert!(
+            err_msg.contains("Client API version 0 is not supported"),
+            "Expected 'Client API version 0 is not supported' error, got: {}",
+            err_msg
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    /// KV upsert + lookup against a schema covering every supported data type.
+    #[tokio::test]
+    async fn all_supported_datatypes() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_kv_complex_types");
+
+        let row_seq_label_owned = dt_row_seq_label();
+        let row_seq_label = as_row_type(&row_seq_label_owned);
+        let inner_array_int = dt_array_int();
+        let inner_map_string_int = dt_map_string_int();
+
+        let plan = ColumnPlan::new()
+            .add("id", DataTypes::int())
+            .start_section("array_basics")
+            .extend(array_dt_basics_columns())
+            .start_section("row_basics")
+            .extend(row_dt_basics_columns())
+            .start_section("map_basics")
+            .extend(map_dt_basics_columns())
+            .start_section("scalars")
+            .extend(scalar_dt_columns());
+        let table_descriptor = TableDescriptor::builder()
+            .schema(plan.build_schema(Some(&["id"])))
+            .build()
+            .expect("table descriptor");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        let upsert_writer = table
+            .new_upsert()
+            .expect("upsert")
+            .create_writer()
+            .expect("writer");
+
+        // Row 1 (id=1) — comprehensive: every column populated.
+        let column_count = plan.len();
+        let mut row1 = GenericRow::new(column_count);
+        row1.set_field(0, 1_i32);
+        row1.set_field(1, make_int_array(&[Some(10), Some(20), Some(30)]));
+        row1.set_field(2, make_string_array(&[Some("hello"), Some("world")]));
+        let arr_of_arr_1 = {
+            let mut w = FlussArrayWriter::new(2, &inner_array_int);
+            w.write_array(0, &make_int_array(&[Some(1), Some(2)]));
+            w.write_array(1, &make_int_array(&[Some(3), Some(4)]));
+            w.complete().expect("arr_of_arr_1")
+        };
+        row1.set_field(3, arr_of_arr_1);
+        let arr_of_row_1 = {
+            let mut w = FlussArrayWriter::new(2, &row_seq_label_owned);
+            let mut e0 = GenericRow::new(2);
+            e0.set_field(0, 1_i32);
+            e0.set_field(1, "open");
+            w.write_row(0, &e0).expect("e0");
+            let mut e1 = GenericRow::new(2);
+            e1.set_field(0, 2_i32);
+            e1.set_field(1, "close");
+            w.write_row(1, &e1).expect("e1");
+            w.complete().expect("arr_of_row_1")
+        };
+        row1.set_field(4, arr_of_row_1);
+        let mut row_basic_1 = GenericRow::new(2);
+        row_basic_1.set_field(0, 42_i32);
+        row_basic_1.set_field(1, "hello");
+        row1.set_field(5, Datum::Row(Box::new(row_basic_1)));
+        let mut deep_inner_1 = GenericRow::new(1);
+        deep_inner_1.set_field(0, 99_i32);
+        let mut row_deep_1 = GenericRow::new(1);
+        row_deep_1.set_field(0, Datum::Row(Box::new(deep_inner_1)));
+        row1.set_field(6, Datum::Row(Box::new(row_deep_1)));
+        let mut row_rich_1 = GenericRow::new(14);
+        row_rich_1.set_field(0, true);
+        row_rich_1.set_field(1, 100_000_i32);
+        row_rich_1.set_field(2, 9_876_543_210_i64);
+        row_rich_1.set_field(3, f32::INFINITY);
+        row_rich_1.set_field(4, std::f64::consts::PI);
+        row_rich_1.set_field(5, "hello world");
+        row_rich_1.set_field(6, b"binary".as_slice());
+        row_rich_1.set_field(7, Decimal::from_unscaled_long(12345, 10, 2).unwrap());
+        row_rich_1.set_field(8, Datum::Date(Date::new(20476)));
+        row_rich_1.set_field(9, Datum::Time(Time::new(36_827_123)));
+        row_rich_1.set_field(
+            10,
+            Datum::TimestampNtz(TimestampNtz::new(1_769_163_227_123)),
+        );
+        row_rich_1.set_field(
+            11,
+            Datum::TimestampLtz(TimestampLtz::new(1_769_163_227_456)),
+        );
+        row_rich_1.set_field(12, b"\x01\x02\x03\x04".as_slice());
+        row_rich_1.set_field(13, make_int_array(&[Some(7), None, Some(11)]));
+        row1.set_field(7, Datum::Row(Box::new(row_rich_1)));
+        let map_string_int_1 = {
+            let mut w = FlussMapWriter::new(3, &DataTypes::string(), &DataTypes::int());
+            w.write_entry("a".into(), 1.into()).unwrap();
+            w.write_entry("b".into(), Datum::Null).unwrap();
+            w.write_entry("c".into(), 3.into()).unwrap();
+            w.complete().expect("map_string_int_1")
+        };
+        row1.set_field(8, Datum::Map(map_string_int_1));
+        let map_of_row_1 = {
+            let mut e0 = GenericRow::new(2);
+            e0.set_field(0, 1_i32);
+            e0.set_field(1, "open");
+            let mut e1 = GenericRow::new(2);
+            e1.set_field(0, 2_i32);
+            e1.set_field(1, "close");
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &row_seq_label_owned);
+            w.write_entry("e0".into(), Datum::Row(Box::new(e0)))
+                .unwrap();
+            w.write_entry("e1".into(), Datum::Row(Box::new(e1)))
+                .unwrap();
+            w.complete().expect("map_of_row_1")
+        };
+        row1.set_field(9, Datum::Map(map_of_row_1));
+        let map_of_map_1 = {
+            let g1 = {
+                let mut w = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::int());
+                w.write_entry("a".into(), 1.into()).unwrap();
+                w.write_entry("b".into(), 2.into()).unwrap();
+                w.complete().expect("g1")
+            };
+            let g2 = {
+                let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::int());
+                w.write_entry("c".into(), 3.into()).unwrap();
+                w.complete().expect("g2")
+            };
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &inner_map_string_int);
+            w.write_entry("g1".into(), Datum::Map(g1)).unwrap();
+            w.write_entry("g2".into(), Datum::Map(g2)).unwrap();
+            w.complete().expect("map_of_map_1")
+        };
+        row1.set_field(10, Datum::Map(map_of_map_1));
+        let map_of_array_1 = {
+            let primes = make_int_array(&[Some(2), Some(3), Some(5)]);
+            let squares = make_int_array(&[Some(1), Some(4)]);
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &inner_array_int);
+            w.write_entry("primes".into(), Datum::Array(primes))
+                .unwrap();
+            w.write_entry("squares".into(), Datum::Array(squares))
+                .unwrap();
+            w.complete().expect("map_of_array_1")
+        };
+        row1.set_field(11, Datum::Map(map_of_array_1));
+        let array_of_map_1 = {
+            let m0 = {
+                let mut w = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::int());
+                w.write_entry("x".into(), 1.into()).unwrap();
+                w.write_entry("y".into(), 2.into()).unwrap();
+                w.complete().expect("m0")
+            };
+            let m1 = {
+                let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::int());
+                w.write_entry("z".into(), 9.into()).unwrap();
+                w.complete().expect("m1")
+            };
+            let mut w = FlussArrayWriter::new(2, &inner_map_string_int);
+            w.write_map(0, &m0);
+            w.write_map(1, &m1);
+            w.complete().expect("array_of_map_1")
+        };
+        row1.set_field(12, array_of_map_1);
+
+        // Scalar values for row 1.
+        let s_tinyint = 127_i8;
+        let s_smallint = 32_767_i16;
+        let s_bigint = 9_223_372_036_854_775_807_i64;
+        let s_float = std::f32::consts::PI;
+        let s_double = std::f64::consts::E;
+        let s_char = "hello";
+        let s_string = "world of fluss rust client";
+        let s_decimal = Decimal::from_unscaled_long(12345, 10, 2).unwrap();
+        let s_date = Date::new(20476);
+        let s_time_s = Time::new(36_827_000);
+        let s_time_ms = Time::new(36_827_123);
+        let s_time_us = Time::new(86_399_999);
+        let s_time_ns = Time::new(1);
+        let s_ts_s = TimestampNtz::new(1_769_163_227_000);
+        let s_ts_ms = TimestampNtz::new(1_769_163_227_123);
+        let s_ts_us = TimestampNtz::from_millis_nanos(1_769_163_227_123, 456_000).unwrap();
+        let s_ts_ns = TimestampNtz::from_millis_nanos(1_769_163_227_123, 999_999).unwrap();
+        let s_ts_ltz_s = TimestampLtz::new(1_769_163_227_000);
+        let s_ts_ltz_ms = TimestampLtz::new(1_769_163_227_123);
+        let s_ts_ltz_us = TimestampLtz::from_millis_nanos(1_769_163_227_123, 456_000).unwrap();
+        let s_ts_ltz_ns = TimestampLtz::from_millis_nanos(1_769_163_227_123, 999_999).unwrap();
+        let s_bytes_top: Vec<u8> = b"binary data".to_vec();
+        let s_binary_top: Vec<u8> = vec![0xDE, 0xAD, 0xBE, 0xEF];
+        let s_ts_us_neg = TimestampNtz::from_millis_nanos(-301_234_154_877, 456_000).unwrap();
+        let s_ts_ns_neg = TimestampNtz::from_millis_nanos(-301_234_154_877, 999_999).unwrap();
+        let s_ts_ltz_us_neg = TimestampLtz::from_millis_nanos(-301_234_154_877, 456_000).unwrap();
+        let s_ts_ltz_ns_neg = TimestampLtz::from_millis_nanos(-301_234_154_877, 999_999).unwrap();
+
+        row1.set_field(plan.idx("col_tinyint"), s_tinyint);
+        row1.set_field(plan.idx("col_smallint"), s_smallint);
+        row1.set_field(plan.idx("col_bigint"), s_bigint);
+        row1.set_field(plan.idx("col_float"), s_float);
+        row1.set_field(plan.idx("col_double"), s_double);
+        row1.set_field(plan.idx("col_boolean"), true);
+        row1.set_field(plan.idx("col_char"), s_char);
+        row1.set_field(plan.idx("col_string"), s_string);
+        row1.set_field(plan.idx("col_decimal"), s_decimal.clone());
+        row1.set_field(plan.idx("col_date"), Datum::Date(s_date));
+        row1.set_field(plan.idx("col_time_s"), s_time_s);
+        row1.set_field(plan.idx("col_time_ms"), s_time_ms);
+        row1.set_field(plan.idx("col_time_us"), s_time_us);
+        row1.set_field(plan.idx("col_time_ns"), s_time_ns);
+        row1.set_field(plan.idx("col_ts_s"), s_ts_s);
+        row1.set_field(plan.idx("col_ts_ms"), s_ts_ms);
+        row1.set_field(plan.idx("col_ts_us"), s_ts_us);
+        row1.set_field(plan.idx("col_ts_ns"), s_ts_ns);
+        row1.set_field(plan.idx("col_ts_ltz_s"), s_ts_ltz_s);
+        row1.set_field(plan.idx("col_ts_ltz_ms"), s_ts_ltz_ms);
+        row1.set_field(plan.idx("col_ts_ltz_us"), s_ts_ltz_us);
+        row1.set_field(plan.idx("col_ts_ltz_ns"), s_ts_ltz_ns);
+        row1.set_field(plan.idx("col_bytes_top"), s_bytes_top.as_slice());
+        row1.set_field(plan.idx("col_binary_top"), s_binary_top.as_slice());
+        row1.set_field(plan.idx("col_ts_us_neg"), s_ts_us_neg);
+        row1.set_field(plan.idx("col_ts_ns_neg"), s_ts_ns_neg);
+        row1.set_field(plan.idx("col_ts_ltz_us_neg"), s_ts_ltz_us_neg);
+        row1.set_field(plan.idx("col_ts_ltz_ns_neg"), s_ts_ltz_ns_neg);
+
+        upsert_writer
+            .upsert(&row1)
+            .expect("upsert row1")
+            .await
+            .expect("ack row1");
+
+        // Row 2 (id=2) — empty MAP, all other compound + scalar columns NULL.
+        let mut row2 = GenericRow::new(column_count);
+        row2.set_field(0, 2_i32);
+        for i in 1..column_count {
+            row2.set_field(i, Datum::Null);
+        }
+        let empty_map = FlussMapWriter::new(0, &DataTypes::string(), &DataTypes::int())
+            .complete()
+            .expect("empty_map");
+        row2.set_field(plan.idx("map_string_int"), Datum::Map(empty_map));
+        upsert_writer
+            .upsert(&row2)
+            .expect("upsert row2")
+            .await
+            .expect("ack row2");
+
+        // Row 3 (id=3) — every compound + scalar column NULL.
+        let mut row3 = GenericRow::new(column_count);
+        row3.set_field(0, 3_i32);
+        for i in 1..column_count {
+            row3.set_field(i, Datum::Null);
+        }
+        upsert_writer
+            .upsert(&row3)
+            .expect("upsert row3")
+            .await
+            .expect("ack row3");
+
+        let mut lookuper = table
+            .new_lookup()
+            .expect("lookup")
+            .create_lookuper()
+            .expect("lookuper");
+
+        let result1 = lookuper.lookup(&make_key(1)).await.expect("lookup row1");
+        let r1 = result1
+            .get_single_row()
+            .expect("row1")
+            .expect("row1 exists");
+        assert_eq!(r1.get_int(0).unwrap(), 1);
+
+        // === ARRAY: basic shapes ===
+        let arr_int = r1.get_array(1).unwrap();
+        assert_eq!(arr_int.size(), 3);
+        assert_eq!(arr_int.get_int(2).unwrap(), 30);
+        let arr_string = r1.get_array(2).unwrap();
+        assert_eq!(arr_string.size(), 2);
+        assert_eq!(arr_string.get_string(0).unwrap(), "hello");
+        let arr_of_arr = r1.get_array(3).unwrap();
+        assert_eq!(arr_of_arr.size(), 2);
+        assert_eq!(arr_of_arr.get_array(1).unwrap().get_int(1).unwrap(), 4);
+
+        // === ARRAY<ROW> ===
+        let aor = r1.get_array(4).unwrap();
+        assert_eq!(aor.size(), 2);
+        let e0 = aor.get_row(0, &row_seq_label).unwrap();
+        assert_eq!(e0.get_int(0).unwrap(), 1);
+        assert_eq!(e0.get_string(1).unwrap(), "open");
+
+        // === ROW: basic + deep + rich ===
+        let rb = r1.get_row(5).unwrap();
+        assert_eq!(rb.get_int(0).unwrap(), 42);
+        assert_eq!(rb.get_string(1).unwrap(), "hello");
+        let rd = r1.get_row(6).unwrap();
+        let rd_inner = rd.get_row(0).unwrap();
+        assert_eq!(rd_inner.get_int(0).unwrap(), 99);
+        let rr = r1.get_row(7).unwrap();
+        assert!(rr.get_boolean(0).unwrap());
+        assert_eq!(rr.get_int(1).unwrap(), 100_000);
+        assert_eq!(rr.get_long(2).unwrap(), 9_876_543_210);
+        assert!(rr.get_float(3).unwrap().is_infinite());
+        assert!((rr.get_double(4).unwrap() - std::f64::consts::PI).abs() < f64::EPSILON);
+        assert_eq!(rr.get_string(5).unwrap(), "hello world");
+        assert_eq!(rr.get_bytes(6).unwrap(), b"binary");
+        assert_eq!(
+            rr.get_decimal(7, 10, 2).unwrap(),
+            Decimal::from_unscaled_long(12345, 10, 2).unwrap()
+        );
+        assert_eq!(rr.get_date(8).unwrap().get_inner(), 20476);
+        assert_eq!(rr.get_time(9).unwrap().get_inner(), 36_827_123);
+        assert_eq!(
+            rr.get_timestamp_ntz(10, 6).unwrap().get_millisecond(),
+            1_769_163_227_123
+        );
+        assert_eq!(
+            rr.get_timestamp_ltz(11, 6).unwrap().get_epoch_millisecond(),
+            1_769_163_227_456
+        );
+        assert_eq!(rr.get_binary(12, 4).unwrap(), b"\x01\x02\x03\x04");
+        let f_arr = rr.get_array(13).unwrap();
+        assert_eq!(f_arr.size(), 3);
+        assert!(f_arr.is_null_at(1));
+
+        // === MAP: basic ===
+        let m = r1.get_map(8).unwrap();
+        assert_eq!(m.size(), 3);
+        assert_eq!(m.get(&Datum::from("a")).unwrap(), Some(Datum::from(1_i32)));
+        assert_eq!(m.get(&Datum::from("b")).unwrap(), Some(Datum::Null));
+        assert_eq!(m.get(&Datum::from("c")).unwrap(), Some(Datum::from(3_i32)));
+
+        // === MAP<K, ROW> ===
+        let m = r1.get_map(9).unwrap();
+        let v0 = m.value_array().get_row(0, &row_seq_label).unwrap();
+        assert_eq!(v0.get_int(0).unwrap(), 1);
+        assert_eq!(v0.get_string(1).unwrap(), "open");
+
+        // === MAP<K, MAP> ===
+        let m = r1.get_map(10).unwrap();
+        let g1 = m
+            .value_array()
+            .get_map(0, &DataTypes::string(), &DataTypes::int())
+            .unwrap();
+        assert_eq!(g1.size(), 2);
+
+        // === MAP<K, ARRAY> + ARRAY<MAP> ===
+        let m = r1.get_map(11).unwrap();
+        assert_eq!(m.value_array().get_array(0).unwrap().size(), 3);
+        let am = r1.get_array(12).unwrap();
+        assert_eq!(am.size(), 2);
+        let am0 = am
+            .get_map(0, &DataTypes::string(), &DataTypes::int())
+            .unwrap();
+        assert_eq!(am0.size(), 2);
+
+        // === Scalars: integers + floating point ===
+        assert_eq!(r1.get_byte(plan.idx("col_tinyint")).unwrap(), s_tinyint);
+        assert_eq!(r1.get_short(plan.idx("col_smallint")).unwrap(), s_smallint);
+        assert_eq!(r1.get_long(plan.idx("col_bigint")).unwrap(), s_bigint);
+        assert!((r1.get_float(plan.idx("col_float")).unwrap() - s_float).abs() < f32::EPSILON);
+        assert!((r1.get_double(plan.idx("col_double")).unwrap() - s_double).abs() < f64::EPSILON);
+
+        // === Scalars: boolean / char / string / decimal / date ===
+        assert!(r1.get_boolean(plan.idx("col_boolean")).unwrap());
+        assert_eq!(r1.get_char(plan.idx("col_char"), 10).unwrap(), s_char);
+        assert_eq!(r1.get_string(plan.idx("col_string")).unwrap(), s_string);
+        assert_eq!(
+            r1.get_decimal(plan.idx("col_decimal"), 10, 2).unwrap(),
+            s_decimal
+        );
+        assert_eq!(
+            r1.get_date(plan.idx("col_date")).unwrap().get_inner(),
+            s_date.get_inner()
+        );
+
+        // === Scalars: time across all four precisions ===
+        assert_eq!(
+            r1.get_time(plan.idx("col_time_s")).unwrap().get_inner(),
+            s_time_s.get_inner()
+        );
+        assert_eq!(
+            r1.get_time(plan.idx("col_time_ms")).unwrap().get_inner(),
+            s_time_ms.get_inner()
+        );
+        assert_eq!(
+            r1.get_time(plan.idx("col_time_us")).unwrap().get_inner(),
+            s_time_us.get_inner()
+        );
+        assert_eq!(
+            r1.get_time(plan.idx("col_time_ns")).unwrap().get_inner(),
+            s_time_ns.get_inner()
+        );
+
+        // === Scalars: timestamp across all four precisions ===
+        assert_eq!(
+            r1.get_timestamp_ntz(plan.idx("col_ts_s"), 0)
+                .unwrap()
+                .get_millisecond(),
+            s_ts_s.get_millisecond()
+        );
+        assert_eq!(
+            r1.get_timestamp_ntz(plan.idx("col_ts_ms"), 3)
+                .unwrap()
+                .get_millisecond(),
+            s_ts_ms.get_millisecond()
+        );
+        let read_ts_us = r1.get_timestamp_ntz(plan.idx("col_ts_us"), 6).unwrap();
+        assert_eq!(read_ts_us.get_millisecond(), s_ts_us.get_millisecond());
+        assert_eq!(
+            read_ts_us.get_nano_of_millisecond(),
+            s_ts_us.get_nano_of_millisecond()
+        );
+        let read_ts_ns = r1.get_timestamp_ntz(plan.idx("col_ts_ns"), 9).unwrap();
+        assert_eq!(read_ts_ns.get_millisecond(), s_ts_ns.get_millisecond());
+        assert_eq!(
+            read_ts_ns.get_nano_of_millisecond(),
+            s_ts_ns.get_nano_of_millisecond()
+        );
+
+        // === Scalars: timestamp_ltz across all four precisions ===
+        assert_eq!(
+            r1.get_timestamp_ltz(plan.idx("col_ts_ltz_s"), 0)
+                .unwrap()
+                .get_epoch_millisecond(),
+            s_ts_ltz_s.get_epoch_millisecond()
+        );
+        assert_eq!(
+            r1.get_timestamp_ltz(plan.idx("col_ts_ltz_ms"), 3)
+                .unwrap()
+                .get_epoch_millisecond(),
+            s_ts_ltz_ms.get_epoch_millisecond()
+        );
+        let read_ltz_us = r1.get_timestamp_ltz(plan.idx("col_ts_ltz_us"), 6).unwrap();
+        assert_eq!(
+            read_ltz_us.get_epoch_millisecond(),
+            s_ts_ltz_us.get_epoch_millisecond()
+        );
+        assert_eq!(
+            read_ltz_us.get_nano_of_millisecond(),
+            s_ts_ltz_us.get_nano_of_millisecond()
+        );
+        let read_ltz_ns = r1.get_timestamp_ltz(plan.idx("col_ts_ltz_ns"), 9).unwrap();
+        assert_eq!(
+            read_ltz_ns.get_epoch_millisecond(),
+            s_ts_ltz_ns.get_epoch_millisecond()
+        );
+        assert_eq!(
+            read_ltz_ns.get_nano_of_millisecond(),
+            s_ts_ltz_ns.get_nano_of_millisecond()
+        );
+
+        // === Scalars: bytes + fixed binary ===
+        assert_eq!(
+            r1.get_bytes(plan.idx("col_bytes_top")).unwrap(),
+            s_bytes_top.as_slice()
+        );
+        assert_eq!(
+            r1.get_binary(plan.idx("col_binary_top"), 4).unwrap(),
+            s_binary_top.as_slice()
+        );
+
+        // === Scalars: negative-epoch timestamps (pre-1970) ===
+        let read_neg_us = r1.get_timestamp_ntz(plan.idx("col_ts_us_neg"), 6).unwrap();
+        assert_eq!(read_neg_us.get_millisecond(), s_ts_us_neg.get_millisecond());
+        assert_eq!(
+            read_neg_us.get_nano_of_millisecond(),
+            s_ts_us_neg.get_nano_of_millisecond()
+        );
+        let read_neg_ns = r1.get_timestamp_ntz(plan.idx("col_ts_ns_neg"), 9).unwrap();
+        assert_eq!(read_neg_ns.get_millisecond(), s_ts_ns_neg.get_millisecond());
+        assert_eq!(
+            read_neg_ns.get_nano_of_millisecond(),
+            s_ts_ns_neg.get_nano_of_millisecond()
+        );
+        let read_neg_ltz_us = r1
+            .get_timestamp_ltz(plan.idx("col_ts_ltz_us_neg"), 6)
+            .unwrap();
+        assert_eq!(
+            read_neg_ltz_us.get_epoch_millisecond(),
+            s_ts_ltz_us_neg.get_epoch_millisecond()
+        );
+        let read_neg_ltz_ns = r1
+            .get_timestamp_ltz(plan.idx("col_ts_ltz_ns_neg"), 9)
+            .unwrap();
+        assert_eq!(
+            read_neg_ltz_ns.get_epoch_millisecond(),
+            s_ts_ltz_ns_neg.get_epoch_millisecond()
+        );
+
+        // === Row 2 lookup — empty map, all other columns NULL ===
+        let result2 = lookuper.lookup(&make_key(2)).await.expect("lookup row2");
+        let r2 = result2
+            .get_single_row()
+            .expect("row2")
+            .expect("row2 exists");
+        assert_eq!(r2.get_int(0).unwrap(), 2);
+        let map_idx = plan.idx("map_string_int");
+        for i in 1..column_count {
+            if i == map_idx {
+                assert_eq!(r2.get_map(map_idx).unwrap().size(), 0);
+            } else {
+                assert!(r2.is_null_at(i).unwrap(), "field {i} should be null");
+            }
+        }
+
+        // === Row 3 lookup — every compound + scalar field NULL ===
+        let result3 = lookuper.lookup(&make_key(3)).await.expect("lookup row3");
+        let r3 = result3
+            .get_single_row()
+            .expect("row3")
+            .expect("row3 exists");
+        assert_eq!(r3.get_int(0).unwrap(), 3);
+        for i in 1..column_count {
+            assert!(r3.is_null_at(i).unwrap(), "field {i} should be null");
+        }
+
+        admin.drop_table(&table_path, false).await.expect("drop");
+    }
+}
diff --git a/fluss-rust/crates/fluss/tests/integration/log_table.rs b/fluss-rust/crates/fluss/tests/integration/log_table.rs
new file mode 100644
index 0000000000..f8323df7c2
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/integration/log_table.rs
@@ -0,0 +1,1948 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#[cfg(test)]
+mod table_test {
+    use crate::integration::utils::{
+        ColumnPlan, array_dt_basics_columns, as_row_type, create_partitions, create_table,
+        dt_array_int, dt_map_string_int, dt_row_seq_label, extract_ids_from_batches,
+        get_shared_cluster, make_int_array, make_string_array, map_dt_basics_columns,
+        row_dt_basics_columns, scalar_dt_columns, wait_for_partitions_ready, wait_for_table_ready,
+    };
+    use arrow::array::record_batch;
+    use fluss::client::{EARLIEST_OFFSET, FlussTable, TableScan};
+    use fluss::metadata::{DataField, DataTypes, Schema, TableDescriptor, TablePath};
+    use fluss::record::ScanRecord;
+    use fluss::row::binary_array::FlussArrayWriter;
+    use fluss::row::binary_map::FlussMapWriter;
+    use fluss::row::{
+        Date, Datum, Decimal, FlussArray, GenericRow, InternalRow, Time, TimestampLtz, TimestampNtz,
+    };
+    use fluss::rpc::message::OffsetSpec;
+    use std::collections::HashMap;
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn append_record_batch_and_scan() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_append_record_batch_and_scan");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("c1", DataTypes::int())
+                    .column("c2", DataTypes::string())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .distributed_by(Some(3), vec!["c1".to_string()])
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+
+        let append_writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+
+        let batch1 =
+            record_batch!(("c1", Int32, [1, 2, 3]), ("c2", Utf8, ["a1", "a2", "a3"])).unwrap();
+        append_writer
+            .append_arrow_batch(batch1)
+            .expect("Failed to append batch");
+
+        let batch2 =
+            record_batch!(("c1", Int32, [4, 5, 6]), ("c2", Utf8, ["a4", "a5", "a6"])).unwrap();
+        append_writer
+            .append_arrow_batch(batch2)
+            .expect("Failed to append batch");
+
+        // Flush to ensure all writes are acknowledged
+        append_writer.flush().await.expect("Failed to flush");
+
+        // Create scanner to verify appended records
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+        let num_buckets = table.get_table_info().get_num_buckets();
+        let log_scanner = table
+            .new_scan()
+            .create_log_scanner()
+            .expect("Failed to create log scanner");
+        for bucket_id in 0..num_buckets {
+            log_scanner
+                .subscribe(bucket_id, EARLIEST_OFFSET)
+                .await
+                .expect("Failed to subscribe with EARLIEST_OFFSET");
+        }
+
+        // Poll for records across all buckets
+        let mut collected: Vec<(i32, String)> = Vec::new();
+        let start_time = std::time::Instant::now();
+        while collected.len() < 6 && start_time.elapsed() < Duration::from_secs(10) {
+            let scan_records = log_scanner
+                .poll(Duration::from_millis(500))
+                .await
+                .expect("Failed to poll records");
+            for rec in scan_records {
+                let row = rec.row();
+                collected.push((
+                    row.get_int(0).unwrap(),
+                    row.get_string(1).unwrap().to_string(),
+                ));
+            }
+        }
+
+        assert_eq!(collected.len(), 6, "Expected 6 records");
+
+        // Sort and verify record contents
+        collected.sort();
+        let expected: Vec<(i32, String)> = vec![
+            (1, "a1".to_string()),
+            (2, "a2".to_string()),
+            (3, "a3".to_string()),
+            (4, "a4".to_string()),
+            (5, "a5".to_string()),
+            (6, "a6".to_string()),
+        ];
+        assert_eq!(collected, expected);
+
+        // Test unsubscribe: unsubscribe from bucket 0, verify no error
+        log_scanner
+            .unsubscribe(0)
+            .await
+            .expect("Failed to unsubscribe from bucket 0");
+
+        // Verify unsubscribe_partition fails on a non-partitioned table
+        assert!(
+            log_scanner.unsubscribe_partition(0, 0).await.is_err(),
+            "unsubscribe_partition should fail on a non-partitioned table"
+        );
+    }
+
+    #[tokio::test]
+    async fn list_offsets() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_list_offsets");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        wait_for_table_ready(&admin, &table_path).await;
+
+        // Test earliest offset (should be 0 for empty table)
+        let earliest_offsets = admin
+            .list_offsets(&table_path, &[0], OffsetSpec::Earliest)
+            .await
+            .expect("Failed to list earliest offsets");
+
+        assert_eq!(
+            earliest_offsets.get(&0),
+            Some(&0),
+            "Earliest offset should be 0 for bucket 0"
+        );
+
+        // Test latest offset (should be 0 for empty table)
+        let latest_offsets = admin
+            .list_offsets(&table_path, &[0], OffsetSpec::Latest)
+            .await
+            .expect("Failed to list latest offsets");
+
+        assert_eq!(
+            latest_offsets.get(&0),
+            Some(&0),
+            "Latest offset should be 0 for empty table"
+        );
+
+        // Append some records
+        let append_writer = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table")
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+
+        let batch = record_batch!(
+            ("id", Int32, [1, 2, 3]),
+            ("name", Utf8, ["alice", "bob", "charlie"])
+        )
+        .unwrap();
+        append_writer
+            .append_arrow_batch(batch)
+            .expect("Failed to append batch");
+
+        // Flush to ensure all writes are acknowledged
+        append_writer.flush().await.expect("Failed to flush");
+
+        // Test latest offset after appending (should be 3)
+        let latest_offsets_after = admin
+            .list_offsets(&table_path, &[0], OffsetSpec::Latest)
+            .await
+            .expect("Failed to list latest offsets after append");
+
+        assert_eq!(
+            latest_offsets_after.get(&0),
+            Some(&3),
+            "Latest offset should be 3 after appending 3 records"
+        );
+
+        // Test earliest offset after appending (should still be 0)
+        let earliest_offsets_after = admin
+            .list_offsets(&table_path, &[0], OffsetSpec::Earliest)
+            .await
+            .expect("Failed to list earliest offsets after append");
+
+        assert_eq!(
+            earliest_offsets_after.get(&0),
+            Some(&0),
+            "Earliest offset should still be 0"
+        );
+
+        // Scan records back to get server-assigned timestamps (avoids host/container
+        // clock skew issues that make host-based timestamps unreliable).
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+        let log_scanner = table
+            .new_scan()
+            .create_log_scanner()
+            .expect("Failed to create log scanner");
+        log_scanner
+            .subscribe(0, EARLIEST_OFFSET)
+            .await
+            .expect("Failed to subscribe");
+
+        let mut record_timestamps: Vec<i64> = Vec::new();
+        let scan_start = std::time::Instant::now();
+        while record_timestamps.len() < 3 && scan_start.elapsed() < Duration::from_secs(10) {
+            let scan_records = log_scanner
+                .poll(Duration::from_millis(500))
+                .await
+                .expect("Failed to poll records");
+            for rec in scan_records {
+                record_timestamps.push(rec.timestamp());
+            }
+        }
+        assert_eq!(record_timestamps.len(), 3, "Expected 3 record timestamps");
+
+        let min_ts = *record_timestamps.iter().min().unwrap();
+        let max_ts = *record_timestamps.iter().max().unwrap();
+
+        // Timestamp before all records should resolve to offset 0
+        let before_offsets = admin
+            .list_offsets(&table_path, &[0], OffsetSpec::Timestamp(min_ts - 1))
+            .await
+            .expect("Failed to list offsets by timestamp (before)");
+
+        assert_eq!(
+            before_offsets.get(&0),
+            Some(&0),
+            "Timestamp before first record should resolve to offset 0"
+        );
+
+        // Timestamp after all records should resolve to offset 3
+        let after_offsets = admin
+            .list_offsets(&table_path, &[0], OffsetSpec::Timestamp(max_ts + 1))
+            .await
+            .expect("Failed to list offsets by timestamp (after)");
+
+        assert_eq!(
+            after_offsets.get(&0),
+            Some(&3),
+            "Timestamp after last record should resolve to offset 3"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_project() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_project");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("col_a", DataTypes::int())
+                    .column("col_b", DataTypes::string())
+                    .column("col_c", DataTypes::int())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+
+        // Append 3 records
+        let append_writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+
+        let batch = record_batch!(
+            ("col_a", Int32, [1, 2, 3]),
+            ("col_b", Utf8, ["x", "y", "z"]),
+            ("col_c", Int32, [10, 20, 30])
+        )
+        .unwrap();
+        append_writer
+            .append_arrow_batch(batch)
+            .expect("Failed to append batch");
+        append_writer.flush().await.expect("Failed to flush");
+
+        // Test project_by_name: select col_b and col_c only
+        let records = scan_table(&table, |scan| {
+            scan.project_by_name(&["col_b", "col_c"])
+                .expect("Failed to project by name")
+        })
+        .await;
+
+        assert_eq!(
+            records.len(),
+            3,
+            "Should have 3 records with project_by_name"
+        );
+
+        // Verify projected columns are in the correct order (col_b, col_c)
+        let expected_col_b = ["x", "y", "z"];
+        let expected_col_c = [10, 20, 30];
+
+        for (i, record) in records.iter().enumerate() {
+            let row = record.row();
+            // col_b is now at index 0, col_c is at index 1
+            assert_eq!(
+                row.get_string(0).unwrap(),
+                expected_col_b[i],
+                "col_b mismatch at index {}",
+                i
+            );
+            assert_eq!(
+                row.get_int(1).unwrap(),
+                expected_col_c[i],
+                "col_c mismatch at index {}",
+                i
+            );
+        }
+
+        // test project by column indices
+        let records = scan_table(&table, |scan| {
+            scan.project(&[1, 0]).expect("Failed to project by indices")
+        })
+        .await;
+
+        assert_eq!(
+            records.len(),
+            3,
+            "Should have 3 records with project_by_name"
+        );
+        // Verify projected columns are in the correct order (col_b, col_a)
+        let expected_col_b = ["x", "y", "z"];
+        let expected_col_a = [1, 2, 3];
+
+        for (i, record) in records.iter().enumerate() {
+            let row = record.row();
+            // col_b is now at index 0, col_c is at index 1
+            assert_eq!(
+                row.get_string(0).unwrap(),
+                expected_col_b[i],
+                "col_b mismatch at index {}",
+                i
+            );
+            assert_eq!(
+                row.get_int(1).unwrap(),
+                expected_col_a[i],
+                "col_c mismatch at index {}",
+                i
+            );
+        }
+
+        // Test error case: empty column names should fail
+        let result = table.new_scan().project_by_name(&[]);
+        assert!(
+            result.is_err(),
+            "project_by_name with empty names should fail"
+        );
+
+        // Test error case: non-existent column should fail
+        let result = table.new_scan().project_by_name(&["nonexistent_column"]);
+        assert!(
+            result.is_err(),
+            "project_by_name with non-existent column should fail"
+        );
+    }
+
+    async fn scan_table<'a>(
+        table: &FlussTable<'a>,
+        setup_scan: impl FnOnce(TableScan) -> TableScan,
+    ) -> Vec<ScanRecord> {
+        // 1. build log scanner
+        let log_scanner = setup_scan(table.new_scan())
+            .create_log_scanner()
+            .expect("Failed to create log scanner");
+
+        // 2. subscribe
+        let mut bucket_offsets = HashMap::new();
+        bucket_offsets.insert(0, 0);
+        log_scanner
+            .subscribe_buckets(&bucket_offsets)
+            .await
+            .expect("Failed to subscribe");
+
+        // 3. poll records
+        let scan_records = log_scanner
+            .poll(Duration::from_secs(10))
+            .await
+            .expect("Failed to poll");
+
+        // 4. collect and sort
+        let mut records: Vec<_> = scan_records.into_iter().collect();
+        records.sort_by_key(|r| r.offset());
+        records
+    }
+
+    #[tokio::test]
+    async fn test_poll_batches() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_poll_batches");
+        let schema = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .build()
+            .unwrap();
+
+        create_table(
+            &admin,
+            &table_path,
+            &TableDescriptor::builder().schema(schema).build().unwrap(),
+        )
+        .await;
+        wait_for_table_ready(&admin, &table_path).await;
+
+        let table = connection.get_table(&table_path).await.unwrap();
+        let scanner = table.new_scan().create_record_batch_log_scanner().unwrap();
+        scanner.subscribe(0, 0).await.unwrap();
+
+        // Test 1: Empty table should return empty result
+        assert!(
+            scanner
+                .poll(Duration::from_millis(500))
+                .await
+                .unwrap()
+                .is_empty()
+        );
+
+        let writer = table.new_append().unwrap().create_writer().unwrap();
+        writer
+            .append_arrow_batch(
+                record_batch!(("id", Int32, [1, 2]), ("name", Utf8, ["a", "b"])).unwrap(),
+            )
+            .unwrap();
+        writer
+            .append_arrow_batch(
+                record_batch!(("id", Int32, [3, 4]), ("name", Utf8, ["c", "d"])).unwrap(),
+            )
+            .unwrap();
+        writer
+            .append_arrow_batch(
+                record_batch!(("id", Int32, [5, 6]), ("name", Utf8, ["e", "f"])).unwrap(),
+            )
+            .unwrap();
+        writer.flush().await.unwrap();
+
+        // poll may return partial results if not all batches are available yet,
+        // so we accumulate across multiple polls until we have the expected count.
+        let mut all_ids = Vec::new();
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(10);
+        while all_ids.len() < 6 && tokio::time::Instant::now() < deadline {
+            let batches = scanner.poll(Duration::from_secs(5)).await.unwrap();
+            all_ids.extend(extract_ids_from_batches(&batches));
+        }
+
+        // Test 2: Order should be preserved across multiple batches
+        assert_eq!(all_ids, vec![1, 2, 3, 4, 5, 6]);
+
+        writer
+            .append_arrow_batch(
+                record_batch!(("id", Int32, [7, 8]), ("name", Utf8, ["g", "h"])).unwrap(),
+            )
+            .unwrap();
+        writer.flush().await.unwrap();
+
+        let mut new_ids = Vec::new();
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(10);
+        while new_ids.len() < 2 && tokio::time::Instant::now() < deadline {
+            let more = scanner.poll(Duration::from_secs(5)).await.unwrap();
+            new_ids.extend(extract_ids_from_batches(&more));
+        }
+
+        // Test 3: Subsequent polls should not return duplicate data (offset continuation)
+        assert_eq!(new_ids, vec![7, 8]);
+
+        // Test 4: Subscribing from mid-offset should truncate batch (Arrow batch slicing)
+        // Server returns all records from start of batch, but client truncates to subscription offset
+        let trunc_scanner = table.new_scan().create_record_batch_log_scanner().unwrap();
+        trunc_scanner.subscribe(0, 3).await.unwrap();
+        let mut trunc_ids = Vec::new();
+        let deadline = tokio::time::Instant::now() + Duration::from_secs(10);
+        while trunc_ids.len() < 5 && tokio::time::Instant::now() < deadline {
+            let trunc_batches = trunc_scanner.poll(Duration::from_secs(5)).await.unwrap();
+            trunc_ids.extend(extract_ids_from_batches(&trunc_batches));
+        }
+
+        // Subscribing from offset 3 should return [4,5,6,7,8], not [1,2,3,4,5,6,7,8]
+        assert_eq!(trunc_ids, vec![4, 5, 6, 7, 8]);
+
+        // Test 5: Projection should only return requested columns
+        let proj = table
+            .new_scan()
+            .project_by_name(&["id"])
+            .unwrap()
+            .create_record_batch_log_scanner()
+            .unwrap();
+        proj.subscribe(0, 0).await.unwrap();
+        let proj_batches = proj.poll(Duration::from_secs(10)).await.unwrap();
+
+        // Projected batch should have 1 column (id), not 2 (id, name)
+        assert_eq!(proj_batches[0].batch().num_columns(), 1);
+    }
+
+    /// Integration test covering produce and scan operations for all supported datatypes
+    /// in log tables.
+    #[tokio::test]
+    async fn partitioned_table_append_scan() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_partitioned_log_append");
+
+        // Create a partitioned log table
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("region", DataTypes::string())
+                    .column("value", DataTypes::bigint())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .partitioned_by(vec!["region"])
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        // Create partitions
+        create_partitions(&admin, &table_path, "region", &["US", "EU"]).await;
+
+        // Wait for partition bucket leaders to be available.
+        wait_for_partitions_ready(&admin, &table_path, &["US", "EU"]).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+
+        // Create append writer - this should now work for partitioned tables
+        let append_writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+
+        // Append records with different partitions
+        let test_data = [
+            (1, "US", 100i64),
+            (2, "US", 200i64),
+            (3, "EU", 300i64),
+            (4, "EU", 400i64),
+        ];
+
+        for (id, region, value) in &test_data {
+            let mut row = GenericRow::new(3);
+            row.set_field(0, *id);
+            row.set_field(1, *region);
+            row.set_field(2, *value);
+            append_writer.append(&row).expect("Failed to append row");
+        }
+
+        append_writer.flush().await.expect("Failed to flush");
+
+        // Test append_arrow_batch for partitioned tables
+        // Each batch must contain rows from the same partition
+        let us_batch = record_batch!(
+            ("id", Int32, [5, 6]),
+            ("region", Utf8, ["US", "US"]),
+            ("value", Int64, [500, 600])
+        )
+        .unwrap();
+        append_writer
+            .append_arrow_batch(us_batch)
+            .expect("Failed to append US batch");
+
+        let eu_batch = record_batch!(
+            ("id", Int32, [7, 8]),
+            ("region", Utf8, ["EU", "EU"]),
+            ("value", Int64, [700, 800])
+        )
+        .unwrap();
+        append_writer
+            .append_arrow_batch(eu_batch)
+            .expect("Failed to append EU batch");
+
+        append_writer
+            .flush()
+            .await
+            .expect("Failed to flush batches");
+
+        // Test list_offsets_for_partition
+        // US partition has 4 records: 2 from row append + 2 from batch append
+        let us_offsets = admin
+            .list_partition_offsets(&table_path, "US", &[0], OffsetSpec::Latest)
+            .await
+            .expect("Failed to list offsets for US partition");
+        assert_eq!(
+            us_offsets.get(&0),
+            Some(&4),
+            "US partition should have 4 records"
+        );
+
+        // EU partition has 4 records: 2 from row append + 2 from batch append
+        let eu_offsets = admin
+            .list_partition_offsets(&table_path, "EU", &[0], OffsetSpec::Latest)
+            .await
+            .expect("Failed to list offsets for EU partition");
+        assert_eq!(
+            eu_offsets.get(&0),
+            Some(&4),
+            "EU partition should have 4 records"
+        );
+
+        // test list a not exist partition should return error
+        let result = admin
+            .list_partition_offsets(&table_path, "NOT Exists", &[0], OffsetSpec::Latest)
+            .await;
+        assert!(result.is_err());
+        assert!(result.unwrap_err().to_string().contains(
+            "Table partition 'fluss.test_partitioned_log_append(p=NOT Exists)' does not exist."
+        ));
+
+        let log_scanner = table
+            .new_scan()
+            .create_log_scanner()
+            .expect("Failed to create log scanner");
+        let partition_info = admin
+            .list_partition_infos(&table_path)
+            .await
+            .expect("Failed to list partition infos");
+        for partition_info in partition_info {
+            log_scanner
+                .subscribe_partition(partition_info.get_partition_id(), 0, 0)
+                .await
+                .expect("Failed to subscribe to partition");
+        }
+
+        let expected_records = vec![
+            (1, "US", 100i64),
+            (2, "US", 200i64),
+            (3, "EU", 300i64),
+            (4, "EU", 400),
+            (5, "US", 500i64),
+            (6, "US", 600i64),
+            (7, "EU", 700i64),
+            (8, "EU", 800i64),
+        ];
+        let expected_records: Vec<(i32, String, i64)> = expected_records
+            .into_iter()
+            .map(|(id, region, val)| (id, region.to_string(), val))
+            .collect();
+
+        let mut collected_records: Vec<(i32, String, i64)> = Vec::new();
+        let start_time = std::time::Instant::now();
+        while collected_records.len() < expected_records.len()
+            && start_time.elapsed() < Duration::from_secs(10)
+        {
+            let records = log_scanner
+                .poll(Duration::from_millis(500))
+                .await
+                .expect("Failed to poll log scanner");
+            for rec in records {
+                let row = rec.row();
+                collected_records.push((
+                    row.get_int(0).unwrap(),
+                    row.get_string(1).unwrap().to_string(),
+                    row.get_long(2).unwrap(),
+                ));
+            }
+        }
+
+        assert_eq!(
+            collected_records.len(),
+            expected_records.len(),
+            "Did not receive all records in time, expect receive {} records, but got {} records",
+            expected_records.len(),
+            collected_records.len()
+        );
+        collected_records.sort_by_key(|r| r.0);
+        assert_eq!(
+            collected_records, expected_records,
+            "Data mismatch between sent and received"
+        );
+
+        // Test unsubscribe_partition: after unsubscribing from one partition,
+        // data from that partition should no longer be read.
+        let log_scanner_unsub = table
+            .new_scan()
+            .create_log_scanner()
+            .expect("Failed to create log scanner for unsubscribe test");
+        let partition_infos = admin
+            .list_partition_infos(&table_path)
+            .await
+            .expect("Failed to list partition infos");
+        let eu_partition_id = partition_infos
+            .iter()
+            .find(|p| p.get_partition_name() == "EU")
+            .map(|p| p.get_partition_id())
+            .expect("EU partition should exist");
+        for info in &partition_infos {
+            log_scanner_unsub
+                .subscribe_partition(info.get_partition_id(), 0, 0)
+                .await
+                .expect("Failed to subscribe to partition");
+        }
+        log_scanner_unsub
+            .unsubscribe_partition(eu_partition_id, 0)
+            .await
+            .expect("Failed to unsubscribe from EU partition");
+
+        let mut records_after_unsubscribe: Vec<(i32, String, i64)> = Vec::new();
+        let unsub_deadline = std::time::Instant::now() + Duration::from_secs(5);
+        while records_after_unsubscribe.len() < 4 && std::time::Instant::now() < unsub_deadline {
+            let records = log_scanner_unsub
+                .poll(Duration::from_millis(300))
+                .await
+                .expect("Failed to poll after unsubscribe");
+            for rec in records {
+                let row = rec.row();
+                records_after_unsubscribe.push((
+                    row.get_int(0).unwrap(),
+                    row.get_string(1).unwrap().to_string(),
+                    row.get_long(2).unwrap(),
+                ));
+            }
+        }
+
+        assert!(
+            records_after_unsubscribe.iter().all(|r| r.1 == "US"),
+            "After unsubscribe_partition(EU), only US partition data should be read; got regions: {:?}",
+            records_after_unsubscribe
+                .iter()
+                .map(|r| r.1.as_str())
+                .collect::<Vec<_>>()
+        );
+        assert_eq!(
+            records_after_unsubscribe.len(),
+            4,
+            "Should receive exactly 4 US records (ids 1,2,5,6); got {}",
+            records_after_unsubscribe.len()
+        );
+
+        // Test subscribe_partition_buckets: batch subscribe to all partitions at once
+        let log_scanner_batch = table
+            .new_scan()
+            .create_log_scanner()
+            .expect("Failed to create log scanner for batch partition subscribe test");
+        let partition_infos = admin
+            .list_partition_infos(&table_path)
+            .await
+            .expect("Failed to list partition infos");
+        let partition_bucket_offsets: HashMap<(i64, i32), i64> = partition_infos
+            .iter()
+            .map(|p| ((p.get_partition_id(), 0), 0i64))
+            .collect();
+        log_scanner_batch
+            .subscribe_partition_buckets(&partition_bucket_offsets)
+            .await
+            .expect("Failed to batch subscribe to partitions");
+
+        let mut batch_collected: Vec<(i32, String, i64)> = Vec::new();
+        let batch_start = std::time::Instant::now();
+        while batch_collected.len() < expected_records.len()
+            && batch_start.elapsed() < Duration::from_secs(10)
+        {
+            let records = log_scanner_batch
+                .poll(Duration::from_millis(500))
+                .await
+                .expect("Failed to poll after batch partition subscribe");
+            for rec in records {
+                let row = rec.row();
+                batch_collected.push((
+                    row.get_int(0).unwrap(),
+                    row.get_string(1).unwrap().to_string(),
+                    row.get_long(2).unwrap(),
+                ));
+            }
+        }
+        assert_eq!(
+            batch_collected.len(),
+            expected_records.len(),
+            "Did not receive all records in time, expect receive {} records, but got {} records",
+            expected_records.len(),
+            batch_collected.len()
+        );
+        batch_collected.sort_by_key(|r| r.0);
+        assert_eq!(
+            batch_collected, expected_records,
+            "subscribe_partition_buckets should receive the same records as subscribe_partition loop"
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    /// Projection over a log table containing every compound type.
+    #[tokio::test]
+    async fn projection_with_compound_types() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_log_projection_compound");
+
+        let row_type = DataTypes::row(vec![
+            DataField::new("seq", DataTypes::int(), None),
+            DataField::new("label", DataTypes::string(), None),
+        ]);
+
+        let schema = Schema::builder()
+            .column("id", DataTypes::int())
+            .column("nested", row_type)
+            .column(
+                "attrs",
+                DataTypes::map(DataTypes::string(), DataTypes::int()),
+            )
+            .column("tags", DataTypes::array(DataTypes::string()))
+            .column("extra", DataTypes::string())
+            .build()
+            .expect("schema");
+
+        create_table(
+            &admin,
+            &table_path,
+            &TableDescriptor::builder()
+                .schema(schema)
+                .build()
+                .expect("table descriptor"),
+        )
+        .await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        let writer = table
+            .new_append()
+            .expect("append")
+            .create_writer()
+            .expect("writer");
+
+        let mut nested = GenericRow::new(2);
+        nested.set_field(0, 42_i32);
+        nested.set_field(1, "hello");
+        let attrs = {
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::int());
+            w.write_entry("x".into(), 1.into()).unwrap();
+            w.write_entry("y".into(), 2.into()).unwrap();
+            w.complete().expect("attrs")
+        };
+        let tags = make_string_array(&[Some("alpha"), Some("beta")]);
+
+        let mut row = GenericRow::new(5);
+        row.set_field(0, 7_i32);
+        row.set_field(1, Datum::Row(Box::new(nested)));
+        row.set_field(2, Datum::Map(attrs));
+        row.set_field(3, tags);
+        row.set_field(4, "ignore-me");
+        writer.append(&row).expect("append");
+        writer.flush().await.expect("flush");
+
+        // Project columns in reordered form, dropping `extra`.
+        let records = scan_table(&table, |scan| {
+            scan.project_by_name(&["nested", "attrs", "tags", "id"])
+                .expect("project failed")
+        })
+        .await;
+        assert_eq!(records.len(), 1);
+        let r = records[0].row();
+
+        // === Projection: ROW ===
+        let projected_nested = r.get_row(0).expect("get_row over projection");
+        assert_eq!(projected_nested.get_int(0).unwrap(), 42);
+        assert_eq!(projected_nested.get_string(1).unwrap(), "hello");
+
+        // === Projection: MAP ===
+        let m = r.get_map(1).expect("get_map over projection");
+        assert_eq!(m.size(), 2);
+        assert_eq!(m.get(&Datum::from("x")).unwrap(), Some(Datum::from(1_i32)));
+        assert_eq!(m.get(&Datum::from("y")).unwrap(), Some(Datum::from(2_i32)));
+
+        // === Projection: ARRAY ===
+        let a = r.get_array(2).expect("get_array over projection");
+        assert_eq!(a.size(), 2);
+        assert_eq!(a.get_string(0).unwrap(), "alpha");
+        assert_eq!(a.get_string(1).unwrap(), "beta");
+
+        // === Projection: scalar reordered to position 3 ===
+        assert_eq!(r.get_int(3).unwrap(), 7);
+
+        admin.drop_table(&table_path, false).await.expect("drop");
+    }
+
+    /// Log append + scan against a schema covering every supported data type.
+    #[tokio::test]
+    async fn all_supported_datatypes() {
+        fn assert_f32_special(actual: f32, expected: f32) {
+            if expected.is_nan() {
+                assert!(actual.is_nan(), "expected NaN");
+            } else if expected.is_infinite() {
+                assert!(actual.is_infinite());
+                assert_eq!(actual.signum(), expected.signum());
+            } else {
+                assert!((actual - expected).abs() < f32::EPSILON);
+            }
+        }
+        fn assert_f64_special(actual: f64, expected: f64) {
+            if expected.is_nan() {
+                assert!(actual.is_nan(), "expected NaN");
+            } else if expected.is_infinite() {
+                assert!(actual.is_infinite());
+                assert_eq!(actual.signum(), expected.signum());
+            } else {
+                assert!((actual - expected).abs() < f64::EPSILON);
+            }
+        }
+
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_log_complex_types");
+
+        let row_seq_label_owned = dt_row_seq_label();
+        let row_seq_label = as_row_type(&row_seq_label_owned);
+        let inner_array_int = dt_array_int();
+        let inner_map_string_int = dt_map_string_int();
+
+        let plan = ColumnPlan::new()
+            .add("id", DataTypes::int())
+            .start_section("array_basics")
+            .extend(array_dt_basics_columns())
+            .start_section("row_basics")
+            .extend(row_dt_basics_columns())
+            .start_section("map_basics")
+            .extend(map_dt_basics_columns())
+            // ARRAY rich types
+            .start_section("array_rich")
+            .add("arr_bytes", DataTypes::array(DataTypes::bytes()))
+            .add("arr_date", DataTypes::array(DataTypes::date()))
+            .add(
+                "arr_time",
+                DataTypes::array(DataTypes::time_with_precision(3)),
+            )
+            .add(
+                "arr_ts",
+                DataTypes::array(DataTypes::timestamp_with_precision(6)),
+            )
+            .add(
+                "arr_ts_ltz",
+                DataTypes::array(DataTypes::timestamp_ltz_with_precision(3)),
+            )
+            .add("arr_decimal", DataTypes::array(DataTypes::decimal(10, 2)))
+            .add(
+                "arr_decimal_big",
+                DataTypes::array(DataTypes::decimal(22, 5)),
+            )
+            .add("arr_float", DataTypes::array(DataTypes::float()))
+            .add("arr_double", DataTypes::array(DataTypes::double()))
+            .add("arr_binary", DataTypes::array(DataTypes::binary(4)))
+            // MAP rich types
+            .start_section("map_rich")
+            .add(
+                "map_bytes",
+                DataTypes::map(DataTypes::string(), DataTypes::bytes()),
+            )
+            .add(
+                "map_decimal",
+                DataTypes::map(DataTypes::string(), DataTypes::decimal(10, 2)),
+            )
+            .add(
+                "map_date",
+                DataTypes::map(DataTypes::string(), DataTypes::date()),
+            )
+            .add(
+                "map_time",
+                DataTypes::map(DataTypes::string(), DataTypes::time_with_precision(3)),
+            )
+            .add(
+                "map_ts",
+                DataTypes::map(DataTypes::string(), DataTypes::timestamp_with_precision(6)),
+            )
+            .add(
+                "map_ts_ltz",
+                DataTypes::map(
+                    DataTypes::string(),
+                    DataTypes::timestamp_ltz_with_precision(3),
+                ),
+            )
+            .add(
+                "map_float",
+                DataTypes::map(DataTypes::string(), DataTypes::float()),
+            )
+            .add(
+                "map_double",
+                DataTypes::map(DataTypes::string(), DataTypes::double()),
+            )
+            .add(
+                "map_bool",
+                DataTypes::map(DataTypes::string(), DataTypes::boolean()),
+            )
+            .add(
+                "map_binary",
+                DataTypes::map(DataTypes::string(), DataTypes::binary(4)),
+            )
+            .add(
+                "map_int_key",
+                DataTypes::map(DataTypes::int(), DataTypes::string()),
+            )
+            .start_section("scalars")
+            .extend(scalar_dt_columns());
+        let column_count = plan.len();
+
+        create_table(
+            &admin,
+            &table_path,
+            &TableDescriptor::builder()
+                .schema(plan.build_schema(None))
+                .build()
+                .expect("table descriptor"),
+        )
+        .await;
+
+        let table = connection.get_table(&table_path).await.expect("table");
+        let writer = table
+            .new_append()
+            .expect("append")
+            .create_writer()
+            .expect("writer");
+
+        // Shared scalar values
+        let dec = Decimal::from_unscaled_long(12345, 10, 2).unwrap();
+        let dec_big = Decimal::from_unscaled_bytes(&[66, 237, 18, 59, 11, 216, 31, 4, 244], 22, 5)
+            .expect("big decimal");
+        let date_v = Date::new(20476);
+        let time_v = Time::new(36_827_123);
+        let ts_v = TimestampNtz::from_millis_nanos(1_769_163_227_123, 456_000).unwrap();
+        let ts_ltz_v = TimestampLtz::new(1_769_163_227_123);
+        let bytes_v = vec![0xDE_u8, 0xAD, 0xBE, 0xEF];
+        let fixed_a = vec![0x01_u8, 0x02, 0x03, 0x04];
+        let fixed_b = vec![0xAA_u8, 0xBB, 0xCC, 0xDD];
+
+        // Row 0 — every column populated.
+        let mut row0 = GenericRow::new(column_count);
+        row0.set_field(0, 1_i32);
+
+        // ARRAY basics
+        row0.set_field(1, make_int_array(&[Some(10), Some(20), Some(30)]));
+        row0.set_field(2, make_string_array(&[Some("hello"), Some("world")]));
+        let arr_of_arr_0 = {
+            let mut w = FlussArrayWriter::new(2, &inner_array_int);
+            w.write_array(0, &make_int_array(&[Some(1), Some(2)]));
+            w.write_array(1, &make_int_array(&[Some(3), Some(4)]));
+            w.complete().expect("arr_of_arr_0")
+        };
+        row0.set_field(3, arr_of_arr_0);
+        let arr_of_row_0 = {
+            let mut w = FlussArrayWriter::new(2, &row_seq_label_owned);
+            let mut e0 = GenericRow::new(2);
+            e0.set_field(0, 1_i32);
+            e0.set_field(1, "open");
+            w.write_row(0, &e0).expect("e0");
+            let mut e1 = GenericRow::new(2);
+            e1.set_field(0, 2_i32);
+            e1.set_field(1, "close");
+            w.write_row(1, &e1).expect("e1");
+            w.complete().expect("arr_of_row_0")
+        };
+        row0.set_field(4, arr_of_row_0);
+
+        // ROW basics
+        let mut row_basic_0 = GenericRow::new(2);
+        row_basic_0.set_field(0, 42_i32);
+        row_basic_0.set_field(1, "hello");
+        row0.set_field(5, Datum::Row(Box::new(row_basic_0)));
+
+        let mut row_deep_inner_0 = GenericRow::new(1);
+        row_deep_inner_0.set_field(0, 99_i32);
+        let mut row_deep_0 = GenericRow::new(1);
+        row_deep_0.set_field(0, Datum::Row(Box::new(row_deep_inner_0)));
+        row0.set_field(6, Datum::Row(Box::new(row_deep_0)));
+
+        let mut row_rich_0 = GenericRow::new(14);
+        row_rich_0.set_field(0, true);
+        row_rich_0.set_field(1, 100_000_i32);
+        row_rich_0.set_field(2, 9_876_543_210_i64);
+        row_rich_0.set_field(3, f32::INFINITY);
+        row_rich_0.set_field(4, f64::NAN);
+        row_rich_0.set_field(5, "hello world");
+        row_rich_0.set_field(6, b"binary".as_slice());
+        row_rich_0.set_field(7, dec.clone());
+        row_rich_0.set_field(8, Datum::Date(Date::new(20476)));
+        row_rich_0.set_field(9, Datum::Time(Time::new(36_827_123)));
+        row_rich_0.set_field(
+            10,
+            Datum::TimestampNtz(TimestampNtz::new(1_769_163_227_123)),
+        );
+        row_rich_0.set_field(
+            11,
+            Datum::TimestampLtz(TimestampLtz::new(1_769_163_227_456)),
+        );
+        row_rich_0.set_field(12, b"\x01\x02\x03\x04".as_slice());
+        row_rich_0.set_field(13, make_int_array(&[Some(7), None, Some(11)]));
+        row0.set_field(7, Datum::Row(Box::new(row_rich_0)));
+
+        // MAP basics
+        let map_string_int_0 = {
+            let mut w = FlussMapWriter::new(3, &DataTypes::string(), &DataTypes::int());
+            w.write_entry("a".into(), 1.into()).unwrap();
+            w.write_entry("b".into(), Datum::Null).unwrap();
+            w.write_entry("c".into(), 3.into()).unwrap();
+            w.complete().expect("map_string_int_0")
+        };
+        row0.set_field(8, Datum::Map(map_string_int_0));
+
+        let map_of_row_0 = {
+            let mut e0 = GenericRow::new(2);
+            e0.set_field(0, 1_i32);
+            e0.set_field(1, "open");
+            let mut e1 = GenericRow::new(2);
+            e1.set_field(0, 2_i32);
+            e1.set_field(1, "close");
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &row_seq_label_owned);
+            w.write_entry("e0".into(), Datum::Row(Box::new(e0)))
+                .unwrap();
+            w.write_entry("e1".into(), Datum::Row(Box::new(e1)))
+                .unwrap();
+            w.complete().expect("map_of_row_0")
+        };
+        row0.set_field(9, Datum::Map(map_of_row_0));
+
+        let map_of_map_0 = {
+            let g1 = {
+                let mut w = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::int());
+                w.write_entry("a".into(), 1.into()).unwrap();
+                w.write_entry("b".into(), 2.into()).unwrap();
+                w.complete().expect("g1")
+            };
+            let g2 = {
+                let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::int());
+                w.write_entry("c".into(), 3.into()).unwrap();
+                w.complete().expect("g2")
+            };
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &inner_map_string_int);
+            w.write_entry("g1".into(), Datum::Map(g1)).unwrap();
+            w.write_entry("g2".into(), Datum::Map(g2)).unwrap();
+            w.complete().expect("map_of_map_0")
+        };
+        row0.set_field(10, Datum::Map(map_of_map_0));
+
+        let map_of_array_0 = {
+            let primes = make_int_array(&[Some(2), Some(3), Some(5)]);
+            let squares = make_int_array(&[Some(1), Some(4)]);
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &inner_array_int);
+            w.write_entry("primes".into(), Datum::Array(primes))
+                .unwrap();
+            w.write_entry("squares".into(), Datum::Array(squares))
+                .unwrap();
+            w.complete().expect("map_of_array_0")
+        };
+        row0.set_field(11, Datum::Map(map_of_array_0));
+
+        let array_of_map_0 = {
+            let m0 = {
+                let mut w = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::int());
+                w.write_entry("x".into(), 1.into()).unwrap();
+                w.write_entry("y".into(), 2.into()).unwrap();
+                w.complete().expect("m0")
+            };
+            let m1 = {
+                let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::int());
+                w.write_entry("z".into(), 9.into()).unwrap();
+                w.complete().expect("m1")
+            };
+            let mut w = FlussArrayWriter::new(2, &inner_map_string_int);
+            w.write_map(0, &m0);
+            w.write_map(1, &m1);
+            w.complete().expect("array_of_map_0")
+        };
+        row0.set_field(12, array_of_map_0);
+
+        // ARRAY rich types
+        let arr_bytes_0 = {
+            let mut w = FlussArrayWriter::new(2, &DataTypes::bytes());
+            w.write_binary_bytes(0, &bytes_v);
+            w.set_null_at(1);
+            w.complete().expect("arr_bytes_0")
+        };
+        row0.set_field(13, arr_bytes_0);
+        let arr_date_0 = {
+            let mut w = FlussArrayWriter::new(2, &DataTypes::date());
+            w.write_date(0, date_v);
+            w.set_null_at(1);
+            w.complete().expect("arr_date_0")
+        };
+        row0.set_field(14, arr_date_0);
+        let arr_time_0 = {
+            let mut w = FlussArrayWriter::new(2, &DataTypes::time_with_precision(3));
+            w.write_time(0, time_v);
+            w.set_null_at(1);
+            w.complete().expect("arr_time_0")
+        };
+        row0.set_field(15, arr_time_0);
+        let arr_ts_0 = {
+            let mut w = FlussArrayWriter::new(2, &DataTypes::timestamp_with_precision(6));
+            w.write_timestamp_ntz(0, &ts_v, 6);
+            w.set_null_at(1);
+            w.complete().expect("arr_ts_0")
+        };
+        row0.set_field(16, arr_ts_0);
+        let arr_ts_ltz_0 = {
+            let mut w = FlussArrayWriter::new(2, &DataTypes::timestamp_ltz_with_precision(3));
+            w.write_timestamp_ltz(0, &ts_ltz_v, 3);
+            w.set_null_at(1);
+            w.complete().expect("arr_ts_ltz_0")
+        };
+        row0.set_field(17, arr_ts_ltz_0);
+        let arr_decimal_0 = {
+            let mut w = FlussArrayWriter::new(2, &DataTypes::decimal(10, 2));
+            w.write_decimal(0, &dec, 10);
+            w.set_null_at(1);
+            w.complete().expect("arr_decimal_0")
+        };
+        row0.set_field(18, arr_decimal_0);
+        let arr_decimal_big_0 = {
+            let mut w = FlussArrayWriter::new(1, &DataTypes::decimal(22, 5));
+            w.write_decimal(0, &dec_big, 22);
+            w.complete().expect("arr_decimal_big_0")
+        };
+        row0.set_field(19, arr_decimal_big_0);
+        let arr_float_0 = {
+            let mut w = FlussArrayWriter::new(3, &DataTypes::float());
+            w.write_float(0, f32::NAN);
+            w.write_float(1, f32::INFINITY);
+            w.write_float(2, f32::NEG_INFINITY);
+            w.complete().expect("arr_float_0")
+        };
+        row0.set_field(20, arr_float_0);
+        let arr_double_0 = {
+            let mut w = FlussArrayWriter::new(3, &DataTypes::double());
+            w.write_double(0, f64::NAN);
+            w.write_double(1, f64::INFINITY);
+            w.write_double(2, f64::NEG_INFINITY);
+            w.complete().expect("arr_double_0")
+        };
+        row0.set_field(21, arr_double_0);
+        let arr_binary_0 = {
+            let mut w = FlussArrayWriter::new(2, &DataTypes::binary(4));
+            w.write_binary_bytes(0, &fixed_a);
+            w.write_binary_bytes(1, &fixed_b);
+            w.complete().expect("arr_binary_0")
+        };
+        row0.set_field(22, arr_binary_0);
+
+        // MAP rich types
+        let map_bytes_0 = {
+            let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::bytes());
+            w.write_entry("blob".into(), bytes_v.as_slice().into())
+                .unwrap();
+            w.complete().expect("map_bytes_0")
+        };
+        row0.set_field(23, Datum::Map(map_bytes_0));
+        let map_decimal_0 = {
+            let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::decimal(10, 2));
+            w.write_entry("price".into(), Datum::Decimal(dec.clone()))
+                .unwrap();
+            w.complete().expect("map_decimal_0")
+        };
+        row0.set_field(24, Datum::Map(map_decimal_0));
+        let map_date_0 = {
+            let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::date());
+            w.write_entry("d".into(), Datum::Date(date_v)).unwrap();
+            w.complete().expect("map_date_0")
+        };
+        row0.set_field(25, Datum::Map(map_date_0));
+        let map_time_0 = {
+            let mut w =
+                FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::time_with_precision(3));
+            w.write_entry("t".into(), Datum::Time(time_v)).unwrap();
+            w.complete().expect("map_time_0")
+        };
+        row0.set_field(26, Datum::Map(map_time_0));
+        let map_ts_0 = {
+            let mut w = FlussMapWriter::new(
+                1,
+                &DataTypes::string(),
+                &DataTypes::timestamp_with_precision(6),
+            );
+            w.write_entry("ts".into(), Datum::TimestampNtz(ts_v))
+                .unwrap();
+            w.complete().expect("map_ts_0")
+        };
+        row0.set_field(27, Datum::Map(map_ts_0));
+        let map_ts_ltz_0 = {
+            let mut w = FlussMapWriter::new(
+                1,
+                &DataTypes::string(),
+                &DataTypes::timestamp_ltz_with_precision(3),
+            );
+            w.write_entry("ts".into(), Datum::TimestampLtz(ts_ltz_v))
+                .unwrap();
+            w.complete().expect("map_ts_ltz_0")
+        };
+        row0.set_field(28, Datum::Map(map_ts_ltz_0));
+        let map_float_0 = {
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::float());
+            w.write_entry("nan".into(), f32::NAN.into()).unwrap();
+            w.write_entry("inf".into(), f32::INFINITY.into()).unwrap();
+            w.complete().expect("map_float_0")
+        };
+        row0.set_field(29, Datum::Map(map_float_0));
+        let map_double_0 = {
+            let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::double());
+            w.write_entry("pi".into(), std::f64::consts::PI.into())
+                .unwrap();
+            w.complete().expect("map_double_0")
+        };
+        row0.set_field(30, Datum::Map(map_double_0));
+        let map_bool_0 = {
+            let mut w = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::boolean());
+            w.write_entry("t".into(), true.into()).unwrap();
+            w.write_entry("f".into(), false.into()).unwrap();
+            w.complete().expect("map_bool_0")
+        };
+        row0.set_field(31, Datum::Map(map_bool_0));
+        let map_binary_0 = {
+            let mut w = FlussMapWriter::new(1, &DataTypes::string(), &DataTypes::binary(4));
+            w.write_entry("k".into(), fixed_a.as_slice().into())
+                .unwrap();
+            w.complete().expect("map_binary_0")
+        };
+        row0.set_field(32, Datum::Map(map_binary_0));
+        let map_int_key_0 = {
+            let mut w = FlussMapWriter::new(2, &DataTypes::int(), &DataTypes::string());
+            w.write_entry(1.into(), "one".into()).unwrap();
+            w.write_entry(2.into(), "two".into()).unwrap();
+            w.complete().expect("map_int_key_0")
+        };
+        row0.set_field(33, Datum::Map(map_int_key_0));
+
+        // Scalar values
+        let scalar_tinyint = 127_i8;
+        let scalar_smallint = 32_767_i16;
+        let scalar_bigint = 9_223_372_036_854_775_807_i64;
+        let scalar_float = std::f32::consts::PI;
+        let scalar_double = std::f64::consts::E;
+        let scalar_char = "hello";
+        let scalar_string = "world of fluss rust client";
+        let scalar_time_s = Time::new(36_827_000);
+        let scalar_time_ms = Time::new(36_827_123);
+        let scalar_time_us = Time::new(86_399_999);
+        let scalar_time_ns = Time::new(1);
+        let scalar_ts_s = TimestampNtz::new(1_769_163_227_000);
+        let scalar_ts_ms = TimestampNtz::new(1_769_163_227_123);
+        let scalar_ts_us = TimestampNtz::from_millis_nanos(1_769_163_227_123, 456_000).unwrap();
+        let scalar_ts_ns = TimestampNtz::from_millis_nanos(1_769_163_227_123, 999_999).unwrap();
+        let scalar_ts_ltz_s = TimestampLtz::new(1_769_163_227_000);
+        let scalar_ts_ltz_ms = TimestampLtz::new(1_769_163_227_123);
+        let scalar_ts_ltz_us = TimestampLtz::from_millis_nanos(1_769_163_227_123, 456_000).unwrap();
+        let scalar_ts_ltz_ns = TimestampLtz::from_millis_nanos(1_769_163_227_123, 999_999).unwrap();
+        let scalar_bytes_top: Vec<u8> = b"binary data".to_vec();
+        let scalar_binary_top: Vec<u8> = vec![0xDE, 0xAD, 0xBE, 0xEF];
+        let scalar_ts_us_neg = TimestampNtz::from_millis_nanos(-301_234_154_877, 456_000).unwrap();
+        let scalar_ts_ns_neg = TimestampNtz::from_millis_nanos(-301_234_154_877, 999_999).unwrap();
+        let scalar_ts_ltz_us_neg =
+            TimestampLtz::from_millis_nanos(-301_234_154_877, 456_000).unwrap();
+        let scalar_ts_ltz_ns_neg =
+            TimestampLtz::from_millis_nanos(-301_234_154_877, 999_999).unwrap();
+
+        row0.set_field(34, scalar_tinyint);
+        row0.set_field(35, scalar_smallint);
+        row0.set_field(36, scalar_bigint);
+        row0.set_field(37, scalar_float);
+        row0.set_field(38, scalar_double);
+        row0.set_field(39, true);
+        row0.set_field(40, scalar_char);
+        row0.set_field(41, scalar_string);
+        row0.set_field(42, dec.clone());
+        row0.set_field(43, Datum::Date(date_v));
+        row0.set_field(44, scalar_time_s);
+        row0.set_field(45, scalar_time_ms);
+        row0.set_field(46, scalar_time_us);
+        row0.set_field(47, scalar_time_ns);
+        row0.set_field(48, scalar_ts_s);
+        row0.set_field(49, scalar_ts_ms);
+        row0.set_field(50, scalar_ts_us);
+        row0.set_field(51, scalar_ts_ns);
+        row0.set_field(52, scalar_ts_ltz_s);
+        row0.set_field(53, scalar_ts_ltz_ms);
+        row0.set_field(54, scalar_ts_ltz_us);
+        row0.set_field(55, scalar_ts_ltz_ns);
+        row0.set_field(56, scalar_bytes_top.as_slice());
+        row0.set_field(57, scalar_binary_top.as_slice());
+        row0.set_field(58, scalar_ts_us_neg);
+        row0.set_field(59, scalar_ts_ns_neg);
+        row0.set_field(60, scalar_ts_ltz_us_neg);
+        row0.set_field(61, scalar_ts_ltz_ns_neg);
+
+        // Row 1 — ARRAY/MAP basic-shape edge cases (empty, null elements).
+        let mut row1 = GenericRow::new(column_count);
+        row1.set_field(0, 2_i32);
+        row1.set_field(1, make_int_array(&[]));
+        row1.set_field(2, make_string_array(&[None]));
+        let arr_of_arr_1 = {
+            let mut w = FlussArrayWriter::new(3, &inner_array_int);
+            w.write_array(0, &make_int_array(&[Some(5)]));
+            w.set_null_at(1);
+            w.write_array(2, &make_int_array(&[]));
+            w.complete().expect("arr_of_arr_1")
+        };
+        row1.set_field(3, arr_of_arr_1);
+        let arr_of_row_1 = {
+            let mut w = FlussArrayWriter::new(3, &row_seq_label_owned);
+            let mut e0 = GenericRow::new(2);
+            e0.set_field(0, 7_i32);
+            e0.set_field(1, "x");
+            w.write_row(0, &e0).expect("e0");
+            w.set_null_at(1);
+            let mut e2 = GenericRow::new(2);
+            e2.set_field(0, 8_i32);
+            e2.set_field(1, "y");
+            w.write_row(2, &e2).expect("e2");
+            w.complete().expect("arr_of_row_1")
+        };
+        row1.set_field(4, arr_of_row_1);
+        for i in plan.section_range("row_basics") {
+            row1.set_field(i, Datum::Null);
+        }
+        // Empty MAP
+        let empty_map = FlussMapWriter::new(0, &DataTypes::string(), &DataTypes::int())
+            .complete()
+            .expect("empty_map");
+        row1.set_field(8, Datum::Map(empty_map));
+        for i in (plan.idx("map_string_int") + 1)..plan.len() {
+            row1.set_field(i, Datum::Null);
+        }
+
+        // Row 2 — every column NULL.
+        let mut row2 = GenericRow::new(column_count);
+        row2.set_field(0, 3_i32);
+        for i in 1..column_count {
+            row2.set_field(i, Datum::Null);
+        }
+
+        writer.append(&row0).expect("append row0");
+        writer.append(&row1).expect("append row1");
+        writer.append(&row2).expect("append row2");
+        writer.flush().await.expect("flush");
+
+        let records = scan_table(&table, |scan| scan).await;
+        assert_eq!(records.len(), 3);
+        let r0 = records[0].row();
+        let r1 = records[1].row();
+        let r2 = records[2].row();
+
+        assert_eq!(r0.get_int(0).unwrap(), 1);
+        assert_eq!(r1.get_int(0).unwrap(), 2);
+        assert_eq!(r2.get_int(0).unwrap(), 3);
+
+        // === ARRAY: basic shapes ===
+        let arr_int = r0.get_array(1).unwrap();
+        assert_eq!(arr_int.size(), 3);
+        assert_eq!(arr_int.get_int(0).unwrap(), 10);
+        assert_eq!(arr_int.get_int(2).unwrap(), 30);
+        let arr_string = r0.get_array(2).unwrap();
+        assert_eq!(arr_string.size(), 2);
+        assert_eq!(arr_string.get_string(0).unwrap(), "hello");
+        assert_eq!(arr_string.get_string(1).unwrap(), "world");
+        let arr_of_arr = r0.get_array(3).unwrap();
+        assert_eq!(arr_of_arr.size(), 2);
+        let inner = arr_of_arr.get_array(0).unwrap();
+        assert_eq!(inner.size(), 2);
+        assert_eq!(inner.get_int(0).unwrap(), 1);
+        assert_eq!(inner.get_int(1).unwrap(), 2);
+        let inner = arr_of_arr.get_array(1).unwrap();
+        assert_eq!(inner.get_int(0).unwrap(), 3);
+        assert_eq!(inner.get_int(1).unwrap(), 4);
+
+        // === ARRAY: edge cases on row 1 (empty + null elements + null inner) ===
+        assert_eq!(r1.get_array(1).unwrap().size(), 0);
+        let arr_string_r1 = r1.get_array(2).unwrap();
+        assert_eq!(arr_string_r1.size(), 1);
+        assert!(arr_string_r1.is_null_at(0));
+        let arr_of_arr_r1 = r1.get_array(3).unwrap();
+        assert_eq!(arr_of_arr_r1.size(), 3);
+        let aa0 = arr_of_arr_r1.get_array(0).unwrap();
+        assert_eq!(aa0.size(), 1);
+        assert_eq!(aa0.get_int(0).unwrap(), 5);
+        assert!(arr_of_arr_r1.is_null_at(1));
+        assert_eq!(arr_of_arr_r1.get_array(2).unwrap().size(), 0);
+
+        // === ARRAY: null whole column on row 2 ===
+        assert!(r2.is_null_at(1).unwrap());
+        assert!(r2.is_null_at(2).unwrap());
+        assert!(r2.is_null_at(3).unwrap());
+
+        // === ARRAY<ROW>: row 0 + row 1 with null element + row 2 null whole ===
+        let aor0 = r0.get_array(4).unwrap();
+        assert_eq!(aor0.size(), 2);
+        let e0 = aor0.get_row(0, &row_seq_label).unwrap();
+        assert_eq!(e0.get_int(0).unwrap(), 1);
+        assert_eq!(e0.get_string(1).unwrap(), "open");
+        let e1 = aor0.get_row(1, &row_seq_label).unwrap();
+        assert_eq!(e1.get_int(0).unwrap(), 2);
+        assert_eq!(e1.get_string(1).unwrap(), "close");
+        let aor1 = r1.get_array(4).unwrap();
+        assert_eq!(aor1.size(), 3);
+        let e0 = aor1.get_row(0, &row_seq_label).unwrap();
+        assert_eq!(e0.get_int(0).unwrap(), 7);
+        assert!(aor1.is_null_at(1));
+        let e2 = aor1.get_row(2, &row_seq_label).unwrap();
+        assert_eq!(e2.get_int(0).unwrap(), 8);
+        assert!(r2.is_null_at(4).unwrap());
+
+        // === ROW: basic + deep + rich types on row 0; row 2 null ===
+        let rb = r0.get_row(5).unwrap();
+        assert_eq!(rb.get_int(0).unwrap(), 42);
+        assert_eq!(rb.get_string(1).unwrap(), "hello");
+        let rd = r0.get_row(6).unwrap();
+        let rd_inner = rd.get_row(0).unwrap();
+        assert_eq!(rd_inner.get_int(0).unwrap(), 99);
+        let rr = r0.get_row(7).unwrap();
+        assert!(rr.get_boolean(0).unwrap());
+        assert_eq!(rr.get_int(1).unwrap(), 100_000);
+        assert_eq!(rr.get_long(2).unwrap(), 9_876_543_210);
+        assert_f32_special(rr.get_float(3).unwrap(), f32::INFINITY);
+        assert!(rr.get_double(4).unwrap().is_nan());
+        assert_eq!(rr.get_string(5).unwrap(), "hello world");
+        assert_eq!(rr.get_bytes(6).unwrap(), b"binary");
+        assert_eq!(rr.get_decimal(7, 10, 2).unwrap(), dec);
+        assert_eq!(rr.get_date(8).unwrap().get_inner(), 20476);
+        assert_eq!(rr.get_time(9).unwrap().get_inner(), 36_827_123);
+        assert_eq!(
+            rr.get_timestamp_ntz(10, 6).unwrap().get_millisecond(),
+            1_769_163_227_123
+        );
+        assert_eq!(
+            rr.get_timestamp_ltz(11, 6).unwrap().get_epoch_millisecond(),
+            1_769_163_227_456
+        );
+        assert_eq!(rr.get_binary(12, 4).unwrap(), b"\x01\x02\x03\x04");
+        let f_arr = rr.get_array(13).unwrap();
+        assert_eq!(f_arr.size(), 3);
+        assert_eq!(f_arr.get_int(0).unwrap(), 7);
+        assert!(f_arr.is_null_at(1));
+        assert!(r2.is_null_at(5).unwrap());
+        assert!(r2.is_null_at(6).unwrap());
+        assert!(r2.is_null_at(7).unwrap());
+
+        // === MAP: basic (with null value) + empty (row 1) + null (row 2) ===
+        let m = r0.get_map(8).unwrap();
+        assert_eq!(m.size(), 3);
+        assert_eq!(m.get(&Datum::from("a")).unwrap(), Some(Datum::from(1_i32)));
+        assert_eq!(m.get(&Datum::from("b")).unwrap(), Some(Datum::Null));
+        assert_eq!(m.get(&Datum::from("c")).unwrap(), Some(Datum::from(3_i32)));
+        assert_eq!(r1.get_map(8).unwrap().size(), 0);
+        assert!(r2.is_null_at(8).unwrap());
+
+        // === MAP<K, ROW> ===
+        let m = r0.get_map(9).unwrap();
+        assert_eq!(m.size(), 2);
+        let keys = m.key_array();
+        let values = m.value_array();
+        assert_eq!(keys.get_string(0).unwrap(), "e0");
+        let v0 = values.get_row(0, &row_seq_label).unwrap();
+        assert_eq!(v0.get_int(0).unwrap(), 1);
+        assert_eq!(v0.get_string(1).unwrap(), "open");
+        assert_eq!(keys.get_string(1).unwrap(), "e1");
+        let v1 = values.get_row(1, &row_seq_label).unwrap();
+        assert_eq!(v1.get_int(0).unwrap(), 2);
+        assert_eq!(v1.get_string(1).unwrap(), "close");
+
+        // === MAP<K, MAP> ===
+        let m = r0.get_map(10).unwrap();
+        assert_eq!(m.size(), 2);
+        let g1 = m
+            .value_array()
+            .get_map(0, &DataTypes::string(), &DataTypes::int())
+            .unwrap();
+        assert_eq!(g1.size(), 2);
+        assert_eq!(g1.get(&Datum::from("a")).unwrap(), Some(Datum::from(1_i32)));
+        let g2 = m
+            .value_array()
+            .get_map(1, &DataTypes::string(), &DataTypes::int())
+            .unwrap();
+        assert_eq!(g2.size(), 1);
+        assert_eq!(g2.get(&Datum::from("c")).unwrap(), Some(Datum::from(3_i32)));
+
+        // === MAP<K, ARRAY> + ARRAY<MAP> ===
+        let m = r0.get_map(11).unwrap();
+        assert_eq!(m.size(), 2);
+        let primes = m.value_array().get_array(0).unwrap();
+        assert_eq!(primes.size(), 3);
+        assert_eq!(primes.get_int(2).unwrap(), 5);
+        let am = r0.get_array(12).unwrap();
+        assert_eq!(am.size(), 2);
+        let am0 = am
+            .get_map(0, &DataTypes::string(), &DataTypes::int())
+            .unwrap();
+        assert_eq!(am0.size(), 2);
+        let am1 = am
+            .get_map(1, &DataTypes::string(), &DataTypes::int())
+            .unwrap();
+        assert_eq!(am1.size(), 1);
+        assert_eq!(
+            am1.get(&Datum::from("z")).unwrap(),
+            Some(Datum::from(9_i32))
+        );
+
+        // === ARRAY rich types ===
+        let ab = r0.get_array(13).unwrap();
+        assert_eq!(ab.size(), 2);
+        assert_eq!(ab.get_bytes(0).unwrap(), bytes_v.as_slice());
+        assert!(ab.is_null_at(1));
+        let ad = r0.get_array(14).unwrap();
+        assert_eq!(ad.get_date(0).unwrap().get_inner(), date_v.get_inner());
+        assert!(ad.is_null_at(1));
+        let at = r0.get_array(15).unwrap();
+        assert_eq!(at.get_time(0).unwrap().get_inner(), time_v.get_inner());
+        assert!(at.is_null_at(1));
+        let ats = r0.get_array(16).unwrap();
+        let read_ts = ats.get_timestamp_ntz(0, 6).unwrap();
+        assert_eq!(read_ts.get_millisecond(), ts_v.get_millisecond());
+        assert_eq!(
+            read_ts.get_nano_of_millisecond(),
+            ts_v.get_nano_of_millisecond()
+        );
+        assert!(ats.is_null_at(1));
+        let atl = r0.get_array(17).unwrap();
+        assert_eq!(
+            atl.get_timestamp_ltz(0, 3).unwrap().get_epoch_millisecond(),
+            ts_ltz_v.get_epoch_millisecond()
+        );
+        assert!(atl.is_null_at(1));
+        let adc = r0.get_array(18).unwrap();
+        assert_eq!(adc.get_decimal(0, 10, 2).unwrap(), dec);
+        assert!(adc.is_null_at(1));
+        let adb = r0.get_array(19).unwrap();
+        assert_eq!(adb.get_decimal(0, 22, 5).unwrap(), dec_big);
+        let af = r0.get_array(20).unwrap();
+        assert_eq!(af.size(), 3);
+        assert_f32_special(af.get_float(0).unwrap(), f32::NAN);
+        assert_f32_special(af.get_float(1).unwrap(), f32::INFINITY);
+        assert_f32_special(af.get_float(2).unwrap(), f32::NEG_INFINITY);
+        let adbl = r0.get_array(21).unwrap();
+        assert_f64_special(adbl.get_double(0).unwrap(), f64::NAN);
+        assert_f64_special(adbl.get_double(1).unwrap(), f64::INFINITY);
+        assert_f64_special(adbl.get_double(2).unwrap(), f64::NEG_INFINITY);
+        let fb: FlussArray = r0.get_array(22).unwrap();
+        assert_eq!(fb.get_binary(0).unwrap(), fixed_a.as_slice());
+        assert_eq!(fb.get_binary(1).unwrap(), fixed_b.as_slice());
+
+        // === MAP rich types ===
+        let m = r0.get_map(23).unwrap();
+        assert_eq!(m.value_array().get_bytes(0).unwrap(), bytes_v.as_slice());
+        let m = r0.get_map(24).unwrap();
+        assert_eq!(m.value_array().get_decimal(0, 10, 2).unwrap(), dec);
+        let m = r0.get_map(25).unwrap();
+        assert_eq!(
+            m.value_array().get_date(0).unwrap().get_inner(),
+            date_v.get_inner()
+        );
+        let m = r0.get_map(26).unwrap();
+        assert_eq!(
+            m.value_array().get_time(0).unwrap().get_inner(),
+            time_v.get_inner()
+        );
+        let m = r0.get_map(27).unwrap();
+        let read_ts = m.value_array().get_timestamp_ntz(0, 6).unwrap();
+        assert_eq!(read_ts.get_millisecond(), ts_v.get_millisecond());
+        let m = r0.get_map(28).unwrap();
+        let read_ltz = m.value_array().get_timestamp_ltz(0, 3).unwrap();
+        assert_eq!(
+            read_ltz.get_epoch_millisecond(),
+            ts_ltz_v.get_epoch_millisecond()
+        );
+        let m = r0.get_map(29).unwrap();
+        assert!(m.value_array().get_float(0).unwrap().is_nan());
+        assert!(m.value_array().get_float(1).unwrap().is_infinite());
+        let m = r0.get_map(30).unwrap();
+        assert!(
+            (m.value_array().get_double(0).unwrap() - std::f64::consts::PI).abs() < f64::EPSILON
+        );
+        let m = r0.get_map(31).unwrap();
+        assert!(m.value_array().get_boolean(0).unwrap());
+        assert!(!m.value_array().get_boolean(1).unwrap());
+        let m = r0.get_map(32).unwrap();
+        assert_eq!(m.value_array().get_binary(0).unwrap(), fixed_a.as_slice());
+        let m = r0.get_map(33).unwrap();
+        assert_eq!(m.size(), 2);
+        assert_eq!(m.key_array().get_int(0).unwrap(), 1);
+        assert_eq!(m.value_array().get_string(0).unwrap(), "one");
+
+        // === Convenience API: entries / get / key_type / value_type ===
+        // (exercised on row 0's map_string_int at index 8)
+        let m = r0.get_map(8).unwrap();
+        assert_eq!(m.key_type(), &DataTypes::string().as_non_nullable());
+        assert_eq!(m.value_type(), &DataTypes::int());
+        let mut got: HashMap<String, Option<i32>> = HashMap::with_capacity(m.size());
+        for entry in m.entries() {
+            let (k, v) = entry.expect("decode entry");
+            let key = match k {
+                Datum::String(s) => s.into_owned(),
+                other => panic!("unexpected key variant: {other:?}"),
+            };
+            let value = match v {
+                Datum::Int32(i) => Some(i),
+                Datum::Null => None,
+                other => panic!("unexpected value variant: {other:?}"),
+            };
+            got.insert(key, value);
+        }
+        let expected: HashMap<String, Option<i32>> = HashMap::from([
+            ("a".to_string(), Some(1)),
+            ("b".to_string(), None),
+            ("c".to_string(), Some(3)),
+        ]);
+        assert_eq!(got, expected);
+        assert_eq!(m.get(&Datum::from("a")).unwrap(), Some(Datum::from(1_i32)));
+        assert!(m.get(&Datum::from("missing")).unwrap().is_none());
+
+        // === Bulk write via FlussMapWriter::extend (covered with a fresh map) ===
+        let src: HashMap<&str, i32> = HashMap::from([("a", 1), ("b", 2), ("c", 3)]);
+        let extend_built = {
+            let mut w = FlussMapWriter::new(src.len(), &DataTypes::string(), &DataTypes::int());
+            w.extend(src.clone()).expect("extend");
+            w.complete().expect("extend-complete")
+        };
+        assert_eq!(extend_built.size(), src.len());
+        let extend_b = extend_built.get(&Datum::from("b")).unwrap();
+        assert_eq!(extend_b, Some(Datum::from(2_i32)));
+
+        // === Scalars: integer family ===
+        assert_eq!(r0.get_byte(34).unwrap(), scalar_tinyint);
+        assert_eq!(r0.get_short(35).unwrap(), scalar_smallint);
+        assert_eq!(r0.get_long(36).unwrap(), scalar_bigint);
+
+        // === Scalars: floating point ===
+        assert!((r0.get_float(37).unwrap() - scalar_float).abs() < f32::EPSILON);
+        assert!((r0.get_double(38).unwrap() - scalar_double).abs() < f64::EPSILON);
+
+        // === Scalars: boolean / char / string ===
+        assert!(r0.get_boolean(39).unwrap());
+        assert_eq!(r0.get_char(40, 10).unwrap(), scalar_char);
+        assert_eq!(r0.get_string(41).unwrap(), scalar_string);
+
+        // === Scalars: decimal / date ===
+        assert_eq!(r0.get_decimal(42, 10, 2).unwrap(), dec);
+        assert_eq!(r0.get_date(43).unwrap().get_inner(), date_v.get_inner());
+
+        // === Scalars: time across all four precisions ===
+        assert_eq!(
+            r0.get_time(44).unwrap().get_inner(),
+            scalar_time_s.get_inner()
+        );
+        assert_eq!(
+            r0.get_time(45).unwrap().get_inner(),
+            scalar_time_ms.get_inner()
+        );
+        assert_eq!(
+            r0.get_time(46).unwrap().get_inner(),
+            scalar_time_us.get_inner()
+        );
+        assert_eq!(
+            r0.get_time(47).unwrap().get_inner(),
+            scalar_time_ns.get_inner()
+        );
+
+        // === Scalars: timestamp across all four precisions ===
+        assert_eq!(
+            r0.get_timestamp_ntz(48, 0).unwrap().get_millisecond(),
+            scalar_ts_s.get_millisecond()
+        );
+        assert_eq!(
+            r0.get_timestamp_ntz(49, 3).unwrap().get_millisecond(),
+            scalar_ts_ms.get_millisecond()
+        );
+        let read_us = r0.get_timestamp_ntz(50, 6).unwrap();
+        assert_eq!(read_us.get_millisecond(), scalar_ts_us.get_millisecond());
+        assert_eq!(
+            read_us.get_nano_of_millisecond(),
+            scalar_ts_us.get_nano_of_millisecond()
+        );
+        let read_ns = r0.get_timestamp_ntz(51, 9).unwrap();
+        assert_eq!(read_ns.get_millisecond(), scalar_ts_ns.get_millisecond());
+        assert_eq!(
+            read_ns.get_nano_of_millisecond(),
+            scalar_ts_ns.get_nano_of_millisecond()
+        );
+
+        // === Scalars: timestamp_ltz across all four precisions ===
+        assert_eq!(
+            r0.get_timestamp_ltz(52, 0).unwrap().get_epoch_millisecond(),
+            scalar_ts_ltz_s.get_epoch_millisecond()
+        );
+        assert_eq!(
+            r0.get_timestamp_ltz(53, 3).unwrap().get_epoch_millisecond(),
+            scalar_ts_ltz_ms.get_epoch_millisecond()
+        );
+        let read_ltz_us = r0.get_timestamp_ltz(54, 6).unwrap();
+        assert_eq!(
+            read_ltz_us.get_epoch_millisecond(),
+            scalar_ts_ltz_us.get_epoch_millisecond()
+        );
+        assert_eq!(
+            read_ltz_us.get_nano_of_millisecond(),
+            scalar_ts_ltz_us.get_nano_of_millisecond()
+        );
+        let read_ltz_ns = r0.get_timestamp_ltz(55, 9).unwrap();
+        assert_eq!(
+            read_ltz_ns.get_epoch_millisecond(),
+            scalar_ts_ltz_ns.get_epoch_millisecond()
+        );
+        assert_eq!(
+            read_ltz_ns.get_nano_of_millisecond(),
+            scalar_ts_ltz_ns.get_nano_of_millisecond()
+        );
+
+        // === Scalars: bytes + fixed binary ===
+        assert_eq!(r0.get_bytes(56).unwrap(), scalar_bytes_top.as_slice());
+        assert_eq!(r0.get_binary(57, 4).unwrap(), scalar_binary_top.as_slice());
+
+        // === Scalars: negative-epoch timestamps (pre-1970) ===
+        let read_neg_us = r0.get_timestamp_ntz(58, 6).unwrap();
+        assert_eq!(
+            read_neg_us.get_millisecond(),
+            scalar_ts_us_neg.get_millisecond()
+        );
+        assert_eq!(
+            read_neg_us.get_nano_of_millisecond(),
+            scalar_ts_us_neg.get_nano_of_millisecond()
+        );
+        let read_neg_ns = r0.get_timestamp_ntz(59, 9).unwrap();
+        assert_eq!(
+            read_neg_ns.get_millisecond(),
+            scalar_ts_ns_neg.get_millisecond()
+        );
+        assert_eq!(
+            read_neg_ns.get_nano_of_millisecond(),
+            scalar_ts_ns_neg.get_nano_of_millisecond()
+        );
+        let read_neg_ltz_us = r0.get_timestamp_ltz(60, 6).unwrap();
+        assert_eq!(
+            read_neg_ltz_us.get_epoch_millisecond(),
+            scalar_ts_ltz_us_neg.get_epoch_millisecond()
+        );
+        let read_neg_ltz_ns = r0.get_timestamp_ltz(61, 9).unwrap();
+        assert_eq!(
+            read_neg_ltz_ns.get_epoch_millisecond(),
+            scalar_ts_ltz_ns_neg.get_epoch_millisecond()
+        );
+
+        // === Scalars: every column NULL on row 2 ===
+        for i in plan.section_range("scalars") {
+            assert!(
+                r2.is_null_at(i).unwrap(),
+                "scalar column {i} should be null"
+            );
+        }
+
+        // === Append-side validation: malformed rows are rejected client-side ===
+        // Field count mismatch — far fewer fields than the schema demands.
+        let mut undersized = GenericRow::new(2);
+        undersized.set_field(0, true);
+        let err = writer.append(&undersized).unwrap_err().to_string();
+        assert!(
+            err.contains(&format!("Expected: {column_count}")) && err.contains("Actual: 2"),
+            "expected field-count error, got: {err}"
+        );
+
+        // Type mismatch — correct field count but every cell is Bool, which
+        // satisfies none of the column types except col_boolean.
+        let wrong_types = GenericRow::from_data(
+            (0..column_count)
+                .map(|_| Datum::Bool(true))
+                .collect::<Vec<_>>(),
+        );
+        assert!(
+            writer.append(&wrong_types).is_err(),
+            "row with wrong types should be rejected, not panic"
+        );
+
+        admin.drop_table(&table_path, false).await.expect("drop");
+    }
+}
diff --git a/fluss-rust/crates/fluss/tests/integration/record_batch_log_reader.rs b/fluss-rust/crates/fluss/tests/integration/record_batch_log_reader.rs
new file mode 100644
index 0000000000..6c8d5392c1
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/integration/record_batch_log_reader.rs
@@ -0,0 +1,524 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#[cfg(test)]
+mod reader_test {
+    use crate::integration::utils::{
+        create_partitions, create_table, extract_ids_from_batches, get_shared_cluster,
+        wait_for_partitions_ready,
+    };
+    use arrow::array::record_batch;
+    use fluss::client::{EARLIEST_OFFSET, FlussConnection, RecordBatchLogReader};
+    use fluss::config::{Config, NoKeyAssigner};
+    use fluss::metadata::{DataTypes, Schema, TableBucket, TableDescriptor, TablePath};
+    use fluss::rpc::message::OffsetSpec;
+    use std::collections::HashMap;
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn until_offsets_stops_at_explicit_offset() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_reader_until_offsets");
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+        let writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+        writer
+            .append_arrow_batch(
+                record_batch!(
+                    ("id", Int32, [1, 2, 3, 4, 5, 6]),
+                    ("name", Utf8, ["a", "b", "c", "d", "e", "f"])
+                )
+                .unwrap(),
+            )
+            .expect("Failed to append batch");
+        writer.flush().await.expect("Failed to flush");
+
+        let scanner = table
+            .new_scan()
+            .create_record_batch_log_scanner()
+            .expect("Failed to create record batch scanner");
+        scanner
+            .subscribe(0, 1)
+            .await
+            .expect("Failed to subscribe from offset 1");
+
+        let table_id = table.get_table_info().table_id;
+        let mut reader = RecordBatchLogReader::new_until_offsets(
+            scanner,
+            HashMap::from([(TableBucket::new(table_id, 0), 4)]),
+        )
+        .expect("Failed to create record batch reader");
+
+        let batches = tokio::time::timeout(Duration::from_secs(10), reader.collect_all_batches())
+            .await
+            .expect("Timed out collecting bounded reader batches")
+            .expect("Failed to collect bounded reader batches");
+
+        assert_eq!(
+            extract_ids_from_batches(&batches),
+            vec![2, 3, 4],
+            "reader should include offsets [1, 4) and stop before offset 4"
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    #[tokio::test]
+    async fn until_offsets_with_empty_range() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_reader_until_offsets_empty_range");
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+        let writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+        writer
+            .append_arrow_batch(
+                record_batch!(("id", Int32, [1, 2, 3]), ("name", Utf8, ["a", "b", "c"])).unwrap(),
+            )
+            .expect("Failed to append batch");
+        writer.flush().await.expect("Failed to flush");
+
+        let scanner = table
+            .new_scan()
+            .create_record_batch_log_scanner()
+            .expect("Failed to create record batch scanner");
+        scanner
+            .subscribe(0, 1)
+            .await
+            .expect("Failed to subscribe from offset 1");
+
+        let table_id = table.get_table_info().table_id;
+        let mut reader = RecordBatchLogReader::new_until_offsets(
+            scanner,
+            HashMap::from([(TableBucket::new(table_id, 0), 1)]),
+        )
+        .expect("Failed to create record batch reader");
+
+        let batches = tokio::time::timeout(Duration::from_secs(10), reader.collect_all_batches())
+            .await
+            .expect("Timed out collecting empty-range reader batches")
+            .expect("Failed to collect empty-range reader batches");
+
+        assert!(
+            batches.is_empty(),
+            "reader should return no batches when start and stop offsets are equal"
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    #[tokio::test]
+    async fn until_offsets_past_end_of_log() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_reader_until_offsets_past_end");
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+        let writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+        writer
+            .append_arrow_batch(
+                record_batch!(("id", Int32, [1, 2, 3]), ("name", Utf8, ["a", "b", "c"])).unwrap(),
+            )
+            .expect("Failed to append initial batch");
+        writer.flush().await.expect("Failed to flush initial batch");
+
+        let scanner = table
+            .new_scan()
+            .create_record_batch_log_scanner()
+            .expect("Failed to create record batch scanner");
+        scanner
+            .subscribe(0, EARLIEST_OFFSET)
+            .await
+            .expect("Failed to subscribe bucket");
+
+        let table_id = table.get_table_info().table_id;
+        let mut reader = RecordBatchLogReader::new_until_offsets(
+            scanner,
+            HashMap::from([(TableBucket::new(table_id, 0), 6)]),
+        )
+        .expect("Failed to create record batch reader");
+
+        let collect_task = tokio::spawn(async move { reader.collect_all_batches().await });
+        tokio::time::sleep(Duration::from_millis(750)).await;
+        assert!(
+            !collect_task.is_finished(),
+            "reader should wait when the stopping offset is beyond the current log end"
+        );
+
+        writer
+            .append_arrow_batch(
+                record_batch!(("id", Int32, [4, 5, 6]), ("name", Utf8, ["d", "e", "f"])).unwrap(),
+            )
+            .expect("Failed to append follow-up batch");
+        writer
+            .flush()
+            .await
+            .expect("Failed to flush follow-up batch");
+
+        let batches = tokio::time::timeout(Duration::from_secs(10), collect_task)
+            .await
+            .expect("Timed out collecting reader batches after appending past stop offset")
+            .expect("Reader task panicked")
+            .expect("Failed to collect reader batches");
+
+        assert_eq!(
+            extract_ids_from_batches(&batches),
+            vec![1, 2, 3, 4, 5, 6],
+            "reader should resume after future records arrive and stop at the requested offset"
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    #[tokio::test]
+    async fn until_offsets_multi_bucket() {
+        let cluster = get_shared_cluster();
+        let connection = FlussConnection::new(Config {
+            writer_acks: "all".to_string(),
+            bootstrap_servers: cluster.plaintext_bootstrap_servers().to_string(),
+            writer_bucket_no_key_assigner: NoKeyAssigner::RoundRobin,
+            ..Default::default()
+        })
+        .await
+        .expect("Failed to connect with round-robin bucket assignment");
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_reader_until_offsets_multi_bucket");
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .distributed_by(Some(2), vec!["id".to_string()])
+            .build()
+            .expect("Failed to build table");
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+        let writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+        writer
+            .append_arrow_batch(
+                record_batch!(
+                    ("id", Int32, [1, 2, 3, 4]),
+                    ("name", Utf8, ["a", "b", "c", "d"])
+                )
+                .unwrap(),
+            )
+            .expect("Failed to append first batch");
+        writer
+            .append_arrow_batch(
+                record_batch!(
+                    ("id", Int32, [5, 6, 7, 8]),
+                    ("name", Utf8, ["e", "f", "g", "h"])
+                )
+                .unwrap(),
+            )
+            .expect("Failed to append second batch");
+        writer.flush().await.expect("Failed to flush");
+
+        let latest_offsets = admin
+            .list_offsets(&table_path, &[0, 1], OffsetSpec::Latest)
+            .await
+            .expect("Failed to list latest offsets");
+        assert!(
+            latest_offsets.values().all(|offset| *offset > 0),
+            "test records should be distributed across both buckets: {latest_offsets:?}"
+        );
+
+        let scanner = table
+            .new_scan()
+            .create_record_batch_log_scanner()
+            .expect("Failed to create record batch scanner");
+        scanner
+            .subscribe_buckets(&HashMap::from([(0, 0), (1, 0)]))
+            .await
+            .expect("Failed to subscribe to multiple buckets");
+
+        let table_id = table.get_table_info().table_id;
+        let stopping_offsets: HashMap<TableBucket, i64> = latest_offsets
+            .into_iter()
+            .map(|(bucket, offset)| (TableBucket::new(table_id, bucket), offset))
+            .collect();
+        assert_eq!(
+            stopping_offsets.len(),
+            2,
+            "reader should track two stopping offsets"
+        );
+
+        let mut reader = RecordBatchLogReader::new_until_offsets(scanner, stopping_offsets)
+            .expect("Failed to create record batch reader");
+        let batches = tokio::time::timeout(Duration::from_secs(10), reader.collect_all_batches())
+            .await
+            .expect("Timed out collecting multi-bucket reader batches")
+            .expect("Failed to collect multi-bucket reader batches");
+
+        let mut ids = extract_ids_from_batches(&batches);
+        ids.sort();
+        assert_eq!(ids, vec![1, 2, 3, 4, 5, 6, 7, 8]);
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    #[tokio::test]
+    async fn until_latest_reads_non_partitioned_table() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_reader_non_partitioned_latest");
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("name", DataTypes::string())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+        let writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+        writer
+            .append_arrow_batch(
+                record_batch!(
+                    ("id", Int32, [1, 2, 3, 4]),
+                    ("name", Utf8, ["a", "b", "c", "d"])
+                )
+                .unwrap(),
+            )
+            .expect("Failed to append batch");
+        writer.flush().await.expect("Failed to flush");
+
+        let scanner = table
+            .new_scan()
+            .create_record_batch_log_scanner()
+            .expect("Failed to create record batch scanner");
+        scanner
+            .subscribe(0, EARLIEST_OFFSET)
+            .await
+            .expect("Failed to subscribe bucket");
+
+        let mut reader = RecordBatchLogReader::new_until_latest(scanner, &admin)
+            .await
+            .expect("Failed to create latest-offset reader");
+        let batches = tokio::time::timeout(Duration::from_secs(10), reader.collect_all_batches())
+            .await
+            .expect("Timed out collecting non-partitioned reader batches")
+            .expect("Failed to collect non-partitioned reader batches");
+
+        assert_eq!(
+            extract_ids_from_batches(&batches),
+            vec![1, 2, 3, 4],
+            "latest-offset reader should read all records present in the non-partitioned table"
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+
+    #[tokio::test]
+    async fn until_latest_reads_partitioned_table() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_reader_partitioned_latest");
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("id", DataTypes::int())
+                    .column("region", DataTypes::string())
+                    .column("value", DataTypes::bigint())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .partitioned_by(vec!["region"])
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+        create_partitions(&admin, &table_path, "region", &["US", "EU"]).await;
+        wait_for_partitions_ready(&admin, &table_path, &["US", "EU"]).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+        let writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+
+        let us_batch = record_batch!(
+            ("id", Int32, [1, 2]),
+            ("region", Utf8, ["US", "US"]),
+            ("value", Int64, [100, 200])
+        )
+        .unwrap();
+        writer
+            .append_arrow_batch(us_batch)
+            .expect("Failed to append US batch");
+
+        let eu_batch = record_batch!(
+            ("id", Int32, [3, 4]),
+            ("region", Utf8, ["EU", "EU"]),
+            ("value", Int64, [300, 400])
+        )
+        .unwrap();
+        writer
+            .append_arrow_batch(eu_batch)
+            .expect("Failed to append EU batch");
+        writer.flush().await.expect("Failed to flush");
+
+        let scanner = table
+            .new_scan()
+            .create_record_batch_log_scanner()
+            .expect("Failed to create record batch scanner");
+        for partition in admin
+            .list_partition_infos(&table_path)
+            .await
+            .expect("Failed to list partition infos")
+        {
+            // The table uses the default single-bucket layout, so bucket 0 is
+            // the only bucket in each partition. If this test switches to a
+            // multi-bucket table, subscribe all buckets for each partition.
+            scanner
+                .subscribe_partition(partition.get_partition_id(), 0, EARLIEST_OFFSET)
+                .await
+                .expect("Failed to subscribe partition bucket");
+        }
+
+        let mut reader = RecordBatchLogReader::new_until_latest(scanner, &admin)
+            .await
+            .expect("Failed to create latest-offset reader");
+        let batches = tokio::time::timeout(Duration::from_secs(10), reader.collect_all_batches())
+            .await
+            .expect("Timed out collecting partitioned reader batches")
+            .expect("Failed to collect partitioned reader batches");
+
+        let mut ids = extract_ids_from_batches(&batches);
+        ids.sort();
+        assert_eq!(
+            ids,
+            vec![1, 2, 3, 4],
+            "latest-offset reader should read all records present in subscribed partitions"
+        );
+
+        admin
+            .drop_table(&table_path, false)
+            .await
+            .expect("Failed to drop table");
+    }
+}
diff --git a/fluss-rust/crates/fluss/tests/integration/sasl_auth.rs b/fluss-rust/crates/fluss/tests/integration/sasl_auth.rs
new file mode 100644
index 0000000000..439d65fee7
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/integration/sasl_auth.rs
@@ -0,0 +1,147 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(test)]
+mod sasl_auth_test {
+    use crate::integration::utils::get_shared_cluster;
+    use fluss::client::FlussConnection;
+    use fluss::config::Config;
+    use fluss::error::FlussError;
+    use fluss::metadata::DatabaseDescriptorBuilder;
+
+    const SASL_USERNAME: &str = "admin";
+    const SASL_PASSWORD: &str = "admin-secret";
+
+    /// Verify that a client with correct SASL credentials can connect and perform operations.
+    #[tokio::test]
+    async fn test_sasl_connect_with_valid_credentials() {
+        let cluster = get_shared_cluster();
+        let connection = cluster
+            .get_fluss_connection_with_sasl(SASL_USERNAME, SASL_PASSWORD)
+            .await;
+
+        let admin = connection
+            .get_admin()
+            .expect("Should get admin with valid SASL credentials");
+
+        // Perform a basic operation to confirm the connection is fully functional
+        let db_name = "sasl_test_valid_db";
+        let descriptor = DatabaseDescriptorBuilder::default()
+            .comment("created via SASL auth")
+            .build();
+
+        admin
+            .create_database(db_name, Some(&descriptor), true)
+            .await
+            .expect("Should create database with SASL auth");
+
+        assert!(admin.database_exists(db_name).await.unwrap());
+
+        // Cleanup
+        admin
+            .drop_database(db_name, true, true)
+            .await
+            .expect("Should drop database");
+    }
+
+    /// Verify that a second user can also authenticate successfully.
+    #[tokio::test]
+    async fn test_sasl_connect_with_second_user() {
+        let cluster = get_shared_cluster();
+        let connection = cluster
+            .get_fluss_connection_with_sasl("alice", "alice-secret")
+            .await;
+
+        let admin = connection
+            .get_admin()
+            .expect("Should get admin with alice credentials");
+
+        // Basic operation to confirm functional connection
+        assert!(
+            admin
+                .database_exists("some_nonexistent_db_alice")
+                .await
+                .is_ok()
+        );
+    }
+
+    /// Verify that wrong credentials are rejected with a typed AuthenticateException error.
+    #[tokio::test]
+    async fn test_sasl_connect_with_wrong_password() {
+        let cluster = get_shared_cluster();
+        let result = cluster
+            .try_fluss_connection_with_sasl(SASL_USERNAME, "wrong-password")
+            .await;
+
+        let err = match result {
+            Err(e) => e,
+            Ok(_) => panic!("Connection with wrong password should fail"),
+        };
+
+        // The server error code must be preserved (not wrapped in a generic string).
+        // Code 46 = AuthenticateException — this is what C++ and Python bindings
+        // use to distinguish auth failures from network errors.
+        assert_eq!(
+            err.api_error(),
+            Some(FlussError::AuthenticateException),
+            "Wrong password should produce AuthenticateException, got: {err}"
+        );
+    }
+
+    /// Verify that a SASL-configured client fails when connecting to a plaintext server.
+    #[tokio::test]
+    async fn test_sasl_client_to_plaintext_server() {
+        let cluster = get_shared_cluster();
+        let plaintext_addr = cluster.plaintext_bootstrap_servers().to_string();
+
+        let config = Config {
+            writer_acks: "all".to_string(),
+            bootstrap_servers: plaintext_addr,
+            security_protocol: "sasl".to_string(),
+            security_sasl_mechanism: "PLAIN".to_string(),
+            security_sasl_username: SASL_USERNAME.to_string(),
+            security_sasl_password: SASL_PASSWORD.to_string(),
+            ..Default::default()
+        };
+
+        let result = FlussConnection::new(config).await;
+        assert!(
+            result.is_err(),
+            "SASL client connecting to plaintext server should fail"
+        );
+    }
+
+    /// Verify that a nonexistent user is rejected with a typed error.
+    #[tokio::test]
+    async fn test_sasl_connect_with_unknown_user() {
+        let cluster = get_shared_cluster();
+        let result = cluster
+            .try_fluss_connection_with_sasl("nonexistent_user", "some-password")
+            .await;
+
+        let err = match result {
+            Err(e) => e,
+            Ok(_) => panic!("Connection with unknown user should fail"),
+        };
+
+        assert_eq!(
+            err.api_error(),
+            Some(FlussError::AuthenticateException),
+            "Unknown user should produce AuthenticateException, got: {err}"
+        );
+    }
+}
diff --git a/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs
new file mode 100644
index 0000000000..293c1f78c9
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/integration/table_remote_scan.rs
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#[cfg(test)]
+mod table_remote_scan_test {
+    use crate::integration::utils::{create_table, get_shared_cluster};
+    use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+    use fluss::row::{GenericRow, InternalRow};
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn test_scan_remote_log() {
+        let cluster = get_shared_cluster();
+        let connection = cluster.get_fluss_connection().await;
+
+        let admin = connection.get_admin().expect("Failed to get admin");
+
+        let table_path = TablePath::new("fluss", "test_scan_remote_log");
+
+        let table_descriptor = TableDescriptor::builder()
+            .schema(
+                Schema::builder()
+                    .column("c1", DataTypes::int())
+                    .column("c2", DataTypes::string())
+                    .build()
+                    .expect("Failed to build schema"),
+            )
+            .build()
+            .expect("Failed to build table");
+
+        create_table(&admin, &table_path, &table_descriptor).await;
+
+        let table = connection
+            .get_table(&table_path)
+            .await
+            .expect("Failed to get table");
+
+        let append_writer = table
+            .new_append()
+            .expect("Failed to create append")
+            .create_writer()
+            .expect("Failed to create writer");
+
+        // append 20 rows, there must be some tiered to remote
+        let record_count = 20;
+        for i in 0..record_count {
+            let mut row = GenericRow::new(2);
+            row.set_field(0, i as i32);
+            let v = format!("v{}", i);
+            row.set_field(1, v.as_str());
+            append_writer.append(&row).expect("Failed to append row");
+        }
+
+        append_writer.flush().await.expect("Failed to flush");
+
+        // Create a log scanner and subscribe to all buckets to read appended records
+        let num_buckets = table.get_table_info().get_num_buckets();
+        let log_scanner = table
+            .new_scan()
+            .project(&[1, 0])
+            .unwrap()
+            .create_log_scanner()
+            .expect("Failed to create log scanner");
+        for bucket_id in 0..num_buckets {
+            log_scanner
+                .subscribe(bucket_id, 0)
+                .await
+                .expect("Failed to subscribe");
+        }
+
+        let mut records = Vec::with_capacity(record_count);
+        let start = std::time::Instant::now();
+        const MAX_WAIT_DURATION: Duration = Duration::from_secs(60);
+        while records.len() < record_count {
+            if start.elapsed() > MAX_WAIT_DURATION {
+                panic!(
+                    "Timed out waiting for {} records; only got {} after {:?}",
+                    record_count,
+                    records.len(),
+                    start.elapsed()
+                );
+            }
+            let scan_records = log_scanner
+                .poll(Duration::from_secs(1))
+                .await
+                .expect("Failed to poll log scanner");
+            records.extend(scan_records);
+        }
+
+        // then, check the data
+        for (i, record) in records.iter().enumerate() {
+            let row = record.row();
+            let expected_c1 = i as i32;
+            let expected_c2 = format!("v{}", i);
+            assert_eq!(
+                row.get_int(1).unwrap(),
+                expected_c1,
+                "c1 mismatch at index {}",
+                i
+            );
+            assert_eq!(
+                row.get_string(0).unwrap(),
+                expected_c2,
+                "c2 mismatch at index {}",
+                i
+            );
+        }
+    }
+}
diff --git a/fluss-rust/crates/fluss/tests/integration/utils.rs b/fluss-rust/crates/fluss/tests/integration/utils.rs
new file mode 100644
index 0000000000..2ebe31d963
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/integration/utils.rs
@@ -0,0 +1,499 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+use crate::integration::fluss_cluster::{FlussTestingCluster, FlussTestingClusterBuilder};
+use arrow::array::Int32Array;
+use fluss::client::FlussAdmin;
+use fluss::metadata::{
+    DataField, DataType, DataTypes, PartitionSpec, RowType, Schema, TableDescriptor, TablePath,
+};
+use fluss::record::ScanBatch;
+use fluss::row::FlussArray;
+use fluss::row::binary_array::FlussArrayWriter;
+use fluss::rpc::message::OffsetSpec;
+use std::collections::HashMap;
+use std::future::Future;
+use std::sync::Arc;
+use std::sync::LazyLock;
+use std::time::Duration;
+
+extern "C" fn cleanup_on_exit() {
+    SHARED_CLUSTER.stop();
+}
+
+/// Shared cluster with dual listeners: PLAIN_CLIENT (plaintext) on port 9223
+/// and CLIENT (SASL) on port 9123. Includes remote storage config so
+/// table_remote_scan can also use this cluster.
+static SHARED_CLUSTER: LazyLock<FlussTestingCluster> = LazyLock::new(|| {
+    std::thread::spawn(|| {
+        let rt = tokio::runtime::Runtime::new().expect("Failed to create runtime");
+        rt.block_on(async {
+            let temp_dir = std::env::current_dir()
+                .unwrap_or_else(|_| std::path::PathBuf::from("."))
+                .join("target")
+                .join(format!("test-remote-data-{}", uuid::Uuid::new_v4()));
+            let _ = std::fs::remove_dir_all(&temp_dir);
+            std::fs::create_dir_all(&temp_dir)
+                .expect("Failed to create temporary directory for remote data");
+            let temp_dir = temp_dir
+                .canonicalize()
+                .expect("Failed to canonicalize remote data directory path");
+
+            let mut cluster_conf = HashMap::new();
+            cluster_conf.insert("log.segment.file-size".to_string(), "120b".to_string());
+            cluster_conf.insert(
+                "remote.log.task-interval-duration".to_string(),
+                "1s".to_string(),
+            );
+
+            let cluster =
+                FlussTestingClusterBuilder::new_with_cluster_conf("rust-test", &cluster_conf)
+                    .with_sasl(vec![
+                        ("admin".to_string(), "admin-secret".to_string()),
+                        ("alice".to_string(), "alice-secret".to_string()),
+                    ])
+                    .with_remote_data_dir(temp_dir)
+                    .build()
+                    .await;
+            wait_for_cluster_ready_with_sasl(&cluster).await;
+
+            // Register cleanup so containers are removed on process exit.
+            unsafe {
+                unsafe extern "C" {
+                    fn atexit(f: extern "C" fn()) -> std::os::raw::c_int;
+                }
+                atexit(cleanup_on_exit);
+            }
+
+            cluster
+        })
+    })
+    .join()
+    .expect("Failed to initialize shared cluster")
+});
+
+/// Returns an `Arc` to the shared test cluster.
+pub fn get_shared_cluster() -> Arc<FlussTestingCluster> {
+    Arc::new(SHARED_CLUSTER.clone())
+}
+
+pub async fn create_table(
+    admin: &FlussAdmin,
+    table_path: &TablePath,
+    table_descriptor: &TableDescriptor,
+) {
+    admin
+        .create_table(table_path, table_descriptor, false)
+        .await
+        .expect("Failed to create table");
+}
+
+const READINESS_TIMEOUT: Duration = Duration::from_secs(30);
+const READINESS_POLL_INTERVAL: Duration = Duration::from_millis(200);
+
+async fn poll_until<F, Fut>(
+    timeout: Duration,
+    interval: Duration,
+    timeout_message: String,
+    mut probe: F,
+) where
+    F: FnMut() -> Fut,
+    Fut: Future<Output = Result<(), String>>,
+{
+    let start = std::time::Instant::now();
+
+    loop {
+        match probe().await {
+            Ok(()) => return,
+            Err(err) => {
+                if start.elapsed() >= timeout {
+                    panic!(
+                        "{timeout_message} after {} seconds. Last error: {err}",
+                        timeout.as_secs()
+                    );
+                }
+            }
+        }
+
+        tokio::time::sleep(interval).await;
+    }
+}
+
+/// Waits until the default bucket of a non-partitioned table can serve offset requests.
+///
+/// Newly-created tables may not have bucket leaders immediately. Polling list offsets avoids
+/// fixed sleeps that are either flaky on slow CI or waste time when the cluster is ready sooner.
+pub async fn wait_for_table_ready(admin: &FlussAdmin, table_path: &TablePath) {
+    wait_for_table_buckets_ready(admin, table_path, &[0]).await;
+}
+
+/// Waits until the specified buckets of a non-partitioned table can serve offset requests.
+pub async fn wait_for_table_buckets_ready(
+    admin: &FlussAdmin,
+    table_path: &TablePath,
+    buckets: &[i32],
+) {
+    poll_until(
+        READINESS_TIMEOUT,
+        READINESS_POLL_INTERVAL,
+        format!("Timed out waiting for table '{table_path}' buckets {buckets:?} to become ready"),
+        || async {
+            admin
+                .list_offsets(table_path, buckets, OffsetSpec::Latest)
+                .await
+                .map(|_| ())
+                .map_err(|err| format!("{err:?}"))
+        },
+    )
+    .await;
+}
+
+/// Waits until all listed partition values can serve offset requests for the default bucket.
+pub async fn wait_for_partitions_ready(
+    admin: &FlussAdmin,
+    table_path: &TablePath,
+    partition_values: &[&str],
+) {
+    for partition_value in partition_values {
+        wait_for_partition_ready(admin, table_path, partition_value).await;
+    }
+}
+
+/// Waits until one partition value can serve offset requests for the default bucket.
+pub async fn wait_for_partition_ready(
+    admin: &FlussAdmin,
+    table_path: &TablePath,
+    partition_value: &str,
+) {
+    wait_for_partition_buckets_ready(admin, table_path, partition_value, &[0]).await;
+}
+
+/// Waits until the specified buckets of a partition can serve offset requests.
+pub async fn wait_for_partition_buckets_ready(
+    admin: &FlussAdmin,
+    table_path: &TablePath,
+    partition_value: &str,
+    buckets: &[i32],
+) {
+    poll_until(
+        READINESS_TIMEOUT,
+        READINESS_POLL_INTERVAL,
+        format!(
+            "Timed out waiting for table '{table_path}' partition '{partition_value}' buckets {buckets:?} to become ready"
+        ),
+        || async {
+            admin
+                .list_partition_offsets(table_path, partition_value, buckets, OffsetSpec::Latest)
+                .await
+                .map(|_| ())
+                .map_err(|err| format!("{err:?}"))
+        },
+    )
+    .await;
+}
+
+pub fn make_string_array(values: &[Option<&str>]) -> FlussArray {
+    let mut writer = FlussArrayWriter::new(values.len(), &DataTypes::string());
+    for (idx, value) in values.iter().enumerate() {
+        match value {
+            Some(v) => writer.write_string(idx, v),
+            None => writer.set_null_at(idx),
+        }
+    }
+    writer.complete().expect("Failed to build string array")
+}
+
+pub fn make_int_array(values: &[Option<i32>]) -> FlussArray {
+    let mut writer = FlussArrayWriter::new(values.len(), &DataTypes::int());
+    for (idx, value) in values.iter().enumerate() {
+        match value {
+            Some(v) => writer.write_int(idx, *v),
+            None => writer.set_null_at(idx),
+        }
+    }
+    writer.complete().expect("Failed to build int array")
+}
+
+pub fn extract_ids_from_batches(batches: &[ScanBatch]) -> Vec<i32> {
+    batches
+        .iter()
+        .flat_map(|scan_batch| {
+            let batch = scan_batch.batch();
+            (0..batch.num_rows()).map(move |row| {
+                batch
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int32Array>()
+                    .expect("id column should be Int32")
+                    .value(row)
+            })
+        })
+        .collect()
+}
+
+/// Similar to wait_for_cluster_ready but connects with SASL credentials.
+pub async fn wait_for_cluster_ready_with_sasl(cluster: &FlussTestingCluster) {
+    let (username, password) = cluster
+        .sasl_users()
+        .first()
+        .expect("SASL cluster must have at least one user");
+
+    poll_until(
+        Duration::from_secs(30),
+        Duration::from_millis(500),
+        "SASL server readiness check timed out".to_string(),
+        || async {
+            let connection = cluster
+                .get_fluss_connection_with_sasl(username, password)
+                .await;
+            if connection
+                .get_metadata()
+                .get_cluster()
+                .get_one_available_server()
+                .is_some()
+            {
+                Ok(())
+            } else {
+                Err(
+                    "CoordinatorEventProcessor may not be initialized or TabletServer may not be available"
+                        .to_string(),
+                )
+            }
+        },
+    )
+    .await;
+}
+
+/// Creates partitions for a partitioned table.
+///
+/// # Arguments
+/// * `admin` - The FlussAdmin instance
+/// * `table_path` - The table path
+/// * `partition_column` - The partition column name
+/// * `partition_values` - The partition values to create
+pub async fn create_partitions(
+    admin: &FlussAdmin,
+    table_path: &TablePath,
+    partition_column: &str,
+    partition_values: &[&str],
+) {
+    for value in partition_values {
+        let mut partition_map = HashMap::new();
+        partition_map.insert(partition_column, *value);
+        admin
+            .create_partition(table_path, &PartitionSpec::new(partition_map), true)
+            .await
+            .expect("Failed to create partition");
+    }
+}
+
+pub fn dt_array_int() -> DataType {
+    DataTypes::array(DataTypes::int())
+}
+
+pub fn dt_map_string_int() -> DataType {
+    DataTypes::map(DataTypes::string(), DataTypes::int())
+}
+
+pub fn dt_row_seq_label() -> DataType {
+    DataTypes::row(vec![
+        DataField::new("seq", DataTypes::int(), None),
+        DataField::new("label", DataTypes::string(), None),
+    ])
+}
+
+pub fn as_row_type(dt: &DataType) -> RowType {
+    match dt {
+        DataType::Row(rt) => rt.clone(),
+        other => panic!("expected DataType::Row, got {other:?}"),
+    }
+}
+
+pub fn dt_row_deep() -> DataType {
+    let inner = DataTypes::row(vec![DataField::new("n", DataTypes::int(), None)]);
+    DataTypes::row(vec![DataField::new("inner", inner, None)])
+}
+
+pub fn dt_row_rich() -> DataType {
+    DataTypes::row(vec![
+        DataField::new("f_bool", DataTypes::boolean(), None),
+        DataField::new("f_int", DataTypes::int(), None),
+        DataField::new("f_long", DataTypes::bigint(), None),
+        DataField::new("f_float", DataTypes::float(), None),
+        DataField::new("f_double", DataTypes::double(), None),
+        DataField::new("f_str", DataTypes::string(), None),
+        DataField::new("f_bytes", DataTypes::bytes(), None),
+        DataField::new("f_decimal", DataTypes::decimal(10, 2), None),
+        DataField::new("f_date", DataTypes::date(), None),
+        DataField::new("f_time", DataTypes::time_with_precision(3), None),
+        DataField::new("f_ts_ntz", DataTypes::timestamp_with_precision(6), None),
+        DataField::new("f_ts_ltz", DataTypes::timestamp_ltz_with_precision(6), None),
+        DataField::new("f_binary_fixed", DataTypes::binary(4), None),
+        DataField::new("f_array_int", DataTypes::array(DataTypes::int()), None),
+    ])
+}
+
+pub fn array_dt_basics_columns() -> Vec<(&'static str, DataType)> {
+    vec![
+        ("arr_int", DataTypes::array(DataTypes::int())),
+        ("arr_string", DataTypes::array(DataTypes::string())),
+        ("arr_of_arr", DataTypes::array(dt_array_int())),
+        ("arr_of_row", DataTypes::array(dt_row_seq_label())),
+    ]
+}
+
+pub fn row_dt_basics_columns() -> Vec<(&'static str, DataType)> {
+    vec![
+        ("row_basic", dt_row_seq_label()),
+        ("row_deep", dt_row_deep()),
+        ("row_rich", dt_row_rich()),
+    ]
+}
+
+pub fn map_dt_basics_columns() -> Vec<(&'static str, DataType)> {
+    vec![
+        ("map_string_int", dt_map_string_int()),
+        (
+            "map_of_row",
+            DataTypes::map(DataTypes::string(), dt_row_seq_label()),
+        ),
+        (
+            "map_of_map",
+            DataTypes::map(DataTypes::string(), dt_map_string_int()),
+        ),
+        (
+            "map_of_array",
+            DataTypes::map(DataTypes::string(), dt_array_int()),
+        ),
+        ("array_of_map", DataTypes::array(dt_map_string_int())),
+    ]
+}
+
+pub fn scalar_dt_columns() -> Vec<(&'static str, DataType)> {
+    vec![
+        ("col_tinyint", DataTypes::tinyint()),
+        ("col_smallint", DataTypes::smallint()),
+        ("col_bigint", DataTypes::bigint()),
+        ("col_float", DataTypes::float()),
+        ("col_double", DataTypes::double()),
+        ("col_boolean", DataTypes::boolean()),
+        ("col_char", DataTypes::char(10)),
+        ("col_string", DataTypes::string()),
+        ("col_decimal", DataTypes::decimal(10, 2)),
+        ("col_date", DataTypes::date()),
+        ("col_time_s", DataTypes::time_with_precision(0)),
+        ("col_time_ms", DataTypes::time_with_precision(3)),
+        ("col_time_us", DataTypes::time_with_precision(6)),
+        ("col_time_ns", DataTypes::time_with_precision(9)),
+        ("col_ts_s", DataTypes::timestamp_with_precision(0)),
+        ("col_ts_ms", DataTypes::timestamp_with_precision(3)),
+        ("col_ts_us", DataTypes::timestamp_with_precision(6)),
+        ("col_ts_ns", DataTypes::timestamp_with_precision(9)),
+        ("col_ts_ltz_s", DataTypes::timestamp_ltz_with_precision(0)),
+        ("col_ts_ltz_ms", DataTypes::timestamp_ltz_with_precision(3)),
+        ("col_ts_ltz_us", DataTypes::timestamp_ltz_with_precision(6)),
+        ("col_ts_ltz_ns", DataTypes::timestamp_ltz_with_precision(9)),
+        ("col_bytes_top", DataTypes::bytes()),
+        ("col_binary_top", DataTypes::binary(4)),
+        ("col_ts_us_neg", DataTypes::timestamp_with_precision(6)),
+        ("col_ts_ns_neg", DataTypes::timestamp_with_precision(9)),
+        (
+            "col_ts_ltz_us_neg",
+            DataTypes::timestamp_ltz_with_precision(6),
+        ),
+        (
+            "col_ts_ltz_ns_neg",
+            DataTypes::timestamp_ltz_with_precision(9),
+        ),
+    ]
+}
+
+#[derive(Default)]
+pub struct ColumnPlan {
+    cols: Vec<(&'static str, DataType)>,
+    index: HashMap<&'static str, usize>,
+    sections: Vec<(&'static str, usize)>,
+}
+
+impl ColumnPlan {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn add(mut self, name: &'static str, dt: DataType) -> Self {
+        let prev = self.index.insert(name, self.cols.len());
+        assert!(prev.is_none(), "duplicate column in plan: {name}");
+        self.cols.push((name, dt));
+        self
+    }
+
+    pub fn extend<I: IntoIterator<Item = (&'static str, DataType)>>(mut self, it: I) -> Self {
+        for (n, dt) in it {
+            self = self.add(n, dt);
+        }
+        self
+    }
+
+    /// Marks the next column added as the start of a named section. Each call
+    /// closes the previous section; the last section runs to the end of the plan.
+    pub fn start_section(mut self, name: &'static str) -> Self {
+        assert!(
+            !self.sections.iter().any(|(n, _)| *n == name),
+            "duplicate section: {name}"
+        );
+        self.sections.push((name, self.cols.len()));
+        self
+    }
+
+    pub fn build_schema(&self, pk: Option<&[&str]>) -> Schema {
+        let mut sb = Schema::builder();
+        for (n, dt) in &self.cols {
+            sb = sb.column(*n, dt.clone());
+        }
+        if let Some(keys) = pk {
+            sb = sb.primary_key(keys.iter().copied());
+        }
+        sb.build().expect("schema build")
+    }
+
+    pub fn idx(&self, name: &str) -> usize {
+        *self
+            .index
+            .get(name)
+            .unwrap_or_else(|| panic!("unknown column in plan: {name}"))
+    }
+
+    pub fn len(&self) -> usize {
+        self.cols.len()
+    }
+
+    /// Half-open range of the named section: `[its start, next section's start or plan end)`.
+    pub fn section_range(&self, name: &str) -> std::ops::Range<usize> {
+        let pos = self
+            .sections
+            .iter()
+            .position(|(n, _)| *n == name)
+            .unwrap_or_else(|| panic!("unknown section: {name}"));
+        let start = self.sections[pos].1;
+        let end = self
+            .sections
+            .get(pos + 1)
+            .map(|(_, s)| *s)
+            .unwrap_or(self.cols.len());
+        start..end
+    }
+}
diff --git a/fluss-rust/crates/fluss/tests/test_fluss.rs b/fluss-rust/crates/fluss/tests/test_fluss.rs
new file mode 100644
index 0000000000..2d2bd152ec
--- /dev/null
+++ b/fluss-rust/crates/fluss/tests/test_fluss.rs
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[cfg(feature = "integration_tests")]
+extern crate fluss;
+
+#[cfg(feature = "integration_tests")]
+mod integration {
+    mod admin;
+    mod batch_scanner;
+    mod fluss_cluster;
+    mod kv_table;
+    mod log_table;
+    mod record_batch_log_reader;
+    mod sasl_auth;
+
+    mod utils;
+
+    mod table_remote_scan;
+}
diff --git a/fluss-rust/deny.toml b/fluss-rust/deny.toml
new file mode 100644
index 0000000000..18ed544033
--- /dev/null
+++ b/fluss-rust/deny.toml
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[licenses]
+allow = [
+    "Apache-2.0",
+    "Apache-2.0 WITH LLVM-exception",
+    "BSD-2-Clause",
+    "BSD-3-Clause",
+    "CC0-1.0",
+    "ISC",
+    "MIT",
+    "Unicode-3.0",
+    "Zlib",
+]
+
+exceptions = [
+    # open data licenses that SHOULD be OK
+    { crate = "webpki-roots", allow = [
+        "CDLA-Permissive-2.0",
+    ] },
+]
\ No newline at end of file
diff --git a/fluss-rust/docs/cpp-bazel-usage.md b/fluss-rust/docs/cpp-bazel-usage.md
new file mode 100644
index 0000000000..61d861edcc
--- /dev/null
+++ b/fluss-rust/docs/cpp-bazel-usage.md
@@ -0,0 +1,291 @@
+# Fluss C++ Bazel Usage Guide (System / Build Modes)
+
+This guide is for:
+
+- C++ application teams consuming Fluss C++ bindings via Bazel
+- Maintainers evolving the Bazel integration
+
+For the CMake flow with the same `system` / `build` dependency modes, see
+`docs/cpp-cmake-usage.md`.
+
+Current simplification scope:
+
+- Keep only two dependency modes in the mainline guidance:
+  - `system`
+  - `build`
+- Defer strict internal-registry-only module flow from the mainline path
+
+## Scope
+
+- Dependency model: **root module mode**
+- Consumer dependency target: `@fluss-cpp//bindings/cpp:fluss_cpp`
+- Root `MODULE.bazel` is required for root module mode.
+- Build systems covered by this document: **Bazel**
+- Dependency modes covered by this document: **system/build**
+
+Version baseline references currently used by examples:
+
+- `protobuf/protoc`: `3.25.5`
+- `arrow-cpp`: `19.0.1`
+
+## Common Consumer `BUILD.bazel`
+
+Both modes use the same dependency target:
+
+```starlark
+load("@rules_cc//cc:defs.bzl", "cc_binary")
+
+cc_binary(
+    name = "fluss_reader",
+    srcs = ["reader.cc"],
+    deps = ["@fluss-cpp//bindings/cpp:fluss_cpp"],
+)
+```
+
+## Mode 1: `system` (Recommended in preinstalled environments)
+
+Use this mode when your environment already provides:
+
+- `protoc`
+- Arrow C++ (headers + shared libraries)
+
+### Consumer `MODULE.bazel` (pattern)
+
+```starlark
+module(name = "my_cpp_app")
+
+bazel_dep(name = "rules_cc", version = "0.2.14")
+bazel_dep(name = "fluss-cpp", version = "<released-version>")
+
+fluss_cpp = use_extension("@fluss-cpp//bindings/cpp/bazel/cpp:deps.bzl", "cpp_sdk")
+fluss_cpp.config(
+    mode = "system",
+    protobuf_version = "3.25.5",
+    arrow_cpp_version = "19.0.1",
+    # Adjust Arrow paths for your environment
+    system_arrow_prefix = "/usr",
+    system_arrow_include_dir = "include",
+    system_arrow_shared_library = "lib/x86_64-linux-gnu/libarrow.so",
+    system_arrow_runtime_glob = "lib/x86_64-linux-gnu/libarrow.so*",
+)
+use_repo(fluss_cpp, "apache_arrow_cpp")
+```
+
+### Build and run (consumer workspace pattern)
+
+Run from your consumer workspace root (the directory containing
+`MODULE.bazel` and your top-level `BUILD.bazel`).
+
+```bash
+PROTOC_BIN="$(command -v protoc)"
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+  --action_env=PROTOC="$PROTOC_BIN" \
+  --action_env=CARGO="$CARGO_BIN" \
+  --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+  //:fluss_reader
+```
+
+### Runnable example
+
+- `bindings/cpp/examples/bazel-consumer/system`
+
+```bash
+cd bindings/cpp/examples/bazel-consumer/system
+PROTOC_BIN="$(command -v protoc)"
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+  --action_env=PROTOC="$PROTOC_BIN" \
+  --action_env=CARGO="$CARGO_BIN" \
+  --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+  //:consumer_system
+```
+
+## Mode 2: `build` (No internal registry / no preinstalled Arrow)
+
+Use this mode when Arrow C++ is not preinstalled and you want Bazel to
+provision it from source.
+
+### Consumer `MODULE.bazel` (pattern)
+
+```starlark
+module(name = "my_cpp_app")
+
+bazel_dep(name = "rules_cc", version = "0.2.14")
+bazel_dep(name = "fluss-cpp", version = "<released-version>")
+
+fluss_cpp = use_extension("@fluss-cpp//bindings/cpp/bazel/cpp:deps.bzl", "cpp_sdk")
+fluss_cpp.config(
+    mode = "build",
+    protobuf_version = "3.25.5",
+    arrow_cpp_version = "19.0.1",
+)
+use_repo(fluss_cpp, "apache_arrow_cpp")
+```
+
+Notes:
+
+- `build` mode in the core Bazel integration still uses `PROTOC` (env / PATH).
+- To auto-download a pinned `protoc` for `build` mode, use
+  `bindings/cpp/scripts/ensure_protoc.sh` and pass the result via `--action_env=PROTOC=...`.
+- `ensure_protoc.sh` auto-detects host OS/arch (`linux`/`osx`, `x86_64`/`aarch_64`).
+- Some environments may require `ep_cmake_ar/ranlib/nm` overrides.
+
+### Build and run (consumer workspace pattern, with auto-downloaded `protoc`)
+
+Run from the `fluss-rust` repository root, or adjust the script path if you
+copied it elsewhere.
+
+```bash
+PROTOC_BIN="$(bash bindings/cpp/scripts/ensure_protoc.sh --print-path)"
+```
+
+```bash
+bazel run --action_env=PROTOC="$PROTOC_BIN" //:fluss_reader
+```
+
+If `cargo` is not on Bazel action `PATH`, also pass:
+
+```bash
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+  --action_env=PROTOC="$PROTOC_BIN" \
+  --action_env=CARGO="$CARGO_BIN" \
+  --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+  //:fluss_reader
+```
+
+### Runnable example
+
+- `bindings/cpp/examples/bazel-consumer/build`
+
+```bash
+cd bindings/cpp/examples/bazel-consumer/build
+PROTOC_BIN="$(bash ../../../scripts/ensure_protoc.sh --print-path)"
+CARGO_BIN="$(command -v cargo)"
+bazel run \
+  --action_env=PROTOC="$PROTOC_BIN" \
+  --action_env=CARGO="$CARGO_BIN" \
+  --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+  //:consumer_build
+```
+
+## Local Development Override (Optional)
+
+For repository-local validation only:
+
+```starlark
+local_path_override(
+    module_name = "fluss-cpp",
+    path = "/path/to/fluss-rust",
+)
+```
+
+Do not keep local overrides in long-lived branches.
+
+Repository-local examples in this repo use `version = "0.1.0"` together with
+`local_path_override(...)` for local validation before publishing to the Bazel
+registry.
+
+## Repository-local Validation (Direct Commands)
+
+These commands validate the repository examples directly.
+If your environment requires a proxy for Bazel external downloads, export it
+before running (replace the placeholder URL with your actual proxy):
+
+```bash
+export BAZEL_PROXY_URL="http://proxy.example.com:3128"
+export http_proxy="$BAZEL_PROXY_URL"
+export https_proxy="$BAZEL_PROXY_URL"
+export HTTP_PROXY="$http_proxy"
+export HTTPS_PROXY="$https_proxy"
+unset all_proxy ALL_PROXY
+```
+
+### Validate `build` example
+
+```bash
+cd bindings/cpp/examples/bazel-consumer/build
+PROTOC_BIN="$(bash ../../../scripts/ensure_protoc.sh --print-path)"
+CARGO_BIN="$(command -v cargo)"
+bazel --ignore_all_rc_files run \
+  --registry=https://bcr.bazel.build \
+  --lockfile_mode=off \
+  --repo_env=http_proxy="${http_proxy:-}" \
+  --repo_env=https_proxy="${https_proxy:-}" \
+  --repo_env=HTTP_PROXY="${HTTP_PROXY:-}" \
+  --repo_env=HTTPS_PROXY="${HTTPS_PROXY:-}" \
+  --action_env=http_proxy="${http_proxy:-}" \
+  --action_env=https_proxy="${https_proxy:-}" \
+  --action_env=HTTP_PROXY="${HTTP_PROXY:-}" \
+  --action_env=HTTPS_PROXY="${HTTPS_PROXY:-}" \
+  --action_env=all_proxy= \
+  --action_env=ALL_PROXY= \
+  --action_env=PROTOC="$PROTOC_BIN" \
+  --action_env=CARGO="$CARGO_BIN" \
+  --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+  --strategy=CcCmakeMakeRule=local \
+  --strategy=BootstrapGNUMake=local \
+  --strategy=BootstrapPkgConfig=local \
+  //:consumer_build
+```
+
+### Validate `system` example (using a local Arrow prefix)
+
+The `system` example defaults to `/usr`. If your Arrow prefix is elsewhere
+(for example a locally built prefix), copy the example to a temp directory and
+patch `MODULE.bazel` before running:
+
+```bash
+tmp_dir="$(mktemp -d /tmp/fluss-bazel-system-doc.XXXXXX)"
+FLUSS_RUST_ROOT="$(pwd)"
+cp -a bindings/cpp/examples/bazel-consumer/system/. "$tmp_dir/"
+sed -i \
+  -e "s|path = \"../../../../../\"|path = \"$FLUSS_RUST_ROOT\"|" \
+  -e 's|system_arrow_prefix = "/usr"|system_arrow_prefix = "/tmp/fluss-system-arrow-19.0.1"|' \
+  -e 's|system_arrow_shared_library = "lib/x86_64-linux-gnu/libarrow.so"|system_arrow_shared_library = "lib/libarrow.so"|' \
+  -e 's|system_arrow_runtime_glob = "lib/x86_64-linux-gnu/libarrow.so\\*"|system_arrow_runtime_glob = "lib/libarrow.so*"|' \
+  "$tmp_dir/MODULE.bazel"
+cd "$tmp_dir"
+PROTOC_BIN="$(command -v protoc)"
+CARGO_BIN="$(command -v cargo)"
+bazel --ignore_all_rc_files run \
+  --registry=https://bcr.bazel.build \
+  --lockfile_mode=off \
+  --repo_env=http_proxy="${http_proxy:-}" \
+  --repo_env=https_proxy="${https_proxy:-}" \
+  --repo_env=HTTP_PROXY="${HTTP_PROXY:-}" \
+  --repo_env=HTTPS_PROXY="${HTTPS_PROXY:-}" \
+  --action_env=http_proxy="${http_proxy:-}" \
+  --action_env=https_proxy="${https_proxy:-}" \
+  --action_env=HTTP_PROXY="${HTTP_PROXY:-}" \
+  --action_env=HTTPS_PROXY="${HTTPS_PROXY:-}" \
+  --action_env=all_proxy= \
+  --action_env=ALL_PROXY= \
+  --action_env=PROTOC="$PROTOC_BIN" \
+  --action_env=CARGO="$CARGO_BIN" \
+  --action_env=PATH="$(dirname "$CARGO_BIN"):$PATH" \
+  //:consumer_system
+```
+
+On macOS (BSD `sed`), replace `sed -i` with `sed -i ''` in the patch step above.
+
+## Upgrade Procedure
+
+1. Update `bazel_dep(name = "fluss-cpp", version = "...")`
+2. Update mode version settings if needed (`protobuf_version`, `arrow_cpp_version`)
+3. Run `bazel mod tidy`
+4. Commit `MODULE.bazel` and `MODULE.bazel.lock`
+5. Run build + tests
+6. Verify dependency graph:
+
+```bash
+bazel mod graph | rg "fluss-cpp@"
+```
+
+## Examples and Non-Mainline References
+
+Mainline examples:
+
+- `bindings/cpp/examples/bazel-consumer/build`
+- `bindings/cpp/examples/bazel-consumer/system`
diff --git a/fluss-rust/docs/cpp-cmake-usage.md b/fluss-rust/docs/cpp-cmake-usage.md
new file mode 100644
index 0000000000..3002d1c4b4
--- /dev/null
+++ b/fluss-rust/docs/cpp-cmake-usage.md
@@ -0,0 +1,129 @@
+# Fluss C++ CMake Usage Guide (System / Build Modes)
+
+## Audience
+
+- C++ application teams building `bindings/cpp` with CMake
+- Maintainers evolving Fluss C++ dependency provisioning
+
+## Scope
+
+- Build system covered by this document: **CMake**
+- Dependency modes covered by this document: **system/build**
+
+Current tested baselines:
+
+- `protoc`: `3.25.5`
+- `arrow-cpp`: `19.0.1`
+
+Notes:
+
+- CMake currently warns (does not fail) when local `protoc`/Arrow versions differ from the baselines.
+- `protoc` is required because Rust `prost-build` runs during the C++ build.
+
+## Common Prerequisites
+
+- Rust toolchain (`cargo` in `PATH`, or set `CARGO=/path/to/cargo`)
+- `protoc` in `PATH` (required for `system` mode; `build` mode can auto-download via `bindings/cpp/scripts/ensure_protoc.sh`)
+- C++17 compiler
+- CMake 3.22+
+
+Examples below use `bindings/cpp` as the source directory.
+
+## Mode 1: `system`
+
+Use this mode when the environment already provides Arrow C++.
+
+### Configure
+
+```bash
+cmake -S bindings/cpp -B /tmp/fluss-cpp-cmake-system \
+  -DFLUSS_CPP_DEP_MODE=system \
+  -DFLUSS_CPP_ARROW_SYSTEM_ROOT=/path/to/arrow/prefix
+```
+
+Typical prefixes:
+
+- Ubuntu package install: `/usr`
+- Custom install prefix: `/usr/local` or `/opt/arrow`
+
+### Build
+
+```bash
+cmake --build /tmp/fluss-cpp-cmake-system --target fluss_cpp -j
+```
+
+## Mode 2: `build`
+
+Use this mode when Arrow C++ is not preinstalled and CMake should fetch/build it.
+
+### Configure (with auto-downloaded `protoc`)
+
+```bash
+PROTOC_BIN="$(bash bindings/cpp/scripts/ensure_protoc.sh --print-path)"
+export PATH="$(dirname "$PROTOC_BIN"):$PATH"
+```
+
+Then configure:
+
+```bash
+cmake -S bindings/cpp -B /tmp/fluss-cpp-cmake-build \
+  -DFLUSS_CPP_DEP_MODE=build
+```
+
+Optional overrides:
+
+- `-DFLUSS_CPP_ARROW_VERSION=19.0.1`
+- `-DFLUSS_CPP_ARROW_SOURCE_URL=...` (internal mirror or pinned archive)
+- `-DFLUSS_CPP_PROTOBUF_VERSION=3.25.5` (baseline warning only)
+
+If your environment needs a proxy for CMake/FetchContent downloads, export standard proxy vars before configure/build:
+
+```bash
+export http_proxy=http://host:port
+export https_proxy=http://host:port
+export HTTP_PROXY="$http_proxy"
+export HTTPS_PROXY="$https_proxy"
+```
+
+### Build
+
+```bash
+cmake --build /tmp/fluss-cpp-cmake-build --target fluss_cpp -j
+```
+
+This mode is slower on first build because it compiles Arrow C++ from source.
+
+## Repository-local Validation (Direct Commands)
+
+### Validate `system` mode
+
+```bash
+PROTOC_BIN="$(bash bindings/cpp/scripts/ensure_protoc.sh --print-path)"
+export PATH="$(dirname "$PROTOC_BIN"):$PATH"
+cmake -S bindings/cpp -B /tmp/fluss-cpp-cmake-system \
+  -DFLUSS_CPP_DEP_MODE=system \
+  -DFLUSS_CPP_ARROW_SYSTEM_ROOT=/tmp/fluss-system-arrow-19.0.1
+cmake --build /tmp/fluss-cpp-cmake-system --target fluss_cpp -j
+```
+
+### Validate `build` mode
+
+```bash
+PROTOC_BIN="$(bash bindings/cpp/scripts/ensure_protoc.sh --print-path)"
+export PATH="$(dirname "$PROTOC_BIN"):$PATH"
+cmake -S bindings/cpp -B /tmp/fluss-cpp-cmake-build \
+  -DFLUSS_CPP_DEP_MODE=build
+cmake --build /tmp/fluss-cpp-cmake-build --target fluss_cpp -j
+```
+
+## Troubleshooting
+
+- `cargo not found`
+  - Install Rust toolchain or set `CARGO=/path/to/cargo`.
+- `protoc not found`
+  - Install `protoc` and ensure it is in `PATH`.
+  - For `build` mode, use `bindings/cpp/scripts/ensure_protoc.sh` and prepend the returned path to `PATH`.
+- `arrow/c/bridge.h` not found (build mode)
+  - Reconfigure after updating to the latest `bindings/cpp/CMakeLists.txt`; build mode now adds Arrow source/build include dirs explicitly.
+- Long first build in `build` mode
+  - Expected. Arrow C++ source build dominates wall time.
diff --git a/fluss-rust/justfile b/fluss-rust/justfile
new file mode 100644
index 0000000000..c4e1a763ac
--- /dev/null
+++ b/fluss-rust/justfile
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Create ASF source release artifacts under dist/.
+# Check out the release tag first (e.g. git checkout v0.1.0-rc1).
+# Usage: just release [version]
+#   If version is omitted, read from Cargo.toml.
+
+# [version]: optional; if omitted, script reads from Cargo.toml
+release [version]:
+    ./scripts/release.sh {{version}}
+
+# Bump version on main for next development cycle. Run from main after cutting release branch.
+# Usage: just bump-version <current> <next>   e.g. just bump-version 0.1.0 0.1.1
+bump-version from to:
+    ./scripts/bump-version.sh {{from}} {{to}}
diff --git a/fluss-rust/rust-toolchain.toml b/fluss-rust/rust-toolchain.toml
new file mode 100644
index 0000000000..870d7eb7af
--- /dev/null
+++ b/fluss-rust/rust-toolchain.toml
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+[toolchain]
+channel = "stable"
+components = ["rustfmt", "clippy"]
diff --git a/fluss-rust/rustfmt.toml b/fluss-rust/rustfmt.toml
new file mode 100644
index 0000000000..18d114826f
--- /dev/null
+++ b/fluss-rust/rustfmt.toml
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+edition = "2024"
+reorder_imports = true
\ No newline at end of file
diff --git a/fluss-rust/scripts/bump-version.sh b/fluss-rust/scripts/bump-version.sh
new file mode 100755
index 0000000000..347b2863d9
--- /dev/null
+++ b/fluss-rust/scripts/bump-version.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Bump version in root Cargo.toml ([workspace.package] and [workspace.dependencies] fluss-rs).
+# Run from repo root. Use after cutting a release branch so main is set to the next version.
+#
+# Usage: ./scripts/bump-version.sh <current_version> <next_version>
+#   e.g. ./scripts/bump-version.sh 0.1.0 0.1.1
+#   Or with env vars: ./scripts/bump-version.sh $RELEASE_VERSION $NEXT_VERSION
+
+set -e
+
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Usage: $0 <current_version> <next_version>"
+  echo "  e.g. $0 0.1.0 0.1.1"
+  exit 1
+fi
+
+FROM_VERSION="$1"
+TO_VERSION="$2"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$REPO_ROOT"
+
+if [ ! -f Cargo.toml ]; then
+  echo "Cargo.toml not found. Run from repo root."
+  exit 1
+fi
+
+# Replace version = "X.Y.Z" with version = "TO_VERSION" (all occurrences in root Cargo.toml)
+case "$(uname -s)" in
+  Darwin)
+    sed -i '' "s/version = \"${FROM_VERSION}\"/version = \"${TO_VERSION}\"/g" Cargo.toml
+    ;;
+  *)
+    sed -i "s/version = \"${FROM_VERSION}\"/version = \"${TO_VERSION}\"/g" Cargo.toml
+    ;;
+esac
+
+echo "Bumped version from ${FROM_VERSION} to ${TO_VERSION} in Cargo.toml"
+echo "Review with: git diff Cargo.toml"
diff --git a/fluss-rust/scripts/constants.py b/fluss-rust/scripts/constants.py
new file mode 100644
index 0000000000..4a23e6a53e
--- /dev/null
+++ b/fluss-rust/scripts/constants.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tomllib
+from pathlib import Path
+
+ROOT_DIR = Path(__file__).resolve().parent.parent
+
+
+def list_packages():
+    """Package directories from [workspace].members in root Cargo.toml, plus workspace root.
+    Each gets a DEPENDENCIES.rust.tsv. Avoids scanning target/, .git/, etc.
+    Requires Python 3.11+ (tomllib).
+    """
+    root_cargo = ROOT_DIR / "Cargo.toml"
+    if not root_cargo.exists():
+        return ["."]
+    with open(root_cargo, "rb") as f:
+        data = tomllib.load(f)
+    members = data.get("workspace", {}).get("members", [])
+    if not isinstance(members, list):
+        return ["."]
+    packages = ["."]
+    for m in members:
+        if isinstance(m, str) and m:
+            packages.append(m)
+    return packages
+
+
+PACKAGES = list_packages()
diff --git a/fluss-rust/scripts/dependencies.py b/fluss-rust/scripts/dependencies.py
new file mode 100644
index 0000000000..ec77469f15
--- /dev/null
+++ b/fluss-rust/scripts/dependencies.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Release tooling: requires Python 3.11+ (constants.py uses tomllib).
+
+import sys
+
+if sys.version_info < (3, 11):
+    sys.exit(
+        "This script requires Python 3.11 or newer (uses tomllib). "
+        f"Current: {sys.version}. Use python3.11+ or see docs for release requirements."
+    )
+
+import subprocess
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+
+from constants import PACKAGES, ROOT_DIR
+
+
+def check_single_package(root):
+    pkg_dir = ROOT_DIR / root if root != "." else ROOT_DIR
+    if (pkg_dir / "Cargo.toml").exists():
+        print(f"Checking dependencies of {root}")
+        subprocess.run(
+            ["cargo", "deny", "check", "license"],
+            cwd=pkg_dir,
+            check=True,
+        )
+    else:
+        print(f"Skipping {root} as Cargo.toml does not exist")
+
+
+def check_deps():
+    for d in PACKAGES:
+        check_single_package(d)
+
+
+def generate_single_package(root):
+    pkg_dir = ROOT_DIR / root if root != "." else ROOT_DIR
+    if (pkg_dir / "Cargo.toml").exists():
+        print(f"Generating dependencies {root}")
+        result = subprocess.run(
+            ["cargo", "deny", "list", "-f", "tsv", "-t", "0.6"],
+            cwd=pkg_dir,
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"cargo deny list failed in {root}: {result.stderr or result.stdout}"
+            )
+        out_file = pkg_dir / "DEPENDENCIES.rust.tsv"
+        out_file.write_text(result.stdout)
+    else:
+        print(f"Skipping {root} as Cargo.toml does not exist")
+
+
+def generate_deps():
+    for d in PACKAGES:
+        generate_single_package(d)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
+    parser.set_defaults(func=parser.print_help)
+    subparsers = parser.add_subparsers()
+
+    parser_check = subparsers.add_parser(
+        "check", description="Check dependencies", help="Check dependencies"
+    )
+    parser_check.set_defaults(func=check_deps)
+
+    parser_generate = subparsers.add_parser(
+        "generate", description="Generate dependencies", help="Generate dependencies"
+    )
+    parser_generate.set_defaults(func=generate_deps)
+
+    args = parser.parse_args()
+    arg_dict = dict(vars(args))
+    del arg_dict["func"]
+    args.func(**arg_dict)
diff --git a/fluss-rust/scripts/release.sh b/fluss-rust/scripts/release.sh
new file mode 100755
index 0000000000..e4e6b07939
--- /dev/null
+++ b/fluss-rust/scripts/release.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Create ASF source release artifacts under dist/ (aligned with Fluss release package format):
+#   fluss-rust-{version}-incubating.tgz
+#   fluss-rust-{version}-incubating.tgz.asc
+#   fluss-rust-{version}-incubating.tgz.sha512
+# (Incubator policy requires "incubating" in the artifact name.)
+# Run from repo root. Check out the release tag first (e.g. git checkout v0.1.0-rc1).
+# Usage: ./scripts/release.sh [version]
+#   If version is omitted, it is read from Cargo.toml (workspace.package.version).
+
+set -e
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$REPO_ROOT"
+
+if [ -n "$1" ]; then
+  VERSION="$1"
+else
+  VERSION=$(grep -E '^version\s*=' Cargo.toml | head -1 | sed 's/.*"\([^"]*\)".*/\1/')
+  if [ -z "$VERSION" ]; then
+    echo "Could not read version from Cargo.toml. Pass version as argument: $0 <version>"
+    exit 1
+  fi
+fi
+
+PREFIX="fluss-rust-${VERSION}-incubating"
+DIST_DIR="${REPO_ROOT}/dist"
+TARBALL="${PREFIX}.tgz"
+
+echo "Creating ASF source release for fluss-rust ${VERSION}"
+mkdir -p "$DIST_DIR"
+
+echo "Creating source archive: ${TARBALL}"
+git archive --format=tar.gz --prefix="${PREFIX}/" -o "${DIST_DIR}/${TARBALL}" HEAD
+
+echo "Generating SHA-512 checksum: ${TARBALL}.sha512"
+if command -v shasum >/dev/null 2>&1; then
+  (cd "$DIST_DIR" && shasum -a 512 "$TARBALL" > "${TARBALL}.sha512")
+else
+  (cd "$DIST_DIR" && sha512sum "$TARBALL" > "${TARBALL}.sha512")
+fi
+
+echo "Signing with GPG: ${TARBALL}.asc"
+(cd "$DIST_DIR" && gpg --armor --detach-sig "$TARBALL")
+
+echo "Verifying signature"
+(cd "$DIST_DIR" && gpg --verify "${TARBALL}.asc" "$TARBALL")
+
+echo "Done. Artifacts in dist/:"
+ls -la "${DIST_DIR}/"
+echo ""
+echo "Next: upload contents of dist/ to SVN (see docs/creating-a-release.md)."
diff --git a/fluss-rust/scripts/vendor-proto.sh b/fluss-rust/scripts/vendor-proto.sh
new file mode 100755
index 0000000000..64b328c894
--- /dev/null
+++ b/fluss-rust/scripts/vendor-proto.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Vendor the canonical FlussApi.proto into the fluss-rs crate so `cargo publish`
+# produces a self-contained crate. build.rs prefers crates/fluss/proto/FlussApi.proto
+# when present, otherwise reads it from ../../../fluss-rpc in the monorepo.
+#
+# Usage:
+#   scripts/vendor-proto.sh           # copy canonical proto into the crate
+#   scripts/vendor-proto.sh --clean   # remove the vendored copy
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+CANONICAL="${REPO_ROOT}/../fluss-rpc/src/main/proto/FlussApi.proto"
+DEST_DIR="${REPO_ROOT}/crates/fluss/proto"
+DEST="${DEST_DIR}/FlussApi.proto"
+
+if [ "${1:-}" = "--clean" ]; then
+  rm -rf "$DEST_DIR"
+  echo "Removed vendored proto: ${DEST_DIR}"
+  exit 0
+fi
+
+if [ ! -f "$CANONICAL" ]; then
+  echo "Canonical proto not found: ${CANONICAL}" >&2
+  echo "Run from the consolidated repo (fluss-rpc must be a sibling of fluss-rust)." >&2
+  exit 1
+fi
+
+mkdir -p "$DEST_DIR"
+cp "$CANONICAL" "$DEST"
+echo "Vendored ${CANONICAL} -> ${DEST}"
diff --git a/fluss-rust/website/babel.config.js b/fluss-rust/website/babel.config.js
new file mode 100644
index 0000000000..e00595dae7
--- /dev/null
+++ b/fluss-rust/website/babel.config.js
@@ -0,0 +1,3 @@
+module.exports = {
+  presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
+};
diff --git a/fluss-rust/website/docs/developer-guide/contributing.md b/fluss-rust/website/docs/developer-guide/contributing.md
new file mode 100644
index 0000000000..38b792e8e6
--- /dev/null
+++ b/fluss-rust/website/docs/developer-guide/contributing.md
@@ -0,0 +1,132 @@
+# Contributing
+
+Welcome to the development guide for `fluss-rust`! This project builds the Fluss Rust client and language-specific bindings (Python, C++).
+
+## Prerequisites
+
+- Rust 1.85+ (see [rust-toolchain.toml](https://github.com/apache/fluss-rust/blob/main/rust-toolchain.toml))
+- Protobuf compiler (`protoc`)
+
+Install using your preferred package/version manager:
+
+```bash
+# Using mise
+mise install protobuf
+mise install rust
+
+# Using Homebrew (macOS)
+brew install protobuf
+
+# Using apt (Ubuntu/Debian)
+sudo apt-get install protobuf-compiler
+```
+
+## IDE Setup
+
+We recommend [RustRover](https://www.jetbrains.com/rust/) IDE.
+
+### Importing the Project
+
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/apache/fluss-rust.git
+   ```
+2. Open RustRover, go to the `Projects` tab, click `Open`, and navigate to the root directory.
+3. Click `Open`.
+
+### Copyright Profile
+
+Fluss is an Apache project, every file needs an Apache licence header. To automate this in RustRover:
+
+1. Go to `Settings` > `Editor` > `Copyright` > `Copyright Profiles`.
+2. Add a new profile named `Apache` with this text:
+   ```
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.
+   ```
+3. Go to `Editor` > `Copyright` and set `Apache` as the default profile.
+4. Go to `Editor` > `Copyright` > `Formatting` > `Rust`, choose `Use custom formatting`, then `Use line comment`.
+5. Click `Apply`.
+
+## Project Structure
+
+```
+crates/fluss        (Fluss Rust client crate)
+crates/examples     (Rust client examples)
+bindings/cpp        (C++ bindings)
+bindings/python     (Python bindings - PyO3)
+```
+
+## Building and Testing
+
+### Rust Client
+
+```bash
+# Build everything
+cargo build --workspace --all-targets
+
+# Run unit tests
+cargo test --workspace
+
+# Run integration tests (requires Docker)
+RUST_TEST_THREADS=1 cargo test --features integration_tests --workspace
+
+# Run a single test
+cargo test test_name
+```
+
+### Python Bindings
+
+```bash
+cd bindings/python
+
+# Install dev dependencies and build the extension
+uv sync --extra dev && uv run maturin develop
+
+# Run integration tests (requires Docker)
+uv run pytest test/ -v
+
+# To run against an existing cluster instead
+FLUSS_BOOTSTRAP_SERVERS=127.0.0.1:9123 uv run pytest test/ -v
+```
+
+### C++ Bindings
+
+```bash
+cd bindings/cpp
+mkdir -p build && cd build
+cmake ..
+cmake --build .
+```
+
+## License Check (cargo-deny)
+
+We use [cargo-deny](https://embarkstudios.github.io/cargo-deny/) to ensure all dependency licenses are Apache-compatible:
+
+```bash
+cargo install cargo-deny --locked
+cargo deny check licenses
+```
+
+## Formatting and Clippy
+
+CI runs formatting and clippy checks. Run these before submitting a PR:
+
+```bash
+cargo fmt --all
+cargo clippy --all-targets --fix --allow-dirty --allow-staged
+```
diff --git a/fluss-rust/website/docs/index.md b/fluss-rust/website/docs/index.md
new file mode 100644
index 0000000000..e7d5e08a87
--- /dev/null
+++ b/fluss-rust/website/docs/index.md
@@ -0,0 +1,45 @@
+---
+slug: /
+sidebar_position: 1
+title: Introduction
+---
+
+# Introduction
+
+[Apache Fluss](https://fluss.apache.org/) (incubating) is a streaming storage system built for real-time analytics, serving as the real-time data layer for Lakehouse architectures.
+
+This documentation covers the **Fluss client libraries** for Rust, Python, C++, and [Java](https://fluss.apache.org/docs/0.9/apis/java-client/), which are developed in the [fluss-rust](https://github.com/apache/fluss-rust) repository (Java client is part of the [main Fluss project](https://github.com/apache/fluss)). These clients allow you to:
+
+- **Create and manage** databases, tables, and partitions
+- **Write** data to log tables (append-only) and primary key tables (upsert/delete)
+- **Read** data via log scanning and key lookups
+- **Integrate** with the broader Fluss ecosystem including lakehouse snapshots
+
+## Prerequisites
+
+You need a running Fluss cluster to use any of the client libraries. See the [Deploying a Local Cluster](https://fluss.apache.org/docs/install-deploy/deploying-local-cluster/) guide to get started.
+
+## Key Concepts
+
+- **Log table** — an append-only table (no primary key). Records are immutable once written. Use for event streams, logs, and audit trails.
+  - **Offset** — the position of a record within a log table's bucket. Used to track reading progress. Start from `EARLIEST_OFFSET` to read all data, or resolve the current latest offset via `list_offsets` to only read new records.
+- **Primary key (PK) table** — a table with a primary key. Supports upsert, delete, and point lookups.
+- **Bucket** — the unit of parallelism within a table (similar to Kafka partitions). Each table has one or more buckets. Readers subscribe to individual buckets.
+- **Partition** — a way to organize data by column values (e.g. by date or region). Each partition contains its own set of buckets. Partitions must be created explicitly before writing.
+
+## Client Overview
+
+|                        | Rust                                                       | Python                   | C++                                            |
+|------------------------|------------------------------------------------------------|--------------------------|------------------------------------------------|
+| **Package**            | [fluss-rs](https://crates.io/crates/fluss-rs) on crates.io | Build from source (PyO3) | Build from source (CMake)                      |
+| **Async runtime**      | Tokio                                                      | asyncio                  | Synchronous (Tokio runtime managed internally) |
+| **Data format**        | Arrow RecordBatch / GenericRow                             | PyArrow / Pandas / dict  | Arrow RecordBatch / GenericRow                 |
+| **Log tables**         | Read + Write                                               | Read + Write             | Read + Write                                   |
+| **Primary key tables** | Upsert + Delete + Lookup                                   | Upsert + Delete + Lookup | Upsert + Delete + Lookup                       |
+| **Partitioned tables** | Read + Write                                               | Read + Write             | Read + Write                                   |
+
+## How This Guide Is Organised
+
+The **Clients** section walks through installation, configuration, and working with each table type across all three languages. Code examples are shown side by side under **Rust**, **Python**, and **C++** headings.
+
+The **Contributing** guide covers building from source, running tests, and the release process for contributors.
diff --git a/fluss-rust/website/docs/release/create-release.md b/fluss-rust/website/docs/release/create-release.md
new file mode 100644
index 0000000000..5df844325a
--- /dev/null
+++ b/fluss-rust/website/docs/release/create-release.md
@@ -0,0 +1,445 @@
+---
+sidebar_position: 4
+---
+
+# Creating a Fluss Rust Client Release
+
+This document describes in detail how to create a release of the **Fluss clients** (fluss-rust, fluss-python, fluss-cpp) from the [fluss-rust](https://github.com/apache/fluss-rust) repository. It is based on the [Creating a Fluss Release](https://fluss.apache.org/community/how-to-release/creating-a-fluss-release/) guide of the Apache Fluss project and the [release guide of Apache OpenDAL](https://nightlies.apache.org/opendal/opendal-docs-stable/community/release/); releases are source archives plus CI-published crates.io and PyPI.
+
+Publishing software has legal consequences. This guide complements the foundation-wide [Product Release Policy](https://www.apache.org/legal/release-policy.html) and [Release Distribution Policy](https://infra.apache.org/release-distribution.html).
+
+## Overview
+
+![Release process overview](/img/release-guide.png)
+
+The release process consists of:
+
+1. [Decide to release](#decide-to-release)
+2. [Prepare for the release](#prepare-for-the-release)
+3. [Build a release candidate](#build-a-release-candidate)
+4. [Vote on the release candidate](#vote-on-the-release-candidate)
+5. [If necessary, fix any issues and go back to step 3](#fix-any-issues)
+6. [Finalize the release](#finalize-the-release)
+7. [Promote the release](#promote-the-release)
+
+## Decide to release
+
+Deciding to release and selecting a Release Manager is the first step. This is a consensus-based decision of the community.
+
+Anybody can propose a release (e.g. on the dev [mailing list](https://fluss.apache.org/community/welcome/)), giving a short rationale and nominating a committer as Release Manager (including themselves). Any objections should be resolved by consensus before starting.
+
+**Checklist to proceed**
+
+- [ ] Community agrees to release
+- [ ] A Release Manager is selected
+
+## Prepare for the release
+
+### 0. One-time Release Manager setup
+
+Before your first release, perform one-time configuration. See **[Release Manager Preparation](https://fluss.apache.org/community/how-to-release/release-manager-preparation/)** (GPG key, etc.). For fluss-rust you do **not** need Nexus/Maven; you only need GPG for signing the source archive and (optionally) git signing.
+
+For GitHub Actions publishing, configure the repository secret `CARGO_REGISTRY_TOKEN` with a crates.io API token from an account allowed to publish `fluss-rs`. The `Release Rust` workflow uses this secret directly when a release tag is pushed.
+
+**Checklist (one-time)**
+
+- [ ] GPG key set up and published to [KEYS](https://downloads.apache.org/incubator/fluss/KEYS) or Apache account
+- [ ] Git configured to use your GPG key for signing tags
+- [ ] GitHub Actions secret `CARGO_REGISTRY_TOKEN` configured for crates.io publishing
+
+### 1. Install Rust (and optional: just)
+
+The release script (`just release` or `./scripts/release.sh`) uses `git archive` and `gpg`; building or verifying the project locally requires **Rust**. Install the [Rust toolchain](https://rustup.rs/) (the version should match [rust-toolchain.toml](https://github.com/apache/fluss-rust/blob/main/rust-toolchain.toml) in the repo). The dependency list script (`scripts/dependencies.py`) requires **Python 3.11+**.
+
+```bash
+rustc --version
+cargo --version
+```
+
+To use `just release`, install [just](https://github.com/casey/just) (e.g. `cargo install just` or your system package manager). If you prefer not to use just, run `./scripts/release.sh $RELEASE_VERSION` instead.
+
+### 2. Optional: Create a new Milestone in GitHub
+
+If the project uses GitHub milestones for release tracking, create a new milestone for the **next** version (e.g. `v0.2` if you are releasing `0.1.x`). This helps contributors target issues to the correct release.
+
+### 3. Optional: Triage release-blocking issues
+
+Check open issues that might block the release. Resolve, defer to the next milestone, or mark as blocker and do not proceed until they are fixed.
+
+### 4. Clone fluss-rust into a fresh workspace
+
+Use a clean clone to avoid local changes affecting the release.
+
+```bash
+git clone https://github.com/apache/fluss-rust.git
+cd fluss-rust
+```
+
+### 5. Set up environment variables
+
+Set these once and use them in all following commands. (Bash syntax.)
+
+```bash
+RELEASE_VERSION="0.1.0"
+SHORT_RELEASE_VERSION="0.1"
+RELEASE_TAG="v${RELEASE_VERSION}"
+SVN_RELEASE_DIR="fluss-rust-${RELEASE_VERSION}"
+# Only set if there is a previous release (for compare link in DISCUSS / release notes)
+LAST_VERSION="0.0.9"
+NEXT_VERSION="0.2.0"
+```
+
+For the **first release** there is no previous version; leave `LAST_VERSION` unset or omit it when using the compare link in the DISCUSS thread and release notes.
+
+### 6. Generate dependencies list
+
+[ASF release policy](https://www.apache.org/legal/release-policy.html) requires that every release comply with [ASF licensing policy](https://www.apache.org/legal/resolved.html) and that an **audit be performed before a full release**. Generating and committing a dependency list (and using cargo-deny) documents third-party components and supports this requirement.
+
+Do this on `main` **before** creating the release branch. Then both the release branch (when created from `main`) and `main` will have the same dependency list.
+
+1. Download and set up [cargo-deny](https://embarkstudios.github.io/cargo-deny/cli/index.html) (see cargo-deny docs).
+2. Run the script to update the dependency list (requires **Python 3.11+** for the release tooling), then commit on `main`:
+
+```bash
+git checkout main
+git pull
+python3 scripts/dependencies.py generate
+git add **/DEPENDENCIES*.tsv
+# Bash: run  shopt -s globstar  first so ** matches subdirs
+git commit -m "chore: update dependency list for release ${RELEASE_VERSION}"
+git push origin main
+```
+
+To only check licenses (no file update): `python3 scripts/dependencies.py check`.
+
+### 7. Optional: Start a [DISCUSS] thread
+
+On [Fluss Discussions](https://github.com/apache/fluss-rust/discussions) or the dev list:
+
+- **Subject:** `[DISCUSS] Release Apache Fluss clients (fluss-rust, fluss-python, fluss-cpp) $RELEASE_VERSION`
+- **Body:** Short rationale; if there is a previous release, add compare link: `https://github.com/apache/fluss-rust/compare/v${LAST_VERSION}...main`. Ask for comments.
+
+### 8. Create a release branch
+
+From `main`, create a release branch. All release artifacts will be built from this branch. The tag (RC or release) is created later when building the release candidate.
+
+```bash
+git checkout main
+git pull
+git checkout -b release-${SHORT_RELEASE_VERSION}
+git push origin release-${SHORT_RELEASE_VERSION}
+```
+
+Do **not** create or push the release/RC tag yet; that happens in [Build a release candidate](#build-a-release-candidate) after the source artifacts are staged.
+
+### 9. Bump version on main for the next development cycle
+
+So that `main` moves to the next version immediately after the release branch is cut, run the bump script and commit:
+
+```bash
+git checkout main
+git pull
+
+./scripts/bump-version.sh $RELEASE_VERSION $NEXT_VERSION
+
+git add Cargo.toml
+git commit -m "Bump version to ${NEXT_VERSION}"
+git push origin main
+```
+
+The script updates the root `Cargo.toml` ([workspace.package] and [workspace.dependencies] fluss-rs). crates/fluss and bindings inherit `version` from the workspace.
+
+### 10. Optional: Create PRs for release blog and download page
+
+You can open a pull request in the **Apache Fluss** repository for the release blog (announcement). If the project website has a download page, also create a PR to add the new version there. **Do not merge these PRs until the release is finalized.**
+
+---
+
+**Checklist to proceed to the next step**
+
+- [ ] Rust (and optionally just) installed and on PATH
+- [ ] Python 3.11+ for dependency list script
+- [ ] No release-blocking issues (or triaged)
+- [ ] Environment variables set
+- [ ] Release branch created and pushed
+- [ ] Main branch bumped to `NEXT_VERSION` and pushed
+- [ ] Dependencies list generated and committed on main
+- [ ] (Optional) DISCUSS thread and/or tracking issue created
+- [ ] (Optional) PRs for blog and download page created but not merged
+
+## Build a release candidate
+
+Each release candidate is built from the release branch, signed, and staged to the dev area of dist.apache.org. If an RC fails the vote, fix issues and repeat this section with an incremented `RC_NUM` (see [Fix any issues](#fix-any-issues)).
+
+### 1. Set RC environment variables
+
+Set these when building a **release candidate**. Start with `RC_NUM=1`; if the vote fails and you build a new candidate, increment to `2`, then `3`, etc.
+
+```bash
+export RC_NUM="1"
+export RC_TAG="v${RELEASE_VERSION}-rc${RC_NUM}"
+export SVN_RC_DIR="fluss-rust-${RELEASE_VERSION}-rc${RC_NUM}"
+```
+
+For a **direct release** (no RC), skip these and use `RELEASE_TAG` and `SVN_RELEASE_DIR` from the Prepare step instead.
+
+### 2. Check out the release branch and create the tag
+
+Check out the release branch at the commit you want to release, create the signed tag, then push it. Use `RC_TAG` for a release candidate or `RELEASE_TAG` for a direct release. Pushing the tag triggers GitHub Actions (for an RC tag, fluss-python is published to TestPyPI).
+
+```bash
+git checkout release-${SHORT_RELEASE_VERSION}
+git pull
+git tag -s $RC_TAG -m "${RC_TAG}"
+git push origin $RC_TAG
+```
+
+Check CI: [Actions](https://github.com/apache/fluss-rust/actions) (Release Rust, Release Python).
+
+### 3. Create source release artifacts
+
+From the repository root (on the release branch, at the commit you tagged):
+
+```bash
+just release $RELEASE_VERSION
+# Or: ./scripts/release.sh $RELEASE_VERSION
+```
+
+This creates under `dist/`:
+
+- `fluss-rust-${RELEASE_VERSION}-incubating.tgz`
+- `fluss-rust-${RELEASE_VERSION}-incubating.tgz.sha512`
+- `fluss-rust-${RELEASE_VERSION}-incubating.tgz.asc`
+
+(Incubator policy requires the word "incubating" in release artifact names.)
+
+Verify with: `gpg --verify dist/fluss-rust-${RELEASE_VERSION}-incubating.tgz.asc dist/fluss-rust-${RELEASE_VERSION}-incubating.tgz`
+
+### 4. Stage artifacts to SVN (dist.apache.org dev)
+
+From the **fluss-rust** repo root, check out the Fluss dev area and add the release artifacts.
+
+```bash
+svn checkout https://dist.apache.org/repos/dist/dev/incubator/fluss fluss-dist-dev --depth=immediates
+cd fluss-dist-dev
+mkdir $SVN_RC_DIR
+cp ../dist/fluss-rust-${RELEASE_VERSION}-incubating.* $SVN_RC_DIR/
+svn add $SVN_RC_DIR
+svn status
+svn commit -m "Add fluss-rust ${RELEASE_VERSION} RC${RC_NUM}"
+```
+
+Verify: [https://dist.apache.org/repos/dist/dev/incubator/fluss/](https://dist.apache.org/repos/dist/dev/incubator/fluss/)
+
+---
+
+**Checklist to proceed to the next step**
+
+- [ ] Source distribution built and signed under `dist/`
+- [ ] Artifacts staged to [dist.apache.org dev](https://dist.apache.org/repos/dist/dev/incubator/fluss/) under `$SVN_RC_DIR`
+- [ ] RC (or release) tag pushed to GitHub
+- [ ] CI for Release Rust / Release Python succeeded
+
+## Vote on the release candidate
+
+Share the release candidate for community review. If the project is in incubation, a [two-phase vote](https://incubator.apache.org/cookbook/#two_phase_vote_on_podling_releases) (Fluss community then Incubator PMC) may be required; otherwise one community vote is enough.
+
+### Fluss community vote
+
+Start the vote on the dev@ mailing list.
+
+**Subject:** `[VOTE] Release Apache Fluss clients (fluss-rust, fluss-python, fluss-cpp) ${RELEASE_VERSION} (RC${RC_NUM})`
+
+**Body template:**
+
+```
+Hi everyone,
+
+Please review and vote on release candidate #${RC_NUM} for Apache Fluss clients (fluss-rust, fluss-python, fluss-cpp) ${RELEASE_VERSION}.
+
+[ ] +1 Approve the release
+[ ] +0 No opinion
+[ ] -1 Do not approve (please provide specific comments)
+
+The release candidate (source distribution) is available at:
+* https://dist.apache.org/repos/dist/dev/incubator/fluss/$SVN_RC_DIR/
+
+KEYS for signature verification:
+* https://downloads.apache.org/incubator/fluss/KEYS
+
+Git tag:
+* https://github.com/apache/fluss-rust/releases/tag/$RC_TAG
+
+PyPI (release) / TestPyPI (RC):
+* https://pypi.org/project/pyfluss/
+* https://test.pypi.org/project/pyfluss/
+
+Please download, verify, and test. Verification steps are in [How to Verify a Release Candidate](verifying-a-release-candidate.md).
+
+The vote will be open for at least 72 hours. It is adopted by majority approval with at least 3 PPMC affirmative votes (or as per project policy).
+
+Thanks,
+Release Manager
+```
+
+If issues are found, cancel the vote and go to [Fix any issues](#fix-any-issues). If the vote passes, close it and tally the result in a follow-up:
+
+**Subject:** `[RESULT][VOTE] Release Apache Fluss clients ${RELEASE_VERSION} (RC${RC_NUM})`
+
+**Body:** Summarize binding and non-binding votes and link to the vote thread.
+
+### Incubator PMC vote (if applicable)
+
+If the project is in incubation, start a vote on general@incubator.apache.org after the Fluss community vote passes. Use the same structure: link to the community vote thread, release candidate URL, KEYS, tag, and ask IPMC to vote within 72 hours. Then send the result to the same list.
+
+---
+
+**Checklist to proceed to finalization**
+
+- [ ] Community vote passed (at least 3 binding +1, more +1 than -1)
+- [ ] (If incubating) Incubator PMC vote passed
+
+## Fix any issues
+
+If the vote revealed issues:
+
+1. Fix them on `main` (or the release branch) via normal PRs; cherry-pick fixes into the release branch as needed.
+2. Remove the old RC from dist.apache.org dev (optional but recommended):
+
+```bash
+svn checkout https://dist.apache.org/repos/dist/dev/incubator/fluss fluss-dist-dev --depth=immediates
+cd fluss-dist-dev
+svn remove $SVN_RC_DIR
+svn commit -m "Remove fluss-rust ${RELEASE_VERSION} RC${RC_NUM} (superseded)"
+```
+
+3. Increment `RC_NUM` (e.g. set `RC_NUM="2"`), recreate `RC_TAG` and `SVN_RC_DIR`, then go back to [Build a release candidate](#build-a-release-candidate) and repeat until a candidate is approved.
+
+**Checklist**
+
+- [ ] Issues resolved and changes merged/cherry-picked to the release branch
+- [ ] New RC built and voted on (or same RC re-voted if only minor fixes)
+
+## Finalize the release
+
+Once a release candidate has been approved, finalize the release.
+
+### 1. Push the release git tag (if the vote was on an RC)
+
+If the community voted on an RC tag, create and push the formal release tag so CI publishes to crates.io and PyPI:
+
+```bash
+git checkout $RC_TAG
+git tag -s $RELEASE_TAG -m "Release fluss-rust, fluss-python, fluss-cpp ${RELEASE_VERSION}"
+git push origin $RELEASE_TAG
+```
+
+### 2. Deploy source artifacts to the release repository
+
+Move the staged artifacts from dev to release:
+
+```bash
+svn mv -m "Release fluss-rust ${RELEASE_VERSION}" \
+  https://dist.apache.org/repos/dist/dev/incubator/fluss/$SVN_RC_DIR \
+  https://dist.apache.org/repos/dist/release/incubator/fluss/$SVN_RELEASE_DIR
+```
+
+(Only PPMC members may have write access to the release repository; if you get permission errors, ask on the mailing list.)
+
+### 3. Remove old RC(s) from dev (optional)
+
+Clean up the dev area so only the current RC or the moved release remains:
+
+```bash
+cd fluss-dist-dev
+svn remove $SVN_RC_DIR
+svn commit -m "Remove RC after release fluss-rust ${RELEASE_VERSION}"
+```
+
+### 4. Verify language artifacts
+
+- **fluss-rust:** [crates.io/crates/fluss-rs](https://crates.io/crates/fluss-rs) shows version `$RELEASE_VERSION`
+- **fluss-python:** [PyPI – pyfluss](https://pypi.org/project/pyfluss/) shows version `$RELEASE_VERSION`
+- **fluss-cpp:** Distributed via the source archive; no separate registry
+
+### 5. Create GitHub Release
+
+1. Go to [Releases → New release](https://github.com/apache/fluss-rust/releases/new).
+2. Choose tag `$RELEASE_TAG`.
+3. Set the target to the release branch `release-${RELEASE_VERSION}` (i.e., the branch/commit used to create `$RELEASE_TAG`).
+4. Click **Generate release notes**, then add: notable changes, breaking changes (if any) from component upgrade docs, **official download link** (source archive and verification), and install instructions for fluss-rust, fluss-python, fluss-cpp.
+    - **Download link:** `https://downloads.apache.org/incubator/fluss/fluss-rust-${RELEASE_VERSION}/` (or the project download page). In the release description, include checksums and GPG verification steps.
+5. Click **Publish release**.
+
+### 6. Update CHANGELOG.md on main
+
+Add an entry for `$RELEASE_VERSION` with the list of changes (use [Generate Release Note](generate-release-note.md) from the release tag). Commit and push to `main`.
+
+---
+
+**Checklist to proceed to promotion**
+
+- [ ] Release tag pushed; CI published to crates.io and PyPI
+- [ ] Source artifacts in [dist release](https://dist.apache.org/repos/dist/release/incubator/fluss/)
+- [ ] GitHub Release created
+- [ ] CHANGELOG.md updated on main
+
+## Promote the release
+
+### Merge website PRs
+
+Merge the pull requests for the release blog and download page that were created in [Prepare for the release](#10-optional-create-prs-for-release-blog-and-download-page).
+
+### Announce the release
+
+Wait at least 24 hours after finalizing, per [ASF release policy](https://www.apache.org/legal/release-policy.html#release-announcements).
+
+- Announce on the dev mailing list that the release is complete.
+- Announce on [Fluss Discussions – Announcements](https://github.com/apache/fluss-rust/discussions) (if that category exists).
+- Send the release announcement to **announce@apache.org**.
+
+Use the `@apache.org` email address and **plain text** for the body; otherwise the list may reject the message.
+
+**Subject:** `[ANNOUNCE] Release Apache Fluss clients (fluss-rust, fluss-python, fluss-cpp) ${RELEASE_VERSION}`
+
+**Body template:**
+
+```
+The Apache Fluss community is pleased to announce the release of Apache Fluss clients (fluss-rust, fluss-python, fluss-cpp) ${RELEASE_VERSION}.
+
+This release includes ...
+(Notable changes; link to CHANGELOG or release notes.)
+
+Download and verification:
+* https://downloads.apache.org/incubator/fluss/$SVN_RELEASE_DIR/
+* KEYS: https://downloads.apache.org/incubator/fluss/KEYS (or https://downloads.apache.org/fluss/KEYS after graduation)
+
+Rust:    cargo add fluss-rs
+Python:  pip install pyfluss
+C++:     build from source (see project documentation)
+
+Release notes: https://github.com/apache/fluss-rust/releases/tag/$RELEASE_TAG
+
+Thanks to all contributors!
+
+Release Manager
+```
+
+---
+
+**Checklist to declare the process completed**
+
+- [ ] Release announced on dev list and (if applicable) user list
+- [ ] Release announced on announce@apache.org
+- [ ] Release blog published (if applicable)
+- [ ] Download page updated (if applicable)
+
+## Improve the process
+
+After finishing the release, consider what could be improved (simplifications, clearer steps, automation). Propose changes on the dev list or via a pull request to this guide.
+
+## See also
+
+- [Release Manager Preparation](https://fluss.apache.org/community/how-to-release/release-manager-preparation/) — GPG and one-time setup
+- [How to Verify a Release Candidate](verifying-a-release-candidate.md) — Verify signatures, checksums, build, and tests for a release candidate
+- [ASF Release Policy](https://www.apache.org/legal/release-policy.html)
diff --git a/fluss-rust/website/docs/release/generate-release-note.md b/fluss-rust/website/docs/release/generate-release-note.md
new file mode 100644
index 0000000000..9db04a5cab
--- /dev/null
+++ b/fluss-rust/website/docs/release/generate-release-note.md
@@ -0,0 +1,10 @@
+# Generate Release Note
+
+Use GitHub's **Generate release notes** to produce a draft from merged PRs between tags. Categories (Added, Fixed, Docs, etc.) are configured in [.github/release.yml](https://github.com/apache/fluss-rust/blob/main/.github/release.yml).
+
+1. Go to [Creating a Fluss Client Release](create-release.md).
+2. In **Choose a tag**, pick the release tag (e.g. `v0.1.0`).
+3. Click **Generate release notes**.
+4. Copy the generated content for **CHANGELOG.md** or the GitHub Release description. When publishing the release, add the official download link, checksums/verification, and install instructions (see [Creating a Fluss Rust Client Release](create-release.md)).
+
+See [Creating a Fluss Rust Client Release](create-release.md) and [GitHub: Automatically generated release notes](https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes).
diff --git a/fluss-rust/website/docs/release/verifying-a-release-candidate.md b/fluss-rust/website/docs/release/verifying-a-release-candidate.md
new file mode 100644
index 0000000000..ebb4d97df5
--- /dev/null
+++ b/fluss-rust/website/docs/release/verifying-a-release-candidate.md
@@ -0,0 +1,124 @@
+# How to Verify a Release Candidate
+
+This document describes how to verify a release candidate (RC) of the **Fluss clients** (fluss-rust, fluss-python, fluss-cpp) from the [fluss-rust](https://github.com/apache/fluss-rust) repository. It is intended for anyone participating in the release vote (binding or non-binding) and is based on [Verifying a Fluss Release](https://fluss.apache.org/community/how-to-release/verifying-a-fluss-release/) of the Apache Fluss project, adapted for the fluss-rust source distribution and tooling (Rust, Python, C++).
+
+## Validating distributions
+
+The release vote email includes links to:
+
+- **Distribution archive:** source tarball (`fluss-rust-${RELEASE_VERSION}-incubating.tgz`) on [dist.apache.org dev](https://dist.apache.org/repos/dist/dev/incubator/fluss/)
+- **Signature file:** `fluss-rust-${RELEASE_VERSION}-incubating.tgz.asc`
+- **Checksum file:** `fluss-rust-${RELEASE_VERSION}-incubating.tgz.sha512`
+- **KEYS file:** [https://downloads.apache.org/incubator/fluss/KEYS](https://downloads.apache.org/incubator/fluss/KEYS)
+
+Download the archive (`.tgz`), `.asc`, and `.sha512` from the RC directory (e.g. `fluss-rust-0.1.0-rc1/`) and the KEYS file. Then follow the steps below to verify signatures and checksums.
+
+## Verifying signatures
+
+First, import the keys into your local keyring:
+
+```bash
+curl https://downloads.apache.org/incubator/fluss/KEYS -o KEYS
+gpg --import KEYS
+```
+
+Next, verify all `.asc` files:
+
+```bash
+for i in *.tgz; do echo $i; gpg --verify $i.asc $i; done
+```
+
+If verification succeeds, you will see a message like:
+
+```text
+gpg: Signature made ...
+gpg: using RSA key ...
+gpg: Good signature from "Release Manager Name (CODE SIGNING KEY) <...@apache.org>"
+```
+
+## Verifying checksums
+
+Next, verify the tarball(s) using the provided `.sha512` file(s). Each `.sha512` file lists the expected SHA-512 hash for the corresponding archive; `-c` reads that file and checks the archive.
+
+**On macOS (shasum):**
+
+```bash
+shasum -a 512 -c fluss-rust-${RELEASE_VERSION}-incubating.tgz.sha512
+```
+
+**On Linux (sha512sum):**
+
+```bash
+sha512sum -c fluss-rust-${RELEASE_VERSION}-incubating.tgz.sha512
+```
+
+If you have multiple archives, run `-c` on each `.sha512` file (or use `shasum -a 512 -c *.sha512` / `sha512sum -c *.sha512`).
+
+If the verification is successful, you will see a message like this:
+
+```text
+fluss-rust-0.1.0-incubating.tgz: OK
+```
+
+## Verifying build
+
+Extract the source release archive and verify that it builds (and optionally that tests pass). You need **Rust** (see [rust-toolchain.toml](https://github.com/apache/fluss-rust/blob/main/rust-toolchain.toml) for the expected version) and, for full builds, **protobuf** and **Python 3.9+** for bindings.
+
+```bash
+tar -xzf fluss-rust-${RELEASE_VERSION}-incubating.tgz
+cd fluss-rust-${RELEASE_VERSION}-incubating
+```
+
+Build the workspace:
+
+```bash
+cargo build --workspace --release
+```
+
+For Python bindings, see the project [README](https://github.com/apache/fluss-rust#readme) and [Development Guide](https://github.com/apache/fluss-rust/blob/main/DEVELOPMENT.md). For C++ bindings, see `bindings/cpp/`.
+
+## Verifying LICENSE and NOTICE
+
+Unzip the source release archive and verify that:
+
+1. The **LICENSE** and **NOTICE** files in the root directory are correct and refer to dependencies in the source release (e.g. files in the repository such as fonts, CSS, JavaScript, images).
+2. All files that need it have ASF license headers.
+3. All dependencies have been checked for their license and the license is ASL 2.0 compatible ([ASF third-party license policy](http://www.apache.org/legal/resolved.html#category-x)).
+4. Compatible non-ASL 2.0 licenses are documented (e.g. in NOTICE or in dependency audit files such as `DEPENDENCIES*.tsv`).
+
+The project uses [cargo-deny](https://embarkstudios.github.io/cargo-deny/) for license checks; see [Creating a Fluss Rust Client Release](create-release.md) for how the dependency list is generated before a release.
+
+## Testing features
+
+For any user-facing feature included in a release, we aim to ensure it is functional, usable, and well-documented. Release managers may create testing issues that outline key scenarios to validate; these are open to all community members.
+
+**Per-language verification:** For **Rust** and **C++**, build from the source release and write your own test cases to verify. For **Python**, the RC is published to **TestPyPI**; install the client from TestPyPI and write your own test cases (e.g. connect, create table, read/write) to verify. Use the README in each component as the entry point:
+
+- **Rust client:** You can depend on the RC via its git tag (e.g. in your `Cargo.toml`: `fluss-rs = { git = "https://github.com/apache/fluss-rust", tag = "v${RELEASE_VERSION}-rc${RC_NUM}" }`) and build your own test project to verify. Alternatively, build from the source release; see [Rust Installation Guide](../user-guide/rust/installation.md).
+- **Python bindings:** See [Python Installation Guide](../user-guide/python/installation.md) for how to add the Python client (for an RC, install from **TestPyPI**: `pip install -i https://test.pypi.org/simple/ pyfluss==${RELEASE_VERSION}`); then write test cases to verify.
+- **C++ bindings:** See [C++ Installation Guide](../user-guide/cpp/installation.md) for how to build and link the C++ client; then write test cases to verify.
+
+## Incubator release checklist
+
+If the project is in incubation, the ASF Incubator provides a release checklist. You can refer to it when verifying the release:
+
+- [Incubator Release Checklist](https://cwiki.apache.org/confluence/display/INCUBATOR/Incubator+Release+Checklist)
+
+## Voting
+
+Votes are cast by replying to the vote email on the dev mailing list with **+1**, **0**, or **-1**.
+
+In addition to your vote, it is customary to state whether your vote is **binding** or **non-binding**. Only members of the PPMC and mentors have formally binding votes (and the IPMC on the Incubator general list). If unsure, you can state that your vote is non-binding. See [Apache Foundation Voting](https://www.apache.org/foundation/voting.html).
+
+It is recommended to include a short list of what you verified (e.g. signatures, checksums, build, tests, LICENSE/NOTICE). This helps the community see what has been checked and what might still be missing.
+
+**Checklist you can reference in your vote:**
+
+- [ ] [Validating distributions](#validating-distributions)
+- [ ] [Verifying signatures](#verifying-signatures)
+- [ ] [Verifying checksums](#verifying-checksums)
+- [ ] [Verifying build](#verifying-build)
+- [ ] [Verifying LICENSE and NOTICE](#verifying-license-and-notice)
+- [ ] [Testing features](#testing-features)
+- [ ] [Incubator release checklist](#incubator-release-checklist) (if applicable)
+
diff --git a/fluss-rust/website/docs/user-guide/_category_.json b/fluss-rust/website/docs/user-guide/_category_.json
new file mode 100644
index 0000000000..68ea78e784
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "User Guide",
+  "position": 2
+}
diff --git a/fluss-rust/website/docs/user-guide/cpp/_category_.json b/fluss-rust/website/docs/user-guide/cpp/_category_.json
new file mode 100644
index 0000000000..fbdf7a264d
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "C++",
+  "position": 3
+}
diff --git a/fluss-rust/website/docs/user-guide/cpp/api-reference.md b/fluss-rust/website/docs/user-guide/cpp/api-reference.md
new file mode 100644
index 0000000000..e9b94c9d9e
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/api-reference.md
@@ -0,0 +1,731 @@
+---
+sidebar_position: 2
+---
+# API Reference
+
+Complete API reference for the Fluss C++ client.
+
+## `Result`
+
+| Field / Method  | Type          | Description                                                    |
+|-----------------|---------------|----------------------------------------------------------------|
+| `error_code`    | `int32_t`     | 0 for success, non-zero for errors                             |
+| `error_message` | `std::string` | Human-readable error description                               |
+| `Ok()`          | `bool`        | Returns `true` if operation succeeded (`error_code == 0`)      |
+
+## `Configuration`
+
+| Field                                 | Type          | Default              | Description                                                                              |
+|---------------------------------------|---------------|----------------------|------------------------------------------------------------------------------------------|
+| `bootstrap_servers`                   | `std::string` | `"127.0.0.1:9123"`   | Coordinator server address                                                               |
+| `writer_request_max_size`             | `int32_t`     | `10485760` (10 MB)   | Maximum request size in bytes                                                            |
+| `writer_acks`                         | `std::string` | `"all"`              | Acknowledgment setting (`"all"`, `"0"`, `"1"`, or `"-1"`)                                |
+| `writer_retries`                      | `int32_t`     | `INT32_MAX`          | Number of retries on failure                                                             |
+| `writer_batch_size`                   | `int32_t`     | `2097152` (2 MB)     | Batch size for writes in bytes. Upper bound when dynamic sizing is on; fixed batch size when off |
+| `writer_dynamic_batch_size_enabled`   | `bool`        | `true`               | Enable per-table dynamic batch sizing: target grows 10% above 80% fill, shrinks 5% below 50% |
+| `writer_dynamic_batch_size_min`       | `int32_t`     | `262144` (256 KB)    | Lower bound for the dynamic batch size estimator (ignored when disabled)                 |
+| `writer_batch_timeout_ms`             | `int64_t`     | `100`                | Maximum time in ms to wait for a writer batch to fill up before sending                  |
+| `writer_bucket_no_key_assigner`       | `std::string` | `"sticky"`           | Bucket assignment strategy for tables without bucket keys: `"sticky"` or `"round_robin"` |
+| `scanner_remote_log_prefetch_num`     | `size_t`      | `4`                  | Number of remote log segments to prefetch                                                |
+| `remote_file_download_thread_num`     | `size_t`      | `3`                  | Number of threads for remote log downloads                                               |
+| `scanner_remote_log_read_concurrency` | `size_t`      | `4`                  | Streaming read concurrency within a remote log file                                      |
+| `scanner_log_max_poll_records`        | `size_t`      | `500`                | Maximum number of records returned in a single Poll()                                    |
+| `scanner_log_fetch_max_bytes`         | `int32_t`     | `16777216` (16 MB)   | Maximum bytes per fetch response for LogScanner                                          |
+| `scanner_log_fetch_min_bytes`         | `int32_t`     | `1`                  | Minimum bytes the server must accumulate before returning a fetch response               |
+| `scanner_log_fetch_wait_max_time_ms`  | `int32_t`     | `500`                | Maximum time (ms) the server may wait to satisfy min-bytes                               |
+| `scanner_log_fetch_max_bytes_for_bucket`| `int32_t`   | `1048576` (1 MB)     | Maximum bytes per fetch response per bucket for LogScanner                               |
+| `connect_timeout_ms`                  | `uint64_t`    | `120000`             | TCP connect timeout in milliseconds                                                      |
+| `security_protocol`                   | `std::string` | `"PLAINTEXT"`        | `"PLAINTEXT"` (default) or `"sasl"` for SASL auth                                        |
+| `security_sasl_mechanism`             | `std::string` | `"PLAIN"`            | SASL mechanism (only `"PLAIN"` is supported)                                             |
+| `security_sasl_username`              | `std::string` | (empty)              | SASL username (required when protocol is `"sasl"`)                                       |
+| `security_sasl_password`              | `std::string` | (empty)              | SASL password (required when protocol is `"sasl"`)                                       |
+
+## `Connection`
+
+| Method                                                                  | Description                                       |
+|-------------------------------------------------------------------------|---------------------------------------------------|
+| `static Create(const Configuration& config, Connection& out) -> Result` | Create a connection to a Fluss cluster            |
+| `GetAdmin(Admin& out) -> Result`                                        | Get the admin interface                           |
+| `GetTable(const TablePath& table_path, Table& out) -> Result`           | Get a table for read/write operations             |
+| `Available() -> bool`                                                   | Check if the connection is valid and initialized  |
+
+## `Admin`
+
+### Database Operations
+
+| Method                                                                                                                    | Description              |
+|---------------------------------------------------------------------------------------------------------------------------|--------------------------|
+| `CreateDatabase(const std::string& database_name, const DatabaseDescriptor& descriptor, bool ignore_if_exists) -> Result` | Create a database        |
+| `DropDatabase(const std::string& name, bool ignore_if_not_exists, bool cascade) -> Result`                                | Drop a database          |
+| `ListDatabases(std::vector<std::string>& out) -> Result`                                                                  | List all databases       |
+| `DatabaseExists(const std::string& name, bool& out) -> Result`                                                            | Check if a database exists |
+| `GetDatabaseInfo(const std::string& name, DatabaseInfo& out) -> Result`                                                   | Get database metadata    |
+
+### Table Operations
+
+| Method                                                                                                     | Description                 |
+|------------------------------------------------------------------------------------------------------------|-----------------------------|
+| `CreateTable(const TablePath& path, const TableDescriptor& descriptor, bool ignore_if_exists) -> Result`   | Create a table              |
+| `DropTable(const TablePath& path, bool ignore_if_not_exists) -> Result`                                    | Drop a table                |
+| `GetTableInfo(const TablePath& path, TableInfo& out) -> Result`                                            | Get table metadata          |
+| `ListTables(const std::string& database_name, std::vector<std::string>& out) -> Result`                    | List tables in a database   |
+| `TableExists(const TablePath& path, bool& out) -> Result`                                                  | Check if a table exists     |
+
+### Partition Operations
+
+| Method                                                                                                                                          | Description              |
+|-------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|
+| `CreatePartition(const TablePath& path, const std::unordered_map<std::string, std::string>& partition_spec, bool ignore_if_exists) -> Result`   | Create a partition       |
+| `DropPartition(const TablePath& path, const std::unordered_map<std::string, std::string>& partition_spec, bool ignore_if_not_exists) -> Result` | Drop a partition         |
+| `ListPartitionInfos(const TablePath& path, std::vector<PartitionInfo>& out) -> Result`                                                          | List partition metadata  |
+
+### Offset Operations
+
+| Method                                                                                                                                                                                                  | Description                             |
+|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|
+| `ListOffsets(const TablePath& path, const std::vector<int32_t>& bucket_ids, const OffsetSpec& query, std::unordered_map<int32_t, int64_t>& out) -> Result`                                             | Get offsets for buckets                 |
+| `ListPartitionOffsets(const TablePath& path, const std::string& partition_name, const std::vector<int32_t>& bucket_ids, const OffsetSpec& query, std::unordered_map<int32_t, int64_t>& out) -> Result` | Get offsets for a partition's buckets   |
+
+### Lake Operations
+
+| Method                                                                      | Description                  |
+|-----------------------------------------------------------------------------|------------------------------|
+| `GetLatestLakeSnapshot(const TablePath& path, LakeSnapshot& out) -> Result` | Get the latest lake snapshot |
+
+### Cluster Operations
+
+| Method                                                    | Description                                        |
+|-----------------------------------------------------------|----------------------------------------------------|
+| `GetServerNodes(std::vector<ServerNode>& out) -> Result`  | Get all alive server nodes (coordinator + tablets) |
+
+## `ServerNode`
+
+| Field         | Type          | Description                                              |
+|---------------|---------------|----------------------------------------------------------|
+| `id`          | `int32_t`     | Server node ID                                           |
+| `host`        | `std::string` | Hostname of the server                                   |
+| `port`        | `uint32_t`    | Port number                                              |
+| `server_type` | `std::string` | Server type (`"CoordinatorServer"` or `"TabletServer"`)  |
+| `uid`         | `std::string` | Unique identifier (e.g. `"cs-0"`, `"ts-1"`)             |
+
+## `Table`
+
+| Method                        | Description                              |
+|-------------------------------|------------------------------------------|
+| `NewRow() -> GenericRow`      | Create a schema-aware row for this table |
+| `NewAppend() -> TableAppend`  | Create an append builder for log tables  |
+| `NewUpsert() -> TableUpsert`  | Create an upsert builder for PK tables   |
+| `NewLookup() -> TableLookup`  | Create a lookup builder for PK tables    |
+| `NewScan() -> TableScan`      | Create a scan builder                    |
+| `GetTableInfo() -> TableInfo` | Get table metadata                       |
+| `GetTablePath() -> TablePath` | Get the table path                       |
+| `HasPrimaryKey() -> bool`     | Check if the table has a primary key     |
+
+## `TableAppend`
+
+| Method                                       | Description             |
+|----------------------------------------------|-------------------------|
+| `CreateWriter(AppendWriter& out) -> Result`  | Create an append writer |
+
+## `TableUpsert`
+
+| Method                                                                       | Description                                |
+|------------------------------------------------------------------------------|--------------------------------------------|
+| `PartialUpdateByIndex(std::vector<size_t> column_indices) -> TableUpsert&`   | Configure partial update by column indices |
+| `PartialUpdateByName(std::vector<std::string> column_names) -> TableUpsert&` | Configure partial update by column names   |
+| `CreateWriter(UpsertWriter& out) -> Result`                                  | Create an upsert writer                    |
+
+## `TableLookup`
+
+| Method                                    | Description                         |
+|-------------------------------------------|-------------------------------------|
+| `CreateLookuper(Lookuper& out) -> Result` | Create a lookuper for point lookups |
+
+## `TableScan`
+
+| Method                                                               | Description                                   |
+|----------------------------------------------------------------------|-----------------------------------------------|
+| `ProjectByIndex(std::vector<size_t> column_indices) -> TableScan&`   | Project columns by index                      |
+| `ProjectByName(std::vector<std::string> column_names) -> TableScan&` | Project columns by name                       |
+| `CreateLogScanner(LogScanner& out) -> Result`                        | Create a record-based log scanner             |
+| `CreateRecordBatchLogScanner(LogScanner& out) -> Result`             | Create an Arrow RecordBatch-based log scanner |
+
+## `AppendWriter`
+
+| Method                                                      | Description                            |
+|-------------------------------------------------------------|----------------------------------------|
+| `Append(const GenericRow& row) -> Result`                   | Append a row (fire-and-forget)         |
+| `Append(const GenericRow& row, WriteResult& out) -> Result` | Append a row with write acknowledgment |
+| `Flush() -> Result`                                         | Flush all pending writes               |
+
+## `UpsertWriter`
+
+| Method                                                      | Description                                   |
+|-------------------------------------------------------------|-----------------------------------------------|
+| `Upsert(const GenericRow& row) -> Result`                   | Upsert a row (fire-and-forget)                |
+| `Upsert(const GenericRow& row, WriteResult& out) -> Result` | Upsert a row with write acknowledgment        |
+| `Delete(const GenericRow& row) -> Result`                   | Delete a row by primary key (fire-and-forget) |
+| `Delete(const GenericRow& row, WriteResult& out) -> Result` | Delete a row with write acknowledgment        |
+| `Flush() -> Result`                                         | Flush all pending operations                  |
+
+## `WriteResult`
+
+| Method             | Description                                 |
+|--------------------|---------------------------------------------|
+| `Wait() -> Result` | Wait for server acknowledgment of the write |
+
+## `Lookuper`
+
+| Method                                                        |  Description                |
+|---------------------------------------------------------------|-----------------------------|
+| `Lookup(const GenericRow& pk_row, LookupResult& out) -> Result` | Lookup a row by primary key |
+
+## `LogScanner`
+
+| Method                                                                                               |  Description                              |
+|------------------------------------------------------------------------------------------------------|-------------------------------------------|
+| `Subscribe(int32_t bucket_id, int64_t offset) -> Result`                                             | Subscribe to a single bucket at an offset |
+| `Subscribe(const std::vector<BucketSubscription>& bucket_offsets) -> Result`                         | Subscribe to multiple buckets             |
+| `SubscribePartitionBuckets(int64_t partition_id, int32_t bucket_id, int64_t start_offset) -> Result` | Subscribe to a single partition bucket    |
+| `SubscribePartitionBuckets(const std::vector<PartitionBucketSubscription>& subscriptions) -> Result` | Subscribe to multiple partition buckets   |
+| `Unsubscribe(int32_t bucket_id) -> Result`                                                           | Unsubscribe from a non-partitioned bucket |
+| `UnsubscribePartition(int64_t partition_id, int32_t bucket_id) -> Result`                            | Unsubscribe from a partition bucket       |
+| `Poll(int64_t timeout_ms, ScanRecords& out) -> Result`                                               | Poll individual records                   |
+| `PollRecordBatch(int64_t timeout_ms, ArrowRecordBatches& out) -> Result`                             | Poll Arrow RecordBatches                  |
+
+## `GenericRow`
+
+`GenericRow` is a **write-only** row used for append, upsert, delete, and lookup key construction. For reading field values from scan or lookup results, see [`RowView`](#rowview) and [`LookupResult`](#lookupresult).
+
+### Index-Based Setters
+
+| Method                                                    |  Description                   |
+|-----------------------------------------------------------|--------------------------------|
+| `SetNull(size_t idx)`                                     | Set field to null              |
+| `SetBool(size_t idx, bool value)`                         | Set boolean value              |
+| `SetInt32(size_t idx, int32_t value)`                     | Set 32-bit integer             |
+| `SetInt64(size_t idx, int64_t value)`                     | Set 64-bit integer             |
+| `SetFloat32(size_t idx, float value)`                     | Set 32-bit float               |
+| `SetFloat64(size_t idx, double value)`                    | Set 64-bit float               |
+| `SetString(size_t idx, const std::string& value)`         | Set string value               |
+| `SetBytes(size_t idx, const std::vector<uint8_t>& value)` | Set binary data                |
+| `SetDate(size_t idx, const Date& value)`                  | Set date value                 |
+| `SetTime(size_t idx, const Time& value)`                  | Set time value                 |
+| `SetTimestampNtz(size_t idx, const Timestamp& value)`     | Set timestamp without timezone |
+| `SetTimestampLtz(size_t idx, const Timestamp& value)`     | Set timestamp with timezone    |
+| `SetDecimal(size_t idx, const std::string& value)`        | Set decimal from string        |
+| `SetArray(size_t idx, ArrayWriter&& writer)`              | Set array value (consumes the writer) |
+
+### Name-Based Setters
+
+When using `table.NewRow()`, the `Set()` method auto-routes to the correct type based on the schema:
+
+| Method                                                   | Description                       |
+|----------------------------------------------------------|-----------------------------------|
+| `Set(const std::string& name, std::nullptr_t)`           | Set field to null by column name  |
+| `Set(const std::string& name, bool value)`               | Set boolean by column name        |
+| `Set(const std::string& name, int32_t value)`            | Set integer by column name        |
+| `Set(const std::string& name, int64_t value)`            | Set big integer by column name    |
+| `Set(const std::string& name, float value)`              | Set float by column name          |
+| `Set(const std::string& name, double value)`             | Set double by column name         |
+| `Set(const std::string& name, const std::string& value)` | Set string/decimal by column name |
+| `Set(const std::string& name, const Date& value)`        | Set date by column name           |
+| `Set(const std::string& name, const Time& value)`        | Set time by column name           |
+| `Set(const std::string& name, const Timestamp& value)`   | Set timestamp by column name      |
+
+## `RowView`
+
+Read-only row view for scan results. Provides zero-copy access to string and bytes data. `RowView` shares ownership of the underlying scan data via reference counting, so it can safely outlive the `ScanRecords` that produced it.
+
+:::note string_view Lifetime
+`GetString()` returns `std::string_view` that borrows from the underlying data. The `string_view` is valid as long as any `RowView` (or `ScanRecord`) referencing the same poll result is alive. Copy to `std::string` if you need the value after all references are gone.
+:::
+
+### Index-Based Getters
+
+| Method                                                     |  Description                   |
+|------------------------------------------------------------|--------------------------------|
+| `FieldCount() -> size_t`                                   | Get the number of fields       |
+| `GetType(size_t idx) -> TypeId`                            | Get the type at index          |
+| `IsNull(size_t idx) -> bool`                               | Check if field is null         |
+| `GetBool(size_t idx) -> bool`                              | Get boolean value at index     |
+| `GetInt32(size_t idx) -> int32_t`                          | Get 32-bit integer at index    |
+| `GetInt64(size_t idx) -> int64_t`                          | Get 64-bit integer at index    |
+| `GetFloat32(size_t idx) -> float`                          | Get 32-bit float at index      |
+| `GetFloat64(size_t idx) -> double`                         | Get 64-bit float at index      |
+| `GetString(size_t idx) -> std::string_view`                | Get string at index (zero-copy)|
+| `GetBytes(size_t idx) -> std::pair<const uint8_t*, size_t>`| Get binary data at index (zero-copy)|
+| `GetDate(size_t idx) -> Date`                              | Get date at index              |
+| `GetTime(size_t idx) -> Time`                              | Get time at index              |
+| `GetTimestamp(size_t idx) -> Timestamp`                    | Get timestamp at index         |
+| `IsDecimal(size_t idx) -> bool`                            | Check if field is a decimal type|
+| `GetDecimalString(size_t idx) -> std::string`              | Get decimal as string at index |
+
+### Array Getters (Index-Based)
+
+| Method                                                             |  Description                              |
+|--------------------------------------------------------------------|-------------------------------------------|
+| `GetArraySize(size_t idx) -> size_t`                               | Get element count of array at index       |
+| `GetArrayElementType(size_t idx) -> TypeId`                        | Get element type of array at index        |
+| `IsArrayElementNull(size_t idx, size_t element) -> bool`           | Check if array element is null            |
+| `GetArrayBool(size_t idx, size_t element) -> bool`                 | Get boolean array element                 |
+| `GetArrayInt32(size_t idx, size_t element) -> int32_t`             | Get 32-bit integer array element          |
+| `GetArrayInt64(size_t idx, size_t element) -> int64_t`             | Get 64-bit integer array element          |
+| `GetArrayFloat32(size_t idx, size_t element) -> float`             | Get 32-bit float array element            |
+| `GetArrayFloat64(size_t idx, size_t element) -> double`            | Get 64-bit float array element            |
+| `GetArrayString(size_t idx, size_t element) -> std::string`        | Get string array element                  |
+| `GetArrayBytes(size_t idx, size_t element) -> std::vector<uint8_t>`| Get binary array element                  |
+| `GetArrayDate(size_t idx, size_t element) -> Date`                 | Get date array element                    |
+| `GetArrayTime(size_t idx, size_t element) -> Time`                 | Get time array element                    |
+| `GetArrayTimestamp(size_t idx, size_t element) -> Timestamp`       | Get timestamp array element               |
+| `GetArrayDecimalString(size_t idx, size_t element) -> std::string` | Get decimal array element as string       |
+| `GetArrayView(size_t idx) -> ArrayView`                            | Get owning ArrayView for nested access    |
+
+All array getters are also available by column name (e.g., `GetArraySize("col")`, `GetArrayView("col")`).
+
+### Name-Based Getters
+
+| Method                                                  |  Description                       |
+|---------------------------------------------------------|------------------------------------|
+| `IsNull(const std::string& name) -> bool`               | Check if field is null by name     |
+| `GetBool(const std::string& name) -> bool`              | Get boolean by column name         |
+| `GetInt32(const std::string& name) -> int32_t`          | Get 32-bit integer by column name  |
+| `GetInt64(const std::string& name) -> int64_t`          | Get 64-bit integer by column name  |
+| `GetFloat32(const std::string& name) -> float`          | Get 32-bit float by column name    |
+| `GetFloat64(const std::string& name) -> double`         | Get 64-bit float by column name    |
+| `GetString(const std::string& name) -> std::string_view`| Get string by column name          |
+| `GetBytes(const std::string& name) -> std::pair<const uint8_t*, size_t>` | Get binary data by column name |
+| `GetDate(const std::string& name) -> Date`              | Get date by column name            |
+| `GetTime(const std::string& name) -> Time`              | Get time by column name            |
+| `GetTimestamp(const std::string& name) -> Timestamp`    | Get timestamp by column name       |
+| `GetDecimalString(const std::string& name) -> std::string` | Get decimal as string by column name |
+
+## `ScanRecord`
+
+`ScanRecord` is a value type that can be freely copied, stored, and accumulated across multiple `Poll()` calls. It shares ownership of the underlying scan data via reference counting.
+
+| Field         | Type         |  Description                                                        |
+|---------------|--------------|---------------------------------------------------------------------|
+| `offset`      | `int64_t`    | Record offset in the log                                            |
+| `timestamp`   | `int64_t`    | Record timestamp                                                    |
+| `change_type` | `ChangeType` | Change type (AppendOnly, Insert, UpdateBefore, UpdateAfter, Delete) |
+| `row`         | `RowView`    | Row data (value type, shares ownership via reference counting)      |
+
+## `ScanRecords`
+
+### Flat Access
+
+| Method                                  |  Description                               |
+|-----------------------------------------|--------------------------------------------|
+| `Count() -> size_t`                     | Total number of records across all buckets |
+| `IsEmpty() -> bool`                     | Check if empty                             |
+| `begin() / end()`                       | Iterator support for range-based for loops |
+
+Flat iteration over all records (regardless of bucket):
+
+```cpp
+for (const auto& rec : records) {
+    std::cout << "offset=" << rec.offset << std::endl;
+}
+```
+
+### Per-Bucket Access
+
+| Method                                                          |  Description                                                          |
+|-----------------------------------------------------------------|-----------------------------------------------------------------------|
+| `BucketCount() -> size_t`                                       | Number of distinct buckets                                            |
+| `Buckets() -> std::vector<TableBucket>`                         | List of distinct buckets                                              |
+| `Records(const TableBucket& bucket) -> BucketRecords`              | Records for a specific bucket (empty if bucket not present)           |
+| `BucketAt(size_t idx) -> BucketRecords`                            | Records by bucket index (0-based, O(1))                               |
+
+## `BucketRecords`
+
+A bundle of scan records belonging to a single bucket. Obtained from `ScanRecords::Records()` or `ScanRecords::BucketAt()`. `BucketRecords` is a value type — it shares ownership of the underlying scan data via reference counting, so it can safely outlive the `ScanRecords` that produced it.
+
+| Method                                         |  Description                               |
+|------------------------------------------------|--------------------------------------------|
+| `Size() -> size_t`                         | Number of records in this bucket           |
+| `Empty() -> bool`                          | Check if empty                             |
+| `Bucket() -> const TableBucket&`           | Get the bucket                             |
+| `operator[](size_t idx) -> ScanRecord`     | Access record by index within this bucket  |
+| `begin() / end()`                          | Iterator support for range-based for loops |
+
+## `TableBucket`
+
+| Field / Method                        |  Description                                    |
+|---------------------------------------|-------------------------------------------------|
+| `table_id -> int64_t`                    | Table ID                                        |
+| `bucket_id -> int32_t`                   | Bucket ID                                       |
+| `partition_id -> std::optional<int64_t>` | Partition ID (empty if non-partitioned)         |
+| `operator==(const TableBucket&) -> bool` | Equality comparison                             |
+
+## `LookupResult`
+
+Read-only result for lookup operations. Provides zero-copy access to field values.
+
+### Metadata
+
+| Method                      |  Description                   |
+|-----------------------------|--------------------------------|
+| `Found() -> bool`           | Whether a matching row was found |
+| `FieldCount() -> size_t`    | Get the number of fields       |
+
+### Index-Based Getters
+
+| Method                                                     |  Description                   |
+|------------------------------------------------------------|--------------------------------|
+| `GetType(size_t idx) -> TypeId`                            | Get the type at index          |
+| `IsNull(size_t idx) -> bool`                               | Check if field is null         |
+| `GetBool(size_t idx) -> bool`                              | Get boolean value at index     |
+| `GetInt32(size_t idx) -> int32_t`                          | Get 32-bit integer at index    |
+| `GetInt64(size_t idx) -> int64_t`                          | Get 64-bit integer at index    |
+| `GetFloat32(size_t idx) -> float`                          | Get 32-bit float at index      |
+| `GetFloat64(size_t idx) -> double`                         | Get 64-bit float at index      |
+| `GetString(size_t idx) -> std::string_view`                | Get string at index (zero-copy)|
+| `GetBytes(size_t idx) -> std::pair<const uint8_t*, size_t>`| Get binary data at index (zero-copy)|
+| `GetDate(size_t idx) -> Date`                              | Get date at index              |
+| `GetTime(size_t idx) -> Time`                              | Get time at index              |
+| `GetTimestamp(size_t idx) -> Timestamp`                    | Get timestamp at index         |
+| `IsDecimal(size_t idx) -> bool`                            | Check if field is a decimal type|
+| `GetDecimalString(size_t idx) -> std::string`              | Get decimal as string at index |
+
+### Array Getters (Index-Based)
+
+Same array getters as [`RowView`](#array-getters-index-based) — `GetArraySize`, `GetArrayInt32`, `GetArrayView`, etc. Also available by column name.
+
+### Name-Based Getters
+
+| Method                                                  |  Description                       |
+|---------------------------------------------------------|------------------------------------|
+| `IsNull(const std::string& name) -> bool`               | Check if field is null by name     |
+| `GetBool(const std::string& name) -> bool`              | Get boolean by column name         |
+| `GetInt32(const std::string& name) -> int32_t`          | Get 32-bit integer by column name  |
+| `GetInt64(const std::string& name) -> int64_t`          | Get 64-bit integer by column name  |
+| `GetFloat32(const std::string& name) -> float`          | Get 32-bit float by column name    |
+| `GetFloat64(const std::string& name) -> double`         | Get 64-bit float by column name    |
+| `GetString(const std::string& name) -> std::string_view`| Get string by column name          |
+| `GetBytes(const std::string& name) -> std::pair<const uint8_t*, size_t>` | Get binary data by column name |
+| `GetDate(const std::string& name) -> Date`              | Get date by column name            |
+| `GetTime(const std::string& name) -> Time`              | Get time by column name            |
+| `GetTimestamp(const std::string& name) -> Timestamp`    | Get timestamp by column name       |
+| `GetDecimalString(const std::string& name) -> std::string` | Get decimal as string by column name |
+
+## `ArrowRecordBatch`
+
+| Method                                                         | Description                          |
+|----------------------------------------------------------------|--------------------------------------|
+| `GetArrowRecordBatch() -> std::shared_ptr<arrow::RecordBatch>` | Get the underlying Arrow RecordBatch |
+| `Available() -> bool`                                          | Check if the batch is valid          |
+| `NumRows() -> int64_t`                                         | Number of rows in the batch          |
+| `GetTableId() -> int64_t`                                      | Table ID                             |
+| `GetPartitionId() -> int64_t`                                  | Partition ID                         |
+| `GetBucketId() -> int32_t`                                     | Bucket ID                            |
+| `GetBaseOffset() -> int64_t`                                   | First record offset                  |
+| `GetLastOffset() -> int64_t`                                   | Last record offset                   |
+
+## `ArrowRecordBatches`
+
+| Method                   |  Description                               |
+|--------------------------|--------------------------------------------|
+| `Size() -> size_t`       | Number of batches                          |
+| `Empty() -> bool`        | Check if empty                             |
+| `operator[](size_t idx)` | Access batch by index                      |
+| `begin() / end()`        | Iterator support for range-based for loops |
+
+## `Schema`
+
+| Method                            |  Description                |
+|-----------------------------------|-----------------------------|
+| `NewBuilder() -> Schema::Builder` | Create a new schema builder |
+
+## `Schema::Builder`
+
+| Method                                                                 |  Description            |
+|------------------------------------------------------------------------|-------------------------|
+| `AddColumn(const std::string& name, const DataType& type) -> Builder&` | Add a column            |
+| `SetPrimaryKeys(const std::vector<std::string>& keys) -> Builder&`     | Set primary key columns |
+| `Build() -> Schema`                                                    | Build the schema        |
+
+## `TableDescriptor`
+
+| Method                                     |  Description                          |
+|--------------------------------------------|---------------------------------------|
+| `NewBuilder() -> TableDescriptor::Builder` | Create a new table descriptor builder |
+
+## `TableDescriptor::Builder`
+
+| Method                                                                            | Description                |
+|-----------------------------------------------------------------------------------|----------------------------|
+| `SetSchema(const Schema& schema) -> Builder&`                                     | Set the table schema       |
+| `SetPartitionKeys(const std::vector<std::string>& keys) -> Builder&`              | Set partition key columns  |
+| `SetBucketCount(int32_t count) -> Builder&`                                       | Set the number of buckets  |
+| `SetBucketKeys(const std::vector<std::string>& keys) -> Builder&`                 | Set bucket key columns     |
+| `SetProperty(const std::string& key, const std::string& value) -> Builder&`       | Set a table property       |
+| `SetCustomProperty(const std::string& key, const std::string& value) -> Builder&` | Set a custom property      |
+| `SetComment(const std::string& comment) -> Builder&`                              | Set a table comment        |
+| `Build() -> TableDescriptor`                                                      | Build the table descriptor |
+
+## `DataType`
+
+### Factory Methods
+
+| Method                                        |  Description                       |
+|-----------------------------------------------|------------------------------------|
+| `DataType::Boolean()`                         | Boolean type                       |
+| `DataType::TinyInt()`                         | 8-bit signed integer               |
+| `DataType::SmallInt()`                        | 16-bit signed integer              |
+| `DataType::Int()`                             | 32-bit signed integer              |
+| `DataType::BigInt()`                          | 64-bit signed integer              |
+| `DataType::Float()`                           | 32-bit floating point              |
+| `DataType::Double()`                          | 64-bit floating point              |
+| `DataType::String()`                          | UTF-8 string                       |
+| `DataType::Bytes()`                           | Binary data                        |
+| `DataType::Date()`                            | Date (days since epoch)            |
+| `DataType::Time()`                            | Time (milliseconds since midnight) |
+| `DataType::Timestamp(int precision)`          | Timestamp without timezone         |
+| `DataType::TimestampLtz(int precision)`       | Timestamp with timezone            |
+| `DataType::Decimal(int precision, int scale)` | Decimal with precision and scale   |
+| `DataType::Array(DataType element)`           | Array of the given element type    |
+
+### Accessors
+
+| Method                              |  Description                                |
+|-------------------------------------|---------------------------------------------|
+| `id() -> TypeId`                    | Get the type ID                             |
+| `precision() -> int`               | Get precision (for Decimal/Timestamp types) |
+| `scale() -> int`                   | Get scale (for Decimal type)                |
+| `nullable() -> bool`               | Returns `true` if this type is nullable (default), `false` if `NOT NULL` |
+| `element_type() -> const DataType*` | Get element type (for Array type, nullptr otherwise) |
+| `NotNull() -> DataType`            | Returns a copy of this type with nullable set to `false` |
+
+## `ArrayWriter`
+
+Write-only builder for array column values. Constructed with a fixed size and element type, then populated element-by-element. Move-only — consumed by `GenericRow::SetArray()` or `ArrayWriter::SetArray()` for nested arrays.
+
+| Method                                                    |  Description                              |
+|-----------------------------------------------------------|-------------------------------------------|
+| `ArrayWriter(size_t size, DataType element_type)`         | Create an array writer                    |
+| `SetNull(size_t idx)`                                     | Set element to null                       |
+| `SetBool(size_t idx, bool value)`                         | Set boolean element                       |
+| `SetInt32(size_t idx, int32_t value)`                     | Set 32-bit integer element                |
+| `SetInt64(size_t idx, int64_t value)`                     | Set 64-bit integer element                |
+| `SetFloat32(size_t idx, float value)`                     | Set 32-bit float element                  |
+| `SetFloat64(size_t idx, double value)`                    | Set 64-bit float element                  |
+| `SetString(size_t idx, const std::string& value)`         | Set string element                        |
+| `SetBytes(size_t idx, const std::vector<uint8_t>& value)` | Set binary element                        |
+| `SetDate(size_t idx, const Date& value)`                  | Set date element                          |
+| `SetTime(size_t idx, const Time& value)`                  | Set time element                          |
+| `SetTimestampNtz(size_t idx, const Timestamp& value)`     | Set timestamp without timezone element    |
+| `SetTimestampLtz(size_t idx, const Timestamp& value)`     | Set timestamp with timezone element       |
+| `SetDecimal(size_t idx, const std::string& value)`        | Set decimal element from string           |
+| `SetArray(size_t idx, ArrayWriter&& nested)`              | Set nested array element (consumes nested)|
+
+## `ArrayView`
+
+Read-only view over an array column value. Obtained from `RowView::GetArrayView()` or `LookupResult::GetArrayView()`, and recursively from `ArrayView::GetArray()` for nested `ARRAY<ARRAY<...>>` columns. Move-only.
+
+| Method                                                  |  Description                              |
+|---------------------------------------------------------|-------------------------------------------|
+| `Size() -> size_t`                                      | Get element count                         |
+| `ElementType() -> TypeId`                               | Get element type                          |
+| `IsNull(size_t element) -> bool`                        | Check if element is null                  |
+| `GetBool(size_t element) -> bool`                       | Get boolean element                       |
+| `GetInt32(size_t element) -> int32_t`                   | Get 32-bit integer element                |
+| `GetInt64(size_t element) -> int64_t`                   | Get 64-bit integer element                |
+| `GetFloat32(size_t element) -> float`                   | Get 32-bit float element                  |
+| `GetFloat64(size_t element) -> double`                  | Get 64-bit float element                  |
+| `GetString(size_t element) -> std::string`              | Get string element                        |
+| `GetBytes(size_t element) -> std::vector<uint8_t>`      | Get binary element                        |
+| `GetDate(size_t element) -> Date`                       | Get date element                          |
+| `GetTime(size_t element) -> Time`                       | Get time element                          |
+| `GetTimestamp(size_t element) -> Timestamp`              | Get timestamp element                     |
+| `GetTimestampLtz(size_t element) -> Timestamp`          | Get timestamp with timezone element       |
+| `GetDecimalString(size_t element) -> std::string`       | Get decimal element as string             |
+| `GetArray(size_t element) -> ArrayView`                 | Get nested array as child ArrayView       |
+
+## `TablePath`
+
+| Method / Field                                                     |  Description          |
+|--------------------------------------------------------------------|-----------------------|
+| `TablePath(const std::string& database, const std::string& table)` | Create a table path   |
+| `database_name -> std::string`                                     | Database name         |
+| `table_name -> std::string`                                        | Table name            |
+| `ToString() -> std::string`                                        | String representation |
+
+## `TableInfo`
+
+| Field               | Type                                           | Description                         |
+|---------------------|------------------------------------------------|-------------------------------------|
+| `table_id`          | `int64_t`                                      | Table ID                            |
+| `schema_id`         | `int32_t`                                      | Schema ID                           |
+| `table_path`        | `TablePath`                                    | Table path                          |
+| `created_time`      | `int64_t`                                      | Creation timestamp                  |
+| `modified_time`     | `int64_t`                                      | Last modification timestamp         |
+| `primary_keys`      | `std::vector<std::string>`                     | Primary key columns                 |
+| `bucket_keys`       | `std::vector<std::string>`                     | Bucket key columns                  |
+| `partition_keys`    | `std::vector<std::string>`                     | Partition key columns               |
+| `num_buckets`       | `int32_t`                                      | Number of buckets                   |
+| `has_primary_key`   | `bool`                                         | Whether the table has a primary key |
+| `is_partitioned`    | `bool`                                         | Whether the table is partitioned    |
+| `properties`        | `std::unordered_map<std::string, std::string>` | Table properties                    |
+| `custom_properties` | `std::unordered_map<std::string, std::string>` | Custom properties                   |
+| `comment`           | `std::string`                                  | Table comment                       |
+| `schema`            | `Schema`                                       | Table schema                        |
+
+## Temporal Types
+
+### `Date`
+
+| Method                                        |  Description                 |
+|-----------------------------------------------|------------------------------|
+| `Date::FromDays(int32_t days)`                | Create from days since epoch |
+| `Date::FromYMD(int year, int month, int day)` | Create from year, month, day |
+| `Year() -> int`                               | Get year                     |
+| `Month() -> int`                              | Get month                    |
+| `Day() -> int`                                | Get day                      |
+
+### `Time`
+
+| Method                                            |  Description                                 |
+|---------------------------------------------------|----------------------------------------------|
+| `Time::FromMillis(int32_t millis)`                | Create from milliseconds since midnight      |
+| `Time::FromHMS(int hour, int minute, int second)` | Create from hour, minute, second             |
+| `Hour() -> int`                                   | Get hour                                     |
+| `Minute() -> int`                                 | Get minute                                   |
+| `Second() -> int`                                 | Get second                                   |
+| `Millis() -> int64_t`                             | Get sub-second millisecond component (0-999) |
+
+### `Timestamp`
+
+| Method                                                               |  Description                             |
+|----------------------------------------------------------------------|------------------------------------------|
+| `Timestamp::FromMillis(int64_t millis)`                              | Create from milliseconds since epoch     |
+| `Timestamp::FromMillisNanos(int64_t millis, int32_t nanos)`          | Create from milliseconds and nanoseconds |
+| `Timestamp::FromTimePoint(std::chrono::system_clock::time_point tp)` | Create from a time point                 |
+
+## `PartitionInfo`
+
+| Field            | Type          |  Description   |
+|------------------|---------------|----------------|
+| `partition_id`   | `int64_t`     | Partition ID   |
+| `partition_name` | `std::string` | Partition name |
+
+## `DatabaseDescriptor`
+
+| Field        | Type                                           | Description       |
+|--------------|------------------------------------------------|-------------------|
+| `comment`    | `std::string`                                  | Database comment  |
+| `properties` | `std::unordered_map<std::string, std::string>` | Custom properties |
+
+## `DatabaseInfo`
+
+| Field           | Type                                           |  Description                |
+|-----------------|------------------------------------------------|-----------------------------|
+| `database_name` | `std::string`                                  | Database name               |
+| `comment`       | `std::string`                                  | Database comment            |
+| `properties`    | `std::unordered_map<std::string, std::string>` | Custom properties           |
+| `created_time`  | `int64_t`                                      | Creation timestamp          |
+| `modified_time` | `int64_t`                                      | Last modification timestamp |
+
+## `LakeSnapshot`
+
+| Field            | Type                        |  Description       |
+|------------------|-----------------------------|--------------------|
+| `snapshot_id`    | `int64_t`                   | Snapshot ID        |
+| `bucket_offsets` | `std::vector<BucketOffset>` | All bucket offsets |
+
+## `BucketOffset`
+
+| Field          | Type      | Description  |
+|----------------|-----------|--------------|
+| `table_id`     | `int64_t` | Table ID     |
+| `partition_id` | `int64_t` | Partition ID |
+| `bucket_id`    | `int32_t` | Bucket ID    |
+| `offset`       | `int64_t` | Offset value |
+
+## `OffsetSpec`
+
+| Method                                             | Description                             |
+|----------------------------------------------------|-----------------------------------------|
+| `OffsetSpec::Earliest()`                          | Query for the earliest available offset |
+| `OffsetSpec::Latest()`                            | Query for the latest offset             |
+| `OffsetSpec::Timestamp(int64_t timestamp_ms)`     | Query offset at a specific timestamp    |
+
+## Constants
+
+| Constant                 |  Value |  Description                                            |
+|--------------------------|--------|---------------------------------------------------------|
+| `fluss::EARLIEST_OFFSET` | `-2`   | Start reading from the earliest available offset        |
+
+To start reading from the latest offset (only new records), resolve the current offset via `ListOffsets` before subscribing:
+
+```cpp
+std::unordered_map<int32_t, int64_t> offsets;
+admin.ListOffsets(table_path, {0}, fluss::OffsetSpec::Latest(), offsets);
+scanner.Subscribe(0, offsets[0]);
+```
+
+## Enums
+
+### `ChangeType`
+
+| Value          | Short String | Description                      |
+|----------------|--------------|----------------------------------|
+| `AppendOnly`   | `+A`         | Append-only record               |
+| `Insert`       | `+I`         | Inserted row                     |
+| `UpdateBefore` | `-U`         | Previous value of an updated row |
+| `UpdateAfter`  | `+U`         | New value of an updated row      |
+| `Delete`       | `-D`         | Deleted row                      |
+
+You may refer to the following example to convert ChangeType enum to its short string representation.
+
+```cpp
+inline const char* ChangeTypeShortString(ChangeType ct) {
+    switch (ct) {
+        case ChangeType::AppendOnly: return "+A";
+        case ChangeType::Insert: return "+I";
+        case ChangeType::UpdateBefore: return "-U";
+        case ChangeType::UpdateAfter: return "+U";
+        case ChangeType::Delete: return "-D";
+    }
+    throw std::invalid_argument("Unknown ChangeType");
+}
+```
+
+### `TypeId`
+
+| Value          |  Description               |
+|----------------|----------------------------|
+| `Boolean`      | Boolean type               |
+| `TinyInt`      | 8-bit signed integer       |
+| `SmallInt`     | 16-bit signed integer      |
+| `Int`          | 32-bit signed integer      |
+| `BigInt`       | 64-bit signed integer      |
+| `Float`        | 32-bit floating point      |
+| `Double`       | 64-bit floating point      |
+| `String`       | UTF-8 string               |
+| `Bytes`        | Binary data                |
+| `Date`         | Date                       |
+| `Time`         | Time                       |
+| `Timestamp`    | Timestamp without timezone |
+| `TimestampLtz` | Timestamp with timezone    |
+| `Decimal`      | Decimal                    |
+| `Array`        | Array of elements          |
+
+### `ChangeType`
+
+| Value          |  Description                                |
+|----------------|---------------------------------------------|
+| `AppendOnly`   | Append-only record (log tables)             |
+| `Insert`       | Inserted row (PK tables)                    |
+| `UpdateBefore` | Row value before an update (PK tables)      |
+| `UpdateAfter`  | Row value after an update (PK tables)       |
+| `Delete`       | Deleted row (PK tables)                     |
+
+### `OffsetSpec`
+
+| Value       |  Description                   |
+|-------------|--------------------------------|
+| `Earliest`  | Earliest available offset      |
+| `Latest`    | Latest offset                  |
+| `Timestamp` | Offset at a specific timestamp |
diff --git a/fluss-rust/website/docs/user-guide/cpp/data-types.md b/fluss-rust/website/docs/user-guide/cpp/data-types.md
new file mode 100644
index 0000000000..cce40cefa1
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/data-types.md
@@ -0,0 +1,250 @@
+---
+sidebar_position: 3
+---
+# Data Types
+
+## Schema DataTypes
+
+| DataType                   | Description                                                    |
+|----------------------------|----------------------------------------------------------------|
+| `DataType::Boolean()`      | Boolean value                                                  |
+| `DataType::TinyInt()`      | 8-bit signed integer                                           |
+| `DataType::SmallInt()`     | 16-bit signed integer                                          |
+| `DataType::Int()`          | 32-bit signed integer                                          |
+| `DataType::BigInt()`       | 64-bit signed integer                                          |
+| `DataType::Float()`        | 32-bit floating point                                          |
+| `DataType::Double()`       | 64-bit floating point                                          |
+| `DataType::String()`       | UTF-8 string                                                   |
+| `DataType::Bytes()`        | Binary data                                                    |
+| `DataType::Date()`         | Date (days since epoch)                                        |
+| `DataType::Time()`         | Time (milliseconds since midnight)                             |
+| `DataType::Timestamp()`    | Timestamp without timezone (default precision 6, microseconds) |
+| `DataType::TimestampLtz()` | Timestamp with timezone (default precision 6, microseconds)    |
+| `DataType::Decimal(p, s)`  | Decimal with precision and scale                               |
+| `DataType::Array(element)` | Array of the given element type (supports nesting)             |
+
+## Nullability
+
+All DataTypes are nullable by default. Use `.NotNull()` to create a `NOT NULL` type:
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("id", fluss::DataType::Int().NotNull())
+    .AddColumn("name", fluss::DataType::String())          // nullable by default
+    .Build();
+```
+
+Primary key columns are automatically forced `NOT NULL` regardless of the `DataType` setting.
+
+For nested types, nullability is preserved at each array level and at the leaf element:
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("tags", fluss::DataType::Array(fluss::DataType::String().NotNull()))
+    .AddColumn("ids", fluss::DataType::Array(fluss::DataType::Int()).NotNull())
+    .AddColumn("nested", fluss::DataType::Array(
+        fluss::DataType::Array(fluss::DataType::Int()).NotNull()))
+    .Build();
+// "tags":   ARRAY<STRING NOT NULL>         (outer nullable, elements NOT NULL)
+// "ids":    ARRAY<INT> NOT NULL            (outer NOT NULL, elements nullable)
+// "nested": ARRAY<ARRAY<INT> NOT NULL>     (outer nullable, inner array NOT NULL)
+```
+
+You can query nullability at runtime:
+
+```cpp
+auto info = table.GetTableInfo();
+bool is_nullable = info.schema.columns[0].data_type.nullable();
+```
+
+## GenericRow Setters
+
+`SetInt32` is used for `TinyInt`, `SmallInt`, and `Int` columns. For `TinyInt` and `SmallInt`, the value is validated at write time — an error is returned if it overflows the column's range (e.g., \[-128, 127\] for `TinyInt`, \[-32768, 32767\] for `SmallInt`).
+
+```cpp
+fluss::GenericRow row;
+row.SetNull(0);
+row.SetBool(1, true);
+row.SetInt32(2, 42);
+row.SetInt64(3, 1234567890L);
+row.SetFloat32(4, 3.14f);
+row.SetFloat64(5, 2.71828);
+row.SetString(6, "hello");
+row.SetBytes(7, {0x01, 0x02, 0x03});
+```
+
+### Array Columns
+
+Array values are built element-by-element using `ArrayWriter`, then attached to the row via `SetArray`:
+
+```cpp
+fluss::ArrayWriter aw(3, fluss::DataType::Int());
+aw.SetInt32(0, 10);
+aw.SetInt32(1, 20);
+aw.SetNull(2);
+row.SetArray(8, std::move(aw));
+```
+
+For nested arrays (e.g., `ARRAY<ARRAY<INT>>`), build inner arrays first:
+
+```cpp
+fluss::ArrayWriter inner(2, fluss::DataType::Int());
+inner.SetInt32(0, 1);
+inner.SetInt32(1, 2);
+
+fluss::ArrayWriter outer(1, fluss::DataType::Array(fluss::DataType::Int()));
+outer.SetArray(0, std::move(inner));
+row.SetArray(9, std::move(outer));
+```
+
+## Name-Based Setters
+
+When using `table.NewRow()`, you can set fields by column name. The setter automatically routes to the correct type based on the schema:
+
+```cpp
+auto row = table.NewRow();
+row.Set("user_id", 1);
+row.Set("name", "Alice");
+row.Set("score", 95.5f);
+row.Set("balance", "1234.56");   // decimal as string
+row.Set("birth_date", fluss::Date::FromYMD(1990, 3, 15));
+row.Set("login_time", fluss::Time::FromHMS(9, 30, 0));
+row.Set("created_at", fluss::Timestamp::FromMillis(1700000000000));
+row.Set("nickname", nullptr);    // set to null
+```
+
+## Reading Field Values
+
+Field values are read through `RowView` (from scan results) and `LookupResult` (from lookups), not through `GenericRow`. Both provide the same getter interface with zero-copy access to string and bytes data.
+
+`ScanRecord` is a value type — it can be freely copied, stored, and accumulated across multiple `Poll()` calls via reference counting.
+
+:::note string_view Lifetime
+`GetString()` returns `std::string_view` that borrows from the underlying data. The `string_view` is valid as long as any `ScanRecord` referencing the same poll result is alive. Copy to `std::string` if you need the value after all records are gone.
+:::
+
+```cpp
+// ScanRecord is a value type — safe to store and accumulate:
+std::vector<fluss::ScanRecord> all_records;
+fluss::ScanRecords records;
+scanner.Poll(5000, records);
+for (const auto& rec : records) {
+    all_records.push_back(rec);                    // safe! ref-counted
+    auto name = rec.row.GetString(0);              // zero-copy string_view
+    auto owned = std::string(rec.row.GetString(0)); // explicit copy when needed
+}
+
+// DON'T — string_view dangles after all records referencing the data are destroyed:
+std::string_view dangling;
+{
+    fluss::ScanRecords records;
+    scanner.Poll(5000, records);
+    dangling = records[0].row.GetString(0);
+}
+// dangling is undefined behavior here — no ScanRecord keeps the data alive!
+```
+
+### From Scan Results (RowView)
+
+```cpp
+for (const auto& rec : records) {
+    auto name = rec.row.GetString(1);          // zero-copy string_view
+    float score = rec.row.GetFloat32(3);
+    auto balance = rec.row.GetDecimalString(4); // std::string (already owned)
+    fluss::Date date = rec.row.GetDate(5);
+    fluss::Time time = rec.row.GetTime(6);
+    fluss::Timestamp ts = rec.row.GetTimestamp(7);
+}
+```
+
+### From Lookup Results (LookupResult)
+
+```cpp
+fluss::LookupResult result;
+lookuper.Lookup(pk_row, result);
+if (result.Found()) {
+    auto name = result.GetString(1);  // zero-copy string_view
+    int64_t age = result.GetInt64(2);
+}
+```
+
+### Reading Array Columns
+
+Array columns can be read element-by-element using index-based getters, or via an `ArrayView` for recursive access:
+
+```cpp
+// Element-by-element access (flat arrays)
+size_t len = rec.row.GetArraySize(8);
+for (size_t i = 0; i < len; i++) {
+    if (!rec.row.IsArrayElementNull(8, i)) {
+        int32_t val = rec.row.GetArrayInt32(8, i);
+    }
+}
+
+// ArrayView for nested arrays or when you need a standalone handle
+fluss::ArrayView av = rec.row.GetArrayView(8);
+for (size_t i = 0; i < av.Size(); i++) {
+    if (!av.IsNull(i)) {
+        int32_t val = av.GetInt32(i);
+    }
+}
+
+// Nested arrays: ArrayView::GetArray() returns a child ArrayView
+fluss::ArrayView outer = rec.row.GetArrayView(9);
+for (size_t i = 0; i < outer.Size(); i++) {
+    fluss::ArrayView inner = outer.GetArray(i);
+    for (size_t j = 0; j < inner.Size(); j++) {
+        int32_t val = inner.GetInt32(j);
+    }
+}
+```
+
+## TypeId Enum
+
+`TinyInt` and `SmallInt` values are widened to `int32_t` on read.
+
+| TypeId          | C++ Type                                    | Getter                    |
+|-----------------|---------------------------------------------|---------------------------|
+| `Boolean`       | `bool`                                      | `GetBool(idx)`            |
+| `TinyInt`       | `int32_t`                                   | `GetInt32(idx)`           |
+| `SmallInt`      | `int32_t`                                   | `GetInt32(idx)`           |
+| `Int`           | `int32_t`                                   | `GetInt32(idx)`           |
+| `BigInt`        | `int64_t`                                   | `GetInt64(idx)`           |
+| `Float`         | `float`                                     | `GetFloat32(idx)`         |
+| `Double`        | `double`                                    | `GetFloat64(idx)`         |
+| `String`        | `std::string_view`                          | `GetString(idx)`          |
+| `Bytes`         | `std::pair<const uint8_t*, size_t>`         | `GetBytes(idx)`           |
+| `Date`          | `Date`                                      | `GetDate(idx)`            |
+| `Time`          | `Time`                                      | `GetTime(idx)`            |
+| `Timestamp`     | `Timestamp`                                 | `GetTimestamp(idx)`       |
+| `TimestampLtz`  | `Timestamp`                                 | `GetTimestamp(idx)`       |
+| `Decimal`       | `std::string`                               | `GetDecimalString(idx)`   |
+| `Array`         | `ArrayView`                                 | `GetArrayView(idx)`       |
+
+## Type Checking
+
+```cpp
+if (rec.row.GetType(0) == fluss::TypeId::Int) {
+    int32_t value = rec.row.GetInt32(0);
+}
+if (rec.row.IsNull(1)) {
+    // field is null
+}
+if (rec.row.IsDecimal(2)) {
+    std::string decimal_str = rec.row.GetDecimalString(2);
+}
+```
+
+## Constants
+
+```cpp
+constexpr int64_t fluss::EARLIEST_OFFSET = -2;  // Start from earliest
+```
+
+To start reading from the latest offset, resolve the current offset via `ListOffsets` before subscribing:
+
+```cpp
+std::unordered_map<int32_t, int64_t> offsets;
+admin.ListOffsets(table_path, {0}, fluss::OffsetSpec::Latest(), offsets);
+scanner.Subscribe(0, offsets[0]);
+```
diff --git a/fluss-rust/website/docs/user-guide/cpp/error-handling.md b/fluss-rust/website/docs/user-guide/cpp/error-handling.md
new file mode 100644
index 0000000000..7447a264c7
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/error-handling.md
@@ -0,0 +1,238 @@
+---
+sidebar_position: 4
+---
+# Error Handling
+
+All C++ client operations return a `fluss::Result` struct instead of throwing exceptions. This gives you explicit control over error handling.
+
+## The `Result` Struct
+
+```cpp
+#include "fluss.hpp"
+
+// All operations return fluss::Result
+fluss::Result result = admin.CreateTable(path, descriptor);
+if (!result.Ok()) {
+    std::cerr << "Error code: " << result.error_code << std::endl;
+    std::cerr << "Error message: " << result.error_message << std::endl;
+}
+```
+
+| Field / Method   | Type          | Description                               |
+|------------------|---------------|-------------------------------------------|
+| `error_code`     | `int32_t`     | 0 for success, non-zero for errors        |
+| `error_message`  | `std::string` | Human-readable error description          |
+| `Ok()`           | `bool`        | Returns `true` if the operation succeeded |
+
+## Handling Errors
+
+Check the `Result` after each operation and decide how to respond, e.g. log and continue, retry, or abort:
+
+```cpp
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+if (!result.Ok()) {
+    // Log, retry, or propagate the error as appropriate
+    std::cerr << "Connection failed (code " << result.error_code
+              << "): " << result.error_message << std::endl;
+    return 1;
+}
+```
+
+## Connection State Checking
+
+Use `Available()` to verify that a connection or object is valid before using it:
+
+```cpp
+fluss::Connection conn;
+if (!conn.Available()) {
+    // Connection not initialized or already moved
+}
+
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+fluss::Result result = fluss::Connection::Create(config, conn);
+if (result.Ok() && conn.Available()) {
+    // Connection is ready to use
+}
+```
+
+## Error Codes
+
+Server-side errors carry a specific error code (>0 or -1). Client-side errors (connection failures, type mismatches, etc.) use `ErrorCode::CLIENT_ERROR` (-2). Use `fluss::ErrorCode` to match on specific codes:
+
+```cpp
+fluss::Result result = admin.DropTable(table_path);
+if (!result.Ok()) {
+    if (result.error_code == fluss::ErrorCode::TABLE_NOT_EXIST) {
+        std::cerr << "Table does not exist" << std::endl;
+    } else if (result.error_code == fluss::ErrorCode::PARTITION_NOT_EXISTS) {
+        std::cerr << "Partition does not exist" << std::endl;
+    } else if (result.error_code == fluss::ErrorCode::CLIENT_ERROR) {
+        std::cerr << "Client-side error: " << result.error_message << std::endl;
+    } else {
+        std::cerr << "Server error (code " << result.error_code
+                  << "): " << result.error_message << std::endl;
+    }
+}
+```
+
+### Common Error Codes
+
+| Constant                                      | Code | Description                         |
+|-----------------------------------------------|------|-------------------------------------|
+| `ErrorCode::CLIENT_ERROR`                     | -2   | Client-side error (not from server) |
+| `ErrorCode::UNKNOWN_SERVER_ERROR`             | -1   | Unexpected server error             |
+| `ErrorCode::NETWORK_EXCEPTION`                | 1    | Server disconnected before response |
+| `ErrorCode::DATABASE_NOT_EXIST`               | 4    | Database does not exist             |
+| `ErrorCode::DATABASE_ALREADY_EXIST`           | 6    | Database already exists             |
+| `ErrorCode::TABLE_NOT_EXIST`                  | 7    | Table does not exist                |
+| `ErrorCode::TABLE_ALREADY_EXIST`              | 8    | Table already exists                |
+| `ErrorCode::INVALID_TABLE_EXCEPTION`          | 15   | Invalid table operation             |
+| `ErrorCode::REQUEST_TIME_OUT`                 | 25   | Request timed out                   |
+| `ErrorCode::PARTITION_NOT_EXISTS`             | 36   | Partition does not exist            |
+| `ErrorCode::PARTITION_ALREADY_EXISTS`         | 42   | Partition already exists            |
+| `ErrorCode::PARTITION_SPEC_INVALID_EXCEPTION` | 43   | Invalid partition spec              |
+| `ErrorCode::LEADER_NOT_AVAILABLE_EXCEPTION`   | 44   | No leader available for partition   |
+| `ErrorCode::AUTHENTICATE_EXCEPTION`           | 46   | Authentication failed (bad credentials) |
+
+See `fluss::ErrorCode` in `fluss.hpp` for the full list of named constants.
+
+## Retry Logic
+
+Some errors are transient, where the server may be temporarily unavailable, mid-election, or under load. `IsRetriable()` can be used for deciding to to retry an operation rather than treating the error as permanent.
+
+`ErrorCode::IsRetriable(int32_t code)` is a static helper available directly on the error code:
+
+```cpp
+fluss::Result result = writer.Append(row);
+if (!result.Ok()) {
+    if (result.IsRetriable()) {
+        // Transient failure — safe to retry 
+    } else {
+        // Permanent failure — log and abort
+        std::cerr << "Fatal error (code " << result.error_code
+                  << "): " << result.error_message << std::endl;
+    }
+}
+```
+
+`Result::IsRetriable()` delegates to `ErrorCode::IsRetriable()`, so you can also call it directly on the code:
+
+```cpp
+if (fluss::ErrorCode::IsRetriable(result.error_code)) {
+    // retry
+}
+```
+
+### Retriable Error Codes
+
+| Constant                                                    | Code | Reason                                    |
+|-------------------------------------------------------------|------|-------------------------------------------|
+| `ErrorCode::NETWORK_EXCEPTION`                          | 1    | Server disconnected                       |
+| `ErrorCode::CORRUPT_MESSAGE`                            | 3    | CRC or size error                         |
+| `ErrorCode::SCHEMA_NOT_EXIST`                           | 9    | Schema may not exist                      |
+| `ErrorCode::LOG_STORAGE_EXCEPTION`                      | 10   | Transient log storage error               |
+| `ErrorCode::KV_STORAGE_EXCEPTION`                       | 11   | Transient KV storage error                |
+| `ErrorCode::NOT_LEADER_OR_FOLLOWER`                     | 12   | Leader election in progress               |
+| `ErrorCode::CORRUPT_RECORD_EXCEPTION`                   | 14   | Corrupt record                            |
+| `ErrorCode::UNKNOWN_TABLE_OR_BUCKET_EXCEPTION`          | 21   | Metadata not yet available                |
+| `ErrorCode::REQUEST_TIME_OUT`                           | 25   | Request timed out                         |
+| `ErrorCode::STORAGE_EXCEPTION`                          | 26   | Transient storage error                   |
+| `ErrorCode::NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION` | 28   | Wrote to server but with low ISR size     |
+| `ErrorCode::NOT_ENOUGH_REPLICAS_EXCEPTION`              | 29   | Low ISR size at write time                |
+| `ErrorCode::LEADER_NOT_AVAILABLE_EXCEPTION`             | 44   | No leader available for partition         |
+
+Client-side errors (`ErrorCode::CLIENT_ERROR`, code -2) always return `false` from `IsRetriable()`.
+
+## Common Error Scenarios
+
+### Connection Refused
+
+The cluster is not running or the address is incorrect:
+
+```cpp
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+if (!result.Ok()) {
+    // "Connection refused" or timeout error
+    std::cerr << "Cannot connect to cluster: " << result.error_message << std::endl;
+}
+```
+
+### Table Not Found
+
+Attempting to access a table that does not exist:
+
+```cpp
+fluss::Table table;
+fluss::Result result = conn.GetTable(fluss::TablePath("fluss", "nonexistent"), table);
+if (!result.Ok()) {
+    if (result.error_code == fluss::ErrorCode::TABLE_NOT_EXIST) {
+        std::cerr << "Table not found" << std::endl;
+    }
+}
+```
+
+### Partition Not Found
+
+Writing to a partitioned primary key table before creating partitions:
+
+```cpp
+// This will fail if partitions are not created first
+auto row = table.NewRow();
+row.Set("user_id", 1);
+row.Set("region", "US");
+row.Set("score", static_cast<int64_t>(100));
+fluss::WriteResult wr;
+fluss::Result result = writer.Upsert(row, wr);
+if (!result.Ok()) {
+    if (result.error_code == fluss::ErrorCode::PARTITION_NOT_EXISTS) {
+        std::cerr << "Partition not found, create partitions before writing" << std::endl;
+    }
+}
+```
+
+### Authentication Failed
+
+SASL credentials are incorrect or the user does not exist:
+
+```cpp
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+config.security_protocol = "sasl";
+config.security_sasl_username = "admin";
+config.security_sasl_password = "wrong-password";
+
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+if (!result.Ok()) {
+    if (result.error_code == fluss::ErrorCode::AUTHENTICATE_EXCEPTION) {
+        std::cerr << "Authentication failed: " << result.error_message << std::endl;
+    }
+}
+```
+
+### Schema Mismatch
+
+Using incorrect types or column indices when writing:
+
+```cpp
+fluss::GenericRow row;
+// Setting wrong type for a column will result in an error
+// when the row is sent to the server
+row.SetString(0, "not_an_integer");  // Column 0 expects Int
+fluss::Result result = writer.Append(row);
+if (!result.Ok()) {
+    std::cerr << "Schema mismatch: " << result.error_message << std::endl;
+}
+```
+
+## Best Practices
+
+1. **Always check `Result`**: Never ignore the return value of operations that return `Result`.
+2. **Handle errors gracefully**: Log errors and retry or fail gracefully rather than crashing.
+3. **Verify connection state**: Use `Available()` to check connection validity before operations.
+4. **Create partitions before writing**: For partitioned primary key tables, always create partitions before attempting upserts.
diff --git a/fluss-rust/website/docs/user-guide/cpp/example/_category_.json b/fluss-rust/website/docs/user-guide/cpp/example/_category_.json
new file mode 100644
index 0000000000..4d81ec12ae
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/example/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Examples",
+  "position": 5
+}
diff --git a/fluss-rust/website/docs/user-guide/cpp/example/admin-operations.md b/fluss-rust/website/docs/user-guide/cpp/example/admin-operations.md
new file mode 100644
index 0000000000..0f08549a0e
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/example/admin-operations.md
@@ -0,0 +1,158 @@
+---
+sidebar_position: 3
+---
+# Admin Operations
+
+## Get Admin Interface
+
+```cpp
+fluss::Admin admin;
+conn.GetAdmin(admin);
+```
+
+## Database Operations
+
+```cpp
+// Create database
+fluss::DatabaseDescriptor db_descriptor;
+db_descriptor.comment = "My database";
+admin.CreateDatabase("my_database", db_descriptor, true);
+
+// List all databases
+std::vector<std::string> databases;
+admin.ListDatabases(databases);
+for (const auto& db : databases) {
+    std::cout << "Database: " << db << std::endl;
+}
+
+// Check if database exists
+bool exists = false;
+admin.DatabaseExists("my_database", exists);
+
+// Get database information
+fluss::DatabaseInfo db_info;
+admin.GetDatabaseInfo("my_database", db_info);
+std::cout << "Database: " << db_info.database_name << std::endl;
+
+// Drop database
+admin.DropDatabase("my_database", true, false);
+```
+
+## Table Operations
+
+```cpp
+fluss::TablePath table_path("fluss", "my_table");
+
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("id", fluss::DataType::Int())
+    .AddColumn("name", fluss::DataType::String())
+    .AddColumn("score", fluss::DataType::Float())
+    .AddColumn("age", fluss::DataType::Int())
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetBucketCount(3)
+    .SetComment("Example table")
+    .Build();
+
+// Create table
+admin.CreateTable(table_path, descriptor, true);
+
+// Get table information
+fluss::TableInfo table_info;
+admin.GetTableInfo(table_path, table_info);
+std::cout << "Table ID: " << table_info.table_id << std::endl;
+std::cout << "Number of buckets: " << table_info.num_buckets << std::endl;
+std::cout << "Has primary key: " << table_info.has_primary_key << std::endl;
+std::cout << "Is partitioned: " << table_info.is_partitioned << std::endl;
+
+// Drop table
+admin.DropTable(table_path, true);
+```
+
+## Schema Builder Options
+
+```cpp
+// Schema with primary key
+auto pk_schema = fluss::Schema::NewBuilder()
+    .AddColumn("id", fluss::DataType::Int())
+    .AddColumn("name", fluss::DataType::String())
+    .AddColumn("value", fluss::DataType::Double())
+    .SetPrimaryKeys({"id"})
+    .Build();
+
+// Table descriptor with partitioning
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetPartitionKeys({"date"})
+    .SetBucketCount(3)
+    .SetBucketKeys({"user_id"})
+    .SetProperty("retention_days", "7")
+    .SetComment("Sample table")
+    .Build();
+```
+
+## Partition Operations
+
+```cpp
+// Create a partition
+std::unordered_map<std::string, std::string> partition_spec = {{"region", "US"}};
+admin.CreatePartition(table_path, partition_spec, true);
+
+// List all partitions
+std::vector<fluss::PartitionInfo> partitions;
+admin.ListPartitionInfos(table_path, partitions);
+for (const auto& p : partitions) {
+    std::cout << "Partition: id=" << p.partition_id
+              << ", name=" << p.partition_name << std::endl;
+}
+
+// Drop a partition
+admin.DropPartition(table_path, partition_spec, true);
+```
+
+## Offset Operations
+
+```cpp
+std::vector<int32_t> bucket_ids = {0, 1, 2};
+
+// Query earliest offsets
+std::unordered_map<int32_t, int64_t> earliest_offsets;
+admin.ListOffsets(table_path, bucket_ids,
+                  fluss::OffsetSpec::Earliest(), earliest_offsets);
+
+// Query latest offsets
+std::unordered_map<int32_t, int64_t> latest_offsets;
+admin.ListOffsets(table_path, bucket_ids,
+                  fluss::OffsetSpec::Latest(), latest_offsets);
+
+// Query offsets for a specific timestamp
+std::unordered_map<int32_t, int64_t> timestamp_offsets;
+admin.ListOffsets(table_path, bucket_ids,
+                  fluss::OffsetSpec::Timestamp(timestamp_ms),
+                  timestamp_offsets);
+
+// Query partition offsets
+std::unordered_map<int32_t, int64_t> partition_offsets;
+admin.ListPartitionOffsets(table_path, "partition_name",
+                           bucket_ids, fluss::OffsetSpec::Latest(),
+                           partition_offsets);
+```
+
+## Lake Snapshot
+
+:::note
+Lake snapshots require [lake integration](https://fluss.apache.org/docs/maintenance/tiered-storage/overview/) (e.g. Paimon or Iceberg) to be enabled on the server. Without it, `GetLatestLakeSnapshot` will return an error.
+:::
+
+```cpp
+fluss::LakeSnapshot snapshot;
+admin.GetLatestLakeSnapshot(table_path, snapshot);
+std::cout << "Snapshot ID: " << snapshot.snapshot_id << std::endl;
+for (const auto& bucket_offset : snapshot.bucket_offsets) {
+    std::cout << "  Table " << bucket_offset.table_id
+              << ", Bucket " << bucket_offset.bucket_id
+              << ": offset=" << bucket_offset.offset << std::endl;
+}
+```
diff --git a/fluss-rust/website/docs/user-guide/cpp/example/configuration.md b/fluss-rust/website/docs/user-guide/cpp/example/configuration.md
new file mode 100644
index 0000000000..38202618c9
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/example/configuration.md
@@ -0,0 +1,42 @@
+---
+sidebar_position: 2
+---
+# Configuration
+
+## Connection Setup
+
+```cpp
+#include "fluss.hpp"
+
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+
+if (!result.Ok()) {
+    std::cerr << "Connection failed: " << result.error_message << std::endl;
+}
+```
+
+## Connection Configurations
+
+All fields have sensible defaults. Only `bootstrap_servers` typically needs to be set.
+
+See the [`Configuration`](../api-reference.md#configuration) section in the API Reference for the full list of configuration fields, types, and defaults.
+
+## SASL Authentication
+
+To connect to a Fluss cluster with SASL/PLAIN authentication enabled:
+
+```cpp
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+config.security_protocol = "sasl";
+config.security_sasl_mechanism = "PLAIN";
+config.security_sasl_username = "admin";
+config.security_sasl_password = "admin-secret";
+
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+```
diff --git a/fluss-rust/website/docs/user-guide/cpp/example/index.md b/fluss-rust/website/docs/user-guide/cpp/example/index.md
new file mode 100644
index 0000000000..51f60e4175
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/example/index.md
@@ -0,0 +1,63 @@
+---
+sidebar_position: 1
+---
+# Example
+
+Minimal working example: connect to Fluss, create a table, write data, and read it back.
+
+```cpp
+#include <iostream>
+#include "fluss.hpp"
+
+int main() {
+    // Connect
+    fluss::Configuration config;
+    config.bootstrap_servers = "127.0.0.1:9123";
+
+    fluss::Connection conn;
+    fluss::Connection::Create(config, conn);
+
+    fluss::Admin admin;
+    conn.GetAdmin(admin);
+
+    // Create a log table
+    fluss::TablePath table_path("fluss", "quickstart_cpp");
+    auto schema = fluss::Schema::NewBuilder()
+        .AddColumn("id", fluss::DataType::Int())
+        .AddColumn("name", fluss::DataType::String())
+        .Build();
+    auto descriptor = fluss::TableDescriptor::NewBuilder()
+        .SetSchema(schema)
+        .Build();
+    admin.CreateTable(table_path, descriptor, true);
+
+    // Write
+    fluss::Table table;
+    conn.GetTable(table_path, table);
+
+    fluss::AppendWriter writer;
+    table.NewAppend().CreateWriter(writer);
+
+    fluss::GenericRow row;
+    row.SetInt32(0, 1);
+    row.SetString(1, "hello");
+    writer.Append(row);
+    writer.Flush();
+
+    // Read
+    fluss::LogScanner scanner;
+    table.NewScan().CreateLogScanner(scanner);
+    auto info = table.GetTableInfo();
+    for (int b = 0; b < info.num_buckets; ++b) {
+        scanner.Subscribe(b, 0);
+    }
+    fluss::ScanRecords records;
+    scanner.Poll(5000, records);
+    for (const auto& rec : records) {
+        std::cout << "id=" << rec.row.GetInt32(0)
+                  << ", name=" << rec.row.GetString(1) << std::endl;
+    }
+
+    return 0;
+}
+```
diff --git a/fluss-rust/website/docs/user-guide/cpp/example/log-tables.md b/fluss-rust/website/docs/user-guide/cpp/example/log-tables.md
new file mode 100644
index 0000000000..0125a4ce29
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/example/log-tables.md
@@ -0,0 +1,161 @@
+---
+sidebar_position: 4
+---
+# Log Tables
+
+Log tables are append-only tables without primary keys, suitable for event streaming.
+
+## Creating a Log Table
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("event_id", fluss::DataType::Int())
+    .AddColumn("event_type", fluss::DataType::String())
+    .AddColumn("timestamp", fluss::DataType::BigInt())
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .Build();
+
+fluss::TablePath table_path("fluss", "events");
+admin.CreateTable(table_path, descriptor, true);
+```
+
+## Writing to Log Tables
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+fluss::AppendWriter writer;
+table.NewAppend().CreateWriter(writer);
+
+fluss::GenericRow row;
+row.SetInt32(0, 1);           // event_id
+row.SetString(1, "user_login");  // event_type
+row.SetInt64(2, 1704067200000L); // timestamp
+writer.Append(row);
+
+writer.Flush();
+```
+
+## Reading from Log Tables
+
+```cpp
+fluss::LogScanner scanner;
+table.NewScan().CreateLogScanner(scanner);
+
+auto info = table.GetTableInfo();
+for (int b = 0; b < info.num_buckets; ++b) {
+    scanner.Subscribe(b, 0);
+}
+
+fluss::ScanRecords records;
+scanner.Poll(5000, records);  // timeout in ms
+
+for (const auto& rec : records) {
+    std::cout << "event_id=" << rec.row.GetInt32(0)
+              << " event_type=" << rec.row.GetString(1)
+              << " timestamp=" << rec.row.GetInt64(2)
+              << " @ offset=" << rec.offset << std::endl;
+}
+
+// Or per-bucket access
+for (const auto& bucket : records.Buckets()) {
+    auto view = records.Records(bucket);
+    std::cout << "Bucket " << bucket.bucket_id << ": "
+              << view.Size() << " records" << std::endl;
+    for (const auto& rec : view) {
+        std::cout << "  event_id=" << rec.row.GetInt32(0)
+                  << " event_type=" << rec.row.GetString(1)
+                  << " @ offset=" << rec.offset << std::endl;
+    }
+}
+```
+
+**Continuous polling:**
+
+```cpp
+while (running) {
+    fluss::ScanRecords records;
+    scanner.Poll(1000, records);
+    for (const auto& rec : records) {
+        process(rec);
+    }
+}
+```
+
+**Accumulating records across polls:**
+
+`ScanRecord` is a value type — it can be freely copied, stored, and accumulated. The underlying data stays alive via reference counting (zero-copy).
+
+```cpp
+std::vector<fluss::ScanRecord> all_records;
+while (all_records.size() < 1000) {
+    fluss::ScanRecords records;
+    scanner.Poll(1000, records);
+    for (const auto& rec : records) {
+        all_records.push_back(rec);  // ref-counted, no data copy
+    }
+}
+// all_records is valid — each record keeps its data alive
+```
+
+**Batch subscribe:**
+
+```cpp
+std::vector<fluss::BucketSubscription> subscriptions;
+subscriptions.push_back({0, 0});    // bucket 0, offset 0
+subscriptions.push_back({1, 100});  // bucket 1, offset 100
+scanner.Subscribe(subscriptions);
+```
+
+**Unsubscribe from a bucket:**
+
+```cpp
+// Stop receiving records from bucket 1
+scanner.Unsubscribe(1);
+```
+
+**Arrow RecordBatch polling (high performance):**
+
+```cpp
+#include <arrow/record_batch.h>
+
+fluss::LogScanner arrow_scanner;
+table.NewScan().CreateRecordBatchLogScanner(arrow_scanner);
+
+for (int b = 0; b < info.num_buckets; ++b) {
+    arrow_scanner.Subscribe(b, 0);
+}
+
+fluss::ArrowRecordBatches batches;
+arrow_scanner.PollRecordBatch(5000, batches);
+
+for (size_t i = 0; i < batches.Size(); ++i) {
+    const auto& batch = batches[i];
+    if (batch->Available()) {
+        auto arrow_batch = batch->GetArrowRecordBatch();
+        std::cout << "Batch " << i << ": " << arrow_batch->num_rows() << " rows"
+                  << ", partition_id=" << batch->GetPartitionId()
+                  << ", bucket_id=" << batch->GetBucketId() << std::endl;
+    }
+}
+```
+
+## Column Projection
+
+```cpp
+// Project by column index
+fluss::LogScanner projected_scanner;
+table.NewScan().ProjectByIndex({0, 2}).CreateLogScanner(projected_scanner);
+
+// Project by column name
+fluss::LogScanner name_projected_scanner;
+table.NewScan().ProjectByName({"event_id", "timestamp"}).CreateLogScanner(name_projected_scanner);
+
+// Arrow RecordBatch with projection
+fluss::LogScanner projected_arrow_scanner;
+table.NewScan().ProjectByIndex({0, 2}).CreateRecordBatchLogScanner(projected_arrow_scanner);
+```
diff --git a/fluss-rust/website/docs/user-guide/cpp/example/partitioned-tables.md b/fluss-rust/website/docs/user-guide/cpp/example/partitioned-tables.md
new file mode 100644
index 0000000000..17c1c2057d
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/example/partitioned-tables.md
@@ -0,0 +1,179 @@
+---
+sidebar_position: 6
+---
+# Partitioned Tables
+
+Partitioned tables distribute data across partitions based on partition column values, enabling efficient data organization and querying. Both log tables and primary key tables support partitioning.
+
+## Partitioned Log Tables
+
+### Creating a Partitioned Log Table
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("event_id", fluss::DataType::Int())
+    .AddColumn("event_type", fluss::DataType::String())
+    .AddColumn("dt", fluss::DataType::String())
+    .AddColumn("region", fluss::DataType::String())
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetPartitionKeys({"dt", "region"})
+    .SetBucketCount(3)
+    .Build();
+
+fluss::TablePath table_path("fluss", "partitioned_events");
+admin.CreateTable(table_path, descriptor, true);
+```
+
+### Writing to Partitioned Log Tables
+
+**Partitions must exist before writing data, otherwise the client will by default retry indefinitely.** Include partition column values in each row, the client routes records to the correct partition automatically.
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+fluss::AppendWriter writer;
+table.NewAppend().CreateWriter(writer);
+
+fluss::GenericRow row;
+row.SetInt32(0, 1);
+row.SetString(1, "user_login");
+row.SetString(2, "2024-01-15");
+row.SetString(3, "US");
+writer.Append(row);
+writer.Flush();
+```
+
+### Reading from Partitioned Log Tables
+
+For partitioned tables, use partition-aware subscribe methods.
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+fluss::LogScanner scanner;
+table.NewScan().CreateLogScanner(scanner);
+
+// Subscribe to individual partitions
+for (const auto& pi : partition_infos) {
+    scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0);
+}
+
+fluss::ScanRecords records;
+scanner.Poll(5000, records);
+
+for (const auto& rec : records) {
+    std::cout << "bucket_id=" << rec.bucket_id
+              << " offset=" << rec.offset << std::endl;
+}
+
+// Or batch-subscribe to all partitions at once
+fluss::LogScanner batch_scanner;
+table.NewScan().CreateLogScanner(batch_scanner);
+
+std::vector<fluss::PartitionBucketSubscription> subs;
+for (const auto& pi : partition_infos) {
+    subs.push_back({pi.partition_id, 0, 0});
+}
+batch_scanner.SubscribePartitionBuckets(subs);
+```
+
+**Unsubscribe from a partition bucket:**
+
+```cpp
+// Stop receiving records from a specific partition bucket
+scanner.UnsubscribePartition(partition_infos[0].partition_id, 0);
+```
+
+### Managing Partitions
+
+```cpp
+// Create a partition
+admin.CreatePartition(table_path, {{"dt", "2024-01-15"}, {"region", "EMEA"}}, true);
+
+// List partitions
+std::vector<fluss::PartitionInfo> partition_infos;
+admin.ListPartitionInfos(table_path, partition_infos);
+
+// Query partition offsets
+std::vector<int32_t> bucket_ids = {0, 1, 2};
+std::unordered_map<int32_t, int64_t> offsets;
+admin.ListPartitionOffsets(table_path, "2024-01-15$US",
+                           bucket_ids, fluss::OffsetSpec::Latest(), offsets);
+```
+
+## Partitioned Primary Key Tables
+
+Partitioned KV tables combine partitioning with primary key operations. Partition columns must be part of the primary key.
+
+### Creating a Partitioned Primary Key Table
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("user_id", fluss::DataType::Int())
+    .AddColumn("region", fluss::DataType::String())
+    .AddColumn("zone", fluss::DataType::BigInt())
+    .AddColumn("score", fluss::DataType::BigInt())
+    .SetPrimaryKeys({"user_id", "region", "zone"})
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetPartitionKeys({"region", "zone"})
+    .SetBucketCount(3)
+    .Build();
+
+fluss::TablePath table_path("fluss", "partitioned_users");
+admin.CreateTable(table_path, descriptor, true);
+```
+
+### Writing to Partitioned Primary Key Tables
+
+**Partitions must exist before upserting data, otherwise the client will by default retry indefinitely.**
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+// Create partitions first
+admin.CreatePartition(table_path, {{"region", "APAC"}, {"zone", "1"}}, true);
+admin.CreatePartition(table_path, {{"region", "EMEA"}, {"zone", "2"}}, true);
+admin.CreatePartition(table_path, {{"region", "US"}, {"zone", "3"}}, true);
+
+fluss::UpsertWriter writer;
+table.NewUpsert().CreateWriter(writer);
+
+auto row = table.NewRow();
+row.Set("user_id", 1001);
+row.Set("region", "APAC");
+row.Set("zone", static_cast<int64_t>(1));
+row.Set("score", static_cast<int64_t>(1234));
+writer.Upsert(row);
+writer.Flush();
+```
+
+### Looking Up Records in Partitioned Tables
+
+Lookup requires all primary key columns including partition columns.
+
+> **Note:** Scanning partitioned primary key tables is not supported. Use lookup operations instead.
+
+```cpp
+fluss::Lookuper lookuper;
+table.NewLookup().CreateLookuper(lookuper);
+
+auto pk = table.NewRow();
+pk.Set("user_id", 1001);
+pk.Set("region", "APAC");
+pk.Set("zone", static_cast<int64_t>(1));
+
+fluss::LookupResult result;
+lookuper.Lookup(pk, result);
+if (result.Found()) {
+    std::cout << "score=" << result.GetInt64(3) << std::endl;
+}
+```
diff --git a/fluss-rust/website/docs/user-guide/cpp/example/primary-key-tables.md b/fluss-rust/website/docs/user-guide/cpp/example/primary-key-tables.md
new file mode 100644
index 0000000000..f26b5477a7
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/example/primary-key-tables.md
@@ -0,0 +1,132 @@
+---
+sidebar_position: 5
+---
+# Primary Key Tables
+
+Primary key tables (KV tables) support upsert, delete, and lookup operations.
+
+## Creating a Primary Key Table
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("id", fluss::DataType::Int())
+    .AddColumn("name", fluss::DataType::String())
+    .AddColumn("age", fluss::DataType::BigInt())
+    .SetPrimaryKeys({"id"})
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetBucketCount(3)
+    .Build();
+
+fluss::TablePath table_path("fluss", "users");
+admin.CreateTable(table_path, descriptor, true);
+```
+
+## Upserting Records
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+fluss::UpsertWriter upsert_writer;
+table.NewUpsert().CreateWriter(upsert_writer);
+
+// Fire-and-forget upserts
+{
+    auto row = table.NewRow();
+    row.Set("id", 1);
+    row.Set("name", "Alice");
+    row.Set("age", static_cast<int64_t>(25));
+    upsert_writer.Upsert(row);
+}
+{
+    auto row = table.NewRow();
+    row.Set("id", 2);
+    row.Set("name", "Bob");
+    row.Set("age", static_cast<int64_t>(30));
+    upsert_writer.Upsert(row);
+}
+upsert_writer.Flush();
+
+// Per-record acknowledgment
+{
+    auto row = table.NewRow();
+    row.Set("id", 3);
+    row.Set("name", "Charlie");
+    row.Set("age", static_cast<int64_t>(35));
+    fluss::WriteResult wr;
+    upsert_writer.Upsert(row, wr);
+    wr.Wait();
+}
+```
+
+## Updating Records
+
+Upsert with the same primary key to update an existing record.
+
+```cpp
+auto row = table.NewRow();
+row.Set("id", 1);
+row.Set("name", "Alice Updated");
+row.Set("age", static_cast<int64_t>(26));
+fluss::WriteResult wr;
+upsert_writer.Upsert(row, wr);
+wr.Wait();
+```
+
+## Deleting Records
+
+```cpp
+auto pk_row = table.NewRow();
+pk_row.Set("id", 2);
+fluss::WriteResult wr;
+upsert_writer.Delete(pk_row, wr);
+wr.Wait();
+```
+
+## Partial Updates
+
+Update only specific columns while preserving others.
+
+```cpp
+// By column names
+fluss::UpsertWriter partial_writer;
+table.NewUpsert()
+    .PartialUpdateByName({"id", "age"})
+    .CreateWriter(partial_writer);
+
+auto row = table.NewRow();
+row.Set("id", 1);
+row.Set("age", static_cast<int64_t>(27));
+fluss::WriteResult wr;
+partial_writer.Upsert(row, wr);
+wr.Wait();
+
+// By column indices
+fluss::UpsertWriter partial_writer_idx;
+table.NewUpsert()
+    .PartialUpdateByIndex({0, 2})
+    .CreateWriter(partial_writer_idx);
+```
+
+## Looking Up Records
+
+```cpp
+fluss::Lookuper lookuper;
+table.NewLookup().CreateLookuper(lookuper);
+
+auto pk_row = table.NewRow();
+pk_row.Set("id", 1);
+
+fluss::LookupResult result;
+lookuper.Lookup(pk_row, result);
+
+if (result.Found()) {
+    std::cout << "Found: name=" << result.GetString(1)
+              << ", age=" << result.GetInt64(2) << std::endl;
+} else {
+    std::cout << "Not found" << std::endl;
+}
+```
diff --git a/fluss-rust/website/docs/user-guide/cpp/installation.md b/fluss-rust/website/docs/user-guide/cpp/installation.md
new file mode 100644
index 0000000000..6360da4369
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/cpp/installation.md
@@ -0,0 +1,107 @@
+---
+sidebar_position: 1
+---
+# Installation
+
+The C++ bindings are not yet published as a package. You need to build from source.
+
+**Prerequisites:** CMake 3.22+, C++17 compiler, Rust 1.85+, Apache Arrow C++ library
+
+```bash
+git clone https://github.com/apache/fluss-rust.git
+cd fluss-rust
+```
+
+Install dependencies:
+
+```bash
+# macOS
+brew install cmake arrow
+
+# Ubuntu/Debian
+sudo apt-get install cmake libarrow-dev
+```
+
+If Arrow is not available via package manager, build from source:
+
+```bash
+git clone https://github.com/apache/arrow.git
+cd arrow/cpp
+cmake -B build -DARROW_BUILD_SHARED=ON
+cmake --build build
+sudo cmake --install build
+```
+
+Build the C++ bindings:
+
+```bash
+cd bindings/cpp
+mkdir -p build && cd build
+
+# Debug mode
+cmake ..
+
+# Or Release mode
+cmake -DCMAKE_BUILD_TYPE=Release ..
+
+# Build
+cmake --build .
+```
+
+This produces:
+- `libfluss_cpp.a` (Static library)
+- `fluss_cpp_example` (Example executable)
+- Header files in `include/`
+
+## Integrating into Your Project
+
+**Option 1: CMake FetchContent**
+
+```cmake
+include(FetchContent)
+FetchContent_Declare(
+    fluss-cpp
+    GIT_REPOSITORY https://github.com/apache/fluss-rust.git
+    SOURCE_SUBDIR bindings/cpp
+)
+FetchContent_MakeAvailable(fluss-cpp)
+
+target_link_libraries(your_target PRIVATE fluss_cpp)
+```
+
+**Option 2: Manual Integration**
+
+Copy the build artifacts and configure CMake:
+
+```cmake
+find_package(Arrow REQUIRED)
+
+add_library(fluss_cpp STATIC IMPORTED)
+set_target_properties(fluss_cpp PROPERTIES
+    IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/lib/libfluss_cpp.a
+    INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/include
+)
+
+target_link_libraries(your_target
+    PRIVATE
+    fluss_cpp
+    Arrow::arrow_shared
+    ${CMAKE_DL_LIBS}
+    Threads::Threads
+)
+
+# On macOS, also link these frameworks
+if(APPLE)
+    target_link_libraries(your_target PRIVATE
+        "-framework CoreFoundation"
+        "-framework Security"
+    )
+endif()
+```
+
+**Option 3: Subdirectory**
+
+```cmake
+add_subdirectory(vendor/fluss-rust/bindings/cpp)
+target_link_libraries(your_target PRIVATE fluss_cpp)
+```
diff --git a/fluss-rust/website/docs/user-guide/python/_category_.json b/fluss-rust/website/docs/user-guide/python/_category_.json
new file mode 100644
index 0000000000..a9f34b477f
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Python",
+  "position": 2
+}
diff --git a/fluss-rust/website/docs/user-guide/python/api-reference.md b/fluss-rust/website/docs/user-guide/python/api-reference.md
new file mode 100644
index 0000000000..9bf0b6902f
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/api-reference.md
@@ -0,0 +1,389 @@
+---
+sidebar_position: 2
+---
+# API Reference
+
+Complete API reference for the Fluss Python client.
+
+## `Config`
+
+| Method / Property                     | Config Key                            | Description                                                                             |
+|---------------------------------------|---------------------------------------|-----------------------------------------------------------------------------------------|
+| `Config(properties: dict = None)`     |                                       | Create config from a dict of key-value pairs                                            |
+| `bootstrap_servers`                   | `bootstrap.servers`                   | Get/set coordinator server address                                                      |
+| `writer_request_max_size`             | `writer.request-max-size`             | Get/set max request size in bytes                                                       |
+| `writer_acks`                         | `writer.acks`                         | Get/set acknowledgment setting (`"all"` for all replicas)                               |
+| `writer_retries`                      | `writer.retries`                      | Get/set number of retries on failure                                                    |
+| `writer_batch_size`                   | `writer.batch-size`                   | Get/set write batch size in bytes. Upper bound when dynamic sizing is on; fixed batch size when off |
+| `writer_dynamic_batch_size_enabled`   | `writer.dynamic-batch-size.enabled`   | Get/set whether the per-table dynamic batch size estimator is enabled (default `true`)  |
+| `writer_dynamic_batch_size_min`       | `writer.dynamic-batch-size-min`       | Get/set the lower bound for the dynamic batch size estimator (default 256 KB; ignored when disabled) |
+| `writer_batch_timeout_ms`             | `writer.batch-timeout-ms`             | Get/set max time in ms to wait for a writer batch to fill up before sending             |
+| `writer_bucket_no_key_assigner`       | `writer.bucket.no-key-assigner`       | Get/set bucket assignment strategy (`"sticky"` or `"round_robin"`)                      |
+| `scanner_remote_log_prefetch_num`     | `scanner.remote-log.prefetch-num`     | Get/set number of remote log segments to prefetch                                       |
+| `remote_file_download_thread_num`     | `remote-file.download-thread-num`     | Get/set number of threads for remote log downloads                                      |
+| `scanner_remote_log_read_concurrency` | `scanner.remote-log.read-concurrency` | Get/set streaming read concurrency within a remote log file                             |
+| `scanner_log_max_poll_records`        | `scanner.log.max-poll-records`        | Get/set max number of records returned in a single poll()                               |
+| `scanner_log_fetch_max_bytes`         | `scanner.log.fetch.max-bytes`         | Get/set maximum bytes per fetch response for LogScanner                                 |
+| `scanner_log_fetch_min_bytes`         | `scanner.log.fetch.min-bytes`         | Get/set minimum bytes the server must accumulate before returning a fetch response      |
+| `scanner_log_fetch_wait_max_time_ms`  | `scanner.log.fetch.wait-max-time-ms`  | Get/set maximum time (ms) the server may wait to satisfy min-bytes                      |
+| `scanner_log_fetch_max_bytes_for_bucket` | `scanner.log.fetch.max-bytes-for-bucket` | Get/set maximum bytes per fetch response per bucket for LogScanner                |
+| `connect_timeout_ms`                  | `connect-timeout`                     | Get/set TCP connect timeout in milliseconds                                             |
+| `security_protocol`                   | `security.protocol`                   | Get/set security protocol (`"PLAINTEXT"` or `"sasl"`)                                   |
+| `security_sasl_mechanism`             | `security.sasl.mechanism`             | Get/set SASL mechanism (only `"PLAIN"` is supported)                                    |
+| `security_sasl_username`              | `security.sasl.username`              | Get/set SASL username (required when protocol is `"sasl"`)                              |
+| `security_sasl_password`              | `security.sasl.password`              | Get/set SASL password (required when protocol is `"sasl"`)                              |
+
+## `FlussConnection`
+
+| Method                                                    |  Description                          |
+|-----------------------------------------------------------|---------------------------------------|
+| `await FlussConnection.create(config) -> FlussConnection` | Connect to a Fluss cluster            |
+| `conn.get_admin() -> FlussAdmin`                        | Get admin interface                   |
+| `await conn.get_table(table_path) -> FlussTable`          | Get a table for read/write operations |
+| `await conn.close()`                                      | Close the connection                  |
+
+Supports `async with` statement (async context manager).
+
+## `FlussAdmin`
+
+| Method                                                                                                                |  Description                          |
+|-----------------------------------------------------------------------------------------------------------------------|---------------------------------------|
+| `await create_database(name, database_descriptor=None, ignore_if_exists=False)`                                       | Create a database                     |
+| `await drop_database(name, ignore_if_not_exists=False, cascade=True)`                                                 | Drop a database                       |
+| `await list_databases() -> list[str]`                                                                                 | List all databases                    |
+| `await database_exists(name) -> bool`                                                                                 | Check if a database exists            |
+| `await get_database_info(name) -> DatabaseInfo`                                                                       | Get database metadata                 |
+| `await create_table(table_path, table_descriptor, ignore_if_exists=False)`                                            | Create a table                        |
+| `await drop_table(table_path, ignore_if_not_exists=False)`                                                            | Drop a table                          |
+| `await get_table_info(table_path) -> TableInfo`                                                                       | Get table metadata                    |
+| `await list_tables(database_name) -> list[str]`                                                                       | List tables in a database             |
+| `await table_exists(table_path) -> bool`                                                                              | Check if a table exists               |
+| `await list_offsets(table_path, bucket_ids, offset_spec) -> dict[int, int]`                           | Get offsets for buckets               |
+| `await list_partition_offsets(table_path, partition_name, bucket_ids, offset_spec) -> dict[int, int]` | Get offsets for a partition's buckets |
+| `await create_partition(table_path, partition_spec, ignore_if_exists=False)`                                          | Create a partition                    |
+| `await drop_partition(table_path, partition_spec, ignore_if_not_exists=False)`                                        | Drop a partition                      |
+| `await list_partition_infos(table_path) -> list[PartitionInfo]`                                                       | List partitions                       |
+| `await get_latest_lake_snapshot(table_path) -> LakeSnapshot`                                                          | Get latest lake snapshot              |
+| `await get_server_nodes() -> list[ServerNode]`                                                                        | Get all alive server nodes            |
+
+## `ServerNode`
+
+| Property                 | Description                                                |
+|--------------------------|------------------------------------------------------------|
+| `.id -> int`             | Server node ID                                             |
+| `.host -> str`           | Hostname of the server                                     |
+| `.port -> int`           | Port number                                                |
+| `.server_type -> str`    | Server type (`"CoordinatorServer"` or `"TabletServer"`)    |
+| `.uid -> str`            | Unique identifier (e.g. `"cs-0"`, `"ts-1"`)               |
+
+## `FlussTable`
+
+| Method                          |  Description                            |
+|---------------------------------|-----------------------------------------|
+| `new_scan() -> TableScan`       | Create a scan builder                   |
+| `new_append() -> TableAppend`   | Create an append builder for log tables |
+| `new_upsert() -> TableUpsert`   | Create an upsert builder for PK tables  |
+| `new_lookup() -> TableLookup`   | Create a lookup builder for PK tables   |
+| `get_table_info() -> TableInfo` | Get table metadata                      |
+| `get_table_path() -> TablePath` | Get table path                          |
+| `has_primary_key() -> bool`     | Check if table has a primary key        |
+
+## `TableScan`
+
+| Method                                                   |  Description                                                        |
+|----------------------------------------------------------|---------------------------------------------------------------------|
+| `.project(indices) -> TableScan`                         | Project columns by index                                            |
+| `.project_by_name(names) -> TableScan`                   | Project columns by name                                             |
+| `await .create_log_scanner() -> LogScanner`              | Create record-based scanner (for `poll()`)                          |
+| `await .create_record_batch_log_scanner() -> LogScanner` | Create batch-based scanner (for `poll_arrow()`, `to_arrow()`, etc.) |
+
+## `TableAppend`
+
+Builder for creating an `AppendWriter`. Obtain via `FlussTable.new_append()`.
+
+| Method                             |  Description             |
+|------------------------------------|--------------------------|
+| `.create_writer() -> AppendWriter` | Create the append writer |
+
+## `TableUpsert`
+
+Builder for creating an `UpsertWriter`. Obtain via `FlussTable.new_upsert()`.
+
+| Method                                             |  Description                               |
+|----------------------------------------------------|--------------------------------------------|
+| `.partial_update_by_name(columns) -> TableUpsert`  | Configure partial update by column names   |
+| `.partial_update_by_index(indices) -> TableUpsert` | Configure partial update by column indices |
+| `.create_writer() -> UpsertWriter`                 | Create the upsert writer                   |
+
+## `TableLookup`
+
+Builder for creating a `Lookuper` or `PrefixLookuper`. Obtain via `FlussTable.new_lookup()`.
+
+| Method                                              |  Description                              |
+|-----------------------------------------------------|-------------------------------------------|
+| `.create_lookuper() -> Lookuper`                    | Create a primary key lookuper             |
+| `.lookup_by(column_names) -> TablePrefixLookup`     | Switch to prefix-scan mode for the given columns (partition keys + bucket keys) |
+
+## `TablePrefixLookup`
+
+Builder for creating a `PrefixLookuper`. Obtain via `TableLookup.lookup_by(columns)`.
+
+| Method                                     |  Description              |
+|--------------------------------------------|---------------------------|
+| `.create_lookuper() -> PrefixLookuper`     | Create the prefix lookuper |
+
+## `AppendWriter`
+
+| Method                                           |  Description                        |
+|--------------------------------------------------|-------------------------------------|
+| `.append(row) -> WriteResultHandle`              | Append a row (dict, list, or tuple) |
+| `.write_arrow(table)`                            | Write a PyArrow Table               |
+| `.write_arrow_batch(batch) -> WriteResultHandle` | Write a PyArrow RecordBatch         |
+| `.write_pandas(df)`                              | Write a Pandas DataFrame            |
+| `await .flush()`                                 | Flush all pending writes            |
+
+## `UpsertWriter`
+
+| Method                              |  Description                          |
+|-------------------------------------|---------------------------------------|
+| `.upsert(row) -> WriteResultHandle` | Upsert a row (insert or update by PK) |
+| `.delete(pk) -> WriteResultHandle`  | Delete a row by primary key           |
+| `await .flush()`                    | Flush all pending operations          |
+
+## `WriteResultHandle`
+
+| Method          |  Description                                 |
+|-----------------|----------------------------------------------|
+| `await .wait()` | Wait for server acknowledgment of this write |
+
+## `Lookuper`
+
+| Method                              |  Description                |
+|-------------------------------------|-----------------------------|
+| `await .lookup(pk) -> dict \| None` | Lookup a row by primary key |
+
+## `PrefixLookuper`
+
+| Method                                        |  Description                                |
+|-----------------------------------------------|---------------------------------------------|
+| `await .lookup(prefix) -> list[dict]`         | Lookup all rows matching a prefix key       |
+
+## `LogScanner`
+
+| Method                                                        |  Description                                                                     |
+|---------------------------------------------------------------|----------------------------------------------------------------------------------|
+| `.subscribe(bucket_id, start_offset)`                         | Subscribe to a bucket                                                            |
+| `.subscribe_buckets(bucket_offsets)`                          | Subscribe to multiple buckets (`{bucket_id: offset}`)                            |
+| `.subscribe_partition(partition_id, bucket_id, start_offset)` | Subscribe to a partition bucket                                                  |
+| `.subscribe_partition_buckets(partition_bucket_offsets)`      | Subscribe to multiple partition+bucket combos (`{(part_id, bucket_id): offset}`) |
+| `.unsubscribe(bucket_id)`                                     | Unsubscribe from a bucket (non-partitioned tables)                               |
+| `.unsubscribe_partition(partition_id, bucket_id)`             | Unsubscribe from a partition bucket                                              |
+| `await .poll(timeout_ms) -> ScanRecords`                      | Poll individual records (record scanner only)                                    |
+| `await .poll_arrow(timeout_ms) -> pa.Table`                   | Poll as Arrow Table (batch scanner only)                                         |
+| `await .poll_record_batch(timeout_ms) -> list[RecordBatch]`   | Poll batches with metadata (batch scanner only)                                  |
+| `.to_arrow_batch_reader() -> pa.RecordBatchReader`            | Lazy Arrow RecordBatchReader reading until latest offsets (batch scanner only)    |
+| `await .to_arrow() -> pa.Table`                               | Read all subscribed data as Arrow Table (batch scanner only)                     |
+| `await .to_pandas() -> pd.DataFrame`                          | Read all subscribed data as DataFrame (batch scanner only)                       |
+
+> **Note:** Overlapping `poll_*` / `to_arrow*` / `to_arrow_batch_reader` calls on the same underlying scanner are not supported. Use only one active polling/consumption path at a time.
+
+## `ScanRecords`
+
+Returned by `LogScanner.poll()`. Records are grouped by bucket.
+
+> **Note:** Flat iteration and integer indexing traverse buckets in an arbitrary order that is consistent within a single `ScanRecords` instance but may differ between `poll()` calls. Use per-bucket access (`.items()`, `.records(bucket)`) when bucket ordering matters.
+
+```python
+scan_records = await scanner.poll(timeout_ms=5000)
+
+# Sequence access
+scan_records[0]                              # first record
+scan_records[-1]                             # last record
+scan_records[:5]                             # first 5 records
+
+# Per-bucket access
+for bucket, records in scan_records.items():
+    for record in records:
+        print(f"bucket={bucket.bucket_id}, offset={record.offset}, row={record.row}")
+
+# Flat iteration
+for record in scan_records:
+    print(record.row)
+```
+
+### Methods
+
+| Method                                 |  Description                                                     |
+|----------------------------------------|------------------------------------------------------------------|
+| `.buckets() -> list[TableBucket]`      | List of distinct buckets                                         |
+| `.records(bucket) -> list[ScanRecord]` | Records for a specific bucket (empty list if bucket not present) |
+| `.count() -> int`                      | Total record count across all buckets                            |
+| `.is_empty() -> bool`                  | Check if empty                                                   |
+
+### Indexing
+
+| Expression                   | Returns              | Description                       |
+|------------------------------|----------------------|-----------------------------------|
+| `scan_records[0]`           | `ScanRecord`         | Record by flat index              |
+| `scan_records[-1]`          | `ScanRecord`         | Negative indexing                  |
+| `scan_records[1:5]`         | `list[ScanRecord]`   | Slice                             |
+| `scan_records[bucket]`      | `list[ScanRecord]`   | Records for a bucket              |
+
+### Mapping Protocol
+
+| Method / Protocol              | Description                                     |
+|--------------------------------|-------------------------------------------------|
+| `.keys()`                      | Same as `.buckets()`                            |
+| `.values()`                    | Lazy iterator over record lists, one per bucket |
+| `.items()`                     | Lazy iterator over `(bucket, records)` pairs    |
+| `len(scan_records)`           | Same as `.count()`                              |
+| `bucket in scan_records`      | Membership test                                 |
+| `for record in scan_records`  | Flat iteration over all records                 |
+
+## `ScanRecord`
+
+| Property                     |  Description                                                        |
+|------------------------------|---------------------------------------------------------------------|
+| `.offset -> int`             | Record offset in the log                                            |
+| `.timestamp -> int`          | Record timestamp                                                    |
+| `.change_type -> ChangeType` | Change type (AppendOnly, Insert, UpdateBefore, UpdateAfter, Delete) |
+| `.row -> dict`               | Row data as `{column_name: value}`                                  |
+
+## `RecordBatch`
+
+| Property                   | Description                  |
+|----------------------------|------------------------------|
+| `.batch -> pa.RecordBatch` | Arrow RecordBatch data       |
+| `.bucket -> TableBucket`   | Bucket this batch belongs to |
+| `.base_offset -> int`      | First record offset          |
+| `.last_offset -> int`      | Last record offset           |
+
+## `Schema`
+
+| Method                                         |  Description               |
+|------------------------------------------------|----------------------------|
+| `Schema(schema: pa.Schema, primary_keys=None)` | Create from PyArrow schema. Field nullability (`pa.field(..., nullable=False)`) is preserved. |
+| `.get_column_names() -> list[str]`             | Get column names           |
+| `.get_column_types() -> list[str]`             | Get column type names. Non-nullable types include a `" NOT NULL"` suffix (e.g., `"int NOT NULL"`). |
+| `.get_columns() -> list[tuple[str, str]]`      | Get `(name, type)` pairs. Type strings follow the same nullability formatting as `.get_column_types()`. |
+| `.get_primary_keys() -> list[str]`             | Get primary key columns    |
+
+## `TableDescriptor`
+
+| Method                                                                                                                                                                         | Description             |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------|
+| `TableDescriptor(schema, *, partition_keys=None, bucket_count=None, bucket_keys=None, comment=None, log_format=None, kv_format=None, properties=None, custom_properties=None)` | Create table descriptor |
+| `.get_schema() -> Schema`                                                                                                                                                      | Get the schema          |
+
+## `TablePath`
+
+| Method / Property            | Description         |
+|------------------------------|---------------------|
+| `TablePath(database, table)` | Create a table path |
+| `.database_name -> str`      | Database name       |
+| `.table_name -> str`         | Table name          |
+
+## `TableInfo`
+
+| Property / Method                    |  Description                |
+|--------------------------------------|-----------------------------|
+| `.table_id -> int`                   | Table ID                    |
+| `.table_path -> TablePath`           | Table path                  |
+| `.num_buckets -> int`                | Number of buckets           |
+| `.schema_id -> int`                  | Schema ID                   |
+| `.comment -> str \| None`            | Table comment               |
+| `.created_time -> int`               | Creation timestamp          |
+| `.modified_time -> int`              | Last modification timestamp |
+| `.get_primary_keys() -> list[str]`   | Primary key columns         |
+| `.get_partition_keys() -> list[str]` | Partition columns           |
+| `.get_bucket_keys() -> list[str]`    | Bucket key columns          |
+| `.has_primary_key() -> bool`         | Has primary key?            |
+| `.is_partitioned() -> bool`          | Is partitioned?             |
+| `.get_schema() -> Schema`            | Get table schema            |
+| `.get_column_names() -> list[str]`   | Column names                |
+| `.get_column_count() -> int`         | Number of columns           |
+| `.get_properties() -> dict`          | All table properties        |
+| `.get_custom_properties() -> dict`   | Custom properties only      |
+
+## `PartitionInfo`
+
+| Property                 |  Description   |
+|--------------------------|----------------|
+| `.partition_id -> int`   | Partition ID   |
+| `.partition_name -> str` | Partition name |
+
+## `DatabaseDescriptor`
+
+| Method / Property                                          | Description       |
+|------------------------------------------------------------|-------------------|
+| `DatabaseDescriptor(comment=None, custom_properties=None)` | Create descriptor |
+| `.comment -> str \| None`                                  | Database comment  |
+| `.get_custom_properties() -> dict`                         | Custom properties |
+
+## `DatabaseInfo`
+
+| Property / Method                                  | Description                 |
+|----------------------------------------------------|-----------------------------|
+| `.database_name -> str`                            | Database name               |
+| `.created_time -> int`                             | Creation timestamp          |
+| `.modified_time -> int`                            | Last modification timestamp |
+| `.get_database_descriptor() -> DatabaseDescriptor` | Get descriptor              |
+
+## `LakeSnapshot`
+
+| Property / Method                                 | Description             |
+|---------------------------------------------------|-------------------------|
+| `.snapshot_id -> int`                             | Snapshot ID             |
+| `.table_buckets_offset -> dict[TableBucket, int]` | All bucket offsets      |
+| `.get_bucket_offset(bucket) -> int \| None`       | Get offset for a bucket |
+| `.get_table_buckets() -> list[TableBucket]`       | Get all buckets         |
+
+## `TableBucket`
+
+| Method / Property                                            | Description                            |
+|--------------------------------------------------------------|----------------------------------------|
+| `TableBucket(table_id, bucket)`                              | Create non-partitioned bucket          |
+| `TableBucket.with_partition(table_id, partition_id, bucket)` | Create partitioned bucket              |
+| `.table_id -> int`                                           | Table ID                               |
+| `.bucket_id -> int`                                          | Bucket ID                              |
+| `.partition_id -> int \| None`                               | Partition ID (None if non-partitioned) |
+
+## `FlussError`
+
+| Property             | Description                                                                         |
+|----------------------|-------------------------------------------------------------------------------------|
+| `.message -> str`    | Error message                                                                       |
+| `.error_code -> int` | Error code (`ErrorCode.CLIENT_ERROR` for client-side errors, server code otherwise) |
+
+Raised for all Fluss-specific errors (connection failures, table not found, schema mismatches, etc.). Inherits from `Exception`. See [Error Handling](./error-handling.md) for details on matching specific error codes.
+
+## Constants
+
+| Constant                     | Value         | Description                                         |
+|------------------------------|---------------|-----------------------------------------------------|
+| `fluss.EARLIEST_OFFSET`      | `-2`          | Start reading from earliest available offset        |
+
+## `OffsetSpec`
+
+| Method                      | Description                                      |
+|-----------------------------|--------------------------------------------------|
+| `OffsetSpec.earliest()`     | Earliest available offset                        |
+| `OffsetSpec.latest()`       | Latest offset                                    |
+| `OffsetSpec.timestamp(ts)`  | Offset at or after the given timestamp (millis)  |
+
+To start reading from the latest offset (only new records), resolve the current offset via `list_offsets` before subscribing:
+
+```python
+offsets = await admin.list_offsets(table_path, [0], fluss.OffsetSpec.latest())
+scanner.subscribe(bucket_id=0, start_offset=offsets[0])
+```
+
+## `ChangeType`
+
+| Value                         | Short String | Description                   |
+|-------------------------------|--------------|-------------------------------|
+| `ChangeType.AppendOnly` (0)   | `+A`         | Append-only                   |
+| `ChangeType.Insert` (1)       | `+I`         | Insert                        |
+| `ChangeType.UpdateBefore` (2) | `-U`         | Previous value of updated row |
+| `ChangeType.UpdateAfter` (3)  | `+U`         | New value of updated row      |
+| `ChangeType.Delete` (4)       | `-D`         | Delete                        |
diff --git a/fluss-rust/website/docs/user-guide/python/data-types.md b/fluss-rust/website/docs/user-guide/python/data-types.md
new file mode 100644
index 0000000000..8e4371e216
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/data-types.md
@@ -0,0 +1,95 @@
+---
+sidebar_position: 3
+---
+# Data Types
+
+The Python client uses PyArrow types for schema definitions:
+
+| PyArrow Type                                    | Fluss Type                        | Python Type         |
+|-------------------------------------------------|-----------------------------------|---------------------|
+| `pa.bool_()`                                    | Boolean                           | `bool`              |
+| `pa.int8()` / `int16()` / `int32()` / `int64()` | TinyInt / SmallInt / Int / BigInt | `int`               |
+| `pa.float32()` / `float64()`                    | Float / Double                    | `float`             |
+| `pa.string()`                                   | String                            | `str`               |
+| `pa.binary()`                                   | Bytes                             | `bytes`             |
+| `pa.binary(n)`                                  | Binary(n)                         | `bytes`             |
+| `pa.date32()`                                   | Date                              | `datetime.date`     |
+| `pa.time32("ms")`                               | Time                              | `datetime.time`     |
+| `pa.timestamp("us")`                            | Timestamp (NTZ)                   | `datetime.datetime` |
+| `pa.timestamp("us", tz="UTC")`                  | TimestampLTZ                      | `datetime.datetime` |
+| `pa.decimal128(precision, scale)`               | Decimal                           | `decimal.Decimal`   |
+| `pa.list_(type)`                                  | Array                             | `list`              |
+
+All Python native types (`date`, `time`, `datetime`, `Decimal`) work when appending rows via dicts.
+
+## Nullability
+
+PyArrow field nullability is preserved when constructing Fluss schemas. By default, fields are nullable. Use `nullable=False` on `pa.field()` to create a `NOT NULL` column:
+
+```python
+schema = pa.schema([
+    pa.field("id", pa.int32(), nullable=False),
+    pa.field("name", pa.string()),          # nullable by default
+])
+fluss_schema = fluss.Schema(schema)
+fluss_schema.get_column_types()  # ["int NOT NULL", "string"]
+```
+
+Primary key columns are automatically forced `NOT NULL` regardless of the PyArrow field setting.
+
+For nested types, element nullability is also preserved:
+
+```python
+schema = pa.schema([
+    pa.field("tags", pa.list_(pa.field("item", pa.string(), nullable=False))),
+])
+fluss_schema = fluss.Schema(schema)
+fluss_schema.get_column_types()  # ["array<string NOT NULL>"]
+```
+
+## Writing Data
+
+Rows can be dicts, lists, or tuples:
+
+```python
+from datetime import date, time, datetime
+from decimal import Decimal
+
+row = {
+    "user_id": 1,
+    "name": "Alice",
+    "active": True,
+    "score": 95.5,
+    "balance": Decimal("1234.56"),
+    "birth_date": date(1990, 3, 15),
+    "login_time": time(9, 30, 0),
+    "created_at": datetime(2024, 1, 1, 0, 0, 0),
+    "nickname": None,  # null value
+    "tags": ["active", "premium"],  # Array of strings
+    "scores": [10, None, 30],       # Array with null values
+}
+handle = writer.append(row)
+```
+
+Lists and tuples must have values in column order:
+
+```python
+row = [1, "Alice", True, 95.5, Decimal("1234.56"), date(1990, 3, 15), time(9, 30, 0), datetime(2024, 1, 1), None]
+handle = writer.append(row)
+```
+
+## Reading Data
+
+```python
+records = await scanner.poll(timeout_ms=1000)
+for record in records:
+    row = record.row  # dict[str, Any]
+    print(row["user_id"])     # int
+    print(row["name"])        # str
+    print(row["balance"])     # decimal.Decimal
+    print(row["birth_date"])  # datetime.date
+    print(row["created_at"])  # datetime.datetime
+
+    if row["nickname"] is None:
+        print("nickname is null")
+```
diff --git a/fluss-rust/website/docs/user-guide/python/error-handling.md b/fluss-rust/website/docs/user-guide/python/error-handling.md
new file mode 100644
index 0000000000..5bef366516
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/error-handling.md
@@ -0,0 +1,168 @@
+---
+sidebar_position: 4
+---
+# Error Handling
+
+The client raises `fluss.FlussError` for all Fluss-specific errors. Each error carries a `message` and an `error_code`.
+
+## Basic Usage
+
+```python
+import fluss
+
+try:
+    await admin.create_table(table_path, table_descriptor)
+except fluss.FlussError as e:
+    print(f"Error (code {e.error_code}): {e.message}")
+```
+
+## Error Codes
+
+Server-side errors carry a specific error code (>0 or -1). Client-side errors (connection failures, type mismatches, etc.) use `ErrorCode.CLIENT_ERROR` (-2). Use `fluss.ErrorCode` to match on specific codes:
+
+```python
+import fluss
+
+try:
+    await admin.drop_table(table_path)
+except fluss.FlussError as e:
+    if e.error_code == fluss.ErrorCode.TABLE_NOT_EXIST:
+        print("Table does not exist")
+    elif e.error_code == fluss.ErrorCode.PARTITION_NOT_EXISTS:
+        print("Partition does not exist")
+    elif e.error_code == fluss.ErrorCode.CLIENT_ERROR:
+        print(f"Client-side error: {e.message}")
+    else:
+        print(f"Server error (code {e.error_code}): {e.message}")
+```
+
+### Common Error Codes
+
+| Constant                                     | Code | Description                         |
+|----------------------------------------------|------|-------------------------------------|
+| `ErrorCode.CLIENT_ERROR`                     | -2   | Client-side error (not from server) |
+| `ErrorCode.UNKNOWN_SERVER_ERROR`             | -1   | Unexpected server error             |
+| `ErrorCode.NETWORK_EXCEPTION`                | 1    | Server disconnected before response |
+| `ErrorCode.DATABASE_NOT_EXIST`               | 4    | Database does not exist             |
+| `ErrorCode.DATABASE_ALREADY_EXIST`           | 6    | Database already exists             |
+| `ErrorCode.TABLE_NOT_EXIST`                  | 7    | Table does not exist                |
+| `ErrorCode.TABLE_ALREADY_EXIST`              | 8    | Table already exists                |
+| `ErrorCode.INVALID_TABLE_EXCEPTION`          | 15   | Invalid table operation             |
+| `ErrorCode.REQUEST_TIME_OUT`                 | 25   | Request timed out                   |
+| `ErrorCode.PARTITION_NOT_EXISTS`             | 36   | Partition does not exist            |
+| `ErrorCode.PARTITION_ALREADY_EXISTS`         | 42   | Partition already exists            |
+| `ErrorCode.PARTITION_SPEC_INVALID_EXCEPTION` | 43   | Invalid partition spec              |
+| `ErrorCode.LEADER_NOT_AVAILABLE_EXCEPTION`   | 44   | No leader available for partition   |
+| `ErrorCode.AUTHENTICATE_EXCEPTION`           | 46   | Authentication failed (bad credentials) |
+
+See `fluss.ErrorCode` for the full list of named constants.
+
+## Retry Logic
+
+Some errors are transient, where the server may be temporarily unavailable, mid-election, or under load. `is_retriable` can be used for deciding to retry an operation rather than treating the error as permanent.
+
+`FlussError.is_retriable` is a property available directly on the exception:
+
+```python
+import fluss
+
+try:
+    await writer.append(row)
+except fluss.FlussError as e:
+    if e.is_retriable:
+        # Transient failure — safe to retry
+        pass
+    else:
+        # Permanent failure — log and abort
+        print(f"Fatal error (code {e.error_code}): {e.message}")
+```
+
+### Retriable Error Codes
+
+| Constant                                                     | Code | Reason                                    |
+|--------------------------------------------------------------|------|-------------------------------------------|
+| `ErrorCode.NETWORK_EXCEPTION`                               | 1    | Server disconnected                       |
+| `ErrorCode.CORRUPT_MESSAGE`                                 | 3    | CRC or size error                         |
+| `ErrorCode.SCHEMA_NOT_EXIST`                                | 9    | Schema may not exist                      |
+| `ErrorCode.LOG_STORAGE_EXCEPTION`                           | 10   | Transient log storage error               |
+| `ErrorCode.KV_STORAGE_EXCEPTION`                            | 11   | Transient KV storage error                |
+| `ErrorCode.NOT_LEADER_OR_FOLLOWER`                          | 12   | Leader election in progress               |
+| `ErrorCode.CORRUPT_RECORD_EXCEPTION`                        | 14   | Corrupt record                            |
+| `ErrorCode.UNKNOWN_TABLE_OR_BUCKET_EXCEPTION`               | 21   | Metadata not yet available                |
+| `ErrorCode.REQUEST_TIME_OUT`                                | 25   | Request timed out                         |
+| `ErrorCode.STORAGE_EXCEPTION`                               | 26   | Transient storage error                   |
+| `ErrorCode.NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION`      | 28   | Wrote to server but with low ISR size     |
+| `ErrorCode.NOT_ENOUGH_REPLICAS_EXCEPTION`                   | 29   | Low ISR size at write time                |
+| `ErrorCode.LEADER_NOT_AVAILABLE_EXCEPTION`                  | 44   | No leader available for partition         |
+
+Client-side errors (`ErrorCode.CLIENT_ERROR`, code -2) always return `False` from `is_retriable`.
+
+## Common Error Scenarios
+
+### Connection Refused
+
+The Fluss cluster is not running or the address is incorrect.
+
+```python
+try:
+    config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"})
+    conn = await fluss.FlussConnection.create(config)
+except fluss.FlussError as e:
+    # error_code == ErrorCode.CLIENT_ERROR for connection failures
+    print(f"Cannot connect to cluster: {e.message}")
+```
+
+### Table Not Found
+
+The table does not exist or has been dropped.
+
+```python
+try:
+    await admin.drop_table(table_path)
+except fluss.FlussError as e:
+    if e.error_code == fluss.ErrorCode.TABLE_NOT_EXIST:
+        print("Table not found")
+```
+
+### Partition Not Found
+
+Writing to a partitioned table before creating partitions.
+
+```python
+try:
+    await admin.drop_partition(table_path, {"region": "US"})
+except fluss.FlussError as e:
+    if e.error_code == fluss.ErrorCode.PARTITION_NOT_EXISTS:
+        print("Partition does not exist, create it first")
+```
+
+### Authentication Failed
+
+SASL credentials are incorrect or the user does not exist.
+
+```python
+try:
+    config = fluss.Config({
+        "bootstrap.servers": "127.0.0.1:9123",
+        "client.security.protocol": "sasl",
+        "client.security.sasl.username": "admin",
+        "client.security.sasl.password": "wrong-password",
+    })
+    conn = await fluss.FlussConnection.create(config)
+except fluss.FlussError as e:
+    if e.error_code == fluss.ErrorCode.AUTHENTICATE_EXCEPTION:
+        print(f"Authentication failed: {e.message}")
+```
+
+### Schema Mismatch
+
+Row data doesn't match the table schema.
+
+```python
+try:
+    writer.append({"wrong_column": "value"})
+    await writer.flush()
+except fluss.FlussError as e:
+    # error_code == ErrorCode.CLIENT_ERROR for type/schema mismatches
+    print(f"Schema mismatch: {e.message}")
+```
diff --git a/fluss-rust/website/docs/user-guide/python/example/_category_.json b/fluss-rust/website/docs/user-guide/python/example/_category_.json
new file mode 100644
index 0000000000..4d81ec12ae
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/example/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Examples",
+  "position": 5
+}
diff --git a/fluss-rust/website/docs/user-guide/python/example/admin-operations.md b/fluss-rust/website/docs/user-guide/python/example/admin-operations.md
new file mode 100644
index 0000000000..2cda6c4abf
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/example/admin-operations.md
@@ -0,0 +1,81 @@
+---
+sidebar_position: 3
+---
+# Admin Operations
+
+```python
+admin = conn.get_admin()
+```
+
+## Databases
+
+```python
+await admin.create_database("my_database", ignore_if_exists=True)
+databases = await admin.list_databases()
+exists = await admin.database_exists("my_database")
+await admin.drop_database("my_database", ignore_if_not_exists=True, cascade=True)
+```
+
+## Tables
+
+Schemas are defined using PyArrow and wrapped in `fluss.Schema`:
+
+```python
+import pyarrow as pa
+
+schema = fluss.Schema(pa.schema([
+    pa.field("id", pa.int32()),
+    pa.field("name", pa.string()),
+    pa.field("amount", pa.int64()),
+]))
+
+table_path = fluss.TablePath("my_database", "my_table")
+await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True)
+
+table_info = await admin.get_table_info(table_path)
+tables = await admin.list_tables("my_database")
+await admin.drop_table(table_path, ignore_if_not_exists=True)
+```
+
+### TableDescriptor Options
+
+`TableDescriptor` accepts these optional parameters:
+
+| Parameter           | Description                                                                         |
+|---------------------|-------------------------------------------------------------------------------------|
+| `partition_keys`    | Column names to partition by (e.g. `["region"]`)                                    |
+| `bucket_count`      | Number of buckets (parallelism units) for the table                                 |
+| `bucket_keys`       | Columns used to determine bucket assignment                                         |
+| `comment`           | Table comment / description                                                         |
+| `log_format`        | Log storage format: `"ARROW"` or `"INDEXED"`                                        |
+| `kv_format`         | KV storage format for primary key tables: `"INDEXED"` or `"COMPACTED"`              |
+| `properties`        | Table configuration properties as a dict (e.g. `{"table.replication.factor": "1"}`) |
+| `custom_properties` | User-defined properties as a dict                                                   |
+
+## Offsets
+
+```python
+# Latest offsets for buckets
+offsets = await admin.list_offsets(table_path, bucket_ids=[0, 1], offset_spec=fluss.OffsetSpec.latest())
+
+# By timestamp
+offsets = await admin.list_offsets(table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.timestamp(1704067200000))
+
+# Per-partition offsets
+offsets = await admin.list_partition_offsets(table_path, partition_name="US", bucket_ids=[0], offset_spec=fluss.OffsetSpec.latest())
+```
+
+## Lake Snapshot
+
+:::note
+Lake snapshots require [lake integration](https://fluss.apache.org/docs/maintenance/tiered-storage/overview/) (e.g. Paimon or Iceberg) to be enabled on the server. Without it, `get_latest_lake_snapshot` will raise an error.
+:::
+
+```python
+snapshot = await admin.get_latest_lake_snapshot(table_path)
+print(f"Snapshot ID: {snapshot.snapshot_id}")
+print(f"Table buckets: {snapshot.get_table_buckets()}")
+
+bucket = fluss.TableBucket(table_id=1, bucket=0)
+offset = snapshot.get_bucket_offset(bucket)
+```
diff --git a/fluss-rust/website/docs/user-guide/python/example/configuration.md b/fluss-rust/website/docs/user-guide/python/example/configuration.md
new file mode 100644
index 0000000000..448ae029ac
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/example/configuration.md
@@ -0,0 +1,49 @@
+---
+sidebar_position: 2
+---
+# Configuration
+
+## Connection Setup
+
+```python
+import fluss
+
+config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"})
+conn = await fluss.FlussConnection.create(config)
+```
+
+The connection also supports async context managers:
+
+```python
+async with await fluss.FlussConnection.create(config) as conn:
+    ...
+```
+
+## Connection Configurations
+
+Configuration options can be set either via dict keys in the `Config()` constructor, or via Python property setters.
+
+See the [`Config`](../api-reference.md#config) section in the API Reference for the full list of options, their config keys, and descriptions.
+
+## SASL Authentication
+
+To connect to a Fluss cluster with SASL/PLAIN authentication enabled:
+
+```python
+config = fluss.Config({
+    "bootstrap.servers": "127.0.0.1:9123",
+    "security.protocol": "sasl",
+    "security.sasl.mechanism": "PLAIN",
+    "security.sasl.username": "admin",
+    "security.sasl.password": "admin-secret",
+})
+conn = await fluss.FlussConnection.create(config)
+```
+
+## Connection Lifecycle
+
+Remember to close the connection when done:
+
+```python
+await conn.close()
+```
diff --git a/fluss-rust/website/docs/user-guide/python/example/index.md b/fluss-rust/website/docs/user-guide/python/example/index.md
new file mode 100644
index 0000000000..ecbdc84685
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/example/index.md
@@ -0,0 +1,46 @@
+---
+sidebar_position: 1
+---
+# Example
+
+Minimal working example: connect to Fluss, create a table, write data, and read it back.
+
+```python
+import asyncio
+import pyarrow as pa
+import fluss
+
+async def main():
+    # Connect
+    config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"})
+    conn = await fluss.FlussConnection.create(config)
+    admin = conn.get_admin()
+
+    # Create a log table
+    schema = fluss.Schema(pa.schema([
+        pa.field("id", pa.int32()),
+        pa.field("name", pa.string()),
+        pa.field("score", pa.float32()),
+    ]))
+    table_path = fluss.TablePath("fluss", "quick_start")
+    await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True)
+
+    # Write
+    table = await conn.get_table(table_path)
+    writer = table.new_append().create_writer()
+    writer.append({"id": 1, "name": "Alice", "score": 95.5})
+    writer.append({"id": 2, "name": "Bob", "score": 87.0})
+    await writer.flush()
+
+    # Read
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner = await table.new_scan().create_record_batch_log_scanner()
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+    print(await scanner.to_pandas())
+
+    # Cleanup
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+    await conn.close()
+
+asyncio.run(main())
+```
diff --git a/fluss-rust/website/docs/user-guide/python/example/log-tables.md b/fluss-rust/website/docs/user-guide/python/example/log-tables.md
new file mode 100644
index 0000000000..4dbe256781
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/example/log-tables.md
@@ -0,0 +1,129 @@
+---
+sidebar_position: 4
+---
+# Log Tables
+
+Log tables are append-only tables without primary keys, suitable for event streaming.
+
+## Creating a Log Table
+
+```python
+import pyarrow as pa
+
+schema = fluss.Schema(pa.schema([
+    pa.field("id", pa.int32()),
+    pa.field("name", pa.string()),
+    pa.field("score", pa.float32()),
+]))
+
+table_path = fluss.TablePath("fluss", "events")
+await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True)
+```
+
+## Writing
+
+Rows can be appended as dicts, lists, or tuples. For bulk writes, use `write_arrow()`, `write_arrow_batch()`, or `write_pandas()`.
+
+Write methods like `append()` and `write_arrow_batch()` return a `WriteResultHandle`. You can ignore it for fire-and-forget semantics (flush at the end), or `await handle.wait()` to block until the server acknowledges that specific write.
+
+```python
+table = await conn.get_table(table_path)
+writer = table.new_append().create_writer()
+
+# Fire-and-forget: queue writes, flush at the end
+writer.append({"id": 1, "name": "Alice", "score": 95.5})
+writer.append([2, "Bob", 87.0])
+await writer.flush()
+
+# Per-record acknowledgment
+handle = writer.append({"id": 3, "name": "Charlie", "score": 91.0})
+await handle.wait()
+
+# Bulk writes
+writer.write_arrow(pa_table)          # PyArrow Table
+writer.write_arrow_batch(record_batch) # PyArrow RecordBatch
+writer.write_pandas(df)                # Pandas DataFrame
+await writer.flush()
+```
+
+## Reading
+
+There are two scanner types:
+- **Batch scanner** (`create_record_batch_log_scanner()`): returns Arrow Tables or DataFrames, best for analytics
+- **Record scanner** (`create_log_scanner()`): returns individual records with metadata (offset, timestamp, change type), best for streaming
+
+And two reading modes:
+- **`to_arrow()` / `to_pandas()`**: reads all data from subscribed buckets up to the current latest offset, then returns. Best for one-shot batch reads.
+- **`poll_arrow()` / `poll()` / `poll_record_batch()`**: returns whatever data is available within the timeout, then returns. Call in a loop for continuous streaming.
+
+### Batch Read (One-Shot)
+
+```python
+num_buckets = (await admin.get_table_info(table_path)).num_buckets
+
+scanner = await table.new_scan().create_record_batch_log_scanner()
+scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+# Reads everything up to current latest offset, then returns
+arrow_table = await scanner.to_arrow()
+df = await scanner.to_pandas()
+```
+
+### Continuous Polling
+
+Use `poll_arrow()` or `poll()` in a loop for streaming consumption:
+
+```python
+# Batch scanner: poll as Arrow Tables
+scanner = await table.new_scan().create_record_batch_log_scanner()
+scanner.subscribe(bucket_id=0, start_offset=fluss.EARLIEST_OFFSET)
+
+while True:
+    result = await scanner.poll_arrow(timeout_ms=5000)
+    if result.num_rows > 0:
+        print(result.to_pandas())
+
+# Record scanner: poll individual records
+scanner = await table.new_scan().create_log_scanner()
+scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+while True:
+    scan_records = await scanner.poll(timeout_ms=5000)
+
+    for record in scan_records:
+        print(f"offset={record.offset}, change={record.change_type.short_string()}, row={record.row}")
+
+    # Or per-bucket access (dict-like)
+    for bucket, records in scan_records.items():
+        for record in records:
+            print(f"bucket={bucket.bucket_id}, offset={record.offset}, row={record.row}")
+```
+
+### Unsubscribing
+
+To stop consuming from a bucket, use `unsubscribe()`:
+
+```python
+scanner.unsubscribe(bucket_id=0)
+```
+
+### Subscribe from Latest Offset
+
+To only consume new records (skip existing data), first resolve the current latest offset via `list_offsets`, then subscribe at that offset:
+
+```python
+admin = conn.get_admin()
+offsets = await admin.list_offsets(table_path, [0], fluss.OffsetSpec.latest())
+latest = offsets[0]
+
+scanner = await table.new_scan().create_record_batch_log_scanner()
+scanner.subscribe(bucket_id=0, start_offset=latest)
+```
+
+## Column Projection
+
+```python
+scanner = await table.new_scan().project([0, 2]).create_record_batch_log_scanner()
+# or by name
+scanner = await table.new_scan().project_by_name(["id", "score"]).create_record_batch_log_scanner()
+```
diff --git a/fluss-rust/website/docs/user-guide/python/example/partitioned-tables.md b/fluss-rust/website/docs/user-guide/python/example/partitioned-tables.md
new file mode 100644
index 0000000000..894bb519db
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/example/partitioned-tables.md
@@ -0,0 +1,104 @@
+---
+sidebar_position: 6
+---
+# Partitioned Tables
+
+Partitioned tables distribute data across partitions based on column values. Partitions must exist before writing data, otherwise the client will by default retry indefinitely.
+
+## Creating and Managing Partitions
+
+```python
+import pyarrow as pa
+
+schema = fluss.Schema(pa.schema([
+    pa.field("id", pa.int32()),
+    pa.field("region", pa.string()),
+    pa.field("value", pa.int64()),
+]))
+
+table_path = fluss.TablePath("fluss", "partitioned_events")
+await admin.create_table(
+    table_path,
+    fluss.TableDescriptor(schema, partition_keys=["region"], bucket_count=1),
+    ignore_if_exists=True,
+)
+
+# Create partitions
+await admin.create_partition(table_path, {"region": "US"}, ignore_if_exists=True)
+await admin.create_partition(table_path, {"region": "EU"}, ignore_if_exists=True)
+
+# List partitions
+partition_infos = await admin.list_partition_infos(table_path)
+```
+
+## Writing
+
+Same as non-partitioned tables - include partition column values in each row. **Partitions must exist before writing data, otherwise the client will by default retry indefinitely.**
+
+```python
+table = await conn.get_table(table_path)
+writer = table.new_append().create_writer()
+writer.append({"id": 1, "region": "US", "value": 100})
+writer.append({"id": 2, "region": "EU", "value": 200})
+await writer.flush()
+```
+
+## Reading
+
+Use `subscribe_partition()` or `subscribe_partition_buckets()` instead of `subscribe()`:
+
+```python
+scanner = await table.new_scan().create_record_batch_log_scanner()
+
+# Subscribe to individual partitions
+for p in partition_infos:
+    scanner.subscribe_partition(partition_id=p.partition_id, bucket_id=0, start_offset=fluss.EARLIEST_OFFSET)
+
+# Or batch-subscribe
+scanner.subscribe_partition_buckets({
+    (p.partition_id, 0): fluss.EARLIEST_OFFSET for p in partition_infos
+})
+
+print(await scanner.to_pandas())
+```
+
+### Unsubscribing
+
+To stop consuming from a specific partition bucket, use `unsubscribe_partition()`:
+
+```python
+scanner.unsubscribe_partition(partition_id=partition_infos[0].partition_id, bucket_id=0)
+```
+
+## Partitioned Primary Key Tables
+
+Partition columns must be part of the primary key. Partitions must exist before upserting data, otherwise the client will by default retry indefinitely.
+
+```python
+schema = fluss.Schema(
+    pa.schema([
+        pa.field("user_id", pa.int32()),
+        pa.field("region", pa.string()),
+        pa.field("score", pa.int64()),
+    ]),
+    primary_keys=["user_id", "region"],
+)
+
+table_path = fluss.TablePath("fluss", "partitioned_users")
+await admin.create_table(
+    table_path,
+    fluss.TableDescriptor(schema, partition_keys=["region"]),
+    ignore_if_exists=True,
+)
+
+await admin.create_partition(table_path, {"region": "US"}, ignore_if_exists=True)
+
+table = await conn.get_table(table_path)
+writer = table.new_upsert().create_writer()
+writer.upsert({"user_id": 1, "region": "US", "score": 1234})
+await writer.flush()
+
+# Lookup includes partition columns
+lookuper = table.new_lookup().create_lookuper()
+result = await lookuper.lookup({"user_id": 1, "region": "US"})
+```
diff --git a/fluss-rust/website/docs/user-guide/python/example/primary-key-tables.md b/fluss-rust/website/docs/user-guide/python/example/primary-key-tables.md
new file mode 100644
index 0000000000..cd61e5084c
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/example/primary-key-tables.md
@@ -0,0 +1,61 @@
+---
+sidebar_position: 5
+---
+# Primary Key Tables
+
+Primary key tables support upsert, delete, and point lookup operations.
+
+## Creating a Primary Key Table
+
+Pass `primary_keys` to `fluss.Schema`:
+
+```python
+import pyarrow as pa
+
+schema = fluss.Schema(
+    pa.schema([
+        pa.field("id", pa.int32()),
+        pa.field("name", pa.string()),
+        pa.field("age", pa.int64()),
+    ]),
+    primary_keys=["id"],
+)
+table_path = fluss.TablePath("fluss", "users")
+await admin.create_table(table_path, fluss.TableDescriptor(schema, bucket_count=3), ignore_if_exists=True)
+```
+
+## Upsert, Delete, Lookup
+
+```python
+table = await conn.get_table(table_path)
+
+# Upsert (fire-and-forget, flush at the end)
+writer = table.new_upsert().create_writer()
+writer.upsert({"id": 1, "name": "Alice", "age": 25})
+writer.upsert({"id": 2, "name": "Bob", "age": 30})
+await writer.flush()
+
+# Per-record acknowledgment (for read-after-write)
+handle = writer.upsert({"id": 3, "name": "Charlie", "age": 35})
+await handle.wait()
+
+# Delete by primary key
+handle = writer.delete({"id": 2})
+await handle.wait()
+
+# Lookup
+lookuper = table.new_lookup().create_lookuper()
+result = await lookuper.lookup({"id": 1})
+if result:
+    print(f"Found: name={result['name']}, age={result['age']}")
+```
+
+## Partial Updates
+
+Update specific columns while preserving others:
+
+```python
+partial_writer = table.new_upsert().partial_update_by_name(["id", "age"]).create_writer()
+partial_writer.upsert({"id": 1, "age": 27})  # only updates age
+await partial_writer.flush()
+```
diff --git a/fluss-rust/website/docs/user-guide/python/installation.md b/fluss-rust/website/docs/user-guide/python/installation.md
new file mode 100644
index 0000000000..4182dbb431
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/python/installation.md
@@ -0,0 +1,41 @@
+---
+sidebar_position: 1
+---
+# Installation
+
+```bash
+pip install pyfluss
+```
+
+## Building From Source (Optional)
+
+**Prerequisites:** Python 3.9+, Rust 1.85+
+
+```bash
+git clone https://github.com/apache/fluss-rust.git
+cd fluss-rust/bindings/python
+```
+
+Install [maturin](https://github.com/PyO3/maturin):
+
+```bash
+pip install maturin
+```
+
+Build and install:
+
+```bash
+# Development mode (editable)
+maturin develop
+
+# Or build a wheel
+maturin build --release
+pip install target/wheels/fluss-*.whl
+```
+
+Verify:
+
+```python
+import fluss
+print("Fluss Python bindings installed successfully!")
+```
diff --git a/fluss-rust/website/docs/user-guide/rust/_category_.json b/fluss-rust/website/docs/user-guide/rust/_category_.json
new file mode 100644
index 0000000000..cdec432dad
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Rust",
+  "position": 1
+}
diff --git a/fluss-rust/website/docs/user-guide/rust/api-reference.md b/fluss-rust/website/docs/user-guide/rust/api-reference.md
new file mode 100644
index 0000000000..bb2ec3e8f1
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/api-reference.md
@@ -0,0 +1,597 @@
+---
+sidebar_position: 2
+---
+# API Reference
+
+Complete API reference for the Fluss Rust client.
+
+## `Config`
+
+| Field                                 | Type            | Default            | Description                                                                          |
+|---------------------------------------|-----------------|--------------------|--------------------------------------------------------------------------------------|
+| `bootstrap_servers`                   | `String`        | `"127.0.0.1:9123"` | Coordinator server address                                                           |
+| `writer_request_max_size`             | `i32`           | `10485760` (10 MB) | Maximum request size in bytes                                                        |
+| `writer_acks`                         | `String`        | `"all"`            | Acknowledgment setting (`"all"` waits for all replicas)                              |
+| `writer_retries`                      | `i32`           | `i32::MAX`         | Number of retries on failure                                                         |
+| `writer_batch_size`                   | `i32`           | `2097152` (2 MB)   | Batch size for writes in bytes. Upper bound when dynamic sizing is on; fixed batch size when off. |
+| `writer_dynamic_batch_size_enabled`   | `bool`          | `true`             | Enable per-table dynamic batch sizing: target grows 10% above 80% fill, shrinks 5% below 50%, clamped to `[writer_dynamic_batch_size_min, writer_batch_size]` |
+| `writer_dynamic_batch_size_min`       | `i32`           | `262144` (256 KB)  | Lower bound for the dynamic batch size estimator (ignored when `writer_dynamic_batch_size_enabled` is `false`) |
+| `writer_batch_timeout_ms`             | `i64`           | `100`              | Maximum time in ms to wait for a writer batch to fill up before sending              |
+| `writer_bucket_no_key_assigner`       | `NoKeyAssigner` | `sticky`           | Bucket assignment strategy for tables without bucket keys: `sticky` or `round_robin` |
+| `scanner_remote_log_prefetch_num`     | `usize`         | `4`                | Number of remote log segments to prefetch                                            |
+| `remote_file_download_thread_num`     | `usize`         | `3`                | Number of threads for remote log downloads                                           |
+| `scanner_remote_log_read_concurrency` | `usize`         | `4`                | Streaming read concurrency within a remote log file                                  |
+| `scanner_log_max_poll_records`        | `usize`         | `500`              | Maximum number of records returned in a single poll()                                |
+| `scanner_log_fetch_max_bytes`         | `i32`           | `16777216` (16 MB) | Maximum bytes per fetch response for LogScanner                                      |
+| `scanner_log_fetch_min_bytes`         | `i32`           | `1`                | Minimum bytes the server must accumulate before returning a fetch response           |
+| `scanner_log_fetch_wait_max_time_ms`  | `i32`           | `500`              | Maximum time (ms) the server may wait to satisfy min-bytes                           |
+| `scanner_log_fetch_max_bytes_for_bucket`| `i32`         | `1048576` (1 MB)   | Maximum bytes per fetch response per bucket for LogScanner                           |
+| `connect_timeout_ms`                  | `u64`           | `120000`           | TCP connect timeout in milliseconds                                                  |
+| `security_protocol`                   | `String`        | `"PLAINTEXT"`      | `PLAINTEXT` (default) or `sasl` for SASL auth                                        |
+| `security_sasl_mechanism`             | `String`        | `"PLAIN"`          | SASL mechanism (only `PLAIN` is supported)                                           |
+| `security_sasl_username`              | `String`        | (empty)            | SASL username (required when protocol is `sasl`)                                     |
+| `security_sasl_password`              | `String`        | (empty)            | SASL password (required when protocol is `sasl`)                                     |
+
+## `FlussConnection`
+
+| Method                                                                        | Description                                    |
+|-------------------------------------------------------------------------------|------------------------------------------------|
+| `async fn new(config: Config) -> Result<Self>`                                | Create a new connection to a Fluss cluster     |
+| `fn get_admin(&self) -> Result<Arc<FlussAdmin>>`                              | Get the admin interface for cluster management |
+| `async fn get_table(&self, table_path: &TablePath) -> Result<FlussTable<'_>>` | Get a table for read/write operations          |
+| `fn config(&self) -> &Config`                                                 | Get a reference to the connection config       |
+
+## `FlussAdmin`
+
+### Database Operations
+
+| Method                                                                                                                       | Description                |
+|------------------------------------------------------------------------------------------------------------------------------|----------------------------|
+| `async fn create_database(&self, name: &str, descriptor: Option<&DatabaseDescriptor>, ignore_if_exists: bool) -> Result<()>` | Create a database          |
+| `async fn drop_database(&self, name: &str, ignore_if_not_exists: bool, cascade: bool) -> Result<()>`                         | Drop a database            |
+| `async fn list_databases(&self) -> Result<Vec<String>>`                                                                      | List all databases         |
+| `async fn database_exists(&self, name: &str) -> Result<bool>`                                                                | Check if a database exists |
+| `async fn get_database_info(&self, name: &str) -> Result<DatabaseInfo>`                                                      | Get database metadata      |
+
+### Table Operations
+
+| Method                                                                                                                     | Description                                                                 |
+|----------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------|
+| `async fn create_table(&self, table_path: &TablePath, descriptor: &TableDescriptor, ignore_if_exists: bool) -> Result<()>` | Create a table                                                              |
+| `async fn drop_table(&self, table_path: &TablePath, ignore_if_not_exists: bool) -> Result<()>`                             | Drop a table                                                                |
+| `async fn get_table_info(&self, table_path: &TablePath) -> Result<TableInfo>`                                              | Get table metadata                                                          |
+| `async fn get_table_schema(&self, table_path: &TablePath, schema_id: Option<i32>) -> Result<SchemaInfo>`                   | Get a table's schema by id, or the latest schema when `schema_id` is `None` |
+| `async fn list_tables(&self, database_name: &str) -> Result<Vec<String>>`                                                  | List tables in a database                                                   |
+| `async fn table_exists(&self, table_path: &TablePath) -> Result<bool>`                                                     | Check if a table exists                                                     |
+
+### Partition Operations
+
+| Method                                                                                                                               | Description                     |
+|--------------------------------------------------------------------------------------------------------------------------------------|---------------------------------|
+| `async fn list_partition_infos(&self, table_path: &TablePath) -> Result<Vec<PartitionInfo>>`                                         | List all partitions             |
+| `async fn list_partition_infos_with_spec(&self, table_path: &TablePath, spec: Option<&PartitionSpec>) -> Result<Vec<PartitionInfo>>` | List partitions matching a spec |
+| `async fn create_partition(&self, table_path: &TablePath, spec: &PartitionSpec, ignore_if_exists: bool) -> Result<()>`               | Create a partition              |
+| `async fn drop_partition(&self, table_path: &TablePath, spec: &PartitionSpec, ignore_if_not_exists: bool) -> Result<()>`             | Drop a partition                |
+
+### Offset Operations
+
+| Method                                                                                                                                                           |  Description                          |
+|------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------|
+| `async fn list_offsets(&self, table_path: &TablePath, bucket_ids: &[i32], offset_spec: OffsetSpec) -> Result<HashMap<i32, i64>>`                                 | Get offsets for buckets               |
+| `async fn list_partition_offsets(&self, table_path: &TablePath, partition_name: &str, bucket_ids: &[i32], offset_spec: OffsetSpec) -> Result<HashMap<i32, i64>>` | Get offsets for a partition's buckets |
+
+### Lake Operations
+
+| Method                                                                                     |  Description                 |
+|--------------------------------------------------------------------------------------------|------------------------------|
+| `async fn get_latest_lake_snapshot(&self, table_path: &TablePath) -> Result<LakeSnapshot>` | Get the latest lake snapshot |
+
+### Cluster Operations
+
+| Method                                                        | Description                                         |
+|---------------------------------------------------------------|-----------------------------------------------------|
+| `async fn get_server_nodes(&self) -> Result<Vec<ServerNode>>` | Get all alive server nodes (coordinator + tablets)  |
+
+## `ServerNode`
+
+| Method                            | Description                                          |
+|-----------------------------------|------------------------------------------------------|
+| `fn id(&self) -> i32`            | Server node ID                                       |
+| `fn host(&self) -> &str`         | Hostname of the server                               |
+| `fn port(&self) -> u32`          | Port number                                          |
+| `fn server_type(&self) -> &ServerType` | Server type (`CoordinatorServer` or `TabletServer`) |
+| `fn uid(&self) -> &str`          | Unique identifier (e.g. `"cs-0"`, `"ts-1"`)         |
+
+## `FlussTable<'a>`
+
+| Method                                        | Description                             |
+|-----------------------------------------------|-----------------------------------------|
+| `fn get_table_info(&self) -> &TableInfo`      | Get table metadata                      |
+| `fn new_append(&self) -> Result<TableAppend>` | Create an append builder for log tables |
+| `fn new_scan(&self) -> TableScan<'_>`         | Create a scan builder                   |
+| `fn new_lookup(&self) -> Result<TableLookup>` | Create a lookup builder for PK tables   |
+| `fn new_upsert(&self) -> Result<TableUpsert>` | Create an upsert builder for PK tables  |
+| `fn has_primary_key(&self) -> bool`           | Check if the table has a primary key    |
+| `fn table_path(&self) -> &TablePath`          | Get the table path                      |
+
+## `TableAppend`
+
+| Method                                            | Description             |
+|---------------------------------------------------|-------------------------|
+| `fn create_writer(&self) -> Result<AppendWriter>` | Create an append writer |
+
+## `AppendWriter`
+
+| Method                                                                          | Description                                       |
+|---------------------------------------------------------------------------------|---------------------------------------------------|
+| `fn append(&self, row: &impl InternalRow) -> Result<WriteResultFuture>`         | Append a row; returns a future for acknowledgment |
+| `fn append_arrow_batch(&self, batch: RecordBatch) -> Result<WriteResultFuture>` | Append an Arrow RecordBatch                       |
+| `async fn flush(&self) -> Result<()>`                                           | Flush all pending writes to the server            |
+
+## `TableScan<'a>`
+
+| Method                                                                      | Description                             |
+|-----------------------------------------------------------------------------|-----------------------------------------|
+| `fn project(self, indices: &[usize]) -> Result<Self>`                       | Project columns by index                |
+| `fn project_by_name(self, names: &[&str]) -> Result<Self>`                  | Project columns by name                 |
+| `fn limit(self, n: i32) -> Result<Self>`                                    | Set a row limit (enables `create_bucket_batch_scanner`; rejected by log scanners) |
+| `fn create_log_scanner(self) -> Result<LogScanner>`                         | Create a record-based log scanner       |
+| `fn create_record_batch_log_scanner(self) -> Result<RecordBatchLogScanner>` | Create an Arrow batch-based log scanner |
+| `fn create_bucket_batch_scanner(self, bucket: TableBucket) -> Result<LimitBatchScanner>` | Bounded scan of one bucket (requires `limit`; runs on first `next_batch`) |
+
+## `LogScanner`
+
+Single-consumer: do not call `poll` concurrently on the same scanner (e.g. from `tokio::join!` or two tasks sharing an `Arc`). Mirrors Java's `LogScannerImpl.acquire()` guard. Debug builds surface overlapping calls via a `debug_assert!`; release builds skip the check for performance and produce skewed poll-timing metrics (`fluss.client.scanner.time_between_poll_ms`, `fluss.client.scanner.poll_idle_ratio`) if the contract is violated.
+
+All `fluss.client.scanner.*` metrics carry `database` and `table` labels (matching Java's per-`TablePath` `ScannerMetricGroup`), so multi-table consumers get one time series per scanned table.
+
+| Method                                                                                                    | Description                                              |
+|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------|
+| `async fn subscribe(&self, bucket_id: i32, start_offset: i64) -> Result<()>`                              | Subscribe to a bucket                                    |
+| `async fn subscribe_buckets(&self, bucket_offsets: &HashMap<i32, i64>) -> Result<()>`                     | Subscribe to multiple buckets                            |
+| `async fn subscribe_partition(&self, partition_id: i64, bucket_id: i32, start_offset: i64) -> Result<()>` | Subscribe to a partition bucket                          |
+| `async fn subscribe_partition_buckets(&self, offsets: &HashMap<(i64, i32), i64>) -> Result<()>`           | Subscribe to multiple partition-bucket pairs             |
+| `async fn unsubscribe(&self, bucket_id: i32) -> Result<()>`                                               | Unsubscribe from a bucket (non-partitioned tables)       |
+| `async fn unsubscribe_partition(&self, partition_id: i64, bucket_id: i32) -> Result<()>`                  | Unsubscribe from a partition bucket (partitioned tables) |
+| `async fn poll(&self, timeout: Duration) -> Result<ScanRecords>`                                          | Poll for records                                         |
+
+## `RecordBatchLogScanner`
+
+Single-consumer: overlapping `poll` calls on handles that share state, or `poll` concurrent with `RecordBatchLogReader::next_batch`, are not supported — use one active polling/consumption call at a time per underlying scanner state. Mirrors Java's `LogScannerImpl.acquire()` guard. Debug builds surface overlapping calls via a `debug_assert!`; release builds skip the check for performance and produce skewed poll-timing metrics (`fluss.client.scanner.time_between_poll_ms`, `fluss.client.scanner.poll_idle_ratio`) if the contract is violated.
+
+| Method                                                                                                    | Description                                              |
+|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------|
+| `async fn subscribe(&self, bucket_id: i32, start_offset: i64) -> Result<()>`                              | Subscribe to a bucket                                    |
+| `async fn subscribe_buckets(&self, bucket_offsets: &HashMap<i32, i64>) -> Result<()>`                     | Subscribe to multiple buckets                            |
+| `async fn subscribe_partition(&self, partition_id: i64, bucket_id: i32, start_offset: i64) -> Result<()>` | Subscribe to a partition bucket                          |
+| `async fn subscribe_partition_buckets(&self, offsets: &HashMap<(i64, i32), i64>) -> Result<()>`           | Subscribe to multiple partition-bucket pairs             |
+| `async fn unsubscribe(&self, bucket_id: i32) -> Result<()>`                                               | Unsubscribe from a bucket (non-partitioned tables)       |
+| `async fn unsubscribe_partition(&self, partition_id: i64, bucket_id: i32) -> Result<()>`                  | Unsubscribe from a partition bucket (partitioned tables) |
+| `async fn poll(&self, timeout: Duration) -> Result<Vec<ScanBatch>>`                                       | Poll for Arrow record batches                            |
+| `fn is_partitioned(&self) -> bool`                                                                        | Check if the table is partitioned                        |
+| `fn get_subscribed_buckets(&self) -> Vec<(TableBucket, i64)>`                                             | Get all current subscriptions as (bucket, offset) pairs  |
+| `fn schema(&self) -> SchemaRef`                                                                           | Get the Arrow schema for batches produced by this scanner|
+| `fn table_path(&self) -> &TablePath`                                                                      | Get the table path                                       |
+| `fn table_id(&self) -> TableId`                                                                           | Get the table ID                                         |
+
+## `RecordBatchLogReader`
+
+Bounded log reader that consumes data up to specified stopping offsets, then terminates.
+Unlike `RecordBatchLogScanner` which polls indefinitely, this reader stops automatically.
+
+| Method                                                                                                      | Description                                              |
+|-------------------------------------------------------------------------------------------------------------|----------------------------------------------------------|
+| `async fn new_until_latest(scanner: RecordBatchLogScanner, admin: &FlussAdmin) -> Result<Self>`              | Read until the latest offsets at time of creation         |
+| `fn new_until_offsets(scanner: RecordBatchLogScanner, stopping_offsets: HashMap<TableBucket, i64>) -> Result<Self>` | Read until custom stopping offsets per bucket             |
+| `async fn next_batch(&mut self) -> Result<Option<ScanBatch>>`                                                | Get the next batch with bucket/offset metadata, or `None` when all buckets caught up |
+| `async fn collect_all_batches(&mut self) -> Result<Vec<ScanBatch>>`                                          | Drain all batches (with metadata) until stopping offsets are satisfied |
+| `fn schema(&self) -> SchemaRef`                                                                              | Arrow schema for produced batches                        |
+| `fn to_record_batch_reader(self, handle: tokio::runtime::Handle) -> SyncRecordBatchLogReader`                | Sync adapter implementing `arrow::RecordBatchReader` (see below) |
+
+## `SyncRecordBatchLogReader`
+
+Synchronous adapter for `RecordBatchLogReader`. Created via
+`RecordBatchLogReader::to_record_batch_reader(handle)`.
+
+Implements both [`Iterator`] and [`arrow::record_batch::RecordBatchReader`], so it
+plugs into the wider Arrow ecosystem — FFI, PyArrow's
+`pa.RecordBatchReader.from_batches`, the C++ Arrow `RecordBatchReader` interface,
+DataFusion sources, etc.
+
+Each `next()` call drives the underlying async reader via
+`tokio::runtime::Handle::block_on`. **Do not call from inside a Tokio worker
+thread that belongs to the same runtime** — nested `block_on` panics. Prefer
+`RecordBatchLogReader::next_batch` in async Rust code; use this adapter only at
+sync/FFI boundaries.
+
+Bucket and offset metadata carried by `ScanBatch` is **dropped** here, because
+the Arrow trait contract yields plain `RecordBatch`. If you need offsets or
+bucket identity per batch, use `next_batch` instead.
+
+| Method                                                          | Description                                      |
+|-----------------------------------------------------------------|--------------------------------------------------|
+| `fn next(&mut self) -> Option<Result<RecordBatch, ArrowError>>` | Iterator: next batch, or `None` when caught up   |
+| `fn schema(&self) -> SchemaRef`                                 | Arrow schema for produced batches                |
+
+## `LimitBatchScanner`
+
+One-shot bounded scanner from `TableScan::limit(n).create_bucket_batch_scanner(bucket)`.
+Poll it with `next_batch` until it returns `None` (mirrors `RecordBatchLogReader`).
+Supports both log and primary-key tables (the latter returns the current,
+server-deduplicated state); yields a single batch of at most `n` rows.
+
+| Method                                                        | Description                          |
+|---------------------------------------------------------------|--------------------------------------|
+| `async fn next_batch(&mut self) -> Result<Option<ScanBatch>>` | Rows on the first call, `None` after |
+| `async fn collect_all_batches(&mut self) -> Result<Vec<ScanBatch>>` | Drain into all batches         |
+| `fn bucket(&self) -> &TableBucket`                            | The scanned bucket                   |
+
+## `ScanRecord`
+
+| Method                                 | Description                            |
+|----------------------------------------|----------------------------------------|
+| `fn row(&self) -> &dyn InternalRow`    | Get the row data                       |
+| `fn offset(&self) -> i64`              | Record offset in the log               |
+| `fn timestamp(&self) -> i64`           | Record timestamp                       |
+| `fn change_type(&self) -> &ChangeType` | Change type (AppendOnly, Insert, etc.) |
+
+## `ScanRecords`
+
+| Method                                                                   | Description                       |
+|--------------------------------------------------------------------------|-----------------------------------|
+| `fn count(&self) -> usize`                                               | Number of records                 |
+| `fn is_empty(&self) -> bool`                                             | Whether the result set is empty   |
+| `fn records(&self, bucket: &TableBucket) -> &[ScanRecord]`               | Get records for a specific bucket |
+| `fn records_by_buckets(&self) -> &HashMap<TableBucket, Vec<ScanRecord>>` | Get all records grouped by bucket |
+
+`ScanRecords` also implements `IntoIterator`, so you can iterate over all records directly:
+
+```rust
+for record in records {
+    println!("offset={}", record.offset());
+}
+```
+
+## `ScanBatch`
+
+| Method                             | Description                    |
+|------------------------------------|--------------------------------|
+| `fn bucket(&self) -> &TableBucket` | Bucket this batch belongs to   |
+| `fn batch(&self) -> &RecordBatch`  | Arrow RecordBatch data         |
+| `fn base_offset(&self) -> i64`     | First record offset            |
+| `fn last_offset(&self) -> i64`     | Last record offset             |
+| `fn num_records(&self) -> usize`   | Number of records in the batch |
+
+## `TableUpsert`
+
+| Method                                                                                | Description                                       |
+|---------------------------------------------------------------------------------------|---------------------------------------------------|
+| `fn create_writer(&self) -> Result<UpsertWriter>`                                     | Create an upsert writer                           |
+| `fn partial_update(&self, column_indices: Option<Vec<usize>>) -> Result<TableUpsert>` | Create a partial update builder by column indices |
+| `fn partial_update_with_column_names(&self, names: &[&str]) -> Result<TableUpsert>`   | Create a partial update builder by column names   |
+
+## `UpsertWriter`
+
+| Method                                                                  | Description                           |
+|-------------------------------------------------------------------------|---------------------------------------|
+| `fn upsert(&self, row: &impl InternalRow) -> Result<WriteResultFuture>` | Upsert a row (insert or update by PK) |
+| `fn delete(&self, row: &impl InternalRow) -> Result<WriteResultFuture>` | Delete a row by primary key           |
+| `async fn flush(&self) -> Result<()>`                                   | Flush all pending operations          |
+
+## `TableLookup`
+
+| Method                                          |  Description                        |
+|-------------------------------------------------|-------------------------------------|
+| `fn create_lookuper(&self) -> Result<Lookuper>` | Create a lookuper for point lookups |
+
+## `Lookuper`
+
+| Method                                                                       |  Description                |
+|------------------------------------------------------------------------------|-----------------------------|
+| `async fn lookup(&mut self, key: &impl InternalRow) -> Result<LookupResult>` | Lookup a row by primary key |
+
+## `LookupResult`
+
+| Method                                                         |  Description                     |
+|----------------------------------------------------------------|----------------------------------|
+| `fn get_single_row(&self) -> Result<Option<impl InternalRow>>` | Get a single row from the result |
+| `fn get_rows(&self) -> Result<Vec<impl InternalRow>>`          | Get all rows from the result     |
+| `fn to_record_batch(&self) -> Result<RecordBatch>`             | Convert all rows to an Arrow `RecordBatch` for DataFusion or other Arrow-based tools    |
+
+## `WriteResultFuture`
+
+| Description                                                                                                                                   |
+|-----------------------------------------------------------------------------------------------------------------------------------------------|
+| Implements `Future<Output = Result<(), Error>>`. Await to wait for server acknowledgment. Returned by `append()`, `upsert()`, and `delete()`. |
+
+Usage:
+
+```rust
+// Fire-and-forget (batched)
+writer.append(&row)?;
+writer.flush().await?;
+
+// Per-record acknowledgment
+writer.append(&row)?.await?;
+```
+
+## `Schema`
+
+| Method                                         |  Description                             |
+|------------------------------------------------|------------------------------------------|
+| `fn builder() -> SchemaBuilder`                | Create a schema builder                  |
+| `fn columns(&self) -> &[Column]`               | Get all columns                          |
+| `fn primary_key(&self) -> Option<&PrimaryKey>` | Get primary key (None if no primary key) |
+| `fn column_names(&self) -> Vec<&str>`          | Get all column names                     |
+| `fn primary_key_indexes(&self) -> Vec<usize>`  | Get primary key column indices           |
+
+## `SchemaBuilder`
+
+| Method                                               |  Description            |
+|------------------------------------------------------|-------------------------|
+| `fn column(name: &str, data_type: DataType) -> Self` | Add a column            |
+| `fn primary_key(keys: Vec<&str>) -> Self`            | Set primary key columns |
+| `fn build() -> Result<Schema>`                       | Build the schema        |
+
+## `SchemaInfo`
+
+A schema together with its server-assigned version id. Returned by [`FlussAdmin::get_table_schema`](#flussadmin).
+
+| Method                                           | Description                              |
+|--------------------------------------------------|------------------------------------------|
+| `fn new(schema: Schema, schema_id: i32) -> Self` | Construct from a schema and id           |
+| `fn schema(&self) -> &Schema`                    | Borrow the schema                        |
+| `fn schema_id(&self) -> i32`                     | Get the server-assigned schema id        |
+| `fn into_parts(self) -> (Schema, i32)`           | Consume and return `(schema, schema_id)` |
+
+## `TableDescriptor`
+
+| Method                                                    | Description                          |
+|-----------------------------------------------------------|--------------------------------------|
+| `fn builder() -> TableDescriptorBuilder`                  | Create a table descriptor builder    |
+| `fn schema(&self) -> &Schema`                             | Get the table schema                 |
+| `fn partition_keys(&self) -> &[String]`                   | Get partition key column names       |
+| `fn has_primary_key(&self) -> bool`                       | Check if the table has a primary key |
+| `fn properties(&self) -> &HashMap<String, String>`        | Get all table properties             |
+| `fn custom_properties(&self) -> &HashMap<String, String>` | Get custom properties                |
+| `fn comment(&self) -> Option<&str>`                       | Get table comment                    |
+
+## `TableDescriptorBuilder`
+
+| Method                                                                                    | Description                                 |
+|-------------------------------------------------------------------------------------------|---------------------------------------------|
+| `fn schema(schema: Schema) -> Self`                                                       | Set the schema                              |
+| `fn log_format(format: LogFormat) -> Self`                                                | Set log format (e.g., `LogFormat::ARROW`)   |
+| `fn kv_format(format: KvFormat) -> Self`                                                  | Set KV format (e.g., `KvFormat::COMPACTED`) |
+| `fn property(key: &str, value: &str) -> Self`                                             | Set a table property                        |
+| `fn custom_property(key: impl Into<String>, value: impl Into<String>) -> Self`            | Set a single custom property                |
+| `fn custom_properties(properties: HashMap<impl Into<String>, impl Into<String>>) -> Self` | Set custom properties                       |
+| `fn partitioned_by(keys: Vec<&str>) -> Self`                                              | Set partition columns                       |
+| `fn distributed_by(bucket_count: Option<i32>, bucket_keys: Vec<String>) -> Self`          | Set bucket distribution                     |
+| `fn comment(comment: &str) -> Self`                                                       | Set table comment                           |
+| `fn build() -> Result<TableDescriptor>`                                                   | Build the table descriptor                  |
+
+## `TablePath`
+
+| Method                                                |  Description        |
+|-------------------------------------------------------|---------------------|
+| `TablePath::new(database: &str, table: &str) -> Self` | Create a table path |
+| `fn database(&self) -> &str`                          | Get database name   |
+| `fn table(&self) -> &str`                             | Get table name      |
+
+## `TableInfo`
+
+| Field / Method       | Description                                         |
+|----------------------|-----------------------------------------------------|
+| `.table_path`        | `TablePath` -- Table path                           |
+| `.table_id`          | `i64` -- Table ID                                   |
+| `.schema_id`         | `i32` -- Schema ID                                  |
+| `.schema`            | `Schema` -- Table schema                            |
+| `.primary_keys`      | `Vec<String>` -- Primary key column names           |
+| `.partition_keys`    | `Vec<String>` -- Partition key column names         |
+| `.num_buckets`       | `i32` -- Number of buckets                          |
+| `.properties`        | `HashMap<String, String>` -- All table properties   |
+| `.custom_properties` | `HashMap<String, String>` -- Custom properties only |
+| `.comment`           | `Option<String>` -- Table comment                   |
+| `.created_time`      | `i64` -- Creation timestamp                         |
+| `.modified_time`     | `i64` -- Last modification timestamp                |
+
+## `TableBucket`
+
+| Method                                                                                              | Description                                |
+|-----------------------------------------------------------------------------------------------------|--------------------------------------------|
+| `TableBucket::new(table_id: i64, bucket_id: i32) -> Self`                                           | Create a non-partitioned bucket            |
+| `TableBucket::new_with_partition(table_id: i64, partition_id: Option<i64>, bucket_id: i32) -> Self` | Create a partitioned bucket                |
+| `fn table_id(&self) -> i64`                                                                         | Get table ID                               |
+| `fn partition_id(&self) -> Option<i64>`                                                             | Get partition ID (None if non-partitioned) |
+| `fn bucket_id(&self) -> i32`                                                                        | Get bucket ID                              |
+
+## `PartitionSpec`
+
+| Method                                                      | Description                                           |
+|-------------------------------------------------------------|-------------------------------------------------------|
+| `PartitionSpec::new(spec_map: HashMap<&str, &str>) -> Self` | Create from a map of partition column names to values |
+| `fn get_spec_map(&self) -> &HashMap<String, String>`        | Get the partition spec map                            |
+
+## `PartitionInfo`
+
+| Method                                   |  Description       |
+|------------------------------------------|--------------------|
+| `fn get_partition_id(&self) -> i64`      | Get partition ID   |
+| `fn get_partition_name(&self) -> String` | Get partition name |
+
+## `DatabaseDescriptor`
+
+| Method                                                    | Description                          |
+|-----------------------------------------------------------|--------------------------------------|
+| `fn builder() -> DatabaseDescriptorBuilder`               | Create a database descriptor builder |
+| `fn comment(&self) -> Option<&str>`                       | Get database comment                 |
+| `fn custom_properties(&self) -> &HashMap<String, String>` | Get custom properties                |
+
+## `DatabaseDescriptorBuilder`
+
+| Method                                                                                    | Description                   |
+|-------------------------------------------------------------------------------------------|-------------------------------|
+| `fn comment(comment: impl Into<String>) -> Self`                                          | Set database comment          |
+| `fn custom_properties(properties: HashMap<impl Into<String>, impl Into<String>>) -> Self` | Set custom properties         |
+| `fn custom_property(key: impl Into<String>, value: impl Into<String>) -> Self`            | Set a single custom property  |
+| `fn build() -> DatabaseDescriptor`                                                        | Build the database descriptor |
+
+## `DatabaseInfo`
+
+| Method                                                 | Description                     |
+|--------------------------------------------------------|---------------------------------|
+| `fn database_name(&self) -> &str`                      | Get database name               |
+| `fn created_time(&self) -> i64`                        | Get creation timestamp          |
+| `fn modified_time(&self) -> i64`                       | Get last modification timestamp |
+| `fn database_descriptor(&self) -> &DatabaseDescriptor` | Get the database descriptor     |
+
+## `LakeSnapshot`
+
+| Field                   | Description                                       |
+|-------------------------|---------------------------------------------------|
+| `.snapshot_id`          | `i64` -- Snapshot ID                              |
+| `.table_buckets_offset` | `HashMap<TableBucket, i64>` -- All bucket offsets |
+
+## `GenericRow<'a>`
+
+| Method                                                             | Description                                      |
+|--------------------------------------------------------------------|--------------------------------------------------|
+| `GenericRow::new(field_count: usize) -> Self`                      | Create a new row with the given number of fields |
+| `fn set_field(&mut self, pos: usize, value: impl Into<Datum<'a>>)` | Set a field value by position                    |
+| `GenericRow::from_data(data: Vec<impl Into<Datum<'a>>>) -> Self`   | Create a row from existing field data            |
+
+Implements the `InternalRow` trait (see below).
+
+## `InternalRow` trait
+
+| Method                                                                                 | Description                             |
+|----------------------------------------------------------------------------------------|-----------------------------------------|
+| `fn is_null_at(&self, idx: usize) -> Result<bool>`                                     | Check if a field is null                |
+| `fn get_boolean(&self, idx: usize) -> Result<bool>`                                    | Get boolean value                       |
+| `fn get_byte(&self, idx: usize) -> Result<i8>`                                         | Get tinyint value                       |
+| `fn get_short(&self, idx: usize) -> Result<i16>`                                       | Get smallint value                      |
+| `fn get_int(&self, idx: usize) -> Result<i32>`                                         | Get int value                           |
+| `fn get_long(&self, idx: usize) -> Result<i64>`                                        | Get bigint value                        |
+| `fn get_float(&self, idx: usize) -> Result<f32>`                                       | Get float value                         |
+| `fn get_double(&self, idx: usize) -> Result<f64>`                                      | Get double value                        |
+| `fn get_string(&self, idx: usize) -> Result<&str>`                                     | Get string value                        |
+| `fn get_decimal(&self, idx: usize, precision: usize, scale: usize) -> Result<Decimal>` | Get decimal value                       |
+| `fn get_date(&self, idx: usize) -> Result<Date>`                                       | Get date value                          |
+| `fn get_time(&self, idx: usize) -> Result<Time>`                                       | Get time value                          |
+| `fn get_timestamp_ntz(&self, idx: usize, precision: u32) -> Result<TimestampNtz>`      | Get timestamp value                     |
+| `fn get_timestamp_ltz(&self, idx: usize, precision: u32) -> Result<TimestampLtz>`      | Get timestamp with local timezone value |
+| `fn get_bytes(&self, idx: usize) -> Result<&[u8]>`                                     | Get bytes value                         |
+| `fn get_binary(&self, idx: usize, length: usize) -> Result<&[u8]>`                     | Get fixed-length binary value           |
+| `fn get_char(&self, idx: usize, length: usize) -> Result<&str>`                        | Get fixed-length char value             |
+| `fn get_array(&self, idx: usize) -> Result<FlussArray>`                                | Get array value                         |
+| `fn get_map(&self, idx: usize) -> Result<FlussMap>`                                    | Get map value                           |
+
+## `FlussArray`
+
+`FlussArray` is the Rust row representation for `ARRAY` values. You usually obtain it from `InternalRow::get_array()`.
+
+| Method | Description |
+|--------|-------------|
+| `fn size(&self) -> usize` | Number of elements in the array |
+| `fn is_null_at(&self, pos: usize) -> bool` | Check whether an element is null |
+| `fn as_bytes(&self) -> &[u8]` | Get encoded bytes of the array |
+
+Element getters mirror `InternalRow` typed getters and return `Result<T>`. For example, use `get_int()`, `get_long()`, and `get_double()` for primitive elements, and `get_string()`, `get_binary()`, `get_decimal()`, `get_timestamp_ntz()`, `get_timestamp_ltz()`, and `get_array()` for variable-length or nested elements.
+
+## `FlussMap`
+
+`FlussMap` is the Rust row representation for `MAP` values. You usually obtain it from `InternalRow::get_map()`.
+
+| Method | Description |
+|--------|-------------|
+| `fn size(&self) -> usize` | Number of entries in the map |
+| `fn as_bytes(&self) -> &[u8]` | Get encoded bytes of the map |
+| `fn key_type(&self) -> &DataType` | Schema-declared type of keys |
+| `fn value_type(&self) -> &DataType` | Schema-declared type of values |
+| `fn entries(&self) -> Entries<'_>` | Iterator yielding `Result<(Datum, Datum)>` pairs |
+| `fn get(&self, key: &Datum) -> Result<Option<Datum>>` | Linear-scan lookup by key (`O(n)`) |
+| `fn key_array(&self) -> &FlussArray` | Parallel keys array (zero-copy view) |
+| `fn value_array(&self) -> &FlussArray` | Parallel values array (zero-copy view) |
+
+Most user code should prefer `entries()` (iteration) and `get()` (lookup). The `key_array()` / `value_array()` views are for serdes and Arrow-adapter code that needs zero-copy access to the underlying parallel-array layout.
+
+## `FlussMapWriter`
+
+`FlussMapWriter` builds a `FlussMap` for write paths.
+
+| Method | Description |
+|--------|-------------|
+| `fn new(capacity: usize, key_type: &DataType, value_type: &DataType) -> Self` | Create a writer sized for `capacity` entries |
+| `fn write_entry(&mut self, key: Datum, value: Datum) -> Result<()>` | Append a single entry; rejects null keys and type mismatches |
+| `fn extend<I, K, V>(&mut self, entries: I) -> Result<()>` | Append every pair from `entries: IntoIterator<Item = (K, V)>` |
+| `fn complete(self) -> Result<FlussMap>` | Finalize the writer and produce the `FlussMap` |
+
+## `ChangeType`
+
+| Value                      | Short String  | Description                      |
+|----------------------------|---------------|----------------------------------|
+| `ChangeType::AppendOnly`   | `+A`          | Append-only record               |
+| `ChangeType::Insert`       | `+I`          | Inserted row                     |
+| `ChangeType::UpdateBefore` | `-U`          | Previous value of an updated row |
+| `ChangeType::UpdateAfter`  | `+U`          | New value of an updated row      |
+| `ChangeType::Delete`       | `-D`          | Deleted row                      |
+
+| Method                           | Description                         |
+|----------------------------------|-------------------------------------|
+| `fn short_string(&self) -> &str` | Get the short string representation |
+
+## `OffsetSpec`
+
+| Variant                      | Description                                     |
+|------------------------------|-------------------------------------------------|
+| `OffsetSpec::Earliest`       | Start from the earliest available offset        |
+| `OffsetSpec::Latest`         | Start from the latest offset (only new records) |
+| `OffsetSpec::Timestamp(i64)` | Start from a specific timestamp in milliseconds |
+
+## Constants
+
+| Constant                         | Value  | Description                                             |
+|----------------------------------|--------|---------------------------------------------------------|
+| `fluss::client::EARLIEST_OFFSET` | `-2`   | Start reading from the earliest available offset        |
+
+To start reading from the latest offset (only new records), resolve the current offset via `list_offsets` before subscribing:
+
+```rust
+use fluss::rpc::message::OffsetSpec;
+
+let offsets = admin.list_offsets(&table_path, &[0], OffsetSpec::Latest).await?;
+let latest = offsets[&0];
+log_scanner.subscribe(0, latest).await?;
+```
+
+## `DataTypes` factory
+
+| Method                                           | Returns    | Description                        |
+|--------------------------------------------------|------------|------------------------------------|
+| `DataTypes::boolean()`                           | `DataType` | Boolean type                       |
+| `DataTypes::tinyint()`                           | `DataType` | 8-bit signed integer               |
+| `DataTypes::smallint()`                          | `DataType` | 16-bit signed integer              |
+| `DataTypes::int()`                               | `DataType` | 32-bit signed integer              |
+| `DataTypes::bigint()`                            | `DataType` | 64-bit signed integer              |
+| `DataTypes::float()`                             | `DataType` | 32-bit floating point              |
+| `DataTypes::double()`                            | `DataType` | 64-bit floating point              |
+| `DataTypes::string()`                            | `DataType` | Variable-length string             |
+| `DataTypes::bytes()`                             | `DataType` | Variable-length byte array         |
+| `DataTypes::date()`                              | `DataType` | Date (days since epoch)            |
+| `DataTypes::time()`                              | `DataType` | Time (milliseconds since midnight) |
+| `DataTypes::timestamp()`                         | `DataType` | Timestamp without timezone         |
+| `DataTypes::timestamp_ltz()`                     | `DataType` | Timestamp with local timezone      |
+| `DataTypes::decimal(precision: u32, scale: u32)` | `DataType` | Fixed-point decimal                |
+| `DataTypes::char(length: u32)`                   | `DataType` | Fixed-length string                |
+| `DataTypes::binary(length: usize)`               | `DataType` | Fixed-length byte array            |
+| `DataTypes::array(element: DataType)`            | `DataType` | Array of elements                  |
+| `DataTypes::map(key: DataType, value: DataType)` | `DataType` | Map of key-value pairs             |
+| `DataTypes::row(fields: Vec<DataField>)`         | `DataType` | Nested row type                    |
+
+## `DataField`
+
+| Method                                                                                                   | Description         |
+|----------------------------------------------------------------------------------------------------------|---------------------|
+| `DataField::new(name: impl Into<String>, data_type: DataType, description: Option<String>) -> DataField` | Create a data field |
+| `fn name(&self) -> &str`                                                                                 | Get the field name  |
diff --git a/fluss-rust/website/docs/user-guide/rust/data-types.md b/fluss-rust/website/docs/user-guide/rust/data-types.md
new file mode 100644
index 0000000000..5418839184
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/data-types.md
@@ -0,0 +1,179 @@
+---
+sidebar_position: 3
+---
+# Data Types
+
+| Fluss Type      | Rust Type      | Getter                               | Setter                         |
+|-----------------|----------------|--------------------------------------|--------------------------------|
+| `BOOLEAN`       | `bool`         | `get_boolean()`                      | `set_field(idx, bool)`         |
+| `TINYINT`       | `i8`           | `get_byte()`                         | `set_field(idx, i8)`           |
+| `SMALLINT`      | `i16`          | `get_short()`                        | `set_field(idx, i16)`          |
+| `INT`           | `i32`          | `get_int()`                          | `set_field(idx, i32)`          |
+| `BIGINT`        | `i64`          | `get_long()`                         | `set_field(idx, i64)`          |
+| `FLOAT`         | `f32`          | `get_float()`                        | `set_field(idx, f32)`          |
+| `DOUBLE`        | `f64`          | `get_double()`                       | `set_field(idx, f64)`          |
+| `CHAR`          | `&str`         | `get_char(idx, length)`              | `set_field(idx, &str)`         |
+| `STRING`        | `&str`         | `get_string()`                       | `set_field(idx, &str)`         |
+| `DECIMAL`       | `Decimal`      | `get_decimal(idx, precision, scale)` | `set_field(idx, Decimal)`      |
+| `DATE`          | `Date`         | `get_date()`                         | `set_field(idx, Date)`         |
+| `TIME`          | `Time`         | `get_time()`                         | `set_field(idx, Time)`         |
+| `TIMESTAMP`     | `TimestampNtz` | `get_timestamp_ntz(idx, precision)`  | `set_field(idx, TimestampNtz)` |
+| `TIMESTAMP_LTZ` | `TimestampLtz` | `get_timestamp_ltz(idx, precision)`  | `set_field(idx, TimestampLtz)` |
+| `BYTES`         | `&[u8]`        | `get_bytes()`                        | `set_field(idx, &[u8])`        |
+| `BINARY(n)`     | `&[u8]`        | `get_binary(idx, length)`            | `set_field(idx, &[u8])`        |
+| `ARRAY<T>`      | `FlussArray`   | `get_array()`                        | `set_field(idx, FlussArray)`   |
+| `MAP<K, V>`     | `FlussMap`     | `get_map(idx)`                       | `set_field(idx, FlussMap)`     |
+
+## Constructing Special Types
+
+Primitive types (`bool`, `i8`, `i16`, `i32`, `i64`, `f32`, `f64`, `&str`, `&[u8]`) can be passed directly to `set_field`. The following types require explicit construction:
+
+```rust
+use fluss::row::{Date, Time, TimestampNtz, TimestampLtz, Decimal};
+
+// Date: days since Unix epoch
+let date = Date::new(19738);
+
+// Time: milliseconds since midnight
+let time = Time::new(43200000);
+
+// Timestamp without timezone: milliseconds since epoch
+// DataTypes::timestamp() defaults to precision 6 (microseconds).
+// Use DataTypes::timestamp_with_precision(p) for a different precision (0–9).
+let ts = TimestampNtz::new(1704067200000);
+
+// Timestamp with local timezone: milliseconds since epoch
+// DataTypes::timestamp_ltz() also defaults to precision 6.
+let ts_ltz = TimestampLtz::new(1704067200000);
+
+// Decimal: from an unscaled long value with precision and scale
+let decimal = Decimal::from_unscaled_long(12345, 10, 2)?; // represents 123.45
+```
+
+## Creating Rows from Data
+
+`GenericRow::from_data` accepts a `Vec<Datum>`. Because multiple crates implement `From<&str>`, Rust cannot infer the target type from `.into()` alone. Annotate the vector type explicitly:
+
+```rust
+use fluss::row::{Datum, GenericRow};
+
+let data: Vec<Datum> = vec![1i32.into(), "hello".into(), Datum::Null];
+let row = GenericRow::from_data(data);
+```
+
+## Arrays
+
+Use `DataTypes::array(element_type)` in schema definitions. At runtime, read arrays with `row.get_array(idx)?`.
+
+To construct array values for writes, build a `FlussArray` and wrap it with `Datum::Array`:
+
+```rust
+use fluss::metadata::DataTypes;
+use fluss::row::binary_array::FlussArrayWriter;
+use fluss::row::{Datum, GenericRow};
+
+let mut writer = FlussArrayWriter::new(3, &DataTypes::int());
+writer.write_int(0, 10);
+writer.write_int(1, 20);
+writer.set_null_at(2);
+let arr = writer.complete()?;
+
+let mut row = GenericRow::new(1);
+row.set_field(0, Datum::Array(arr));
+```
+
+`ARRAY` is supported for row values and nested row fields. For key encoding, Rust follows Java parity: `ARRAY` can be encoded by the compacted key encoder, while table-level key constraints are validated by the server (which may reject unsupported key types).
+
+## Maps
+
+Use `DataTypes::map(key_type, value_type)` in schema definitions. At runtime, read maps with `row.get_map(idx)?` — the row knows its schema, so no extra type arguments are needed.
+
+### Writing
+
+Build a `FlussMap` entry-by-entry, then wrap it with `Datum::Map`:
+
+```rust
+use fluss::metadata::DataTypes;
+use fluss::row::binary_map::FlussMapWriter;
+use fluss::row::{Datum, GenericRow};
+
+let mut writer = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::int());
+writer.write_entry("key1".into(), 100.into())?;
+writer.write_entry("key2".into(), Datum::Null)?;
+let map = writer.complete()?;
+
+let mut row = GenericRow::new(1);
+row.set_field(0, Datum::Map(map));
+```
+
+For bulk writes from any iterator of `(key, value)` pairs (including a `HashMap`), use `extend`:
+
+```rust
+use std::collections::HashMap;
+
+let entries: HashMap<&str, i32> = HashMap::from([("a", 1), ("b", 2)]);
+let mut writer = FlussMapWriter::new(entries.len(), &DataTypes::string(), &DataTypes::int());
+writer.extend(entries)?;
+let map = writer.complete()?;
+```
+
+### Reading
+
+The `entries()` iterator yields `(key, value)` pairs as schema-typed `Datum`s, folding the null check in:
+
+```rust
+use fluss::row::InternalRow;
+
+let m = row.get_map(0)?;
+for entry in m.entries() {
+    let (k, v) = entry?;
+    println!("{k:?} => {v:?}");          // Datum's Debug handles null
+}
+```
+
+For point lookups, `get(&key)` does a linear scan and returns `Option<Datum>`:
+
+```rust
+use fluss::row::Datum;
+
+if let Some(v) = m.get(&Datum::from("attr_size"))? {
+    println!("size = {v:?}");
+}
+```
+
+Lookup is `O(n)` — the binary MAP layout has no key index. If you need repeated lookups against the same map, collect the entries once:
+
+```rust
+use std::collections::HashMap;
+
+let snapshot: HashMap<String, Datum<'_>> = m
+    .entries()
+    .map(|e| e.map(|(k, v)| (format!("{k:?}"), v)))
+    .collect::<Result<_, _>>()?;
+```
+
+For raw access to the underlying parallel-array representation (zero-copy, used by serdes / Arrow adapters), `m.key_array()` and `m.value_array()` are still available.
+
+### Constraints
+
+`MAP` keys cannot be null. `MAP` is supported for row values and nested row fields. `MAP` cannot be used as a primary key or bucket key column — the Rust client rejects it at the compacted key encoder, and the Fluss server bans `MAP` (along with `ARRAY` and `ROW`) from key columns.
+
+## Reading Row Data
+
+```rust
+use fluss::row::InternalRow;
+
+for record in scan_records {
+    let row = record.row();
+
+    if row.is_null_at(0)? {
+        // field is null
+    }
+    let id: i32 = row.get_int(0)?;
+    let name: &str = row.get_string(1)?;
+    let score: f32 = row.get_float(2)?;
+    let date: Date = row.get_date(3)?;
+    let ts: TimestampNtz = row.get_timestamp_ntz(4, 6)?;
+    let decimal: Decimal = row.get_decimal(5, 10, 2)?;
+}
+```
diff --git a/fluss-rust/website/docs/user-guide/rust/error-handling.md b/fluss-rust/website/docs/user-guide/rust/error-handling.md
new file mode 100644
index 0000000000..4966428997
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/error-handling.md
@@ -0,0 +1,241 @@
+---
+sidebar_position: 4
+---
+# Error Handling
+
+The Fluss Rust client uses a unified `Error` type and a `Result<T>` alias for all fallible operations.
+
+## Basic Usage
+
+```rust
+use fluss::error::{Error, Result};
+
+// All operations return Result<T>
+let conn = FlussConnection::new(config).await?;
+let admin = conn.get_admin()?;
+let table = conn.get_table(&table_path).await?;
+```
+
+Use the `?` operator to propagate errors, or `match` on specific variants for fine-grained handling.
+
+## Matching Error Variants
+
+```rust
+use fluss::error::Error;
+
+match result {
+    Ok(val) => {
+        // handle success
+    }
+    Err(Error::RpcError { message, .. }) => {
+        eprintln!("RPC failure: {}", message);
+    }
+    Err(Error::UnsupportedOperation { message }) => {
+        eprintln!("Unsupported: {}", message);
+    }
+    Err(Error::FlussAPIError { api_error }) => {
+        eprintln!("Server error: {}", api_error);
+    }
+    Err(e) => {
+        eprintln!("Unexpected error: {}", e);
+    }
+}
+```
+
+## Error Variants
+
+| Variant                        | Description                                                  |
+|--------------------------------|--------------------------------------------------------------|
+| `UnexpectedError`              | General unexpected errors with a message and optional source |
+| `IoUnexpectedError`            | I/O errors (network, file system)                            |
+| `RemoteStorageUnexpectedError` | Remote storage errors (OpenDAL backend failures)             |
+| `RpcError`                     | RPC communication failures (connection refused, timeout)     |
+| `RowConvertError`              | Row conversion failures (type mismatch, invalid data)        |
+| `ArrowError`                   | Arrow data handling errors (schema mismatch, encoding)       |
+| `IllegalArgument`              | Invalid arguments passed to an API method                    |
+| `UnsupportedOperation`         | Operation not supported on the table type                    |
+| `FlussAPIError`                | Server-side API errors returned by the Fluss cluster         |
+
+Server side errors are represented as `FlussAPIError` with a specific error code. Use the `api_error()` helper to match them ergonomically:
+
+```rust
+use fluss::error::FlussError;
+
+match result {
+    Err(ref e) if e.api_error() == Some(FlussError::InvalidTableException) => {
+        eprintln!("Invalid table: {}", e);
+    }
+    Err(ref e) if e.api_error() == Some(FlussError::PartitionNotExists) => {
+        eprintln!("Partition does not exist: {}", e);
+    }
+    Err(ref e) if e.api_error() == Some(FlussError::LeaderNotAvailableException) => {
+        eprintln!("Leader not available: {}", e);
+    }
+    Err(ref e) if e.api_error() == Some(FlussError::AuthenticateException) => {
+        eprintln!("Authentication failed: {}", e);
+    }
+    _ => {}
+}
+```
+
+## Retry Logic
+
+Some errors are transient, where the server may be temporarily unavailable, mid-election, or under load. `is_retriable()` can be used for deciding to retry an operation rather than treating the error as permanent.
+
+`Error::is_retriable()` is available directly on any `Error` value. `RpcError` is always retriable; `FlussAPIError` delegates to the server error code; all other variants return `false`.
+
+```rust
+use fluss::error::Error;
+
+match writer.append(&row) {
+    Ok(_) => {}
+    Err(ref e) if e.is_retriable() => {
+        // Transient failure — safe to retry
+    }
+    Err(e) => {
+        // Permanent failure — log and abort
+        eprintln!("Fatal error: {}", e);
+    }
+}
+```
+
+### Retriable Variants
+
+| Variant / Error                              | Code | Reason                                    |
+|----------------------------------------------|------|-------------------------------------------|
+| `Error::RpcError`                            | —    | Network-level failure, always retriable   |
+| `FlussError::NetworkException`               | 1    | Server disconnected                       |
+| `FlussError::CorruptMessage`                 | 3    | CRC or size error                         |
+| `FlussError::SchemaNotExist`                 | 9    | Schema may not exist                      |
+| `FlussError::LogStorageException`            | 10   | Transient log storage error               |
+| `FlussError::KvStorageException`             | 11   | Transient KV storage error                |
+| `FlussError::NotLeaderOrFollower`            | 12   | Leader election in progress               |
+| `FlussError::CorruptRecordException`         | 14   | Corrupt record                            |
+| `FlussError::UnknownTableOrBucketException`  | 21   | Metadata not yet available                |
+| `FlussError::RequestTimeOut`                 | 25   | Request timed out                         |
+| `FlussError::StorageException`               | 26   | Transient storage error                   |
+| `FlussError::NotEnoughReplicasAfterAppendException` | 28 | Wrote to server but with low ISR size |
+| `FlussError::NotEnoughReplicasException`     | 29   | Low ISR size at write time                |
+| `FlussError::LeaderNotAvailableException`    | 44   | No leader available for partition         |
+
+All other `Error` variants (e.g. `RowConvertError`, `IllegalArgument`, `UnsupportedOperation`) always return `false` from `is_retriable()`.
+
+## Common Error Scenarios
+
+### Connection Refused
+
+The Fluss cluster is not running or the address is incorrect.
+
+```rust
+let result = FlussConnection::new(config).await;
+match result {
+    Err(Error::RpcError { message, .. }) => {
+        eprintln!("Cannot connect to cluster: {}", message);
+    }
+    _ => {}
+}
+```
+
+### Table Not Found
+
+The table does not exist or has been dropped.
+
+```rust
+use fluss::error::{Error, FlussError};
+
+// Admin operations return FlussError::TableNotExist (code 7)
+let result = admin.drop_table(&table_path, false).await;
+match result {
+    Err(ref e) if e.api_error() == Some(FlussError::TableNotExist) => {
+        eprintln!("Table not found: {}", e);
+    }
+    _ => {}
+}
+
+// conn.get_table() wraps the error differently, match on FlussAPIError directly
+let result = conn.get_table(&table_path).await;
+match result {
+    Err(Error::FlussAPIError { ref api_error }) => {
+        eprintln!("Server error (code {}): {}", api_error.code, api_error.message);
+    }
+    _ => {}
+}
+```
+
+### Partition Not Found
+
+The partition does not exist on a partitioned table.
+
+```rust
+use fluss::error::FlussError;
+
+let result = admin.drop_partition(&table_path, &spec, false).await;
+match result {
+    Err(ref e) if e.api_error() == Some(FlussError::PartitionNotExists) => {
+        eprintln!("Partition does not exist: {}", e);
+    }
+    _ => {}
+}
+```
+
+### Authentication Failed
+
+SASL credentials are incorrect or the user does not exist.
+
+```rust
+use fluss::error::{Error, FlussError};
+
+let result = FlussConnection::new(config).await;
+match result {
+    Err(ref e) if e.api_error() == Some(FlussError::AuthenticateException) => {
+        eprintln!("Authentication failed: {}", e);
+    }
+    _ => {}
+}
+```
+
+### Schema Mismatch
+
+Row data does not match the expected table schema.
+
+```rust
+let result = writer.append(&row);
+match result {
+    Err(Error::RowConvertError { .. }) => {
+        eprintln!("Row does not match table schema");
+    }
+    _ => {}
+}
+```
+
+## Using `Result<T>` in Application Code
+
+The `fluss::error::Result<T>` type alias makes it easy to use Fluss errors with the `?` operator in your application functions:
+
+```rust
+use fluss::error::Result;
+
+async fn my_pipeline() -> Result<()> {
+    let conn = FlussConnection::new(config).await?;
+    let admin = conn.get_admin()?;
+    let table = conn.get_table(&table_path).await?;
+    let writer = table.new_append()?.create_writer()?;
+    writer.append(&row)?;
+    writer.flush().await?;
+    Ok(())
+}
+```
+
+For applications that use other error types alongside Fluss errors, you can convert with standard `From` / `Into` traits or use crates like `anyhow`:
+
+```rust
+use anyhow::Result;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let conn = FlussConnection::new(config).await?;
+    // fluss::error::Error implements std::error::Error,
+    // so it converts into anyhow::Error automatically
+    Ok(())
+}
+```
diff --git a/fluss-rust/website/docs/user-guide/rust/example/_category_.json b/fluss-rust/website/docs/user-guide/rust/example/_category_.json
new file mode 100644
index 0000000000..4d81ec12ae
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/example/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Examples",
+  "position": 5
+}
diff --git a/fluss-rust/website/docs/user-guide/rust/example/admin-operations.md b/fluss-rust/website/docs/user-guide/rust/example/admin-operations.md
new file mode 100644
index 0000000000..39752754f1
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/example/admin-operations.md
@@ -0,0 +1,122 @@
+---
+sidebar_position: 3
+---
+# Admin Operations
+
+## Get Admin Interface
+
+```rust
+let admin = conn.get_admin()?;
+```
+
+## Database Operations
+
+```rust
+// Create database
+admin.create_database("my_database", None, true).await?;
+
+// List all databases
+let databases = admin.list_databases().await?;
+println!("Databases: {:?}", databases);
+
+// Check if database exists
+let exists = admin.database_exists("my_database").await?;
+
+// Get database information
+let db_info = admin.get_database_info("my_database").await?;
+
+// Drop database
+admin.drop_database("my_database", true, false).await?;
+```
+
+## Table Operations
+
+```rust
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .column("amount", DataTypes::bigint())
+            .build()?,
+    )
+    .build()?;
+
+let table_path = TablePath::new("my_database", "my_table");
+
+// Create table
+admin.create_table(&table_path, &table_descriptor, true).await?;
+
+// Get table information
+let table_info = admin.get_table_info(&table_path).await?;
+println!("Table: {}", table_info);
+
+// List tables in database
+let tables = admin.list_tables("my_database").await?;
+
+// Check if table exists
+let exists = admin.table_exists(&table_path).await?;
+
+// Drop table
+admin.drop_table(&table_path, true).await?;
+```
+
+## Partition Operations
+
+```rust
+use fluss::metadata::PartitionSpec;
+use std::collections::HashMap;
+
+// List all partitions
+let partitions = admin.list_partition_infos(&table_path).await?;
+
+// List partitions matching a spec
+let mut filter = HashMap::new();
+filter.insert("year", "2024");
+let spec = PartitionSpec::new(filter);
+let partitions = admin.list_partition_infos_with_spec(&table_path, Some(&spec)).await?;
+
+// Create partition
+admin.create_partition(&table_path, &spec, true).await?;
+
+// Drop partition
+admin.drop_partition(&table_path, &spec, true).await?;
+```
+
+## Offset Operations
+
+```rust
+use fluss::rpc::message::OffsetSpec;
+
+let bucket_ids = vec![0, 1, 2];
+
+// Get earliest offsets
+let earliest = admin.list_offsets(&table_path, &bucket_ids, OffsetSpec::Earliest).await?;
+
+// Get latest offsets
+let latest = admin.list_offsets(&table_path, &bucket_ids, OffsetSpec::Latest).await?;
+
+// Get offsets for a specific timestamp
+let timestamp_ms = 1704067200000; // 2024-01-01 00:00:00 UTC
+let offsets = admin.list_offsets(
+    &table_path, &bucket_ids, OffsetSpec::Timestamp(timestamp_ms),
+).await?;
+
+// Get offsets for a specific partition
+let partition_offsets = admin.list_partition_offsets(
+    &table_path, "partition_name", &bucket_ids, OffsetSpec::Latest,
+).await?;
+```
+
+## Lake Snapshot
+
+:::note
+Lake snapshots require [lake integration](https://fluss.apache.org/docs/maintenance/tiered-storage/overview/) (e.g. Paimon or Iceberg) to be enabled on the server. Without it, `get_latest_lake_snapshot` will return an error.
+:::
+
+```rust
+let snapshot = admin.get_latest_lake_snapshot(&table_path).await?;
+println!("Snapshot ID: {}", snapshot.snapshot_id);
+```
diff --git a/fluss-rust/website/docs/user-guide/rust/example/configuration.md b/fluss-rust/website/docs/user-guide/rust/example/configuration.md
new file mode 100644
index 0000000000..eba38d85f2
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/example/configuration.md
@@ -0,0 +1,35 @@
+---
+sidebar_position: 2
+---
+# Configuration
+
+## Connection Setup
+
+```rust
+use fluss::client::FlussConnection;
+use fluss::config::Config;
+
+let mut config = Config::default();
+config.bootstrap_servers = "127.0.0.1:9123".to_string();
+
+let conn = FlussConnection::new(config).await?;
+```
+
+## Connection Configurations
+
+See the [`Config`](../api-reference.md#config) section in the API Reference for the full list of configuration options, types, and defaults.
+
+## SASL Authentication
+
+To connect to a Fluss cluster with SASL/PLAIN authentication enabled:
+
+```rust
+let mut config = Config::default();
+config.bootstrap_servers = "127.0.0.1:9123".to_string();
+config.security_protocol = "sasl".to_string();
+config.security_sasl_mechanism = "PLAIN".to_string();
+config.security_sasl_username = "admin".to_string();
+config.security_sasl_password = "admin-secret".to_string();
+
+let conn = FlussConnection::new(config).await?;
+```
diff --git a/fluss-rust/website/docs/user-guide/rust/example/index.md b/fluss-rust/website/docs/user-guide/rust/example/index.md
new file mode 100644
index 0000000000..f1d5a6882d
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/example/index.md
@@ -0,0 +1,56 @@
+---
+sidebar_position: 1
+---
+# Example
+
+Minimal working examples: connect to Fluss, create a table, write data, and read it back.
+
+```rust
+use fluss::client::FlussConnection;
+use fluss::config::Config;
+use fluss::error::Result;
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+use fluss::row::{GenericRow, InternalRow};
+use std::time::Duration;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    // Connect
+    let mut config = Config::default();
+    config.bootstrap_servers = "127.0.0.1:9123".to_string();
+    let conn = FlussConnection::new(config).await?;
+    let admin = conn.get_admin()?;
+
+    // Create a log table
+    let table_path = TablePath::new("fluss", "quickstart_rust");
+    let descriptor = TableDescriptor::builder()
+        .schema(
+            Schema::builder()
+                .column("id", DataTypes::int())
+                .column("name", DataTypes::string())
+                .build()?,
+        )
+        .build()?;
+    admin.create_table(&table_path, &descriptor, true).await?;
+
+    // Write
+    let table = conn.get_table(&table_path).await?;
+    let writer = table.new_append()?.create_writer()?;
+    let mut row = GenericRow::new(2);
+    row.set_field(0, 1);
+    row.set_field(1, "hello");
+    writer.append(&row)?;
+    writer.flush().await?;
+
+    // Read
+    let scanner = table.new_scan().create_log_scanner()?;
+    scanner.subscribe(0, 0).await?;
+    let records = scanner.poll(Duration::from_secs(5)).await?;
+    for record in records {
+        let row = record.row();
+        println!("id={}, name={}", row.get_int(0)?, row.get_string(1)?);
+    }
+
+    Ok(())
+}
+```
diff --git a/fluss-rust/website/docs/user-guide/rust/example/log-tables.md b/fluss-rust/website/docs/user-guide/rust/example/log-tables.md
new file mode 100644
index 0000000000..e77c8c6c43
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/example/log-tables.md
@@ -0,0 +1,172 @@
+---
+sidebar_position: 4
+---
+# Log Tables
+
+Log tables are append-only tables without primary keys, suitable for event streaming.
+
+## Creating a Log Table
+
+```rust
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("event_id", DataTypes::int())
+            .column("event_type", DataTypes::string())
+            .column("timestamp", DataTypes::bigint())
+            .build()?,
+    )
+    .build()?;
+
+let table_path = TablePath::new("fluss", "events");
+admin.create_table(&table_path, &table_descriptor, true).await?;
+```
+
+## Writing to Log Tables
+
+```rust
+use fluss::row::{GenericRow, InternalRow};
+
+let table = conn.get_table(&table_path).await?;
+let append_writer = table.new_append()?.create_writer()?;
+
+let mut row = GenericRow::new(3);
+row.set_field(0, 1);                    // event_id
+row.set_field(1, "user_login");         // event_type
+row.set_field(2, 1704067200000i64);     // timestamp
+
+append_writer.append(&row)?;
+append_writer.flush().await?;
+```
+
+Write operations use a **fire-and-forget** pattern for efficient batching. Each call queues the write and returns a `WriteResultFuture` immediately. Call `flush()` to ensure all queued writes are sent to the server.
+
+For per-record acknowledgment:
+
+```rust
+append_writer.append(&row)?.await?;
+```
+
+## Reading from Log Tables
+
+```rust
+use std::time::Duration;
+
+let table = conn.get_table(&table_path).await?;
+let log_scanner = table.new_scan().create_log_scanner()?;
+
+// Subscribe to bucket 0 starting from offset 0
+log_scanner.subscribe(0, 0).await?;
+
+// Poll for records
+let records = log_scanner.poll(Duration::from_secs(10)).await?;
+
+// Per-bucket access
+for (bucket, bucket_records) in records.records_by_buckets() {
+    println!("Bucket {}: {} records", bucket.bucket_id(), bucket_records.len());
+    for record in bucket_records {
+        let row = record.row();
+        println!(
+            "  event_id={}, event_type={} @ offset={}",
+            row.get_int(0)?,
+            row.get_string(1)?,
+            record.offset()
+        );
+    }
+}
+
+// Or flat iteration (consumes ScanRecords)
+for record in records {
+    let row = record.row();
+    println!(
+        "event_id={}, event_type={}, timestamp={} @ offset={}",
+        row.get_int(0)?,
+        row.get_string(1)?,
+        row.get_long(2)?,
+        record.offset()
+    );
+}
+```
+
+**Subscribe from special offsets:**
+
+```rust
+use fluss::client::EARLIEST_OFFSET;
+
+log_scanner.subscribe(0, EARLIEST_OFFSET).await?;  // from earliest
+log_scanner.subscribe(0, 42).await?;                // from specific offset
+```
+
+**Subscribe from latest offset (only new records):**
+
+To start reading only new records, first resolve the current latest offset via `list_offsets`, then subscribe at that offset:
+
+```rust
+use fluss::rpc::message::OffsetSpec;
+
+let admin = conn.get_admin()?;
+let offsets = admin.list_offsets(&table_path, &[0], OffsetSpec::Latest).await?;
+let latest = offsets[&0];
+log_scanner.subscribe(0, latest).await?;
+```
+
+**Subscribe to all buckets:**
+
+```rust
+let num_buckets = table.get_table_info().get_num_buckets();
+for bucket_id in 0..num_buckets {
+    log_scanner.subscribe(bucket_id, 0).await?;
+}
+```
+
+**Subscribe to multiple buckets at once:**
+
+```rust
+use std::collections::HashMap;
+
+let mut bucket_offsets = HashMap::new();
+bucket_offsets.insert(0, 0i64);
+bucket_offsets.insert(1, 100i64);
+log_scanner.subscribe_buckets(&bucket_offsets).await?;
+```
+
+**Unsubscribe from a bucket:**
+
+```rust
+// Non-partitioned tables
+log_scanner.unsubscribe(bucket_id).await?;
+
+// Partitioned tables
+log_scanner.unsubscribe_partition(partition_id, bucket_id).await?;
+```
+
+## Column Projection
+
+```rust
+// Project by column index
+let scanner = table.new_scan().project(&[0, 2])?.create_log_scanner()?;
+
+// Project by column name
+let scanner = table.new_scan()
+    .project_by_name(&["event_id", "timestamp"])?
+    .create_log_scanner()?;
+```
+
+## Limit Scan
+
+For a bounded read of up to `n` rows from a single bucket, use a batch scanner
+instead of subscribing. It issues one request; poll it with `next_batch` until
+it returns `None`.
+
+```rust
+let bucket = TableBucket::new(table.get_table_info().table_id, 0);
+let mut scanner = table.new_scan().limit(10)?.create_bucket_batch_scanner(bucket)?;
+
+while let Some(batch) = scanner.next_batch().await? {
+    println!("rows: {}", batch.batch().num_rows());
+}
+```
+
+Limit applies per bucket; scan each bucket to cover a multi-bucket table.
diff --git a/fluss-rust/website/docs/user-guide/rust/example/partitioned-tables.md b/fluss-rust/website/docs/user-guide/rust/example/partitioned-tables.md
new file mode 100644
index 0000000000..e583e06ead
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/example/partitioned-tables.md
@@ -0,0 +1,219 @@
+---
+sidebar_position: 6
+---
+# Partitioned Tables
+
+Partitioned tables distribute data across partitions based on partition column values, enabling efficient data organization and querying. Both log tables and primary key tables support partitioning.
+
+## Partitioned Log Tables
+
+### Creating a Partitioned Log Table
+
+```rust
+use fluss::metadata::{DataTypes, LogFormat, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("event_id", DataTypes::int())
+            .column("event_type", DataTypes::string())
+            .column("dt", DataTypes::string())
+            .column("region", DataTypes::string())
+            .build()?,
+    )
+    .partitioned_by(vec!["dt", "region"])
+    .log_format(LogFormat::ARROW)
+    .build()?;
+
+let table_path = TablePath::new("fluss", "partitioned_events");
+admin.create_table(&table_path, &table_descriptor, true).await?;
+```
+
+### Writing to Partitioned Log Tables
+
+**Partitions must exist before writing data, otherwise the client will by default retry indefinitely.** Include partition column values in each row, the client routes records to the correct partition automatically.
+
+```rust
+use fluss::metadata::PartitionSpec;
+use std::collections::HashMap;
+
+let table = conn.get_table(&table_path).await?;
+
+// Create the partition before writing
+let mut partition_values = HashMap::new();
+partition_values.insert("dt", "2024-01-15");
+partition_values.insert("region", "US");
+admin.create_partition(&table_path, &PartitionSpec::new(partition_values), true).await?;
+
+let append_writer = table.new_append()?.create_writer()?;
+
+let mut row = GenericRow::new(4);
+row.set_field(0, 1);              // event_id
+row.set_field(1, "user_login");   // event_type
+row.set_field(2, "2024-01-15");   // dt (partition column)
+row.set_field(3, "US");           // region (partition column)
+
+append_writer.append(&row)?;
+append_writer.flush().await?;
+```
+
+### Reading from Partitioned Log Tables
+
+For partitioned tables, use partition-aware subscribe methods.
+
+```rust
+use std::time::Duration;
+
+let table = conn.get_table(&table_path).await?;
+let admin = conn.get_admin()?;
+let partitions = admin.list_partition_infos(&table_path).await?;
+
+let log_scanner = table.new_scan().create_log_scanner()?;
+
+// Subscribe to each partition's buckets
+for partition_info in &partitions {
+    let partition_id = partition_info.get_partition_id();
+    let num_buckets = table.get_table_info().get_num_buckets();
+    for bucket_id in 0..num_buckets {
+        log_scanner.subscribe_partition(partition_id, bucket_id, 0).await?;
+    }
+}
+
+let records = log_scanner.poll(Duration::from_secs(10)).await?;
+for record in records {
+    println!("Record: {:?}", record.row());
+}
+```
+
+Subscribe to multiple partition-buckets at once:
+
+```rust
+use std::collections::HashMap;
+
+let mut partition_bucket_offsets = HashMap::new();
+partition_bucket_offsets.insert((partition_id, 0), 0i64);
+partition_bucket_offsets.insert((partition_id, 1), 0i64);
+log_scanner.subscribe_partition_buckets(&partition_bucket_offsets).await?;
+```
+
+### Managing Partitions
+
+```rust
+use fluss::metadata::PartitionSpec;
+use std::collections::HashMap;
+
+// Create a partition
+let mut partition_values = HashMap::new();
+partition_values.insert("dt", "2024-01-15");
+partition_values.insert("region", "EMEA");
+let spec = PartitionSpec::new(partition_values);
+admin.create_partition(&table_path, &spec, true).await?;
+
+// List all partitions
+let partitions = admin.list_partition_infos(&table_path).await?;
+for partition in &partitions {
+    println!(
+        "Partition: id={}, name={}",
+        partition.get_partition_id(),
+        partition.get_partition_name()
+    );
+}
+
+// List with filter
+let mut partial_values = HashMap::new();
+partial_values.insert("dt", "2024-01-15");
+let partial_spec = PartitionSpec::new(partial_values);
+let filtered = admin.list_partition_infos_with_spec(
+    &table_path, Some(&partial_spec),
+).await?;
+
+// Drop a partition
+admin.drop_partition(&table_path, &spec, true).await?;
+```
+
+## Partitioned Primary Key Tables
+
+Partitioned KV tables combine partitioning with primary key operations. Partition columns must be part of the primary key.
+
+### Creating a Partitioned Primary Key Table
+
+```rust
+use fluss::metadata::{DataTypes, KvFormat, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("user_id", DataTypes::int())
+            .column("region", DataTypes::string())
+            .column("zone", DataTypes::bigint())
+            .column("score", DataTypes::bigint())
+            .primary_key(vec!["user_id", "region", "zone"])
+            .build()?,
+    )
+    .partitioned_by(vec!["region", "zone"])
+    .kv_format(KvFormat::COMPACTED)
+    .build()?;
+
+let table_path = TablePath::new("fluss", "partitioned_users");
+admin.create_table(&table_path, &table_descriptor, true).await?;
+```
+
+### Writing to Partitioned Primary Key Tables
+
+**Partitions must exist before upserting data, otherwise the client will by default retry indefinitely.**
+
+```rust
+use fluss::metadata::PartitionSpec;
+use std::collections::HashMap;
+
+let table = conn.get_table(&table_path).await?;
+
+// Create partitions first
+for (region, zone) in [("APAC", "1"), ("EMEA", "2"), ("US", "3")] {
+    let mut values = HashMap::new();
+    values.insert("region", region);
+    values.insert("zone", zone);
+    admin.create_partition(&table_path, &PartitionSpec::new(values), true).await?;
+}
+
+let table_upsert = table.new_upsert()?;
+let upsert_writer = table_upsert.create_writer()?;
+
+for (user_id, region, zone, score) in [
+    (1001, "APAC", 1i64, 1234i64),
+    (1002, "EMEA", 2, 2234),
+    (1003, "US", 3, 3234),
+] {
+    let mut row = GenericRow::new(4);
+    row.set_field(0, user_id);
+    row.set_field(1, region);
+    row.set_field(2, zone);
+    row.set_field(3, score);
+    upsert_writer.upsert(&row)?;
+}
+upsert_writer.flush().await?;
+```
+
+### Looking Up Records in Partitioned Tables
+
+Lookup requires all primary key columns including partition columns.
+
+```rust
+let mut lookuper = table.new_lookup()?.create_lookuper()?;
+
+let mut key = GenericRow::new(3);
+key.set_field(0, 1001);    // user_id
+key.set_field(1, "APAC");  // region (partition column)
+key.set_field(2, 1i64);    // zone (partition column)
+
+let result = lookuper.lookup(&key).await?;
+if let Some(row) = result.get_single_row()? {
+    println!("Found: score={}", row.get_long(3)?);
+}
+```
+
+### Prefix Lookup on Partitioned Tables
+
+See [Prefix Lookup — Partitioned Table](./prefix-lookup.md#partitioned-table) for details and a full runnable example.
+
+> **Note:** Scanning partitioned primary key tables is not supported. Use lookup operations instead.
diff --git a/fluss-rust/website/docs/user-guide/rust/example/prefix-lookup.md b/fluss-rust/website/docs/user-guide/rust/example/prefix-lookup.md
new file mode 100644
index 0000000000..619ba8341d
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/example/prefix-lookup.md
@@ -0,0 +1,110 @@
+---
+sidebar_position: 7
+---
+# Prefix Lookup
+
+Prefix lookup returns all rows whose primary key starts with a given prefix. It's enabled by choosing a **bucket key that is a strict prefix of the primary key** — rows sharing the same bucket-key prefix land in the same bucket, so one bucket lookup returns them all.
+
+## Table Requirements
+
+- The table must have a primary key.
+- The bucket key must be a strict prefix of the primary key (on partitioned tables, of the *non-partition* portion of the primary key).
+- The bucket key cannot equal the full primary key — that's a normal primary-key lookup, use [`Lookuper`](./primary-key-tables.md#looking-up-records) instead.
+- The `lookup_by` columns passed to the client must equal `partition_keys ++ bucket_key` (in that order, if partitioned).
+
+`create_lookuper()` validates these rules and returns `Err(Error::IllegalArgument { .. })` on mismatch, with a message describing the violation.
+
+## Non-Partitioned Table
+
+Pick a schema where the bucket key is a prefix of the primary key:
+
+```rust
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("user_id", DataTypes::int())
+            .column("session_id", DataTypes::string())
+            .column("event_seq", DataTypes::bigint())
+            .column("event_data", DataTypes::string())
+            .primary_key(vec!["user_id", "session_id", "event_seq"])
+            .build()?,
+    )
+    // Bucket key (user_id, session_id) is a prefix of the primary key.
+    .distributed_by(Some(3), vec!["user_id".to_string(), "session_id".to_string()])
+    .build()?;
+```
+
+Create the lookuper with `lookup_by(columns)` naming the prefix columns, then call `lookup(prefix_row)`:
+
+```rust
+use fluss::row::{GenericRow, InternalRow};
+
+let mut prefix_lookuper = table
+    .new_lookup()?
+    .lookup_by(vec!["user_id".to_string(), "session_id".to_string()])
+    .create_lookuper()?;
+
+let mut prefix = GenericRow::new(2);
+prefix.set_field(0, 1);                // user_id
+prefix.set_field(1, "sess-a");         // session_id
+
+let result = prefix_lookuper.lookup(&prefix).await?;
+for row in result.get_rows()? {
+    println!(
+        "seq={}, data={}",
+        row.get_long(2)?,
+        row.get_string(3)?,
+    );
+}
+```
+
+Unlike primary-key lookup (which uses `get_single_row()`), prefix lookup returns zero or more rows via `get_rows()`.
+
+## Partitioned Table
+
+On a partitioned table, the partition columns are stripped from the primary key before the bucket-prefix rule is evaluated. The lookup key, though, must still carry the partition values so the client can route the request to the right partition — so the `lookup_by` columns are `partition_keys ++ bucket_key`.
+
+```rust
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("region", DataTypes::string())
+            .column("user_id", DataTypes::int())
+            .column("session_id", DataTypes::string())
+            .column("event_seq", DataTypes::bigint())
+            .column("event_data", DataTypes::string())
+            .primary_key(vec!["region", "user_id", "session_id", "event_seq"])
+            .build()?,
+    )
+    .partitioned_by(vec!["region"])
+    // Bucket key (user_id, session_id) is a prefix of the pk minus partition cols.
+    .distributed_by(Some(3), vec!["user_id".to_string(), "session_id".to_string()])
+    .build()?;
+```
+
+```rust
+let mut prefix_lookuper = table
+    .new_lookup()?
+    .lookup_by(vec![
+        "region".to_string(),
+        "user_id".to_string(),
+        "session_id".to_string(),
+    ])
+    .create_lookuper()?;
+
+let mut prefix = GenericRow::new(3);
+prefix.set_field(0, "US");             // region (partition column)
+prefix.set_field(1, 1);                // user_id
+prefix.set_field(2, "sess-a");         // session_id
+
+let result = prefix_lookuper.lookup(&prefix).await?;
+for row in result.get_rows()? {
+    println!(
+        "seq={}, data={}",
+        row.get_long(3)?,
+        row.get_string(4)?,
+    );
+}
+```
diff --git a/fluss-rust/website/docs/user-guide/rust/example/primary-key-tables.md b/fluss-rust/website/docs/user-guide/rust/example/primary-key-tables.md
new file mode 100644
index 0000000000..01836e29e4
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/example/primary-key-tables.md
@@ -0,0 +1,141 @@
+---
+sidebar_position: 5
+---
+# Primary Key Tables
+
+Primary key tables (KV tables) support upsert, delete, and lookup operations.
+
+## Creating a Primary Key Table
+
+```rust
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .column("age", DataTypes::bigint())
+            .primary_key(vec!["id"])
+            .build()?,
+    )
+    .build()?;
+
+let table_path = TablePath::new("fluss", "users");
+admin.create_table(&table_path, &table_descriptor, true).await?;
+```
+
+## Upserting Records
+
+```rust
+use fluss::row::{GenericRow, InternalRow};
+
+let table = conn.get_table(&table_path).await?;
+let table_upsert = table.new_upsert()?;
+let upsert_writer = table_upsert.create_writer()?;
+
+for (id, name, age) in [(1, "Alice", 25i64), (2, "Bob", 30), (3, "Charlie", 35)] {
+    let mut row = GenericRow::new(3);
+    row.set_field(0, id);
+    row.set_field(1, name);
+    row.set_field(2, age);
+    upsert_writer.upsert(&row)?;
+}
+upsert_writer.flush().await?;
+```
+
+## Updating Records
+
+Upsert with the same primary key to update an existing record.
+
+```rust
+let mut row = GenericRow::new(3);
+row.set_field(0, 1);        // id (primary key)
+row.set_field(1, "Alice");
+row.set_field(2, 26i64);    // updated age
+
+upsert_writer.upsert(&row)?;
+upsert_writer.flush().await?;
+```
+
+## Deleting Records
+
+```rust
+// Only primary key field needs to be set
+let mut row = GenericRow::new(3);
+row.set_field(0, 2);  // id of record to delete
+
+upsert_writer.delete(&row)?;
+upsert_writer.flush().await?;
+```
+
+## Partial Updates
+
+Update only specific columns while preserving others.
+
+```rust
+// By column indices
+let partial_upsert = table_upsert.partial_update(Some(vec![0, 2]))?;
+let partial_writer = partial_upsert.create_writer()?;
+
+let mut row = GenericRow::new(3);
+row.set_field(0, 1);       // id (primary key, required)
+row.set_field(2, 27i64);   // age (will be updated)
+// name will remain unchanged
+
+partial_writer.upsert(&row)?;
+partial_writer.flush().await?;
+
+// By column names
+let partial_upsert = table_upsert.partial_update_with_column_names(&["id", "age"])?;
+let partial_writer = partial_upsert.create_writer()?;
+```
+
+## Looking Up Records
+
+```rust
+let mut lookuper = table.new_lookup()?.create_lookuper()?;
+
+let mut key = GenericRow::new(1);
+key.set_field(0, 1);  // id to lookup
+
+let result = lookuper.lookup(&key).await?;
+
+if let Some(row) = result.get_single_row()? {
+    println!(
+        "Found: id={}, name={}, age={}",
+        row.get_int(0)?,
+        row.get_string(1)?,
+        row.get_long(2)?
+    );
+} else {
+    println!("Record not found");
+}
+```
+## Looking Up Records as Arrow RecordBatch
+
+Use `to_record_batch()` to get lookup results in Arrow format, for example when integrating with DataFusion.
+```rust
+let result = lookuper.lookup(&key).await?;
+let batch = result.to_record_batch()?;
+println!("Rows: {}", batch.num_rows());
+```
+
+## Prefix Lookup
+
+To fetch all rows sharing a common primary-key prefix (by choosing a bucket key that's a strict prefix of the primary key), see [Prefix Lookup](./prefix-lookup.md).
+
+## Limit Scan
+
+To read up to `n` rows of a bucket's current state without supplying keys, use a batch scanner. The server returns the deduplicated current rows as Arrow batches, which is convenient for previews or DataFusion sources.
+
+```rust
+let bucket = TableBucket::new(table.get_table_info().table_id, 0);
+let mut scanner = table.new_scan().limit(10)?.create_bucket_batch_scanner(bucket)?;
+
+while let Some(batch) = scanner.next_batch().await? {
+    println!("rows: {}", batch.batch().num_rows());
+}
+```
+
+Limit applies per bucket; scan each bucket to cover a multi-bucket table.
diff --git a/fluss-rust/website/docs/user-guide/rust/installation.md b/fluss-rust/website/docs/user-guide/rust/installation.md
new file mode 100644
index 0000000000..540d4a10a0
--- /dev/null
+++ b/fluss-rust/website/docs/user-guide/rust/installation.md
@@ -0,0 +1,76 @@
+---
+sidebar_position: 1
+---
+# Installation
+
+The Fluss Rust client is published to [crates.io](https://crates.io/crates/fluss-rs) as `fluss-rs`. The crate's library name is `fluss`, so you import it with `use fluss::...`.
+
+```toml
+[dependencies]
+fluss-rs = "0.1.0"
+tokio = { version = "1", features = ["full"] }
+```
+
+## Feature Flags
+
+```toml
+[dependencies]
+# Default: memory and filesystem storage
+fluss-rs = "0.1.0"
+
+# With S3 storage support
+fluss-rs = { version = "0.1", features = ["storage-s3"] }
+
+# With OSS storage support
+fluss-rs = { version = "0.1", features = ["storage-oss"] }
+
+# All storage backends
+fluss-rs = { version = "0.1", features = ["storage-all"] }
+```
+
+Available features:
+- `storage-memory` (default: In-memory storage)
+- `storage-fs` (default: Local filesystem storage)
+- `storage-s3` (Amazon S3 storage)
+- `storage-oss` (Alibaba OSS storage)
+- `storage-all` (All storage backends)
+
+## Git or Path Dependency
+
+For development against unreleased changes:
+
+```toml
+[dependencies]
+# From Git
+fluss = { git = "https://github.com/apache/fluss-rust.git", package = "fluss-rs" }
+
+# From local path
+fluss = { path = "/path/to/fluss-rust/crates/fluss", package = "fluss-rs" }
+```
+
+> **Note:** When using `git` or `path` dependencies, the `package = "fluss-rs"` field is required so that Cargo resolves the correct package while still allowing `use fluss::...` imports.
+
+## Building from Source
+
+**Prerequisites:** Rust 1.85+, Protobuf compiler (`protoc`)
+
+```bash
+git clone https://github.com/apache/fluss-rust.git
+cd fluss-rust
+```
+
+Install `protoc`:
+
+```bash
+# macOS
+brew install protobuf
+
+# Ubuntu/Debian
+sudo apt-get install protobuf-compiler
+```
+
+Build:
+
+```bash
+cargo build --workspace --all-targets
+```
diff --git a/fluss-rust/website/docusaurus.config.ts b/fluss-rust/website/docusaurus.config.ts
new file mode 100644
index 0000000000..8c2e173d3b
--- /dev/null
+++ b/fluss-rust/website/docusaurus.config.ts
@@ -0,0 +1,103 @@
+import {themes as prismThemes} from 'prism-react-renderer';
+import type {Config} from '@docusaurus/types';
+import type * as Preset from '@docusaurus/preset-classic';
+
+const config: Config = {
+  title: 'Apache Fluss Clients',
+  tagline: 'Rust, Python, and C++ clients for Apache Fluss',
+  favicon: 'img/logo/fluss_favicon.svg',
+
+  url: 'https://clients.fluss.apache.org',
+  baseUrl: '/',
+
+  organizationName: 'apache',
+  projectName: 'fluss-rust',
+
+  onBrokenLinks: 'throw',
+
+  i18n: {
+    defaultLocale: 'en',
+    locales: ['en'],
+  },
+
+  plugins: [
+    [
+      '@docusaurus/plugin-pwa',
+      {
+        debug: false,
+        offlineModeActivationStrategies: [
+          'appInstalled',
+          'standalone',
+          'queryString',
+        ],
+        pwaHead: [
+          { tagName: 'link', rel: 'icon', href: '/img/logo/fluss_favicon.svg' },
+          { tagName: 'link', rel: 'manifest', href: '/manifest.json' },
+          { tagName: 'meta', name: 'theme-color', content: '#0071e3' },
+        ],
+      },
+    ],
+  ],
+
+  presets: [
+    [
+      'classic',
+      {
+        docs: {
+          routeBasePath: '/',
+          sidebarPath: './sidebars.ts',
+          editUrl: 'https://github.com/apache/fluss-rust/edit/main/website/',
+        },
+        blog: false,
+        theme: {
+          customCss: './src/css/custom.css',
+        },
+      } satisfies Preset.Options,
+    ],
+  ],
+
+  themeConfig: {
+    image: 'img/logo/png/colored_logo.png',
+    colorMode: {
+      defaultMode: 'light',
+      disableSwitch: true,
+    },
+    navbar: {
+      title: '',
+      logo: {
+        alt: 'Fluss',
+        src: 'img/logo/svg/colored_logo.svg',
+      },
+      items: [
+        {
+          type: 'docSidebar',
+          sidebarId: 'docsSidebar',
+          position: 'left',
+          label: 'Client Docs',
+        },
+        {
+          href: 'https://fluss.apache.org/',
+          label: 'Fluss',
+          position: 'left',
+        },
+        {
+          href: 'https://github.com/apache/fluss-rust',
+          position: 'right',
+          className: 'header-github-link',
+          'aria-label': 'GitHub repository',
+        },
+      ],
+    },
+    footer: {
+      style: 'dark',
+      copyright: `Copyright © ${new Date().getFullYear()} The Apache Software Foundation, Licensed under the Apache License, Version 2.0.`,
+    },
+    prism: {
+      theme: prismThemes.vsDark,
+      darkTheme: prismThemes.dracula,
+      additionalLanguages: ['rust', 'toml', 'bash', 'cmake'],
+    },
+  } satisfies Preset.ThemeConfig,
+};
+
+export default config;
diff --git a/fluss-rust/website/package.json b/fluss-rust/website/package.json
new file mode 100644
index 0000000000..e725e9e47c
--- /dev/null
+++ b/fluss-rust/website/package.json
@@ -0,0 +1,48 @@
+{
+  "name": "fluss-clients-website",
+  "version": "0.0.0",
+  "private": true,
+  "scripts": {
+    "docusaurus": "docusaurus",
+    "start": "docusaurus start",
+    "build": "docusaurus build",
+    "swizzle": "docusaurus swizzle",
+    "clear": "docusaurus clear",
+    "serve": "docusaurus serve"
+  },
+  "dependencies": {
+    "@docusaurus/core": "^3.9.2",
+    "@docusaurus/plugin-pwa": "^3.9.2",
+    "@docusaurus/preset-classic": "^3.9.2",
+    "@mdx-js/react": "^3.0.0",
+    "clsx": "^2.0.0",
+    "prism-react-renderer": "^2.3.0",
+    "react": "^18.0.0",
+    "react-dom": "^18.0.0"
+  },
+  "devDependencies": {
+    "@docusaurus/module-type-aliases": "^3.9.2",
+    "@docusaurus/tsconfig": "^3.9.2",
+    "@docusaurus/types": "^3.9.2",
+    "typescript": "~5.5.2"
+  },
+  "browserslist": {
+    "production": [
+      ">0.5%",
+      "not dead",
+      "not op_mini all"
+    ],
+    "development": [
+      "last 3 chrome version",
+      "last 3 firefox version",
+      "last 5 safari version"
+    ]
+  },
+  "engines": {
+    "node": ">=20.0"
+  },
+  "overrides": {
+    "webpackbar": "^7.0.0"
+  },
+  "comment:overrides": "webpackbar 6.x passes its own options (name, color) as this.options, which webpack 5.106.0+ rejects via strict ProgressPlugin schema validation. webpackbar 7.0.0 fixes this. Remove this override once Docusaurus bumps its webpackbar dependency to ^7.0.0."
+}
diff --git a/fluss-rust/website/sidebars.ts b/fluss-rust/website/sidebars.ts
new file mode 100644
index 0000000000..1aea14b4ac
--- /dev/null
+++ b/fluss-rust/website/sidebars.ts
@@ -0,0 +1,25 @@
+import type {SidebarsConfig} from '@docusaurus/plugin-content-docs';
+
+const sidebars: SidebarsConfig = {
+  docsSidebar: [
+    'index',
+    {
+      type: 'category',
+      label: 'Clients',
+      items: [
+        {type: 'autogenerated', dirName: 'user-guide'},
+        {type: 'link', label: 'Java', href: 'https://fluss.apache.org/docs/0.9/apis/java-client/'},
+      ],
+    },
+    'developer-guide/contributing',
+    {
+      type: 'category',
+      label: 'Release',
+      items: [
+        {type: 'autogenerated', dirName: 'release'},
+      ],
+    },
+  ],
+};
+
+export default sidebars;
diff --git a/fluss-rust/website/src/css/custom.css b/fluss-rust/website/src/css/custom.css
new file mode 100644
index 0000000000..9143372f85
--- /dev/null
+++ b/fluss-rust/website/src/css/custom.css
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Copied from the main fluss.apache.org website (fluss/website/src/css/custom.css)
+ * to ensure visual consistency.
+ */
+
+/* Import Inter font from Google Fonts */
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&display=swap');
+
+/* You can override the default Infima variables here. */
+:root {
+  --ifm-color-primary: #0071e3;
+  --ifm-color-primary-dark: #0066cc;
+  --ifm-color-primary-darker: #0060c1;
+  --ifm-color-primary-darkest: #004f9f;
+  --ifm-color-primary-light: #007cfa;
+  --ifm-color-primary-lighter: #0682ff;
+  --ifm-color-primary-lightest: #2893ff;
+  --ifm-code-font-size: 90%;
+  --ifm-font-family-base: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, 'PingFang SC', 'Hiragino Sans GB', 'Microsoft YaHei', sans-serif;
+  --docusaurus-highlighted-code-line-bg: #E2E9F3;
+
+  --ifm-menu-color-background-active: #edeefa99;
+  --ifm-menu-color-background-hover: #edeefa99;
+}
+
+
+.navbar__brand {
+  font-family: monaco;
+  color: inherit;
+}
+
+.header-github-link:hover {
+  opacity: 0.6;
+}
+
+.header-github-link::before {
+  content: '';
+  width: 24px;
+  height: 24px;
+  display: flex;
+  background: url("data:image/svg+xml,%3Csvg viewBox='0 0 24 24' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M12 .297c-6.63 0-12 5.373-12 12 0 5.303 3.438 9.8 8.205 11.385.6.113.82-.258.82-.577 0-.285-.01-1.04-.015-2.04-3.338.724-4.042-1.61-4.042-1.61C4.422 18.07 3.633 17.7 3.633 17.7c-1.087-.744.084-.729.084-.729 1.205.084 1.838 1.236 1.838 1.236 1.07 1.835 2.809 1.305 3.495.998.108-.776.417-1.305.76-1.605-2.665-.3-5.466-1.332-5.466-5.93 0-1.31.465-2.38 1.235-3.22-.135-.303-.54-1.523.105-3.176 0 0 1.005-.322 3.3 1.23.96-.267 1.98-.399 3-.405 1.02.006 2.04.138 3 .405 2.28-1.552 3.285-1.23 3.285-1.23.645 1.653.24 2.873.12 3.176.765.84 1.23 1.91 1.23 3.22 0 4.61-2.805 5.625-5.475 5.92.42.36.81 1.096.81 2.22 0 1.606-.015 2.896-.015 3.286 0 .315.21.69.825.57C20.565 22.092 24 17.592 24 12.297c0-6.627-5.373-12-12-12'/%3E%3C/svg%3E")
+  no-repeat;
+}
+
+
+.menu__list-item {
+  font-size: 0.95rem;
+  font-weight: 500;
+}
+
+.menu__link--sublist-caret:after {
+  background: var(--ifm-menu-link-sublist-icon) 50% / 1.5rem 1.5rem;
+}
+
+
+.markdown {
+    padding-left: 1rem;
+    h1,
+    h2,
+    h3,
+    h4,
+    h5,
+    h6 {
+        color: #1d1d1d;
+        margin-bottom: 0.3125rem;
+        font-weight: 700;
+    }
+
+    b,
+    strong {
+        font-weight: 700;
+        color: #1d1d1d;
+    }
+
+    h1,
+    h1:first-child {
+        font-size: 2.5rem;
+        margin-bottom: 1.5rem;
+        margin-top: 0;
+    }
+
+    h2 {
+        font-size: 2rem;
+        margin-bottom: 1.25rem;
+        margin-top: 2rem;
+        padding-top: 2rem;
+        border-top: 1px solid #e6e7e9;
+    }
+
+    h3 {
+        font-size: 1.5rem;
+        margin-bottom: 1.25rem;
+        margin-top: 1rem;
+    }
+    p {
+        line-height: 1.875;
+
+        code {
+          border-radius: 4px;
+          background-color: #edf2fa;
+          border: none;
+          padding: 3px 4px;
+          font-size: 14px;
+          color: #4c576c;
+      }
+    }
+
+    li > code {
+        border-radius: 4px;
+        background-color: #edf2fa;
+        border: none;
+        padding: 3px 4px;
+        font-size: 14px;
+        color: #4c576c;
+    }
+
+    table thead tr {
+        background-color: #f7f9fe;
+    }
+
+    table thead th {
+        background-color: #f7f9fe;
+        color: #1d1d1d;
+        font-size: 1rem;
+        font-weight: 500;
+    }
+
+    table tr {
+        border-bottom: none;
+        background-color: var(--global-colors-white);
+        font-size: var(--global-font-size-small);
+
+        code {
+            border-radius: 4px;
+            background-color: #edf2fa;
+            border: none;
+            padding: 3px 4px;
+            font-size: 14px;
+            color: #4c576c;
+        }
+    }
+
+    table tr th {
+        padding: 0.53rem 0.8125rem;
+        border-color: #dfe5f0;
+    }
+
+    table tr td {
+        padding: 0.65rem 0.8125rem;
+        border-color: #dfe5f0;
+    }
+    a {
+        color: var(--ifm-color-primary);
+    }
+    ul {
+        padding-left: 20px;
+        li {
+            margin-top: 4px;
+            position: relative;
+            list-style: initial;
+        }
+    }
+    ol {
+        padding-left: 20px;
+        li {
+            list-style: decimal;
+        }
+    }
+}
+
+.theme-doc-markdown {
+    header {
+        margin-top: 1rem;
+
+        & + h1 {
+            display: none;
+        }
+    }
+}
+
+.breadcrumbs__item--active .breadcrumbs__link {
+  background: var(--ifm-menu-color-background-active);
+}
+
+.footer__copyright {
+    color: #dfe5f0;
+    font-size: .75rem;
+    line-height: 1.8;
+    opacity: .6;
+    text-align: center;
+    width: 98%;
+}
diff --git a/fluss-rust/website/static/CNAME b/fluss-rust/website/static/CNAME
new file mode 100644
index 0000000000..6298936bc4
--- /dev/null
+++ b/fluss-rust/website/static/CNAME
@@ -0,0 +1 @@
+clients.fluss.apache.org
diff --git a/fluss-rust/website/static/img/logo/fluss_favicon.svg b/fluss-rust/website/static/img/logo/fluss_favicon.svg
new file mode 100644
index 0000000000..7c044d55d5
--- /dev/null
+++ b/fluss-rust/website/static/img/logo/fluss_favicon.svg
@@ -0,0 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<svg width="142" height="144" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="118" y="81" width="142" height="144"/></clipPath><clipPath id="clip1"><rect x="118" y="81" width="142" height="144"/></clipPath><clipPath id="clip2"><rect x="119" y="82" width="141" height="143"/></clipPath><clipPath id="clip3"><rect x="119" y="82" width="141" height="143"/></clipPath><clipPath id="clip4"><rect x="119" y="82" width="401" height="173"/></clipPath><linearGradient x1="59.1276" y1="1.49011" x2="59.1276" y2="126.123" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill5"><stop offset="0" stop-color="#FFFFFF" stop-opacity="1"/><stop offset="0.335714" stop-color="#FFFFFF" stop-opacity="1"/><stop offset="1" stop-color="#96A1FF" stop-opacity="1"/></linearGradient><linearGradient x1="69.5384" y1="92.8835" x2="69.5384" y2="170.369" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill6"><stop offset="0" stop-color="#002CDC" stop-opacity="1"/><stop offset="1" stop-color="#3B5FEE" stop-opacity="1"/></linearGradient></defs><g clip-path="url(#clip0)" transform="translate(-118 -81)"><g clip-path="url(#clip1)"><g clip-path="url(#clip2)"><g clip-path="url(#clip3)"><g clip-path="url(#clip4)"><path d="M53.1018 21.9594C53.1018 24.9913 51.0461 27.4492 48.5103 27.4492 45.9745 27.4492 43.9188 24.9913 43.9188 21.9594 43.9188 18.9274 45.9745 16.4695 48.5103 16.4695 51.0461 16.4695 53.1018 18.9274 53.1018 21.9594Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(0.650313 -0.761909 0.753395 0.657662 121.741 121.546)"/><path d="M113.737 21.7028C113.737 23.7968 111.727 25.4944 109.247 25.4944 106.767 25.4944 104.757 23.7968 104.757 21.7028 104.757 19.6088 106.767 17.9113 109.247 17.9113 111.727 17.9113 113.737 19.6088 113.737 21.7028Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(0.778888 -0.627162 0.620154 0.78769 130.596 151.964)"/><path d="M13.0434 119.963C9.0102 119.963 7.80818 118.503 7.50022 117.281 7.04326 115.443 8.52343 113.377 11.3546 111.877 20.9808 106.781 34.7494 96.6879 41.8523 85.0253L42.2993 84.2902 43.1636 84.3101C43.9483 84.3299 44.6139 84.6875 45.051 85.3134 46.2828 87.1214 45.2696 90.5089 42.647 97.8005 41.6039 100.711 40.5211 103.721 40.3224 105.291 39.4581 111.996 33.5374 116.089 20.5238 118.97 17.5337 119.636 15.0203 119.963 13.0434 119.963Z" fill="#B0B7F3" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M20.8459 120.425 20.8476 120.425C33.9046 117.534 40.889 112.552 41.8007 105.478 41.918 104.551 42.6561 102.192 44.0149 98.4003L44.0492 98.3049C46.7341 90.84 47.4752 86.2251 46.2726 84.4602 45.5323 83.4001 44.5073 82.8535 43.1977 82.8203L41.4732 82.7807 40.5796 84.2502C37.4024 89.467 32.7408 94.6401 26.5948 99.7696 21.4547 104.06 16.1421 107.656 10.657 110.56 6.90856 112.546 5.37466 114.908 6.0553 117.645 6.69515 120.184 9.02453 121.454 13.0434 121.454 15.1638 121.454 17.7647 121.111 20.8459 120.425ZM38.8441 105.104C38.1113 110.789 31.8972 114.926 20.2017 117.515L20.2001 117.515C17.3313 118.154 14.9458 118.473 13.0434 118.473 10.5727 118.473 9.20664 117.955 8.94515 116.917 8.64093 115.694 9.67664 114.453 12.0523 113.194 17.7206 110.193 23.2046 106.481 28.5044 102.058 34.903 96.7173 39.7765 91.2982 43.1249 85.8004L43.1253 85.7997 43.1293 85.7998C43.4305 85.8074 43.6638 85.9297 43.8293 86.1666 44.4482 87.0751 43.5867 90.7849 41.2448 97.2962L41.2094 97.3949C39.7772 101.391 38.9888 103.961 38.8441 105.104Z" fill="#000000" fill-rule="evenodd" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M104.427 71.7641C102.46 71.7641 100.433 70.9892 98.8738 69.6382 98.0989 68.9726 91.3934 62.7638 93.4895 50.1277L94.1154 46.3428 96.2115 49.5615C97.2446 51.1608 99.1122 52.8695 101.089 54.6874 103.751 57.1312 106.761 59.9028 108.818 63.3003 110.586 66.2308 109.94 68.2673 109.404 69.221 108.5 70.8104 106.652 71.7641 104.437 71.7641L104.427 71.7641Z" fill="#B0B7F3" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M104.427 73.2542 104.437 73.2542C107.362 73.2542 109.45 72.1533 110.703 69.9515 111.947 67.7393 111.744 65.2651 110.093 62.5287 108.64 60.1293 105.975 57.1499 102.098 53.5906L102.02 53.5187C99.5929 51.2873 98.0731 49.6971 97.4601 48.7483L93.271 42.3156 92.0195 49.8838C91.0855 55.5146 91.6645 60.5553 93.7565 65.0058 94.9134 67.467 96.294 69.3866 97.8981 70.7645 99.8143 72.4243 101.991 73.2542 104.427 73.2542ZM108.105 68.4904C107.429 69.6795 106.206 70.274 104.437 70.274L104.427 70.274C102.731 70.274 101.206 69.6866 99.8494 68.5119 98.5616 67.4056 97.4296 65.8143 96.4536 63.738 94.6248 59.8475 94.1268 55.392 94.9596 50.3715L94.9598 50.37 94.9628 50.3746C95.7092 51.5302 97.3891 53.3095 100.002 55.7125L100.08 55.7843C103.75 59.1528 106.238 61.9153 107.543 64.0718 108.595 65.8145 108.782 67.2873 108.105 68.4904Z" fill="#000000" fill-rule="evenodd" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M35.1375 123.838C33.4288 116.447 35.813 108.579 38.1078 100.97 39.5482 96.1916 40.9191 91.6716 41.088 87.7476 41.4556 79.254 43.2437 62.7337 43.8894 56.952 42.9556 57.3096 41.5748 57.9951 39.7072 59.2766 35.8527 61.9389 24.1306 70.0352 14.3256 70.0352 13.7693 70.0352 13.2229 70.0054 12.7064 69.9557 9.74601 69.6676 7.40157 67.3629 7.00421 64.3529 6.77572 62.5946 6.86513 56.5547 17.7827 52.4718 20.8523 51.3393 23.7233 49.0246 27.3591 46.0841 30.091 43.8788 33.1904 41.3754 37.1442 38.723 38.4555 37.8289 39.8661 37.223 41.2271 36.6368 44.9822 35.0176 48.2207 33.6268 49.5916 26.5637 51.8565 14.9011 54.7374 8.91086 65.8536 4.23191 70.1054 2.44378 75.2016 1.49011 80.5759 1.49011 88.1457 1.49011 95.6857 3.37758 100.752 6.52668 109.147 11.752 111.938 23.1563 111.183 29.2161 110.289 36.3885 105.898 41.7231 96.9771 46.4418 95.0201 54.1009 93.1723 64.8695 94.9307 74.0982 95.1194 74.4161 95.4472 74.9724 95.5168 75.0916 97.2552 77.6944 99.699 79.2739 102.053 80.7938 104.487 82.3634 107.011 83.9926 108.6 86.6946 109.723 88.6616 109.951 91.7709 109.286 96.2115L109.107 97.4532 107.855 97.483C100.235 97.6718 82.9495 98.844 73.4035 104.298 63.8576 109.752 47.267 117.063 37.5117 124.683L35.664 126.123 35.1375 123.848 35.1375 123.838Z" fill="url(#fill5)" fill-rule="nonzero" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M33.6475 124.004C32.5122 118.832 33.5168 111.033 36.6611 100.606L36.6812 100.54 36.716 100.424C38.5319 94.4006 39.493 90.1538 39.5993 87.6836 39.8524 81.8341 40.6886 72.4405 42.1077 59.5029 41.6156 59.7963 41.0965 60.1305 40.5503 60.5052 29.914 67.852 21.1724 71.5253 14.3256 71.5253 13.7495 71.5253 13.1622 71.4965 12.5637 71.439 8.42584 71.0363 6.08023 68.7393 5.52692 64.5479 5.26244 62.5126 5.73332 60.5004 6.93955 58.5114 8.80904 55.4286 12.2494 52.9501 17.2607 51.076 19.1687 50.3722 22.2224 48.322 26.422 44.9255L26.4427 44.9089C30.3576 41.7485 33.648 39.274 36.314 37.4855 37.2569 36.8426 38.6919 36.1062 40.6189 35.2763L40.6377 35.2682 40.647 35.2643C42.8894 34.2973 44.4268 33.4021 45.2593 32.5786 46.6182 31.2344 47.5747 29.1348 48.1288 26.2798 49.3784 19.8453 51.0584 15.0669 53.1688 11.9445 55.6817 8.22673 59.7173 5.19805 65.2756 2.85851 69.8068 0.952836 74.907 0 80.5759 0 88.9082 0 95.8958 1.75371 101.539 5.26112 105.66 7.82616 108.743 11.7457 110.788 17.0196 112.512 21.4644 113.136 25.5913 112.661 29.4004 111.753 36.6828 106.953 42.6971 98.2611 47.4434 95.6866 57.7621 95.0486 66.4696 96.3471 73.5659L96.4468 73.7343 96.7784 74.2975C97.8311 75.8552 99.8588 77.6034 102.862 79.5419 106.363 81.7999 108.704 83.9323 109.885 85.9391 111.177 88.2034 111.468 91.7011 110.759 96.4324L110.402 98.9129 107.89 98.9727C92.4457 99.3553 81.1966 101.561 74.1427 105.591 72.1164 106.749 68.5306 108.648 63.3855 111.289 51.1261 117.582 42.8072 122.437 38.429 125.857L34.7387 128.734 33.6475 124.019 33.6475 124.004ZM36.5913 123.51 36.5893 123.502C35.5131 118.847 36.4881 111.502 39.5145 101.467L39.5344 101.4 39.5694 101.284C41.4573 95.0218 42.4597 90.5311 42.5767 87.8117 42.8443 81.6283 43.7755 71.3969 45.3703 57.1174L45.642 54.6852 43.3565 55.5605C42.0479 56.0616 40.5505 56.8908 38.8641 58.0479 28.7324 65.046 20.553 68.5451 14.3256 68.5451 13.8447 68.5451 13.3525 68.5209 12.849 68.4725 10.2841 68.2229 8.82827 66.7847 8.4815 64.1579 7.93486 59.9511 11.2092 56.5209 18.3046 53.8674 20.5114 53.0533 23.8419 50.8451 28.2961 47.2428L28.3147 47.2278C32.1613 44.1225 35.3812 41.7001 37.9744 39.9605 38.7718 39.4168 40.0463 38.7678 41.7977 38.0135L41.8165 38.0055 41.8271 38.0009C44.4135 36.8856 46.2562 35.7844 47.3552 34.6973 49.1503 32.9216 50.3833 30.305 51.0544 26.8476 52.2266 20.8113 53.7545 16.3999 55.6379 13.6134 57.82 10.385 61.4179 7.71569 66.4317 5.60532 70.5929 3.85525 75.3077 2.98022 80.5759 2.98022 88.3411 2.98022 94.8043 4.58423 99.9653 7.79224 103.522 10.0063 106.203 13.4413 108.009 18.0973 109.552 22.075 110.117 25.7199 109.704 29.0319 108.904 35.4499 104.429 40.8141 96.2803 45.1246L95.6968 45.4333 95.5334 46.0729C92.6847 57.2217 91.9958 66.6564 93.4669 74.3771L93.5157 74.633 93.8821 75.2523 94.2542 75.8841 94.2777 75.9192C95.5524 77.8278 97.8749 79.8699 101.245 82.0457 104.351 84.0485 106.374 85.85 107.316 87.4501 108.254 89.0937 108.419 91.9405 107.811 95.9906L107.811 95.9935C91.8678 96.3892 80.1521 98.7261 72.6643 103.004 70.6767 104.14 67.1302 106.017 62.0248 108.638 49.5979 115.016 41.1212 119.972 36.5945 123.508L36.5913 123.51Z" fill="#000000" fill-rule="evenodd" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M78.4791 81.8435C70.5319 82.8369 65.5649 78.8635 65.5649 71.4128 65.5649 63.9622 66.5583 52.0413 72.0219 43.5973 78.8168 37.4978 95.8935 40.9551 95.367 44.0942 92.3868 49.5578 90.609 62.996 89.4066 71.4128 88.9099 74.8897 86.4264 80.8501 78.4791 81.8435Z" fill="#FFFFFF" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M91.7908 33.4185C93.877 30.8356 96.3605 29.0475 96.4598 27.6567 96.5592 26.2659 91.7908 24.1798 89.9033 24.4778 88.0159 24.7758 83.6449 27.6567 83.7442 29.2461 83.8436 30.8356 87.4198 32.3257 89.2079 33.6171 89.0093 35.4052 86.0787 37.1636 83.8336 37.1636 80.8335 37.1636 77.3765 35.4748 77.1679 36.2695 76.8996 37.3126 80.2474 38.8822 83.8336 38.8822 86.337 38.8822 87.8967 37.9881 89.1881 36.8258 90.3901 35.7529 91.304 36.2993 91.7213 36.6371 92.6948 37.4318 94.6717 37.9881 96.2711 37.9881 98.0989 37.9881 101.029 36.5278 100.751 35.8821 100.304 34.8191 98.1387 36.3987 96.5294 36.3987 93.1816 36.3987 91.4729 34.5112 91.7709 33.4185L91.7908 33.4185Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M75.1016 22.7288C75.1016 24.4845 74.0786 25.9077 72.8167 25.9077 71.5548 25.9077 70.5319 24.4845 70.5319 22.7288 70.5319 20.9732 71.5548 19.5499 72.8167 19.5499 74.0786 19.5499 75.1016 20.9732 75.1016 22.7288Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M104.794 22.729C104.794 24.375 103.86 25.7092 102.708 25.7092 101.556 25.7092 100.622 24.375 100.622 22.729 100.622 21.0831 101.556 19.7488 102.708 19.7488 103.86 19.7488 104.794 21.0831 104.794 22.729Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M78.499 104.944C76.8301 105.526 73.2704 106.904 67.8199 109.076 67.4159 107.917 67.6311 106.444 68.4656 104.655 72.5386 97.4036 70.1544 92.1385 84.8568 88.6616 79.6911 91.2444 78.8964 94.5227 78.7971 100.582 78.8103 102.258 78.711 103.715 78.499 104.953L78.499 104.944Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M1.12754e-05 139.014C7.84267 140.529 19.5771 141.638 26.868 139.856 37.4497 137.271 48.9674 125.97 63.3225 114.135 77.6778 102.3 102.565 95.1492 122.32 101.933 128.024 103.894 130.941 107.816 130.941 107.915 129.266 106.826 127.493 105.637 125.719 104.845 117.246 100.983 107.097 100.884 98.1318 102.963 96.8509 103.261 95.6687 103.558 94.3878 103.954 93.2055 104.35 91.9246 104.746 90.7424 105.142 89.56 105.538 88.3778 106.034 87.1954 106.529 86.013 107.024 84.8308 107.618 83.747 108.113 82.5647 108.708 81.4809 109.302 80.3971 109.995 79.3133 110.688 78.2295 111.283 77.1458 112.075 76.062 112.768 75.0767 113.561 73.9929 114.254 73.0077 115.046 71.9239 115.839 71.0371 116.631 67.2932 119.899 63.8448 123.564 60.3964 127.03 46.6027 141.787 36.5531 149.71 15.3701 150.403 11.9217 150.502 3.3499 150.205 0.0985432 148.522 23.6463 166.052 59.8052 171.4 78.6237 170.211 98.4274 169.023 121.679 160.604 127.986 144.956 115.473 135.646 111.531 150.899 101.481 145.848 91.432 140.796 97.6391 128.615 107.197 121.187 116.753 113.759 123.463 114.422 126.99 115.878 132.498 118.146 132.626 122.474 132.32 123.564 131.936 124.95 130.843 125.842 129.759 126.733 132.813 126.238 134.783 125.544 137.148 123.167 139.02 120.592 139.414 117.621 138.823 114.551 137.837 108.906 134.587 103.558 129.956 100.388 118.724 92.4652 104.408 92.1186 90.6044 93.525 75.057 95.1096 56.1007 105.717 40.9473 117.384 31.7746 124.445 16.0597 135.953 0 139.024L1.12754e-05 139.014ZM76.5546 140.004C67.0961 150.998 51.263 158.307 35.3708 157.138 27.6858 156.573 22.661 155.454 21.5772 154.464 33.7944 155.85 48.2777 151.493 53.9922 147.828 59.7066 144.164 66.6035 135.547 70.3475 132.972 74.0915 130.397 76.259 130.992 77.5399 131.883 79.7074 133.27 79.1163 137.033 76.5546 140.004Z" fill="url(#fill6)" fill-rule="nonzero" transform="matrix(1 0 0 1.0113 119 82.3391)"/><path d="M71.5253 45.2C75.4989 47.1868 80.9627 48.677 86.4264 48.6769 90.0688 48.6769 93.3802 48.1802 96.3604 47.1868L98.844 43.7099C91.8901 47.0213 82.7839 47.518 71.5253 45.2Z" fill="#000000" fill-rule="nonzero" fill-opacity="1" transform="matrix(1 0 0 1.0113 119 82.3391)"/></g></g></g></g></g></svg>
\ No newline at end of file
diff --git a/fluss-rust/website/static/img/logo/png/colored_logo.png b/fluss-rust/website/static/img/logo/png/colored_logo.png
new file mode 100644
index 0000000000..2cd7dd3749
Binary files /dev/null and b/fluss-rust/website/static/img/logo/png/colored_logo.png differ
diff --git a/fluss-rust/website/static/img/logo/svg/colored_logo.svg b/fluss-rust/website/static/img/logo/svg/colored_logo.svg
new file mode 100644
index 0000000000..3b136ac448
--- /dev/null
+++ b/fluss-rust/website/static/img/logo/svg/colored_logo.svg
@@ -0,0 +1,19 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" fill="none" version="1.1" width="403.4100341796875" height="171.5" viewBox="0 0 403.4100341796875 171.5"><defs><clipPath id="master_svg0_14_8989"><rect x="180" y="54.5" width="223.41001892089844" height="91.2300033569336" rx="0"/></clipPath><linearGradient x1="0.5" y1="0" x2="0.5" y2="1" id="master_svg1_14_05037"><stop offset="0%" stop-color="#FFFFFF" stop-opacity="1"/><stop offset="33.571428060531616%" stop-color="#FFFFFF" stop-opacity="1"/><stop offset="100%" stop-color="#96A1FF" stop-opacity="1"/></linearGradient><linearGradient x1="0.5" y1="0" x2="0.5" y2="1" id="master_svg2_14_2646"><stop offset="0%" stop-color="#002CDC" stop-opacity="1"/><stop offset="100%" stop-color="#3B5FEE" stop-opacity="1"/></linearGradient></defs><g><g clip-path="url(#master_svg0_14_8989)"><g><path d="M209.625,54.5L237.465,54.5L236.20499999999998,65.225L211.38,65.225C205.695,65.225,200.91,69.455,200.20499999999998,75.08L198.72,86.96000000000001L230.07,86.96000000000001L228.79500000000002,97.685L197.43,97.685L192.42,138.26C191.895,142.51999999999998,188.28,145.715,183.99,145.715L180,145.715L188.955,72.785C190.23,62.345,199.095,54.5,209.625,54.5Z" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><path d="M235.79966546875,127.715L244.72466796875,54.5L255.92966796875,54.5L247.00466796875,127.715L235.78466796875,127.715L235.79966546875,127.715Z" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><path d="M294.10490234375,117.00455078125L299.36990234375,73.41455078125L310.57490234375,73.41455078125L303.94490234375,127.72955078125L279.46490234375,127.72955078125C266.00990234375,127.72955078125,259.27490234375,121.54955078124999,259.27490234375,109.20455078124999C259.27490234375,107.70455078124999,259.37990234375,106.08455078125,259.57490334375,104.32955078124999L263.38490234375,73.41455364227L274.58990234375,73.41455364227L270.79490234375,104.61455078125C270.65990234375,105.45455078124999,270.59990234375,106.62455078125001,270.59990234375,108.12455078125001C270.59990234375,111.49955078125001,271.39490234375,113.83955078125,272.98490234375,115.14455078124999C274.57490234375,116.44955078125,277.15490234375,117.09455078125,280.73990234375,117.09455078125L294.08990234375,117.00455078125L294.10490234375,117.00455078125Z" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><path d="M327.360234375,128.1196875C321.375244375,127.9846875,316.215234375,127.7296875,311.865234375,127.3396875L312.765232375,120.81468749999999C313.080234375,118.5796875,315.090234375,116.9746875,317.340234375,117.16968750000001C321.870234375,117.54468750000001,325.635234375,117.78468749999999,328.635234375,117.88968750000001L331.950234375,117.88968750000001C336.105234375,117.88968750000001,339.015234375,117.3646875,340.680234375,116.3296875C342.330234375,115.29468750000001,343.365234375,113.79468750000001,343.755234375,111.84468749999999C343.815234375,111.51468750000001,343.860234375,111.06468749999999,343.860234375,110.4796875C343.860234375,108.8596875,343.605234375,107.7796875,343.080234375,107.2546875C342.630234375,106.53468749999999,341.880234375,106.0546875,340.845234375,105.78468749999999C339.810234375,105.5296875,337.815234375,105.2296875,334.890234375,104.9146875L332.655234375,104.6296875C329.280234375,104.3746875,326.055234375,103.7896875,322.995234375,102.8746875C320.595244375,101.9596875,318.810244375,100.6546875,317.625244375,98.9296875C316.455244375,97.2046875,315.870244375,94.9246875,315.870244375,92.0596875C315.870244375,91.4746875,315.930244375,90.46968749999999,316.065234375,89.0446875C316.515244375,85.0846875,317.655234375,81.9796875,319.485234375,79.7296875C321.300234375,77.4796875,323.835234375,75.8746875,327.090234375,74.8996875C330.345234375,73.9246885,334.530234375,73.4296875,339.660234375,73.4296875L343.665234375,73.4296875L357.420234375,73.5346875L356.610234375,80.1346875C356.355234375,82.26468750000001,354.540234375,83.8696875,352.395234375,83.8696875L338.415234375,83.8696875C335.550234375,83.8696875,333.375234375,84.0196875,331.875234375,84.3046875C330.375234375,84.6046875,329.295234375,85.0846875,328.605234375,85.7746875C327.915234375,86.4646875,327.480234375,87.4546875,327.285234375,88.7446875C327.225234375,89.0746875,327.180234375,89.4946875,327.180234375,90.0196875C327.180234375,91.0546875,327.480234375,91.9396875,328.050234375,92.6446875C328.695234375,93.4246875,329.655234375,93.9646875,330.930234375,94.2496875C332.190234375,94.5346875,334.560234375,94.8796875,337.995234375,95.2696875C342.090234375,95.6596875,345.435234375,96.2746875,348.045234375,97.1296875C350.325234375,97.9096875,352.050234375,99.2446875,353.265234375,101.1346875C354.465234375,103.0246875,355.065234375,105.4546875,355.065234375,108.4546875C355.065234375,109.8796875,355.005234375,110.9896875,354.870234375,111.7696875C354.090234375,117.81468749999999,351.810234375,122.0596875,348.045234375,124.4896875C344.280234375,126.91968750000001,338.490234375,128.1496875,330.690234375,128.1496875L327.375234375,128.1496875L327.360234375,128.1196875Z" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><path d="M373.3651171875,128.1196875C367.3801271875,127.9846875,362.2201171875,127.7296875,357.8701171875,127.3396875L358.7701151875,120.81468749999999C359.0851171875,118.5796875,361.0951171875,116.9746875,363.3451171875,117.16968750000001C367.8751171875,117.54468750000001,371.6401171875,117.78468749999999,374.6401171875,117.88968750000001L377.9551171875,117.88968750000001C382.1101171875,117.88968750000001,385.0201171875,117.3646875,386.6851171875,116.3296875C388.3351171875,115.29468750000001,389.3701171875,113.79468750000001,389.7601171875,111.84468749999999C389.8201171875,111.51468750000001,389.8652171875,111.06468749999999,389.8652171875,110.4796875C389.8652171875,108.8596875,389.6102171875,107.7796875,389.0851171875,107.2546875C388.6351171875,106.53468749999999,387.8851171875,106.0546875,386.8501171875,105.78468749999999C385.8151171875,105.5296875,383.8201171875,105.2296875,380.8951171875,104.9146875L378.6601171875,104.6296875C375.2851171875,104.3746875,372.0601171875,103.7896875,369.0001171875,102.8746875C366.6001371875,101.9596875,364.8151371875,100.6546875,363.6301371875,98.9296875C362.4601371875,97.2046875,361.8751371875,94.9246875,361.8751371875,92.0596875C361.8751371875,91.4746875,361.9351371875,90.46968749999999,362.0701371875,89.0446875C362.5201371875,85.0846875,363.6601271875,81.9796875,365.4901271875,79.7296875C367.3051271875,77.4796875,369.8401171875,75.8746875,373.0951171875,74.8996875C376.3501171875,73.9246885,380.5351171875,73.4296875,385.6651171875,73.4296875L389.6701171875,73.4296875L403.4251171875,73.5346875L402.6151171875,80.1346875C402.3601171875,82.26468750000001,400.5451171875,83.8696875,398.4001171875,83.8696875L384.4201171875,83.8696875C381.5551171875,83.8696875,379.3801171875,84.0196875,377.8801171875,84.3046875C376.3801171875,84.6046875,375.3001171875,85.0846875,374.6102171875,85.7746875C373.9202171875,86.4646875,373.4852171875,87.4546875,373.2901171875,88.7446875C373.2302171875,89.0746875,373.1851171875,89.4946875,373.1851171875,90.0196875C373.1851171875,91.0546875,373.4851171875,91.9396875,374.0551171875,92.6446875C374.7001171875,93.4246875,375.6601171875,93.9646875,376.9351171875,94.2496875C378.1951171875,94.5346875,380.5651171875,94.8796875,384.0001171875,95.2696875C388.0951171875,95.6596875,391.4402171875,96.2746875,394.0501171875,97.1296875C396.3301171875,97.9096875,398.0551171875,99.2446875,399.2701171875,101.1346875C400.4701171875,103.0246875,401.0701171875,105.4546875,401.0701171875,108.4546875C401.0701171875,109.8796875,401.0101171875,110.9896875,400.8751171875,111.7696875C400.0951171875,117.81468749999999,397.8151171875,122.0596875,394.0501171875,124.4896875C390.2851171875,126.91968750000001,384.4951171875,128.1496875,376.6951171875,128.1496875L373.3801171875,128.1496875L373.3651171875,128.1196875Z" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g></g><g><g transform="matrix(0.653420627117157,-0.7569950819015503,0.7569950819015503,0.653420627117157,2.759073555469513,39.02634325623512)"><ellipse cx="48.59999990463257" cy="22" rx="4.599999904632568" ry="5.5" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g transform="matrix(0.7823152542114258,-0.6228827238082886,0.6228827238082886,0.7823152542114258,11.673297647153959,69.30429944698699)"><ellipse cx="109.490234375" cy="21.751171827316284" rx="4.5" ry="3.799999952316284" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><path d="M13.130000372772216,120.75960174560547C9.070000372772217,120.75960174560547,7.860000372772217,119.28960174560547,7.550000172772217,118.05960174560548C7.0900003727722165,116.20960174560547,8.580000372772217,114.12960174560547,11.430000372772216,112.61960174560546C21.120020372772217,107.48960174560547,34.980020372772216,97.32960174560547,42.130020372772215,85.58959974560547L42.58002037277222,84.84960174560547L43.450020372772215,84.86959834560547C44.240020372772214,84.88959504560547,44.910020372772216,85.24959574560548,45.350020372772214,85.87960174560547C46.590020372772216,87.69960174560546,45.57002037277222,91.10960174560547,42.93002037277222,98.44960174560546C41.880020372772215,101.37960174560547,40.79002037277222,104.40960174560547,40.590020372772216,105.98960174560547C39.72002037277222,112.73960174560547,33.76002037277222,116.85960174560546,20.660020372772216,119.75960174560547C17.650020372772218,120.42960174560547,15.120000372772218,120.75960174560547,13.130000372772216,120.75960174560547Z" fill="#B0B7F3" fill-opacity="1"/><path d="M20.984220372772217,121.22410174560547L20.985920372772217,121.22380174560547Q40.701520372772215,116.85880174560546,42.07812037277222,106.17800174560547Q42.255220372772214,104.77850174560547,44.307020372772215,99.05340174560547L44.341520372772216,98.95730174560546Q48.395620372772214,87.68567174560548,46.579720372772215,85.02071874560546Q45.461820372772216,83.42004174560547,43.48442037277222,83.36999174560547L41.748420372772216,83.33009174560547L40.84892037277222,84.80936734560547Q36.05152037277222,92.68648174560546,26.771320372772216,100.43170174560547Q19.009920372772214,106.90940174560546,10.727740372772217,111.29410174560547Q5.067750372772217,114.29290174560546,6.0954903727722165,118.42620174560547Q7.061633372772217,122.25960174560547,13.130000372772216,122.25960174560547Q16.331720372772217,122.25960174560547,20.984220372772217,121.22410174560547ZM39.10192037277221,105.80120174560547Q37.995420372772216,114.38570174560547,20.33582037277222,118.29510174560548L20.33412037277222,118.29540174560546Q16.00248037277222,119.25960174560547,13.130010372772217,119.25960174560547Q9.399350372772217,119.25960174560547,9.004520372772216,117.69300174560547Q8.545160372772216,115.84560174560546,12.132260372772217,113.94510174560547Q20.691120372772218,109.41390174560547,28.693620372772216,102.73500174560547Q38.355220372772216,94.67132174560547,43.41112037277222,86.36983174560547L43.411520372772216,86.36911174560547L43.415520372772214,86.36920174560547Q43.87032037277222,86.38072174560547,44.120220372772216,86.73848174560547Q45.054720372772216,88.11024174560546,41.518520372772215,97.94190174560546L41.48292037277222,98.04130174560547Q39.320320372772215,104.07580174560547,39.10192037277221,105.80120174560547Z" fill-rule="evenodd" fill="#000000" fill-opacity="1"/></g><g><path d="M105.119969140625,72.2403791809082C103.139999140625,72.2403791809082,101.100009140625,71.4603791809082,99.529999140625,70.1003791809082C98.749999140625,69.4303791809082,91.999999140625,63.180379180908204,94.110000140625,50.460379180908205L94.739999140625,46.6503791809082L96.849999140625,49.890379180908205C97.889999140625,51.500379180908205,99.769999140625,53.2203791809082,101.759999140625,55.0503791809082C104.439969140625,57.5103791809082,107.469969140625,60.3003791809082,109.539969140625,63.7203791809082C111.319969140625,66.6703791809082,110.669969140625,68.7203791809082,110.12996914062501,69.6803791809082C109.219969140625,71.2803791809082,107.359969140625,72.2403791809082,105.12996914062501,72.2403791809082L105.119969140625,72.2403791809082Z" fill="#B0B7F3" fill-opacity="1"/><path d="M105.119969140625,73.7403791809082L105.12996914062501,73.7403791809082Q109.54646914062499,73.7403791809082,111.43736914062501,70.4157791809082Q113.316269140625,67.0754791809082,110.823269140625,62.943679180908205Q108.63036914062499,59.3206791809082,102.775339140625,53.9462591809082L102.696679140625,53.873929180908206Q99.032449140625,50.5045291809082,98.106959140625,49.071809180908204L93.889965140625,42.5964291809082L92.630219140625,50.2149091809082Q91.219869140625,58.717179180908204,94.378749140625,65.4372791809082Q96.125649140625,69.1535791809082,98.547879140625,71.2341791809082Q101.441109140625,73.7403791809082,105.119969140625,73.7403791809082ZM108.822669140625,68.9449791809082Q107.801469140625,70.7403791809082,105.12996914062501,70.7403791809082L105.119969140625,70.7403791809082Q102.559789140625,70.7403791809082,100.512119140625,68.9665791809082Q98.567499140625,67.29617918090821,97.093749140625,64.1609791809082Q94.332310140625,58.2864791809082,95.589779140625,50.7058491809082L95.590029140625,50.7043291809082L95.593039140625,50.708949180908206Q96.720139140625,52.453799180908206,100.666059140625,56.0822291809082L100.744649140625,56.1544991809082Q106.285869140625,61.240779180908206,108.256769140625,64.4970791809082Q109.844469140625,67.12837918090821,108.822669140625,68.9449791809082Z" fill-rule="evenodd" fill="#000000" fill-opacity="1"/></g><g><path d="M35.3707,124.66C33.6507,117.22,36.0507,109.3,38.3607,101.64C39.8107,96.83,41.1907,92.28,41.3607,88.33C41.7307,79.78,43.5307,63.15,44.1807,57.33C43.2407,57.69,41.8507,58.38,39.9707,59.67C36.0907,62.35,24.2907,70.5,14.42069,70.5C13.86069,70.5,13.310690000000001,70.47,12.79069,70.42C9.810690000000001,70.13,7.450694,67.81,7.0506941,64.78C6.820694,63.01,6.9106941,56.93,17.9007,52.82C20.9907,51.68,23.8807,49.35,27.5407,46.39C30.2907,44.17,33.4107,41.65,37.390699999999995,38.98C38.7107,38.08,40.1307,37.47,41.5007,36.88C45.2807,35.25,48.5407,33.85,49.9207,26.74C52.2007,15,55.1007,8.969999999999999,66.2907,4.26C70.5707,2.46,75.7007,1.5,81.1107,1.5C88.7307,1.5,96.3207,3.4,101.4207,6.57C109.871,11.83,112.681,23.31,111.921,29.41C111.021,36.63,106.6007,42,97.6207,46.75C95.6507,54.46,93.7907,65.3,95.5607,74.59C95.7507,74.91,96.0807,75.47,96.1507,75.59C97.9007,78.21,100.3607,79.8,102.7307,81.33C105.1807,82.91,107.721,84.55,109.321,87.27C110.451,89.25,110.681,92.38,110.011,96.85L109.831,98.1L108.571,98.13C100.9007,98.32,83.5,99.5,73.8907,104.99C64.28139999999999,110.48,47.5807,117.84,37.7607,125.51L35.9007,126.96L35.3707,124.67L35.3707,124.66Z" fill="url(#master_svg1_14_05037)" fill-opacity="1"/><path d="M33.8708,124.827Q32.1566,117.018,36.904399999999995,101.2737L36.9246,101.2069L36.9597,101.0905Q39.7016,91.9954,39.8621,88.2655Q40.2443,79.433,42.3872,59.8978Q41.6441,60.3409,40.8194,60.9068Q24.7591,72,14.42069,72Q13.55073,72,12.647120000000001,71.9131Q6.399075,71.3051,5.5636,64.97630000000001Q5.16425,61.9031,6.9856073,58.8997Q9.80846,54.2448,17.3753,51.415Q20.2562,50.3522,26.5974,45.2237L26.6182,45.2069Q32.5295,40.4349,36.555,37.7343Q37.9788,36.7635,40.8885,35.5104L40.9074,35.5023L40.9168,35.4983Q44.3026,34.0382,45.5597,32.794799999999995Q47.6115,30.7651,48.4482,26.4542Q50.3351,16.7384,53.5217,12.0238Q57.316,6.41009,65.7088,2.8774800000000003Q72.5508,0,81.1107,0Q93.6921,0,102.2126,5.29604Q108.435,9.16913,111.523,17.1326Q114.126,23.844,113.409,29.5955Q112.038,40.5916,98.9132,47.7583Q95.0259,63.339,96.9865,74.0541L97.0869,74.2237L97.4207,74.7906Q99.0102,77.1427,103.5443,80.0698Q108.831,83.4793,110.614,86.5095Q112.565,89.9284,111.494,97.0724L111.135,99.5694L108.606,99.6296Q85.2858,100.2073,74.6348,106.292Q71.5751,108.04,63.8062,112.028Q45.295,121.529,38.684,126.692L34.969300000000004,129.588L33.8708,124.842L33.8708,124.827ZM36.8341,124.33L36.8321,124.322Q35.2071,117.293,39.7767,102.14L39.7968,102.073L39.832,101.956Q42.6826,92.5007,42.8593,88.3945Q43.2633,79.0578,45.6714,57.4965L45.9449,55.0481L43.6442,55.9292Q41.6684,56.6859,39.122,58.4332Q23.8237,69,14.42069,69Q13.69459,69,12.93427,68.9269Q9.061399999999999,68.55,8.53779,64.5837Q7.71239,58.2316,18.426099999999998,54.2249Q21.7583,52.9956,28.4839,47.5563L28.5026,47.5412Q34.3108,42.8524,38.2264,40.2257Q39.4305,39.4047,42.0751,38.2658L42.094,38.2577L42.1047,38.2531Q46.0101,36.569,47.6695,34.9276Q50.38,32.246300000000005,51.3932,27.0258Q53.1633,17.9112,56.0072,13.7037Q59.302,8.82909,66.8726,5.64252Q73.1559,3,81.1107,3Q92.8358,3,100.6288,7.84396Q105.9994,11.18706,108.726,18.2174Q111.055,24.2236,110.432,29.2246Q109.224,38.9154,96.9193,45.4241L96.3319,45.7348L96.1674,46.3787Q91.866,63.2128,94.0872,74.8707L94.1363,75.1283L94.5052,75.7517L94.8797,76.3877L94.9034,76.4231Q96.8282,79.3049,101.9171,82.5902Q106.6065,85.6144,108.028,88.0305Q109.444,90.5122,108.527,96.6277L108.527,96.6306Q84.4528,97.228,73.1466,103.688Q70.1454,105.402,62.4364,109.359Q43.6724,118.989,36.8374,124.328L36.8341,124.33Z" fill-rule="evenodd" fill="#000000" fill-opacity="1"/></g><g><path d="M79,82.38669999999999C71,83.38669999999999,66.00000622869,79.3869,66.00000311434,71.8868C66,64.3867,67,52.3867,72.49993,43.88664C79.3399,37.74664,96.5299,41.226865,95.9999,44.38687C93,49.88672,91.21039999999999,63.414100000000005,90,71.8867C89.5,75.38669999999999,87,81.38669999999999,79,82.38669999999999Z" fill="#FFFFFF" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><path d="M92.40000255126954,33.64025049560547C94.50000255126953,31.04025049560547,97.00000255126953,29.24025049560547,97.10000255126953,27.840250495605467C97.20000255126953,26.44025049560547,92.40000255126954,24.340251495605468,90.50000255126953,24.64025119560547C88.60000255126953,24.94025049560547,84.19999255126953,27.840250495605467,84.30000255126953,29.44025049560547C84.40001255126953,31.04025049560547,88.00000255126953,32.54025049560547,89.80000255126953,33.84025049560547C89.60000255126953,35.64022049560547,86.65000255126954,37.41022049560547,84.39000255126953,37.41022049560547C81.37000255126954,37.41022049560547,77.88999955126953,35.71022049560547,77.68000035126953,36.510220495605466C77.41000355126953,37.56022049560547,80.78000255126953,39.14022049560547,84.39000255126953,39.14022049560547C86.90999255126953,39.14022049560547,88.48000255126954,38.24022049560547,89.78000255126953,37.07022049560547C90.99000255126953,35.99022049560547,91.91000255126953,36.54022049560547,92.33000255126953,36.88022049560547C93.31000255126953,37.68022049560547,95.30000255126953,38.24022049560547,96.91000255126953,38.24022049560547C98.75000255126953,38.24022049560547,101.70000255126953,36.77022049560547,101.42000255126953,36.120220495605466C100.97000255126953,35.05022049560547,98.79000255126954,36.64022049560547,97.17000255126953,36.64022049560547C93.80000255126953,36.64022049560547,92.08000255126953,34.74022049560547,92.38000255126953,33.64025049560547L92.40000255126954,33.64025049560547Z" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><ellipse cx="73.29999995231628" cy="22.87967801094055" rx="2.299999952316284" ry="3.200000047683716" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><ellipse cx="103.39000082015991" cy="22.879873275756836" rx="2.0999999046325684" ry="3" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><path d="M79.02001440429687,105.64Q76.50000440429687,106.52,68.26999640429688,109.8Q67.65999640429688,108.05,68.91999840429688,105.35C73.01999440429688,98.05,70.61999440429688,92.75,85.42001440429688,89.25C80.22001440429688,91.85,79.42001440429688,95.15,79.32001440429687,101.25Q79.34001440429688,103.78,79.02001440429687,105.65L79.02001440429687,105.64Z" fill="#000000" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><path d="M0.0000113502,139.93639237060546C7.89472,141.46179237060545,19.707,142.57839237060546,27.0463,140.78379237060545C37.6982,138.18169237060548,49.2924,126.80629237060546,63.7428,114.89259237060547C78.1933,102.97880237060546,103.246,95.78070237060547,123.132,102.60992237060547C128.874,104.58389237060547,131.81,108.53189237060548,131.81,108.63159237060547C130.124,107.53499237060547,128.339,106.33859237060547,126.553,105.54099237060547C118.024,101.65284237060547,107.808,101.55314237060547,98.7831,103.64679237060547C97.4937,103.94589237060546,96.3036,104.24499237060547,95.0142,104.64369237060546C93.8241,105.04249237060547,92.5347,105.44129237060547,91.3446,105.84009237060548C90.1544,106.23889237060547,88.9643,106.73739237060546,87.7741,107.23589237060547C86.5839,107.73429237060547,85.3938,108.33249237060546,84.3028,108.83099237060547C83.1127,109.42919237060546,82.0217,110.02739237060547,80.9307,110.72519237060547C79.8397,111.42309237060547,78.7487,112.02129237060547,77.6578,112.81889237060547C76.5668,113.51679237060547,75.575,114.31429237060547,74.484,115.01219237060548C73.4922,115.80979237060546,72.4012,116.60739237060547,71.5086,117.40489237060547C67.7398,120.69489237060547,64.2685,124.38369237060547,60.7972,127.87309237060546C46.912,142.72789237060547,36.7957,150.70359237060546,15.4721,151.40149237060547C12.0008,151.50119237060545,3.37213,151.20209237060547,0.0991972,149.50729237060546C23.8032,167.15359237060545,60.2021,172.53729237060548,79.1455,171.34089237060547C99.0806,170.14449237060546,122.487,161.67029237060547,128.835,145.91819237060548C116.239,136.54669237060546,112.271,151.89999237060547,102.155,146.81549237060545C92.0388,141.73089237060546,98.2871,129.46829237060547,107.908,121.99099237060547C117.528,114.51369237060547,124.282,115.18169237060548,127.833,116.64719237060547C133.377,118.93029237060547,133.506,123.28699237060547,133.198,124.38369237060547C132.812,125.77949237060547,131.711,126.67669237060547,130.62,127.57399237060547C133.694,127.07549237060547,135.678,126.37759237060547,138.058,123.98489237060546C139.943,121.39279237060546,140.339,118.40189237060547,139.744,115.31129237060547C138.752,109.62859237060547,135.48,104.24499237060547,130.818,101.05466237060547C119.512,93.07891737060547,105.101,92.72998037060547,91.2057,94.14567237060547C75.5551,95.74082237060547,56.473,106.41829237060547,41.2191,118.16259237060547C31.9855,125.27099237060547,16.1663,136.85579237060546,0,139.94639237060545L0.0000113502,139.93639237060546ZM77.0627,140.93339237060547C67.5414,151.99969237060546,51.6032,159.35729237060548,35.6055,158.18089237060548C27.8695,157.61259237060546,22.8114,156.48609237060546,21.7204,155.48909237060548C34.0187,156.88479237060545,48.5981,152.49819237060547,54.3505,148.80939237060545C60.1029,145.12059237060546,67.0455,136.44699237060547,70.8144,133.85489237060546C74.5832,131.26279237060547,76.7651,131.86099237060546,78.0545,132.75819237060546C80.2364,134.15399237060547,79.6414,137.94249237060546,77.0627,140.93339237060547Z" fill="url(#master_svg2_14_2646)" fill-opacity="1" style="mix-blend-mode:passthrough"/></g><g><path d="M72,45.5C76,47.5,81.50006,49.00005,87,49Q92.4999,48.99995,96.9999,47.49995L99.5,44Q89,49,72,45.5Z" fill="#000000" fill-opacity="1"/></g></g></g></svg>
\ No newline at end of file
diff --git a/fluss-rust/website/static/img/release-guide.png b/fluss-rust/website/static/img/release-guide.png
new file mode 100644
index 0000000000..bf7602ddee
Binary files /dev/null and b/fluss-rust/website/static/img/release-guide.png differ
diff --git a/fluss-rust/website/static/manifest.json b/fluss-rust/website/static/manifest.json
new file mode 100644
index 0000000000..7cd3b569db
--- /dev/null
+++ b/fluss-rust/website/static/manifest.json
@@ -0,0 +1,17 @@
+{
+  "short_name": "Fluss Clients",
+  "name": "Apache Fluss Clients: Rust, Python, and C++",
+  "description": "Rust, Python, and C++ clients for Apache Fluss",
+  "start_url": "/",
+  "scope": "/",
+  "display": "standalone",
+  "background_color": "#000000",
+  "theme_color": "#0071e3",
+  "icons": [
+    {
+      "src": "img/logo/svg/colored_logo.svg",
+      "sizes": "any",
+      "type": "image/svg+xml"
+    }
+  ]
+}
diff --git a/fluss-rust/website/tsconfig.json b/fluss-rust/website/tsconfig.json
new file mode 100644
index 0000000000..d250afaedd
--- /dev/null
+++ b/fluss-rust/website/tsconfig.json
@@ -0,0 +1,6 @@
+{
+  "extends": "@docusaurus/tsconfig",
+  "compilerOptions": {
+    "baseUrl": "."
+  }
+}
diff --git a/pom.xml b/pom.xml
index d39a91d9b3..0c5c4dcc63 100644
--- a/pom.xml
+++ b/pom.xml
@@ -691,6 +691,8 @@
                         <exclude>website/static/**</exclude>
                         <exclude>website/build/**</exclude>
                         <exclude>website/node_modules/**</exclude>
+                        <!-- Rust client: own license enforcement (skywalking-eyes + cargo-deny) -->
+                        <exclude>fluss-rust/**</exclude>
                     </excludes>
                 </configuration>
             </plugin>
diff --git a/website/community/dev/building.md b/website/community/dev/building.md
index 963ac39917..a891c84b6b 100644
--- a/website/community/dev/building.md
+++ b/website/community/dev/building.md
@@ -52,4 +52,40 @@ mvn clean install -DskipTests -T 1C
 
 **NOTE**:
 - For local testing, it's recommend to use directory `${project}/build-target` in project.
-- For deploying distributed cluster, it's recommend to use binary file named `fluss-xxx-bin.tgz`, the file is in directory `${project}/fluss-dist/target`.
\ No newline at end of file
+- For deploying distributed cluster, it's recommend to use binary file named `fluss-xxx-bin.tgz`, the file is in directory `${project}/fluss-dist/target`.
+
+## Building the Rust client (fluss-rust)
+
+The Rust client, language bindings, and examples live under `fluss-rust/` and build with Cargo. You need **Rust** (the toolchain pinned in `fluss-rust/rust-toolchain.toml`, currently 1.85+) and **protoc**, the Protobuf compiler — `build.rs` compiles the canonical `fluss-rpc/src/main/proto/FlussApi.proto`.
+
+```bash
+# protoc (pick one)
+brew install protobuf                    # macOS
+sudo apt-get install protobuf-compiler   # Debian/Ubuntu
+
+cd fluss-rust
+cargo build --workspace --all-targets    # build everything
+cargo test --workspace                    # unit tests
+```
+
+Integration tests start a Fluss cluster via Docker:
+
+```bash
+RUST_TEST_THREADS=1 cargo test --features integration_tests --workspace
+```
+
+The Python and C++ bindings build on top of the Rust crate:
+
+```bash
+cd fluss-rust/bindings/python && uv sync --extra dev && uv run maturin develop   # Python
+cd fluss-rust/bindings/cpp && cmake -B build && cmake --build build              # C++
+```
+
+Before pushing, run the same checks CI does:
+
+```bash
+cd fluss-rust
+cargo fmt --all -- --check
+cargo clippy --all-targets --workspace -- -D warnings
+cargo deny check licenses
+```
\ No newline at end of file
diff --git a/website/community/dev/ide-setup.md b/website/community/dev/ide-setup.md
index 3d157a5c50..2914dd8467 100644
--- a/website/community/dev/ide-setup.md
+++ b/website/community/dev/ide-setup.md
@@ -221,3 +221,12 @@ Go to "Settings" → "Build, Execution, Deployment" → "Compiler" → "Java Com
 This happens if Fluss dependencies are set to "provided", resulting in them not being available
 on the classpath. You can either check "Include dependencies with 'Provided' scope" in your
 run configuration, or create a test that calls the `main()` method of the example.
+
+## RustRover
+
+For the Rust client under `fluss-rust/`, we recommend [RustRover](https://www.jetbrains.com/rust/).
+
+1. Open RustRover, choose **Open**, and select the `fluss-rust/` directory as the project root so Cargo resolves the workspace.
+2. RustRover uses the toolchain pinned in `fluss-rust/rust-toolchain.toml` (install it with `rustup` if prompted). You also need **protoc** on your `PATH` — see [Building the Rust client](/community/dev/building).
+3. Enable **Rustfmt** and **Clippy** under Settings → Rust so local formatting and lints match CI (`cargo fmt --all -- --check`, `cargo clippy --all-targets -- -D warnings`).
+4. Apply the Apache license header to new files — the boilerplate is in `fluss-rust/copyright.txt`; configure it under Settings → Editor → Copyright.
diff --git a/website/community/how-to-contribute/contribute-code.md b/website/community/how-to-contribute/contribute-code.md
index affadb37d5..31ac2418f3 100644
--- a/website/community/how-to-contribute/contribute-code.md
+++ b/website/community/how-to-contribute/contribute-code.md
@@ -32,6 +32,7 @@ Implement the change according to the Code Style and Quality (refer to the [Flin
 
 1. Only start working on the implementation if there is consensus on the approach (e.g. you are assigned to the ticket)
 2. If you are newer, can refer to [ide setup](/community/dev/ide-setup) to setup a Fluss dev environment.
+3. For Rust client changes (under `fluss-rust/`), see [Building the Rust client](/community/dev/building) and the [RustRover IDE setup](/community/dev/ide-setup).
 
 ### Review
 Create the pull request and work with the reviewer. 
@@ -53,7 +54,7 @@ Considerations before opening a pull request:
 
 - Fill out the pull request template to describe the changes contributed by the pull request. Please describe it such that the reviewer understands the problem and solution from the description, not only from the code. That will give reviewers the context they need to do the review.
 
-- Make sure that the change passes the automated tests, i.e., `mvn clean verify` passes.
+- Make sure that the change passes the automated tests, i.e., `mvn clean verify` passes. For Rust client changes under `fluss-rust/`, make sure `cargo build`, `cargo test`, `cargo fmt --all -- --check`, and `cargo clippy --all-targets -- -D warnings` pass (see [Building the Rust client](/community/dev/building)).
 
 - Each pull request should address only one issue, not mix up code from multiple issues.
 
diff --git a/website/community/how-to-release/creating-a-fluss-release.mdx b/website/community/how-to-release/creating-a-fluss-release.mdx
index e66e3a46b5..3eb15b4b26 100644
--- a/website/community/how-to-release/creating-a-fluss-release.mdx
+++ b/website/community/how-to-release/creating-a-fluss-release.mdx
@@ -503,6 +503,28 @@ This ensures that the **Quickstart guide** in the documentation references the c
 git push origin $TAG
 ```
 
+### 9. Publish client RC artifacts (crates.io / PyPI)
+
+The Rust, Python, and C++ clients live in `fluss-rust/` and release from this repository under the **same version and tag** — there is no separate client release.
+
+Before tagging, make sure the client versions match `${RELEASE_VERSION}`:
+
+```bash
+# Rust workspace version (also drives the python/cpp/elixir-NIF crates via version.workspace)
+fluss-rust/scripts/bump-version.sh <previous_version> ${RELEASE_VERSION}
+# The Elixir binding has its own version attribute — set @version "${RELEASE_VERSION}" in
+#   fluss-rust/bindings/elixir/mix.exs
+# Regenerate the Rust dependency/license audit (cargo-deny):
+(cd fluss-rust && python3 scripts/dependencies.py)
+```
+
+Pushing the **RC tag** (`v${RELEASE_VERSION}-rc${RC_NUM}`) triggers the client workflows:
+
+- `python-release.yml` builds the wheels + sdist and publishes them to **TestPyPI**.
+- `rust-release.yml` runs `cargo publish --dry-run` to validate the crate. The canonical proto is vendored into the crate automatically (`fluss-rust/scripts/vendor-proto.sh`), because `build.rs` reads it from the in-repo `fluss-rpc`.
+
+The final publish to **crates.io** and **PyPI** happens when the release tag is pushed (see the **Git tag** step under [Finalize the release](#finalize-the-release)). Publishing uses the CI secrets `CARGO_REGISTRY_TOKEN`, `PYPI_API_TOKEN`, and `TEST_PYPI_API_TOKEN`.
+
 -------------
 
 **Checklist to proceed to the next step**
@@ -512,6 +534,7 @@ git push origin $TAG
 - RC Docker images pushed to DockerHub ([apache/fluss](https://hub.docker.com/r/apache/fluss/tags), [apache/fluss-quickstart-flink](https://hub.docker.com/r/apache/fluss-quickstart-flink/tags))
 - RC tag pushed to the [official repository](https://github.com/apache/fluss/tags)
 - Updated `dockerVersion` in `fluss-versions.json` on `main` branch
+- Client RC artifacts published to TestPyPI, and `cargo publish --dry-run` for `fluss-rs` is green
 
 
 ## Vote on the release candidate
@@ -774,6 +797,8 @@ git tag -s "v${RELEASE_VERSION}" refs/tags/${TAG}^{} -m "Release Fluss ${RELEASE
 git push origin refs/tags/v${RELEASE_VERSION}
 ```
 
+Pushing the `v${RELEASE_VERSION}` tag also triggers the client release workflows: `rust-release.yml` publishes `fluss-rs` to **crates.io**, and `python-release.yml` publishes the wheels + sdist to **PyPI**.
+
 ### 5. Publish Docker Images
 
 :::note
diff --git a/website/community/how-to-release/pre-rc-checklist.md b/website/community/how-to-release/pre-rc-checklist.md
new file mode 100644
index 0000000000..658bc4a778
--- /dev/null
+++ b/website/community/how-to-release/pre-rc-checklist.md
@@ -0,0 +1,38 @@
+---
+title: Pre-RC Checklist
+sidebar_position: 1.5
+---
+
+# Pre-RC Checklist
+
+Run through this before cutting a release candidate. It catches the problems that are expensive to discover mid-vote, now that a single tag releases Java, Rust, Python, and C++ together.
+
+## Access and secrets
+
+- [ ] Maven Central (Apache Nexus) access for `org.apache.fluss` — see [Release Manager Preparation](release-manager-preparation.md)
+- [ ] `CARGO_REGISTRY_TOKEN`, `PYPI_API_TOKEN`, and `TEST_PYPI_API_TOKEN` configured as repository secrets
+- [ ] crates.io owner of `fluss-rs`; PyPI maintainer of `pyfluss`
+- [ ] GPG key published to the Apache KEYS file
+
+## Build and publish dry-runs
+
+- [ ] `cargo publish -p fluss-rs --dry-run` succeeds (run after `fluss-rust/scripts/vendor-proto.sh` so the proto is vendored)
+- [ ] Python wheels + sdist install from **TestPyPI** (the RC tag publishes there):
+
+  ```bash
+  pip install -i https://test.pypi.org/simple/ pyfluss==${RELEASE_VERSION}
+  ```
+
+- [ ] `fluss-cpp` Bazel build smoke test passes:
+
+  ```bash
+  cd fluss-rust/bindings/cpp && bazel build //...
+  ```
+
+## Audits
+
+- [ ] `cargo deny check licenses` passes; the Rust dependency list is regenerated and committed
+- [ ] Java + Rust + binding CI is green on the release branch
+- [ ] `LICENSE` / `NOTICE` cover any third-party content bundled in the source release (including under `fluss-rust/`)
+
+Once these pass, proceed to [Creating a Fluss Release](creating-a-fluss-release.mdx).
diff --git a/website/community/how-to-release/release-manager-preparation.md b/website/community/how-to-release/release-manager-preparation.md
index 700b3bf45e..85123029d4 100644
--- a/website/community/how-to-release/release-manager-preparation.md
+++ b/website/community/how-to-release/release-manager-preparation.md
@@ -210,6 +210,28 @@ which tar
 ```
 
 
+## Rust, Python, and C++ client publishing
+
+A unified release also publishes the Rust, Python, and C++ clients (under `fluss-rust/`). As release manager, make sure the following access and CI secrets are in place.
+
+### Registry access
+
+- **crates.io** (`fluss-rs`): your account must be an owner of the crate. Verify with `cargo login <token>`, then `cargo publish -p fluss-rs --dry-run`.
+- **PyPI / TestPyPI** (`pyfluss`): confirm you are a maintainer at https://pypi.org/project/pyfluss/, and generate API tokens on both PyPI and TestPyPI.
+- **Hex.pm** (Elixir `fluss`): post-1.0 only — not published yet.
+
+### GitHub Actions secrets
+
+Publishing is automated by the `rust-release.yml` and `python-release.yml` workflows when a version tag is pushed. Configure these repository secrets:
+
+- `CARGO_REGISTRY_TOKEN` — crates.io API token
+- `PYPI_API_TOKEN` — PyPI token (final release)
+- `TEST_PYPI_API_TOKEN` — TestPyPI token (release candidates)
+
+### Toolchain for the dependency audit
+
+Regenerating the Rust dependency/license list (an ASF requirement) needs the toolchain in `fluss-rust/rust-toolchain.toml`, [cargo-deny](https://embarkstudios.github.io/cargo-deny/), and Python 3.11+ (for `fluss-rust/scripts/dependencies.py`).
+
 ## Further reading
 
 It's recommended but not mandatory to read following documents before making a release to know more details about apache release:
diff --git a/website/community/how-to-release/verifying-a-fluss-release.md b/website/community/how-to-release/verifying-a-fluss-release.md
index a1649c17c7..c7b4dc7863 100644
--- a/website/community/how-to-release/verifying-a-fluss-release.md
+++ b/website/community/how-to-release/verifying-a-fluss-release.md
@@ -73,6 +73,38 @@ Unzip the source release archive, and verify that:
 5. The LICENSE and NOTICE files in the root directory refer to dependencies in the source release, i.e., files in the git repository (such as fonts, css, JavaScript, images)
 
 
+## Verifying the clients (Rust / Python / C++)
+
+The Rust, Python, and C++ clients ship in the same source release under `fluss-rust/`. Build them from the extracted source archive — you need **Rust** (see `fluss-rust/rust-toolchain.toml` for the expected version), plus **protobuf** and, for the Python binding, **Python 3.9+**:
+
+```bash
+cd fluss-rust
+cargo build --workspace --release
+```
+
+Per-language verification:
+
+- **Rust:** build from the source release (above), or depend on the RC tag in a throwaway project (`fluss-rs = { git = "https://github.com/apache/fluss", tag = "v${RELEASE_VERSION}-rc${RC_NUM}" }`), then write a few test cases (connect, create table, read/write). Installation: https://fluss.apache.org/docs/apis/rust/installation/
+- **Python:** for an RC, install from **TestPyPI** (`pip install -i https://test.pypi.org/simple/ pyfluss==${RELEASE_VERSION}`) and write test cases. Installation: https://fluss.apache.org/docs/apis/python/installation/
+- **C++:** build and link the C++ client from `fluss-rust/bindings/cpp/`, then verify. Installation: https://fluss.apache.org/docs/apis/cpp/installation/
+
+The Rust workspace's dependency licenses are checked with [cargo-deny](https://embarkstudios.github.io/cargo-deny/); the release manager regenerates the dependency audit before the release.
+
+## Release artifacts and publish targets
+
+A release publishes to several registries; confirm each one carries the release version:
+
+| Component | Target | Identifier |
+|-----------|--------|------------|
+| Java / Scala | Maven Central (via Apache Nexus staging) | `org.apache.fluss:fluss-*` |
+| Rust | [crates.io](https://crates.io/crates/fluss-rs) | `fluss-rs` |
+| Python | [PyPI](https://pypi.org/project/pyfluss/) (RC → [TestPyPI](https://test.pypi.org/project/pyfluss/)) | `pyfluss` |
+| C++ | source archive only (no registry) | — |
+| Elixir | Hex.pm (post-1.0; not yet published) | `fluss` |
+| Docker | Docker Hub | `apache/fluss`, `apache/fluss-quickstart-flink` |
+
+Source archives, signatures, and checksums are on [dist.apache.org](https://dist.apache.org/repos/dist/dev/incubator/fluss/) (dev) and, after the vote, on [downloads.apache.org](https://downloads.apache.org/incubator/fluss/).
+
 ## Testing Against Staged Maven Artifacts
 
 Update the root `pom.xml` of the maven project (like the apache/fluss project) to include the staged repository in the `<repositories>` section. You can do this by adding a new repository entry like this:
diff --git a/website/docs/apis/client-support-matrix.md b/website/docs/apis/client-support-matrix.md
index cb932f2d42..f4d31733c5 100644
--- a/website/docs/apis/client-support-matrix.md
+++ b/website/docs/apis/client-support-matrix.md
@@ -1,6 +1,6 @@
 ---
 title: "Client Support Matrix"
-sidebar_position: 5
+sidebar_position: 6
 ---
 
 # Client Feature Support Matrix
@@ -11,7 +11,7 @@ Fluss has a rich set of features and native data types available to users. The f
 
 These data operations are available under TableAppend, TableScan, TableUpsert and TableLookup interfaces.
 
-| Table Type   | Operations                 | [Java Client](/apis/java-client.md) | Rust Client | Python Client | C++ Client |
+| Table Type   | Operations                 | [Java Client](./java/index.md) | Rust Client | Python Client | C++ Client |
 |--------------|----------------------------|-------------------------------------|-------------|---------------|------------|
 | Log          | Append                     | ✔️                                  | ✔️          | ✔️            | ✔️         |
 | Log          | Typed Append               | ✔️                                  |             |               |            |
@@ -30,14 +30,14 @@ These data operations are available under TableAppend, TableScan, TableUpsert an
 | Primary Key  | Batch Scan (Snapshot)      | ✔️                                  |             |               |            |
 
 :::tip
-For more details, see [Table Overview](/table-design/overview.md).
+For more details, see [Table Overview](../table-design/overview.md).
 :::
 
 ## Data Types
 
 Client support for Fluss data types are as follows:
 
-| DataType                                                        | [Java Client](/apis/java-client.md) | Rust Client | Python Client | C++ Client |
+| DataType                                                        | [Java Client](./java/index.md) | Rust Client | Python Client | C++ Client |
 |-----------------------------------------------------------------|-------------------------------------|-------------|---------------|------------|
 | BOOLEAN                                                         | ✔️                                  | ✔️          | ✔️            | ✔️         |
 | TINYINT                                                         | ✔️                                  | ✔️          | ✔️            | ✔️         |
@@ -63,14 +63,14 @@ Client support for Fluss data types are as follows:
 | ROW\<n0 t0, n1 t1, ...\><br/>ROW\<n0 t0 'd0', n1 t1 'd1', ...\> | ✔️                                  |             |               |            |
 
 :::tip
-For more details, see [Data Types](table-design/data-types.md).
+For more details, see [Data Types](../table-design/data-types.md).
 :::
 
 ## Admin Operations
 
 Admin operations are available under FlussAdmin interface.
 
-| Entity    | Operations             | [Java Client](/apis/java-client.md) | Rust Client | Python Client | C++ Client |
+| Entity    | Operations             | [Java Client](./java/index.md) | Rust Client | Python Client | C++ Client |
 |-----------|------------------------|-------------------------------------|-------------|---------------|------------|
 | Database  | CreateDatabase         | ✔️                                  | ✔️          |               |            |
 | Database  | DropDatabase           | ✔️                                  | ✔️          |               |            |
@@ -103,12 +103,12 @@ Admin operations are available under FlussAdmin interface.
 
 ## Data Lake Formats
 
-| Format  | [Java Client](/apis/java-client.md) | Rust Client | Python Client | C++ Client |
+| Format  | [Java Client](./java/index.md) | Rust Client | Python Client | C++ Client |
 |---------|-------------------------------------|-------------|---------------|------------|
 | Iceberg | ✔️                                  |             |               |            |
 | Lance   | ✔️                                  | ✔️          |               |            |
 | Paimon  | ✔️                                  |             |               |            |
 
 :::tip
-For more details, see [Streaming Lakehouse](/streaming-lakehouse/overview.md).
+For more details, see [Streaming Lakehouse](../streaming-lakehouse/overview.md).
 :::
\ No newline at end of file
diff --git a/website/docs/apis/cpp-client.md b/website/docs/apis/cpp-client.md
deleted file mode 100644
index 420ea6f099..0000000000
--- a/website/docs/apis/cpp-client.md
+++ /dev/null
@@ -1,70 +0,0 @@
----
-title: "C++ Client"
-sidebar_position: 4
----
-
-# Fluss C++ Client
-
-The Fluss C++ Client provides a high-performance, synchronous interface for
-interacting with Fluss clusters. It manages an internal Tokio runtime and
-supports Apache Arrow for efficient data interchange.
-
-The client provides two main APIs:
-
-- **[Admin API](https://clients.fluss.apache.org/user-guide/cpp/api-reference#admin)**: For managing databases, tables, and partitions.
-- **[Table API](https://clients.fluss.apache.org/user-guide/cpp/api-reference#table)**: For reading and writing to Log and Primary Key tables.
-
-## Installation
-
-The C++ client is not yet published as a package and must be built from source.
-
-**Prerequisites:** CMake 3.22+, C++17 compiler, Rust 1.85+, Apache Arrow C++ library
-
-Install dependencies:
-```bash
-# macOS
-brew install cmake arrow
-
-# Ubuntu/Debian
-sudo apt-get install cmake libarrow-dev
-```
-```bash
-git clone https://github.com/apache/fluss-rust.git
-cd fluss-rust/bindings/cpp
-mkdir -p build && cd build
-cmake -DCMAKE_BUILD_TYPE=Release ..
-cmake --build .
-```
-
-For full build options including CMake integration into your own project, see the
-[C++ client installation guide](https://clients.fluss.apache.org/user-guide/cpp/installation).
-
-## Quick Example
-```cpp
-#include "fluss.hpp"
-
-int main() {
-    fluss::Configuration config;
-    config.bootstrap_servers = "127.0.0.1:9123";
-
-    fluss::Connection conn;
-    fluss::Result result = fluss::Connection::Create(config, conn);
-    if (!result.Ok()) {
-        std::cerr << "Connection failed: " << result.error_message << std::endl;
-        return 1;
-    }
-
-    fluss::Admin admin;
-    conn.GetAdmin(admin);
-
-    return 0;
-}
-```
-
-For more examples, see the [Fluss C++ Client documentation](https://clients.fluss.apache.org/user-guide/cpp/example/).
-
-## Full Documentation
-
-For the complete C++ client reference including all configuration options,
-API methods, data types, error handling, and worked examples — see the
-**[Fluss C++ Client documentation](https://clients.fluss.apache.org/user-guide/cpp/installation)**.
\ No newline at end of file
diff --git a/website/docs/apis/cpp/_category_.json b/website/docs/apis/cpp/_category_.json
new file mode 100644
index 0000000000..d32653e102
--- /dev/null
+++ b/website/docs/apis/cpp/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "C++",
+  "position": 5
+}
diff --git a/website/docs/apis/cpp/api-reference.md b/website/docs/apis/cpp/api-reference.md
new file mode 100644
index 0000000000..e9b94c9d9e
--- /dev/null
+++ b/website/docs/apis/cpp/api-reference.md
@@ -0,0 +1,731 @@
+---
+sidebar_position: 2
+---
+# API Reference
+
+Complete API reference for the Fluss C++ client.
+
+## `Result`
+
+| Field / Method  | Type          | Description                                                    |
+|-----------------|---------------|----------------------------------------------------------------|
+| `error_code`    | `int32_t`     | 0 for success, non-zero for errors                             |
+| `error_message` | `std::string` | Human-readable error description                               |
+| `Ok()`          | `bool`        | Returns `true` if operation succeeded (`error_code == 0`)      |
+
+## `Configuration`
+
+| Field                                 | Type          | Default              | Description                                                                              |
+|---------------------------------------|---------------|----------------------|------------------------------------------------------------------------------------------|
+| `bootstrap_servers`                   | `std::string` | `"127.0.0.1:9123"`   | Coordinator server address                                                               |
+| `writer_request_max_size`             | `int32_t`     | `10485760` (10 MB)   | Maximum request size in bytes                                                            |
+| `writer_acks`                         | `std::string` | `"all"`              | Acknowledgment setting (`"all"`, `"0"`, `"1"`, or `"-1"`)                                |
+| `writer_retries`                      | `int32_t`     | `INT32_MAX`          | Number of retries on failure                                                             |
+| `writer_batch_size`                   | `int32_t`     | `2097152` (2 MB)     | Batch size for writes in bytes. Upper bound when dynamic sizing is on; fixed batch size when off |
+| `writer_dynamic_batch_size_enabled`   | `bool`        | `true`               | Enable per-table dynamic batch sizing: target grows 10% above 80% fill, shrinks 5% below 50% |
+| `writer_dynamic_batch_size_min`       | `int32_t`     | `262144` (256 KB)    | Lower bound for the dynamic batch size estimator (ignored when disabled)                 |
+| `writer_batch_timeout_ms`             | `int64_t`     | `100`                | Maximum time in ms to wait for a writer batch to fill up before sending                  |
+| `writer_bucket_no_key_assigner`       | `std::string` | `"sticky"`           | Bucket assignment strategy for tables without bucket keys: `"sticky"` or `"round_robin"` |
+| `scanner_remote_log_prefetch_num`     | `size_t`      | `4`                  | Number of remote log segments to prefetch                                                |
+| `remote_file_download_thread_num`     | `size_t`      | `3`                  | Number of threads for remote log downloads                                               |
+| `scanner_remote_log_read_concurrency` | `size_t`      | `4`                  | Streaming read concurrency within a remote log file                                      |
+| `scanner_log_max_poll_records`        | `size_t`      | `500`                | Maximum number of records returned in a single Poll()                                    |
+| `scanner_log_fetch_max_bytes`         | `int32_t`     | `16777216` (16 MB)   | Maximum bytes per fetch response for LogScanner                                          |
+| `scanner_log_fetch_min_bytes`         | `int32_t`     | `1`                  | Minimum bytes the server must accumulate before returning a fetch response               |
+| `scanner_log_fetch_wait_max_time_ms`  | `int32_t`     | `500`                | Maximum time (ms) the server may wait to satisfy min-bytes                               |
+| `scanner_log_fetch_max_bytes_for_bucket`| `int32_t`   | `1048576` (1 MB)     | Maximum bytes per fetch response per bucket for LogScanner                               |
+| `connect_timeout_ms`                  | `uint64_t`    | `120000`             | TCP connect timeout in milliseconds                                                      |
+| `security_protocol`                   | `std::string` | `"PLAINTEXT"`        | `"PLAINTEXT"` (default) or `"sasl"` for SASL auth                                        |
+| `security_sasl_mechanism`             | `std::string` | `"PLAIN"`            | SASL mechanism (only `"PLAIN"` is supported)                                             |
+| `security_sasl_username`              | `std::string` | (empty)              | SASL username (required when protocol is `"sasl"`)                                       |
+| `security_sasl_password`              | `std::string` | (empty)              | SASL password (required when protocol is `"sasl"`)                                       |
+
+## `Connection`
+
+| Method                                                                  | Description                                       |
+|-------------------------------------------------------------------------|---------------------------------------------------|
+| `static Create(const Configuration& config, Connection& out) -> Result` | Create a connection to a Fluss cluster            |
+| `GetAdmin(Admin& out) -> Result`                                        | Get the admin interface                           |
+| `GetTable(const TablePath& table_path, Table& out) -> Result`           | Get a table for read/write operations             |
+| `Available() -> bool`                                                   | Check if the connection is valid and initialized  |
+
+## `Admin`
+
+### Database Operations
+
+| Method                                                                                                                    | Description              |
+|---------------------------------------------------------------------------------------------------------------------------|--------------------------|
+| `CreateDatabase(const std::string& database_name, const DatabaseDescriptor& descriptor, bool ignore_if_exists) -> Result` | Create a database        |
+| `DropDatabase(const std::string& name, bool ignore_if_not_exists, bool cascade) -> Result`                                | Drop a database          |
+| `ListDatabases(std::vector<std::string>& out) -> Result`                                                                  | List all databases       |
+| `DatabaseExists(const std::string& name, bool& out) -> Result`                                                            | Check if a database exists |
+| `GetDatabaseInfo(const std::string& name, DatabaseInfo& out) -> Result`                                                   | Get database metadata    |
+
+### Table Operations
+
+| Method                                                                                                     | Description                 |
+|------------------------------------------------------------------------------------------------------------|-----------------------------|
+| `CreateTable(const TablePath& path, const TableDescriptor& descriptor, bool ignore_if_exists) -> Result`   | Create a table              |
+| `DropTable(const TablePath& path, bool ignore_if_not_exists) -> Result`                                    | Drop a table                |
+| `GetTableInfo(const TablePath& path, TableInfo& out) -> Result`                                            | Get table metadata          |
+| `ListTables(const std::string& database_name, std::vector<std::string>& out) -> Result`                    | List tables in a database   |
+| `TableExists(const TablePath& path, bool& out) -> Result`                                                  | Check if a table exists     |
+
+### Partition Operations
+
+| Method                                                                                                                                          | Description              |
+|-------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|
+| `CreatePartition(const TablePath& path, const std::unordered_map<std::string, std::string>& partition_spec, bool ignore_if_exists) -> Result`   | Create a partition       |
+| `DropPartition(const TablePath& path, const std::unordered_map<std::string, std::string>& partition_spec, bool ignore_if_not_exists) -> Result` | Drop a partition         |
+| `ListPartitionInfos(const TablePath& path, std::vector<PartitionInfo>& out) -> Result`                                                          | List partition metadata  |
+
+### Offset Operations
+
+| Method                                                                                                                                                                                                  | Description                             |
+|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|
+| `ListOffsets(const TablePath& path, const std::vector<int32_t>& bucket_ids, const OffsetSpec& query, std::unordered_map<int32_t, int64_t>& out) -> Result`                                             | Get offsets for buckets                 |
+| `ListPartitionOffsets(const TablePath& path, const std::string& partition_name, const std::vector<int32_t>& bucket_ids, const OffsetSpec& query, std::unordered_map<int32_t, int64_t>& out) -> Result` | Get offsets for a partition's buckets   |
+
+### Lake Operations
+
+| Method                                                                      | Description                  |
+|-----------------------------------------------------------------------------|------------------------------|
+| `GetLatestLakeSnapshot(const TablePath& path, LakeSnapshot& out) -> Result` | Get the latest lake snapshot |
+
+### Cluster Operations
+
+| Method                                                    | Description                                        |
+|-----------------------------------------------------------|----------------------------------------------------|
+| `GetServerNodes(std::vector<ServerNode>& out) -> Result`  | Get all alive server nodes (coordinator + tablets) |
+
+## `ServerNode`
+
+| Field         | Type          | Description                                              |
+|---------------|---------------|----------------------------------------------------------|
+| `id`          | `int32_t`     | Server node ID                                           |
+| `host`        | `std::string` | Hostname of the server                                   |
+| `port`        | `uint32_t`    | Port number                                              |
+| `server_type` | `std::string` | Server type (`"CoordinatorServer"` or `"TabletServer"`)  |
+| `uid`         | `std::string` | Unique identifier (e.g. `"cs-0"`, `"ts-1"`)             |
+
+## `Table`
+
+| Method                        | Description                              |
+|-------------------------------|------------------------------------------|
+| `NewRow() -> GenericRow`      | Create a schema-aware row for this table |
+| `NewAppend() -> TableAppend`  | Create an append builder for log tables  |
+| `NewUpsert() -> TableUpsert`  | Create an upsert builder for PK tables   |
+| `NewLookup() -> TableLookup`  | Create a lookup builder for PK tables    |
+| `NewScan() -> TableScan`      | Create a scan builder                    |
+| `GetTableInfo() -> TableInfo` | Get table metadata                       |
+| `GetTablePath() -> TablePath` | Get the table path                       |
+| `HasPrimaryKey() -> bool`     | Check if the table has a primary key     |
+
+## `TableAppend`
+
+| Method                                       | Description             |
+|----------------------------------------------|-------------------------|
+| `CreateWriter(AppendWriter& out) -> Result`  | Create an append writer |
+
+## `TableUpsert`
+
+| Method                                                                       | Description                                |
+|------------------------------------------------------------------------------|--------------------------------------------|
+| `PartialUpdateByIndex(std::vector<size_t> column_indices) -> TableUpsert&`   | Configure partial update by column indices |
+| `PartialUpdateByName(std::vector<std::string> column_names) -> TableUpsert&` | Configure partial update by column names   |
+| `CreateWriter(UpsertWriter& out) -> Result`                                  | Create an upsert writer                    |
+
+## `TableLookup`
+
+| Method                                    | Description                         |
+|-------------------------------------------|-------------------------------------|
+| `CreateLookuper(Lookuper& out) -> Result` | Create a lookuper for point lookups |
+
+## `TableScan`
+
+| Method                                                               | Description                                   |
+|----------------------------------------------------------------------|-----------------------------------------------|
+| `ProjectByIndex(std::vector<size_t> column_indices) -> TableScan&`   | Project columns by index                      |
+| `ProjectByName(std::vector<std::string> column_names) -> TableScan&` | Project columns by name                       |
+| `CreateLogScanner(LogScanner& out) -> Result`                        | Create a record-based log scanner             |
+| `CreateRecordBatchLogScanner(LogScanner& out) -> Result`             | Create an Arrow RecordBatch-based log scanner |
+
+## `AppendWriter`
+
+| Method                                                      | Description                            |
+|-------------------------------------------------------------|----------------------------------------|
+| `Append(const GenericRow& row) -> Result`                   | Append a row (fire-and-forget)         |
+| `Append(const GenericRow& row, WriteResult& out) -> Result` | Append a row with write acknowledgment |
+| `Flush() -> Result`                                         | Flush all pending writes               |
+
+## `UpsertWriter`
+
+| Method                                                      | Description                                   |
+|-------------------------------------------------------------|-----------------------------------------------|
+| `Upsert(const GenericRow& row) -> Result`                   | Upsert a row (fire-and-forget)                |
+| `Upsert(const GenericRow& row, WriteResult& out) -> Result` | Upsert a row with write acknowledgment        |
+| `Delete(const GenericRow& row) -> Result`                   | Delete a row by primary key (fire-and-forget) |
+| `Delete(const GenericRow& row, WriteResult& out) -> Result` | Delete a row with write acknowledgment        |
+| `Flush() -> Result`                                         | Flush all pending operations                  |
+
+## `WriteResult`
+
+| Method             | Description                                 |
+|--------------------|---------------------------------------------|
+| `Wait() -> Result` | Wait for server acknowledgment of the write |
+
+## `Lookuper`
+
+| Method                                                        |  Description                |
+|---------------------------------------------------------------|-----------------------------|
+| `Lookup(const GenericRow& pk_row, LookupResult& out) -> Result` | Lookup a row by primary key |
+
+## `LogScanner`
+
+| Method                                                                                               |  Description                              |
+|------------------------------------------------------------------------------------------------------|-------------------------------------------|
+| `Subscribe(int32_t bucket_id, int64_t offset) -> Result`                                             | Subscribe to a single bucket at an offset |
+| `Subscribe(const std::vector<BucketSubscription>& bucket_offsets) -> Result`                         | Subscribe to multiple buckets             |
+| `SubscribePartitionBuckets(int64_t partition_id, int32_t bucket_id, int64_t start_offset) -> Result` | Subscribe to a single partition bucket    |
+| `SubscribePartitionBuckets(const std::vector<PartitionBucketSubscription>& subscriptions) -> Result` | Subscribe to multiple partition buckets   |
+| `Unsubscribe(int32_t bucket_id) -> Result`                                                           | Unsubscribe from a non-partitioned bucket |
+| `UnsubscribePartition(int64_t partition_id, int32_t bucket_id) -> Result`                            | Unsubscribe from a partition bucket       |
+| `Poll(int64_t timeout_ms, ScanRecords& out) -> Result`                                               | Poll individual records                   |
+| `PollRecordBatch(int64_t timeout_ms, ArrowRecordBatches& out) -> Result`                             | Poll Arrow RecordBatches                  |
+
+## `GenericRow`
+
+`GenericRow` is a **write-only** row used for append, upsert, delete, and lookup key construction. For reading field values from scan or lookup results, see [`RowView`](#rowview) and [`LookupResult`](#lookupresult).
+
+### Index-Based Setters
+
+| Method                                                    |  Description                   |
+|-----------------------------------------------------------|--------------------------------|
+| `SetNull(size_t idx)`                                     | Set field to null              |
+| `SetBool(size_t idx, bool value)`                         | Set boolean value              |
+| `SetInt32(size_t idx, int32_t value)`                     | Set 32-bit integer             |
+| `SetInt64(size_t idx, int64_t value)`                     | Set 64-bit integer             |
+| `SetFloat32(size_t idx, float value)`                     | Set 32-bit float               |
+| `SetFloat64(size_t idx, double value)`                    | Set 64-bit float               |
+| `SetString(size_t idx, const std::string& value)`         | Set string value               |
+| `SetBytes(size_t idx, const std::vector<uint8_t>& value)` | Set binary data                |
+| `SetDate(size_t idx, const Date& value)`                  | Set date value                 |
+| `SetTime(size_t idx, const Time& value)`                  | Set time value                 |
+| `SetTimestampNtz(size_t idx, const Timestamp& value)`     | Set timestamp without timezone |
+| `SetTimestampLtz(size_t idx, const Timestamp& value)`     | Set timestamp with timezone    |
+| `SetDecimal(size_t idx, const std::string& value)`        | Set decimal from string        |
+| `SetArray(size_t idx, ArrayWriter&& writer)`              | Set array value (consumes the writer) |
+
+### Name-Based Setters
+
+When using `table.NewRow()`, the `Set()` method auto-routes to the correct type based on the schema:
+
+| Method                                                   | Description                       |
+|----------------------------------------------------------|-----------------------------------|
+| `Set(const std::string& name, std::nullptr_t)`           | Set field to null by column name  |
+| `Set(const std::string& name, bool value)`               | Set boolean by column name        |
+| `Set(const std::string& name, int32_t value)`            | Set integer by column name        |
+| `Set(const std::string& name, int64_t value)`            | Set big integer by column name    |
+| `Set(const std::string& name, float value)`              | Set float by column name          |
+| `Set(const std::string& name, double value)`             | Set double by column name         |
+| `Set(const std::string& name, const std::string& value)` | Set string/decimal by column name |
+| `Set(const std::string& name, const Date& value)`        | Set date by column name           |
+| `Set(const std::string& name, const Time& value)`        | Set time by column name           |
+| `Set(const std::string& name, const Timestamp& value)`   | Set timestamp by column name      |
+
+## `RowView`
+
+Read-only row view for scan results. Provides zero-copy access to string and bytes data. `RowView` shares ownership of the underlying scan data via reference counting, so it can safely outlive the `ScanRecords` that produced it.
+
+:::note string_view Lifetime
+`GetString()` returns `std::string_view` that borrows from the underlying data. The `string_view` is valid as long as any `RowView` (or `ScanRecord`) referencing the same poll result is alive. Copy to `std::string` if you need the value after all references are gone.
+:::
+
+### Index-Based Getters
+
+| Method                                                     |  Description                   |
+|------------------------------------------------------------|--------------------------------|
+| `FieldCount() -> size_t`                                   | Get the number of fields       |
+| `GetType(size_t idx) -> TypeId`                            | Get the type at index          |
+| `IsNull(size_t idx) -> bool`                               | Check if field is null         |
+| `GetBool(size_t idx) -> bool`                              | Get boolean value at index     |
+| `GetInt32(size_t idx) -> int32_t`                          | Get 32-bit integer at index    |
+| `GetInt64(size_t idx) -> int64_t`                          | Get 64-bit integer at index    |
+| `GetFloat32(size_t idx) -> float`                          | Get 32-bit float at index      |
+| `GetFloat64(size_t idx) -> double`                         | Get 64-bit float at index      |
+| `GetString(size_t idx) -> std::string_view`                | Get string at index (zero-copy)|
+| `GetBytes(size_t idx) -> std::pair<const uint8_t*, size_t>`| Get binary data at index (zero-copy)|
+| `GetDate(size_t idx) -> Date`                              | Get date at index              |
+| `GetTime(size_t idx) -> Time`                              | Get time at index              |
+| `GetTimestamp(size_t idx) -> Timestamp`                    | Get timestamp at index         |
+| `IsDecimal(size_t idx) -> bool`                            | Check if field is a decimal type|
+| `GetDecimalString(size_t idx) -> std::string`              | Get decimal as string at index |
+
+### Array Getters (Index-Based)
+
+| Method                                                             |  Description                              |
+|--------------------------------------------------------------------|-------------------------------------------|
+| `GetArraySize(size_t idx) -> size_t`                               | Get element count of array at index       |
+| `GetArrayElementType(size_t idx) -> TypeId`                        | Get element type of array at index        |
+| `IsArrayElementNull(size_t idx, size_t element) -> bool`           | Check if array element is null            |
+| `GetArrayBool(size_t idx, size_t element) -> bool`                 | Get boolean array element                 |
+| `GetArrayInt32(size_t idx, size_t element) -> int32_t`             | Get 32-bit integer array element          |
+| `GetArrayInt64(size_t idx, size_t element) -> int64_t`             | Get 64-bit integer array element          |
+| `GetArrayFloat32(size_t idx, size_t element) -> float`             | Get 32-bit float array element            |
+| `GetArrayFloat64(size_t idx, size_t element) -> double`            | Get 64-bit float array element            |
+| `GetArrayString(size_t idx, size_t element) -> std::string`        | Get string array element                  |
+| `GetArrayBytes(size_t idx, size_t element) -> std::vector<uint8_t>`| Get binary array element                  |
+| `GetArrayDate(size_t idx, size_t element) -> Date`                 | Get date array element                    |
+| `GetArrayTime(size_t idx, size_t element) -> Time`                 | Get time array element                    |
+| `GetArrayTimestamp(size_t idx, size_t element) -> Timestamp`       | Get timestamp array element               |
+| `GetArrayDecimalString(size_t idx, size_t element) -> std::string` | Get decimal array element as string       |
+| `GetArrayView(size_t idx) -> ArrayView`                            | Get owning ArrayView for nested access    |
+
+All array getters are also available by column name (e.g., `GetArraySize("col")`, `GetArrayView("col")`).
+
+### Name-Based Getters
+
+| Method                                                  |  Description                       |
+|---------------------------------------------------------|------------------------------------|
+| `IsNull(const std::string& name) -> bool`               | Check if field is null by name     |
+| `GetBool(const std::string& name) -> bool`              | Get boolean by column name         |
+| `GetInt32(const std::string& name) -> int32_t`          | Get 32-bit integer by column name  |
+| `GetInt64(const std::string& name) -> int64_t`          | Get 64-bit integer by column name  |
+| `GetFloat32(const std::string& name) -> float`          | Get 32-bit float by column name    |
+| `GetFloat64(const std::string& name) -> double`         | Get 64-bit float by column name    |
+| `GetString(const std::string& name) -> std::string_view`| Get string by column name          |
+| `GetBytes(const std::string& name) -> std::pair<const uint8_t*, size_t>` | Get binary data by column name |
+| `GetDate(const std::string& name) -> Date`              | Get date by column name            |
+| `GetTime(const std::string& name) -> Time`              | Get time by column name            |
+| `GetTimestamp(const std::string& name) -> Timestamp`    | Get timestamp by column name       |
+| `GetDecimalString(const std::string& name) -> std::string` | Get decimal as string by column name |
+
+## `ScanRecord`
+
+`ScanRecord` is a value type that can be freely copied, stored, and accumulated across multiple `Poll()` calls. It shares ownership of the underlying scan data via reference counting.
+
+| Field         | Type         |  Description                                                        |
+|---------------|--------------|---------------------------------------------------------------------|
+| `offset`      | `int64_t`    | Record offset in the log                                            |
+| `timestamp`   | `int64_t`    | Record timestamp                                                    |
+| `change_type` | `ChangeType` | Change type (AppendOnly, Insert, UpdateBefore, UpdateAfter, Delete) |
+| `row`         | `RowView`    | Row data (value type, shares ownership via reference counting)      |
+
+## `ScanRecords`
+
+### Flat Access
+
+| Method                                  |  Description                               |
+|-----------------------------------------|--------------------------------------------|
+| `Count() -> size_t`                     | Total number of records across all buckets |
+| `IsEmpty() -> bool`                     | Check if empty                             |
+| `begin() / end()`                       | Iterator support for range-based for loops |
+
+Flat iteration over all records (regardless of bucket):
+
+```cpp
+for (const auto& rec : records) {
+    std::cout << "offset=" << rec.offset << std::endl;
+}
+```
+
+### Per-Bucket Access
+
+| Method                                                          |  Description                                                          |
+|-----------------------------------------------------------------|-----------------------------------------------------------------------|
+| `BucketCount() -> size_t`                                       | Number of distinct buckets                                            |
+| `Buckets() -> std::vector<TableBucket>`                         | List of distinct buckets                                              |
+| `Records(const TableBucket& bucket) -> BucketRecords`              | Records for a specific bucket (empty if bucket not present)           |
+| `BucketAt(size_t idx) -> BucketRecords`                            | Records by bucket index (0-based, O(1))                               |
+
+## `BucketRecords`
+
+A bundle of scan records belonging to a single bucket. Obtained from `ScanRecords::Records()` or `ScanRecords::BucketAt()`. `BucketRecords` is a value type — it shares ownership of the underlying scan data via reference counting, so it can safely outlive the `ScanRecords` that produced it.
+
+| Method                                         |  Description                               |
+|------------------------------------------------|--------------------------------------------|
+| `Size() -> size_t`                         | Number of records in this bucket           |
+| `Empty() -> bool`                          | Check if empty                             |
+| `Bucket() -> const TableBucket&`           | Get the bucket                             |
+| `operator[](size_t idx) -> ScanRecord`     | Access record by index within this bucket  |
+| `begin() / end()`                          | Iterator support for range-based for loops |
+
+## `TableBucket`
+
+| Field / Method                        |  Description                                    |
+|---------------------------------------|-------------------------------------------------|
+| `table_id -> int64_t`                    | Table ID                                        |
+| `bucket_id -> int32_t`                   | Bucket ID                                       |
+| `partition_id -> std::optional<int64_t>` | Partition ID (empty if non-partitioned)         |
+| `operator==(const TableBucket&) -> bool` | Equality comparison                             |
+
+## `LookupResult`
+
+Read-only result for lookup operations. Provides zero-copy access to field values.
+
+### Metadata
+
+| Method                      |  Description                   |
+|-----------------------------|--------------------------------|
+| `Found() -> bool`           | Whether a matching row was found |
+| `FieldCount() -> size_t`    | Get the number of fields       |
+
+### Index-Based Getters
+
+| Method                                                     |  Description                   |
+|------------------------------------------------------------|--------------------------------|
+| `GetType(size_t idx) -> TypeId`                            | Get the type at index          |
+| `IsNull(size_t idx) -> bool`                               | Check if field is null         |
+| `GetBool(size_t idx) -> bool`                              | Get boolean value at index     |
+| `GetInt32(size_t idx) -> int32_t`                          | Get 32-bit integer at index    |
+| `GetInt64(size_t idx) -> int64_t`                          | Get 64-bit integer at index    |
+| `GetFloat32(size_t idx) -> float`                          | Get 32-bit float at index      |
+| `GetFloat64(size_t idx) -> double`                         | Get 64-bit float at index      |
+| `GetString(size_t idx) -> std::string_view`                | Get string at index (zero-copy)|
+| `GetBytes(size_t idx) -> std::pair<const uint8_t*, size_t>`| Get binary data at index (zero-copy)|
+| `GetDate(size_t idx) -> Date`                              | Get date at index              |
+| `GetTime(size_t idx) -> Time`                              | Get time at index              |
+| `GetTimestamp(size_t idx) -> Timestamp`                    | Get timestamp at index         |
+| `IsDecimal(size_t idx) -> bool`                            | Check if field is a decimal type|
+| `GetDecimalString(size_t idx) -> std::string`              | Get decimal as string at index |
+
+### Array Getters (Index-Based)
+
+Same array getters as [`RowView`](#array-getters-index-based) — `GetArraySize`, `GetArrayInt32`, `GetArrayView`, etc. Also available by column name.
+
+### Name-Based Getters
+
+| Method                                                  |  Description                       |
+|---------------------------------------------------------|------------------------------------|
+| `IsNull(const std::string& name) -> bool`               | Check if field is null by name     |
+| `GetBool(const std::string& name) -> bool`              | Get boolean by column name         |
+| `GetInt32(const std::string& name) -> int32_t`          | Get 32-bit integer by column name  |
+| `GetInt64(const std::string& name) -> int64_t`          | Get 64-bit integer by column name  |
+| `GetFloat32(const std::string& name) -> float`          | Get 32-bit float by column name    |
+| `GetFloat64(const std::string& name) -> double`         | Get 64-bit float by column name    |
+| `GetString(const std::string& name) -> std::string_view`| Get string by column name          |
+| `GetBytes(const std::string& name) -> std::pair<const uint8_t*, size_t>` | Get binary data by column name |
+| `GetDate(const std::string& name) -> Date`              | Get date by column name            |
+| `GetTime(const std::string& name) -> Time`              | Get time by column name            |
+| `GetTimestamp(const std::string& name) -> Timestamp`    | Get timestamp by column name       |
+| `GetDecimalString(const std::string& name) -> std::string` | Get decimal as string by column name |
+
+## `ArrowRecordBatch`
+
+| Method                                                         | Description                          |
+|----------------------------------------------------------------|--------------------------------------|
+| `GetArrowRecordBatch() -> std::shared_ptr<arrow::RecordBatch>` | Get the underlying Arrow RecordBatch |
+| `Available() -> bool`                                          | Check if the batch is valid          |
+| `NumRows() -> int64_t`                                         | Number of rows in the batch          |
+| `GetTableId() -> int64_t`                                      | Table ID                             |
+| `GetPartitionId() -> int64_t`                                  | Partition ID                         |
+| `GetBucketId() -> int32_t`                                     | Bucket ID                            |
+| `GetBaseOffset() -> int64_t`                                   | First record offset                  |
+| `GetLastOffset() -> int64_t`                                   | Last record offset                   |
+
+## `ArrowRecordBatches`
+
+| Method                   |  Description                               |
+|--------------------------|--------------------------------------------|
+| `Size() -> size_t`       | Number of batches                          |
+| `Empty() -> bool`        | Check if empty                             |
+| `operator[](size_t idx)` | Access batch by index                      |
+| `begin() / end()`        | Iterator support for range-based for loops |
+
+## `Schema`
+
+| Method                            |  Description                |
+|-----------------------------------|-----------------------------|
+| `NewBuilder() -> Schema::Builder` | Create a new schema builder |
+
+## `Schema::Builder`
+
+| Method                                                                 |  Description            |
+|------------------------------------------------------------------------|-------------------------|
+| `AddColumn(const std::string& name, const DataType& type) -> Builder&` | Add a column            |
+| `SetPrimaryKeys(const std::vector<std::string>& keys) -> Builder&`     | Set primary key columns |
+| `Build() -> Schema`                                                    | Build the schema        |
+
+## `TableDescriptor`
+
+| Method                                     |  Description                          |
+|--------------------------------------------|---------------------------------------|
+| `NewBuilder() -> TableDescriptor::Builder` | Create a new table descriptor builder |
+
+## `TableDescriptor::Builder`
+
+| Method                                                                            | Description                |
+|-----------------------------------------------------------------------------------|----------------------------|
+| `SetSchema(const Schema& schema) -> Builder&`                                     | Set the table schema       |
+| `SetPartitionKeys(const std::vector<std::string>& keys) -> Builder&`              | Set partition key columns  |
+| `SetBucketCount(int32_t count) -> Builder&`                                       | Set the number of buckets  |
+| `SetBucketKeys(const std::vector<std::string>& keys) -> Builder&`                 | Set bucket key columns     |
+| `SetProperty(const std::string& key, const std::string& value) -> Builder&`       | Set a table property       |
+| `SetCustomProperty(const std::string& key, const std::string& value) -> Builder&` | Set a custom property      |
+| `SetComment(const std::string& comment) -> Builder&`                              | Set a table comment        |
+| `Build() -> TableDescriptor`                                                      | Build the table descriptor |
+
+## `DataType`
+
+### Factory Methods
+
+| Method                                        |  Description                       |
+|-----------------------------------------------|------------------------------------|
+| `DataType::Boolean()`                         | Boolean type                       |
+| `DataType::TinyInt()`                         | 8-bit signed integer               |
+| `DataType::SmallInt()`                        | 16-bit signed integer              |
+| `DataType::Int()`                             | 32-bit signed integer              |
+| `DataType::BigInt()`                          | 64-bit signed integer              |
+| `DataType::Float()`                           | 32-bit floating point              |
+| `DataType::Double()`                          | 64-bit floating point              |
+| `DataType::String()`                          | UTF-8 string                       |
+| `DataType::Bytes()`                           | Binary data                        |
+| `DataType::Date()`                            | Date (days since epoch)            |
+| `DataType::Time()`                            | Time (milliseconds since midnight) |
+| `DataType::Timestamp(int precision)`          | Timestamp without timezone         |
+| `DataType::TimestampLtz(int precision)`       | Timestamp with timezone            |
+| `DataType::Decimal(int precision, int scale)` | Decimal with precision and scale   |
+| `DataType::Array(DataType element)`           | Array of the given element type    |
+
+### Accessors
+
+| Method                              |  Description                                |
+|-------------------------------------|---------------------------------------------|
+| `id() -> TypeId`                    | Get the type ID                             |
+| `precision() -> int`               | Get precision (for Decimal/Timestamp types) |
+| `scale() -> int`                   | Get scale (for Decimal type)                |
+| `nullable() -> bool`               | Returns `true` if this type is nullable (default), `false` if `NOT NULL` |
+| `element_type() -> const DataType*` | Get element type (for Array type, nullptr otherwise) |
+| `NotNull() -> DataType`            | Returns a copy of this type with nullable set to `false` |
+
+## `ArrayWriter`
+
+Write-only builder for array column values. Constructed with a fixed size and element type, then populated element-by-element. Move-only — consumed by `GenericRow::SetArray()` or `ArrayWriter::SetArray()` for nested arrays.
+
+| Method                                                    |  Description                              |
+|-----------------------------------------------------------|-------------------------------------------|
+| `ArrayWriter(size_t size, DataType element_type)`         | Create an array writer                    |
+| `SetNull(size_t idx)`                                     | Set element to null                       |
+| `SetBool(size_t idx, bool value)`                         | Set boolean element                       |
+| `SetInt32(size_t idx, int32_t value)`                     | Set 32-bit integer element                |
+| `SetInt64(size_t idx, int64_t value)`                     | Set 64-bit integer element                |
+| `SetFloat32(size_t idx, float value)`                     | Set 32-bit float element                  |
+| `SetFloat64(size_t idx, double value)`                    | Set 64-bit float element                  |
+| `SetString(size_t idx, const std::string& value)`         | Set string element                        |
+| `SetBytes(size_t idx, const std::vector<uint8_t>& value)` | Set binary element                        |
+| `SetDate(size_t idx, const Date& value)`                  | Set date element                          |
+| `SetTime(size_t idx, const Time& value)`                  | Set time element                          |
+| `SetTimestampNtz(size_t idx, const Timestamp& value)`     | Set timestamp without timezone element    |
+| `SetTimestampLtz(size_t idx, const Timestamp& value)`     | Set timestamp with timezone element       |
+| `SetDecimal(size_t idx, const std::string& value)`        | Set decimal element from string           |
+| `SetArray(size_t idx, ArrayWriter&& nested)`              | Set nested array element (consumes nested)|
+
+## `ArrayView`
+
+Read-only view over an array column value. Obtained from `RowView::GetArrayView()` or `LookupResult::GetArrayView()`, and recursively from `ArrayView::GetArray()` for nested `ARRAY<ARRAY<...>>` columns. Move-only.
+
+| Method                                                  |  Description                              |
+|---------------------------------------------------------|-------------------------------------------|
+| `Size() -> size_t`                                      | Get element count                         |
+| `ElementType() -> TypeId`                               | Get element type                          |
+| `IsNull(size_t element) -> bool`                        | Check if element is null                  |
+| `GetBool(size_t element) -> bool`                       | Get boolean element                       |
+| `GetInt32(size_t element) -> int32_t`                   | Get 32-bit integer element                |
+| `GetInt64(size_t element) -> int64_t`                   | Get 64-bit integer element                |
+| `GetFloat32(size_t element) -> float`                   | Get 32-bit float element                  |
+| `GetFloat64(size_t element) -> double`                  | Get 64-bit float element                  |
+| `GetString(size_t element) -> std::string`              | Get string element                        |
+| `GetBytes(size_t element) -> std::vector<uint8_t>`      | Get binary element                        |
+| `GetDate(size_t element) -> Date`                       | Get date element                          |
+| `GetTime(size_t element) -> Time`                       | Get time element                          |
+| `GetTimestamp(size_t element) -> Timestamp`              | Get timestamp element                     |
+| `GetTimestampLtz(size_t element) -> Timestamp`          | Get timestamp with timezone element       |
+| `GetDecimalString(size_t element) -> std::string`       | Get decimal element as string             |
+| `GetArray(size_t element) -> ArrayView`                 | Get nested array as child ArrayView       |
+
+## `TablePath`
+
+| Method / Field                                                     |  Description          |
+|--------------------------------------------------------------------|-----------------------|
+| `TablePath(const std::string& database, const std::string& table)` | Create a table path   |
+| `database_name -> std::string`                                     | Database name         |
+| `table_name -> std::string`                                        | Table name            |
+| `ToString() -> std::string`                                        | String representation |
+
+## `TableInfo`
+
+| Field               | Type                                           | Description                         |
+|---------------------|------------------------------------------------|-------------------------------------|
+| `table_id`          | `int64_t`                                      | Table ID                            |
+| `schema_id`         | `int32_t`                                      | Schema ID                           |
+| `table_path`        | `TablePath`                                    | Table path                          |
+| `created_time`      | `int64_t`                                      | Creation timestamp                  |
+| `modified_time`     | `int64_t`                                      | Last modification timestamp         |
+| `primary_keys`      | `std::vector<std::string>`                     | Primary key columns                 |
+| `bucket_keys`       | `std::vector<std::string>`                     | Bucket key columns                  |
+| `partition_keys`    | `std::vector<std::string>`                     | Partition key columns               |
+| `num_buckets`       | `int32_t`                                      | Number of buckets                   |
+| `has_primary_key`   | `bool`                                         | Whether the table has a primary key |
+| `is_partitioned`    | `bool`                                         | Whether the table is partitioned    |
+| `properties`        | `std::unordered_map<std::string, std::string>` | Table properties                    |
+| `custom_properties` | `std::unordered_map<std::string, std::string>` | Custom properties                   |
+| `comment`           | `std::string`                                  | Table comment                       |
+| `schema`            | `Schema`                                       | Table schema                        |
+
+## Temporal Types
+
+### `Date`
+
+| Method                                        |  Description                 |
+|-----------------------------------------------|------------------------------|
+| `Date::FromDays(int32_t days)`                | Create from days since epoch |
+| `Date::FromYMD(int year, int month, int day)` | Create from year, month, day |
+| `Year() -> int`                               | Get year                     |
+| `Month() -> int`                              | Get month                    |
+| `Day() -> int`                                | Get day                      |
+
+### `Time`
+
+| Method                                            |  Description                                 |
+|---------------------------------------------------|----------------------------------------------|
+| `Time::FromMillis(int32_t millis)`                | Create from milliseconds since midnight      |
+| `Time::FromHMS(int hour, int minute, int second)` | Create from hour, minute, second             |
+| `Hour() -> int`                                   | Get hour                                     |
+| `Minute() -> int`                                 | Get minute                                   |
+| `Second() -> int`                                 | Get second                                   |
+| `Millis() -> int64_t`                             | Get sub-second millisecond component (0-999) |
+
+### `Timestamp`
+
+| Method                                                               |  Description                             |
+|----------------------------------------------------------------------|------------------------------------------|
+| `Timestamp::FromMillis(int64_t millis)`                              | Create from milliseconds since epoch     |
+| `Timestamp::FromMillisNanos(int64_t millis, int32_t nanos)`          | Create from milliseconds and nanoseconds |
+| `Timestamp::FromTimePoint(std::chrono::system_clock::time_point tp)` | Create from a time point                 |
+
+## `PartitionInfo`
+
+| Field            | Type          |  Description   |
+|------------------|---------------|----------------|
+| `partition_id`   | `int64_t`     | Partition ID   |
+| `partition_name` | `std::string` | Partition name |
+
+## `DatabaseDescriptor`
+
+| Field        | Type                                           | Description       |
+|--------------|------------------------------------------------|-------------------|
+| `comment`    | `std::string`                                  | Database comment  |
+| `properties` | `std::unordered_map<std::string, std::string>` | Custom properties |
+
+## `DatabaseInfo`
+
+| Field           | Type                                           |  Description                |
+|-----------------|------------------------------------------------|-----------------------------|
+| `database_name` | `std::string`                                  | Database name               |
+| `comment`       | `std::string`                                  | Database comment            |
+| `properties`    | `std::unordered_map<std::string, std::string>` | Custom properties           |
+| `created_time`  | `int64_t`                                      | Creation timestamp          |
+| `modified_time` | `int64_t`                                      | Last modification timestamp |
+
+## `LakeSnapshot`
+
+| Field            | Type                        |  Description       |
+|------------------|-----------------------------|--------------------|
+| `snapshot_id`    | `int64_t`                   | Snapshot ID        |
+| `bucket_offsets` | `std::vector<BucketOffset>` | All bucket offsets |
+
+## `BucketOffset`
+
+| Field          | Type      | Description  |
+|----------------|-----------|--------------|
+| `table_id`     | `int64_t` | Table ID     |
+| `partition_id` | `int64_t` | Partition ID |
+| `bucket_id`    | `int32_t` | Bucket ID    |
+| `offset`       | `int64_t` | Offset value |
+
+## `OffsetSpec`
+
+| Method                                             | Description                             |
+|----------------------------------------------------|-----------------------------------------|
+| `OffsetSpec::Earliest()`                          | Query for the earliest available offset |
+| `OffsetSpec::Latest()`                            | Query for the latest offset             |
+| `OffsetSpec::Timestamp(int64_t timestamp_ms)`     | Query offset at a specific timestamp    |
+
+## Constants
+
+| Constant                 |  Value |  Description                                            |
+|--------------------------|--------|---------------------------------------------------------|
+| `fluss::EARLIEST_OFFSET` | `-2`   | Start reading from the earliest available offset        |
+
+To start reading from the latest offset (only new records), resolve the current offset via `ListOffsets` before subscribing:
+
+```cpp
+std::unordered_map<int32_t, int64_t> offsets;
+admin.ListOffsets(table_path, {0}, fluss::OffsetSpec::Latest(), offsets);
+scanner.Subscribe(0, offsets[0]);
+```
+
+## Enums
+
+### `ChangeType`
+
+| Value          | Short String | Description                      |
+|----------------|--------------|----------------------------------|
+| `AppendOnly`   | `+A`         | Append-only record               |
+| `Insert`       | `+I`         | Inserted row                     |
+| `UpdateBefore` | `-U`         | Previous value of an updated row |
+| `UpdateAfter`  | `+U`         | New value of an updated row      |
+| `Delete`       | `-D`         | Deleted row                      |
+
+You may refer to the following example to convert ChangeType enum to its short string representation.
+
+```cpp
+inline const char* ChangeTypeShortString(ChangeType ct) {
+    switch (ct) {
+        case ChangeType::AppendOnly: return "+A";
+        case ChangeType::Insert: return "+I";
+        case ChangeType::UpdateBefore: return "-U";
+        case ChangeType::UpdateAfter: return "+U";
+        case ChangeType::Delete: return "-D";
+    }
+    throw std::invalid_argument("Unknown ChangeType");
+}
+```
+
+### `TypeId`
+
+| Value          |  Description               |
+|----------------|----------------------------|
+| `Boolean`      | Boolean type               |
+| `TinyInt`      | 8-bit signed integer       |
+| `SmallInt`     | 16-bit signed integer      |
+| `Int`          | 32-bit signed integer      |
+| `BigInt`       | 64-bit signed integer      |
+| `Float`        | 32-bit floating point      |
+| `Double`       | 64-bit floating point      |
+| `String`       | UTF-8 string               |
+| `Bytes`        | Binary data                |
+| `Date`         | Date                       |
+| `Time`         | Time                       |
+| `Timestamp`    | Timestamp without timezone |
+| `TimestampLtz` | Timestamp with timezone    |
+| `Decimal`      | Decimal                    |
+| `Array`        | Array of elements          |
+
+### `ChangeType`
+
+| Value          |  Description                                |
+|----------------|---------------------------------------------|
+| `AppendOnly`   | Append-only record (log tables)             |
+| `Insert`       | Inserted row (PK tables)                    |
+| `UpdateBefore` | Row value before an update (PK tables)      |
+| `UpdateAfter`  | Row value after an update (PK tables)       |
+| `Delete`       | Deleted row (PK tables)                     |
+
+### `OffsetSpec`
+
+| Value       |  Description                   |
+|-------------|--------------------------------|
+| `Earliest`  | Earliest available offset      |
+| `Latest`    | Latest offset                  |
+| `Timestamp` | Offset at a specific timestamp |
diff --git a/website/docs/apis/cpp/data-types.md b/website/docs/apis/cpp/data-types.md
new file mode 100644
index 0000000000..cce40cefa1
--- /dev/null
+++ b/website/docs/apis/cpp/data-types.md
@@ -0,0 +1,250 @@
+---
+sidebar_position: 3
+---
+# Data Types
+
+## Schema DataTypes
+
+| DataType                   | Description                                                    |
+|----------------------------|----------------------------------------------------------------|
+| `DataType::Boolean()`      | Boolean value                                                  |
+| `DataType::TinyInt()`      | 8-bit signed integer                                           |
+| `DataType::SmallInt()`     | 16-bit signed integer                                          |
+| `DataType::Int()`          | 32-bit signed integer                                          |
+| `DataType::BigInt()`       | 64-bit signed integer                                          |
+| `DataType::Float()`        | 32-bit floating point                                          |
+| `DataType::Double()`       | 64-bit floating point                                          |
+| `DataType::String()`       | UTF-8 string                                                   |
+| `DataType::Bytes()`        | Binary data                                                    |
+| `DataType::Date()`         | Date (days since epoch)                                        |
+| `DataType::Time()`         | Time (milliseconds since midnight)                             |
+| `DataType::Timestamp()`    | Timestamp without timezone (default precision 6, microseconds) |
+| `DataType::TimestampLtz()` | Timestamp with timezone (default precision 6, microseconds)    |
+| `DataType::Decimal(p, s)`  | Decimal with precision and scale                               |
+| `DataType::Array(element)` | Array of the given element type (supports nesting)             |
+
+## Nullability
+
+All DataTypes are nullable by default. Use `.NotNull()` to create a `NOT NULL` type:
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("id", fluss::DataType::Int().NotNull())
+    .AddColumn("name", fluss::DataType::String())          // nullable by default
+    .Build();
+```
+
+Primary key columns are automatically forced `NOT NULL` regardless of the `DataType` setting.
+
+For nested types, nullability is preserved at each array level and at the leaf element:
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("tags", fluss::DataType::Array(fluss::DataType::String().NotNull()))
+    .AddColumn("ids", fluss::DataType::Array(fluss::DataType::Int()).NotNull())
+    .AddColumn("nested", fluss::DataType::Array(
+        fluss::DataType::Array(fluss::DataType::Int()).NotNull()))
+    .Build();
+// "tags":   ARRAY<STRING NOT NULL>         (outer nullable, elements NOT NULL)
+// "ids":    ARRAY<INT> NOT NULL            (outer NOT NULL, elements nullable)
+// "nested": ARRAY<ARRAY<INT> NOT NULL>     (outer nullable, inner array NOT NULL)
+```
+
+You can query nullability at runtime:
+
+```cpp
+auto info = table.GetTableInfo();
+bool is_nullable = info.schema.columns[0].data_type.nullable();
+```
+
+## GenericRow Setters
+
+`SetInt32` is used for `TinyInt`, `SmallInt`, and `Int` columns. For `TinyInt` and `SmallInt`, the value is validated at write time — an error is returned if it overflows the column's range (e.g., \[-128, 127\] for `TinyInt`, \[-32768, 32767\] for `SmallInt`).
+
+```cpp
+fluss::GenericRow row;
+row.SetNull(0);
+row.SetBool(1, true);
+row.SetInt32(2, 42);
+row.SetInt64(3, 1234567890L);
+row.SetFloat32(4, 3.14f);
+row.SetFloat64(5, 2.71828);
+row.SetString(6, "hello");
+row.SetBytes(7, {0x01, 0x02, 0x03});
+```
+
+### Array Columns
+
+Array values are built element-by-element using `ArrayWriter`, then attached to the row via `SetArray`:
+
+```cpp
+fluss::ArrayWriter aw(3, fluss::DataType::Int());
+aw.SetInt32(0, 10);
+aw.SetInt32(1, 20);
+aw.SetNull(2);
+row.SetArray(8, std::move(aw));
+```
+
+For nested arrays (e.g., `ARRAY<ARRAY<INT>>`), build inner arrays first:
+
+```cpp
+fluss::ArrayWriter inner(2, fluss::DataType::Int());
+inner.SetInt32(0, 1);
+inner.SetInt32(1, 2);
+
+fluss::ArrayWriter outer(1, fluss::DataType::Array(fluss::DataType::Int()));
+outer.SetArray(0, std::move(inner));
+row.SetArray(9, std::move(outer));
+```
+
+## Name-Based Setters
+
+When using `table.NewRow()`, you can set fields by column name. The setter automatically routes to the correct type based on the schema:
+
+```cpp
+auto row = table.NewRow();
+row.Set("user_id", 1);
+row.Set("name", "Alice");
+row.Set("score", 95.5f);
+row.Set("balance", "1234.56");   // decimal as string
+row.Set("birth_date", fluss::Date::FromYMD(1990, 3, 15));
+row.Set("login_time", fluss::Time::FromHMS(9, 30, 0));
+row.Set("created_at", fluss::Timestamp::FromMillis(1700000000000));
+row.Set("nickname", nullptr);    // set to null
+```
+
+## Reading Field Values
+
+Field values are read through `RowView` (from scan results) and `LookupResult` (from lookups), not through `GenericRow`. Both provide the same getter interface with zero-copy access to string and bytes data.
+
+`ScanRecord` is a value type — it can be freely copied, stored, and accumulated across multiple `Poll()` calls via reference counting.
+
+:::note string_view Lifetime
+`GetString()` returns `std::string_view` that borrows from the underlying data. The `string_view` is valid as long as any `ScanRecord` referencing the same poll result is alive. Copy to `std::string` if you need the value after all records are gone.
+:::
+
+```cpp
+// ScanRecord is a value type — safe to store and accumulate:
+std::vector<fluss::ScanRecord> all_records;
+fluss::ScanRecords records;
+scanner.Poll(5000, records);
+for (const auto& rec : records) {
+    all_records.push_back(rec);                    // safe! ref-counted
+    auto name = rec.row.GetString(0);              // zero-copy string_view
+    auto owned = std::string(rec.row.GetString(0)); // explicit copy when needed
+}
+
+// DON'T — string_view dangles after all records referencing the data are destroyed:
+std::string_view dangling;
+{
+    fluss::ScanRecords records;
+    scanner.Poll(5000, records);
+    dangling = records[0].row.GetString(0);
+}
+// dangling is undefined behavior here — no ScanRecord keeps the data alive!
+```
+
+### From Scan Results (RowView)
+
+```cpp
+for (const auto& rec : records) {
+    auto name = rec.row.GetString(1);          // zero-copy string_view
+    float score = rec.row.GetFloat32(3);
+    auto balance = rec.row.GetDecimalString(4); // std::string (already owned)
+    fluss::Date date = rec.row.GetDate(5);
+    fluss::Time time = rec.row.GetTime(6);
+    fluss::Timestamp ts = rec.row.GetTimestamp(7);
+}
+```
+
+### From Lookup Results (LookupResult)
+
+```cpp
+fluss::LookupResult result;
+lookuper.Lookup(pk_row, result);
+if (result.Found()) {
+    auto name = result.GetString(1);  // zero-copy string_view
+    int64_t age = result.GetInt64(2);
+}
+```
+
+### Reading Array Columns
+
+Array columns can be read element-by-element using index-based getters, or via an `ArrayView` for recursive access:
+
+```cpp
+// Element-by-element access (flat arrays)
+size_t len = rec.row.GetArraySize(8);
+for (size_t i = 0; i < len; i++) {
+    if (!rec.row.IsArrayElementNull(8, i)) {
+        int32_t val = rec.row.GetArrayInt32(8, i);
+    }
+}
+
+// ArrayView for nested arrays or when you need a standalone handle
+fluss::ArrayView av = rec.row.GetArrayView(8);
+for (size_t i = 0; i < av.Size(); i++) {
+    if (!av.IsNull(i)) {
+        int32_t val = av.GetInt32(i);
+    }
+}
+
+// Nested arrays: ArrayView::GetArray() returns a child ArrayView
+fluss::ArrayView outer = rec.row.GetArrayView(9);
+for (size_t i = 0; i < outer.Size(); i++) {
+    fluss::ArrayView inner = outer.GetArray(i);
+    for (size_t j = 0; j < inner.Size(); j++) {
+        int32_t val = inner.GetInt32(j);
+    }
+}
+```
+
+## TypeId Enum
+
+`TinyInt` and `SmallInt` values are widened to `int32_t` on read.
+
+| TypeId          | C++ Type                                    | Getter                    |
+|-----------------|---------------------------------------------|---------------------------|
+| `Boolean`       | `bool`                                      | `GetBool(idx)`            |
+| `TinyInt`       | `int32_t`                                   | `GetInt32(idx)`           |
+| `SmallInt`      | `int32_t`                                   | `GetInt32(idx)`           |
+| `Int`           | `int32_t`                                   | `GetInt32(idx)`           |
+| `BigInt`        | `int64_t`                                   | `GetInt64(idx)`           |
+| `Float`         | `float`                                     | `GetFloat32(idx)`         |
+| `Double`        | `double`                                    | `GetFloat64(idx)`         |
+| `String`        | `std::string_view`                          | `GetString(idx)`          |
+| `Bytes`         | `std::pair<const uint8_t*, size_t>`         | `GetBytes(idx)`           |
+| `Date`          | `Date`                                      | `GetDate(idx)`            |
+| `Time`          | `Time`                                      | `GetTime(idx)`            |
+| `Timestamp`     | `Timestamp`                                 | `GetTimestamp(idx)`       |
+| `TimestampLtz`  | `Timestamp`                                 | `GetTimestamp(idx)`       |
+| `Decimal`       | `std::string`                               | `GetDecimalString(idx)`   |
+| `Array`         | `ArrayView`                                 | `GetArrayView(idx)`       |
+
+## Type Checking
+
+```cpp
+if (rec.row.GetType(0) == fluss::TypeId::Int) {
+    int32_t value = rec.row.GetInt32(0);
+}
+if (rec.row.IsNull(1)) {
+    // field is null
+}
+if (rec.row.IsDecimal(2)) {
+    std::string decimal_str = rec.row.GetDecimalString(2);
+}
+```
+
+## Constants
+
+```cpp
+constexpr int64_t fluss::EARLIEST_OFFSET = -2;  // Start from earliest
+```
+
+To start reading from the latest offset, resolve the current offset via `ListOffsets` before subscribing:
+
+```cpp
+std::unordered_map<int32_t, int64_t> offsets;
+admin.ListOffsets(table_path, {0}, fluss::OffsetSpec::Latest(), offsets);
+scanner.Subscribe(0, offsets[0]);
+```
diff --git a/website/docs/apis/cpp/error-handling.md b/website/docs/apis/cpp/error-handling.md
new file mode 100644
index 0000000000..7447a264c7
--- /dev/null
+++ b/website/docs/apis/cpp/error-handling.md
@@ -0,0 +1,238 @@
+---
+sidebar_position: 4
+---
+# Error Handling
+
+All C++ client operations return a `fluss::Result` struct instead of throwing exceptions. This gives you explicit control over error handling.
+
+## The `Result` Struct
+
+```cpp
+#include "fluss.hpp"
+
+// All operations return fluss::Result
+fluss::Result result = admin.CreateTable(path, descriptor);
+if (!result.Ok()) {
+    std::cerr << "Error code: " << result.error_code << std::endl;
+    std::cerr << "Error message: " << result.error_message << std::endl;
+}
+```
+
+| Field / Method   | Type          | Description                               |
+|------------------|---------------|-------------------------------------------|
+| `error_code`     | `int32_t`     | 0 for success, non-zero for errors        |
+| `error_message`  | `std::string` | Human-readable error description          |
+| `Ok()`           | `bool`        | Returns `true` if the operation succeeded |
+
+## Handling Errors
+
+Check the `Result` after each operation and decide how to respond, e.g. log and continue, retry, or abort:
+
+```cpp
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+if (!result.Ok()) {
+    // Log, retry, or propagate the error as appropriate
+    std::cerr << "Connection failed (code " << result.error_code
+              << "): " << result.error_message << std::endl;
+    return 1;
+}
+```
+
+## Connection State Checking
+
+Use `Available()` to verify that a connection or object is valid before using it:
+
+```cpp
+fluss::Connection conn;
+if (!conn.Available()) {
+    // Connection not initialized or already moved
+}
+
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+fluss::Result result = fluss::Connection::Create(config, conn);
+if (result.Ok() && conn.Available()) {
+    // Connection is ready to use
+}
+```
+
+## Error Codes
+
+Server-side errors carry a specific error code (>0 or -1). Client-side errors (connection failures, type mismatches, etc.) use `ErrorCode::CLIENT_ERROR` (-2). Use `fluss::ErrorCode` to match on specific codes:
+
+```cpp
+fluss::Result result = admin.DropTable(table_path);
+if (!result.Ok()) {
+    if (result.error_code == fluss::ErrorCode::TABLE_NOT_EXIST) {
+        std::cerr << "Table does not exist" << std::endl;
+    } else if (result.error_code == fluss::ErrorCode::PARTITION_NOT_EXISTS) {
+        std::cerr << "Partition does not exist" << std::endl;
+    } else if (result.error_code == fluss::ErrorCode::CLIENT_ERROR) {
+        std::cerr << "Client-side error: " << result.error_message << std::endl;
+    } else {
+        std::cerr << "Server error (code " << result.error_code
+                  << "): " << result.error_message << std::endl;
+    }
+}
+```
+
+### Common Error Codes
+
+| Constant                                      | Code | Description                         |
+|-----------------------------------------------|------|-------------------------------------|
+| `ErrorCode::CLIENT_ERROR`                     | -2   | Client-side error (not from server) |
+| `ErrorCode::UNKNOWN_SERVER_ERROR`             | -1   | Unexpected server error             |
+| `ErrorCode::NETWORK_EXCEPTION`                | 1    | Server disconnected before response |
+| `ErrorCode::DATABASE_NOT_EXIST`               | 4    | Database does not exist             |
+| `ErrorCode::DATABASE_ALREADY_EXIST`           | 6    | Database already exists             |
+| `ErrorCode::TABLE_NOT_EXIST`                  | 7    | Table does not exist                |
+| `ErrorCode::TABLE_ALREADY_EXIST`              | 8    | Table already exists                |
+| `ErrorCode::INVALID_TABLE_EXCEPTION`          | 15   | Invalid table operation             |
+| `ErrorCode::REQUEST_TIME_OUT`                 | 25   | Request timed out                   |
+| `ErrorCode::PARTITION_NOT_EXISTS`             | 36   | Partition does not exist            |
+| `ErrorCode::PARTITION_ALREADY_EXISTS`         | 42   | Partition already exists            |
+| `ErrorCode::PARTITION_SPEC_INVALID_EXCEPTION` | 43   | Invalid partition spec              |
+| `ErrorCode::LEADER_NOT_AVAILABLE_EXCEPTION`   | 44   | No leader available for partition   |
+| `ErrorCode::AUTHENTICATE_EXCEPTION`           | 46   | Authentication failed (bad credentials) |
+
+See `fluss::ErrorCode` in `fluss.hpp` for the full list of named constants.
+
+## Retry Logic
+
+Some errors are transient, where the server may be temporarily unavailable, mid-election, or under load. `IsRetriable()` can be used for deciding to to retry an operation rather than treating the error as permanent.
+
+`ErrorCode::IsRetriable(int32_t code)` is a static helper available directly on the error code:
+
+```cpp
+fluss::Result result = writer.Append(row);
+if (!result.Ok()) {
+    if (result.IsRetriable()) {
+        // Transient failure — safe to retry 
+    } else {
+        // Permanent failure — log and abort
+        std::cerr << "Fatal error (code " << result.error_code
+                  << "): " << result.error_message << std::endl;
+    }
+}
+```
+
+`Result::IsRetriable()` delegates to `ErrorCode::IsRetriable()`, so you can also call it directly on the code:
+
+```cpp
+if (fluss::ErrorCode::IsRetriable(result.error_code)) {
+    // retry
+}
+```
+
+### Retriable Error Codes
+
+| Constant                                                    | Code | Reason                                    |
+|-------------------------------------------------------------|------|-------------------------------------------|
+| `ErrorCode::NETWORK_EXCEPTION`                          | 1    | Server disconnected                       |
+| `ErrorCode::CORRUPT_MESSAGE`                            | 3    | CRC or size error                         |
+| `ErrorCode::SCHEMA_NOT_EXIST`                           | 9    | Schema may not exist                      |
+| `ErrorCode::LOG_STORAGE_EXCEPTION`                      | 10   | Transient log storage error               |
+| `ErrorCode::KV_STORAGE_EXCEPTION`                       | 11   | Transient KV storage error                |
+| `ErrorCode::NOT_LEADER_OR_FOLLOWER`                     | 12   | Leader election in progress               |
+| `ErrorCode::CORRUPT_RECORD_EXCEPTION`                   | 14   | Corrupt record                            |
+| `ErrorCode::UNKNOWN_TABLE_OR_BUCKET_EXCEPTION`          | 21   | Metadata not yet available                |
+| `ErrorCode::REQUEST_TIME_OUT`                           | 25   | Request timed out                         |
+| `ErrorCode::STORAGE_EXCEPTION`                          | 26   | Transient storage error                   |
+| `ErrorCode::NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION` | 28   | Wrote to server but with low ISR size     |
+| `ErrorCode::NOT_ENOUGH_REPLICAS_EXCEPTION`              | 29   | Low ISR size at write time                |
+| `ErrorCode::LEADER_NOT_AVAILABLE_EXCEPTION`             | 44   | No leader available for partition         |
+
+Client-side errors (`ErrorCode::CLIENT_ERROR`, code -2) always return `false` from `IsRetriable()`.
+
+## Common Error Scenarios
+
+### Connection Refused
+
+The cluster is not running or the address is incorrect:
+
+```cpp
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+if (!result.Ok()) {
+    // "Connection refused" or timeout error
+    std::cerr << "Cannot connect to cluster: " << result.error_message << std::endl;
+}
+```
+
+### Table Not Found
+
+Attempting to access a table that does not exist:
+
+```cpp
+fluss::Table table;
+fluss::Result result = conn.GetTable(fluss::TablePath("fluss", "nonexistent"), table);
+if (!result.Ok()) {
+    if (result.error_code == fluss::ErrorCode::TABLE_NOT_EXIST) {
+        std::cerr << "Table not found" << std::endl;
+    }
+}
+```
+
+### Partition Not Found
+
+Writing to a partitioned primary key table before creating partitions:
+
+```cpp
+// This will fail if partitions are not created first
+auto row = table.NewRow();
+row.Set("user_id", 1);
+row.Set("region", "US");
+row.Set("score", static_cast<int64_t>(100));
+fluss::WriteResult wr;
+fluss::Result result = writer.Upsert(row, wr);
+if (!result.Ok()) {
+    if (result.error_code == fluss::ErrorCode::PARTITION_NOT_EXISTS) {
+        std::cerr << "Partition not found, create partitions before writing" << std::endl;
+    }
+}
+```
+
+### Authentication Failed
+
+SASL credentials are incorrect or the user does not exist:
+
+```cpp
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+config.security_protocol = "sasl";
+config.security_sasl_username = "admin";
+config.security_sasl_password = "wrong-password";
+
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+if (!result.Ok()) {
+    if (result.error_code == fluss::ErrorCode::AUTHENTICATE_EXCEPTION) {
+        std::cerr << "Authentication failed: " << result.error_message << std::endl;
+    }
+}
+```
+
+### Schema Mismatch
+
+Using incorrect types or column indices when writing:
+
+```cpp
+fluss::GenericRow row;
+// Setting wrong type for a column will result in an error
+// when the row is sent to the server
+row.SetString(0, "not_an_integer");  // Column 0 expects Int
+fluss::Result result = writer.Append(row);
+if (!result.Ok()) {
+    std::cerr << "Schema mismatch: " << result.error_message << std::endl;
+}
+```
+
+## Best Practices
+
+1. **Always check `Result`**: Never ignore the return value of operations that return `Result`.
+2. **Handle errors gracefully**: Log errors and retry or fail gracefully rather than crashing.
+3. **Verify connection state**: Use `Available()` to check connection validity before operations.
+4. **Create partitions before writing**: For partitioned primary key tables, always create partitions before attempting upserts.
diff --git a/website/docs/apis/cpp/example/_category_.json b/website/docs/apis/cpp/example/_category_.json
new file mode 100644
index 0000000000..4d81ec12ae
--- /dev/null
+++ b/website/docs/apis/cpp/example/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Examples",
+  "position": 5
+}
diff --git a/website/docs/apis/cpp/example/admin-operations.md b/website/docs/apis/cpp/example/admin-operations.md
new file mode 100644
index 0000000000..0f08549a0e
--- /dev/null
+++ b/website/docs/apis/cpp/example/admin-operations.md
@@ -0,0 +1,158 @@
+---
+sidebar_position: 3
+---
+# Admin Operations
+
+## Get Admin Interface
+
+```cpp
+fluss::Admin admin;
+conn.GetAdmin(admin);
+```
+
+## Database Operations
+
+```cpp
+// Create database
+fluss::DatabaseDescriptor db_descriptor;
+db_descriptor.comment = "My database";
+admin.CreateDatabase("my_database", db_descriptor, true);
+
+// List all databases
+std::vector<std::string> databases;
+admin.ListDatabases(databases);
+for (const auto& db : databases) {
+    std::cout << "Database: " << db << std::endl;
+}
+
+// Check if database exists
+bool exists = false;
+admin.DatabaseExists("my_database", exists);
+
+// Get database information
+fluss::DatabaseInfo db_info;
+admin.GetDatabaseInfo("my_database", db_info);
+std::cout << "Database: " << db_info.database_name << std::endl;
+
+// Drop database
+admin.DropDatabase("my_database", true, false);
+```
+
+## Table Operations
+
+```cpp
+fluss::TablePath table_path("fluss", "my_table");
+
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("id", fluss::DataType::Int())
+    .AddColumn("name", fluss::DataType::String())
+    .AddColumn("score", fluss::DataType::Float())
+    .AddColumn("age", fluss::DataType::Int())
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetBucketCount(3)
+    .SetComment("Example table")
+    .Build();
+
+// Create table
+admin.CreateTable(table_path, descriptor, true);
+
+// Get table information
+fluss::TableInfo table_info;
+admin.GetTableInfo(table_path, table_info);
+std::cout << "Table ID: " << table_info.table_id << std::endl;
+std::cout << "Number of buckets: " << table_info.num_buckets << std::endl;
+std::cout << "Has primary key: " << table_info.has_primary_key << std::endl;
+std::cout << "Is partitioned: " << table_info.is_partitioned << std::endl;
+
+// Drop table
+admin.DropTable(table_path, true);
+```
+
+## Schema Builder Options
+
+```cpp
+// Schema with primary key
+auto pk_schema = fluss::Schema::NewBuilder()
+    .AddColumn("id", fluss::DataType::Int())
+    .AddColumn("name", fluss::DataType::String())
+    .AddColumn("value", fluss::DataType::Double())
+    .SetPrimaryKeys({"id"})
+    .Build();
+
+// Table descriptor with partitioning
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetPartitionKeys({"date"})
+    .SetBucketCount(3)
+    .SetBucketKeys({"user_id"})
+    .SetProperty("retention_days", "7")
+    .SetComment("Sample table")
+    .Build();
+```
+
+## Partition Operations
+
+```cpp
+// Create a partition
+std::unordered_map<std::string, std::string> partition_spec = {{"region", "US"}};
+admin.CreatePartition(table_path, partition_spec, true);
+
+// List all partitions
+std::vector<fluss::PartitionInfo> partitions;
+admin.ListPartitionInfos(table_path, partitions);
+for (const auto& p : partitions) {
+    std::cout << "Partition: id=" << p.partition_id
+              << ", name=" << p.partition_name << std::endl;
+}
+
+// Drop a partition
+admin.DropPartition(table_path, partition_spec, true);
+```
+
+## Offset Operations
+
+```cpp
+std::vector<int32_t> bucket_ids = {0, 1, 2};
+
+// Query earliest offsets
+std::unordered_map<int32_t, int64_t> earliest_offsets;
+admin.ListOffsets(table_path, bucket_ids,
+                  fluss::OffsetSpec::Earliest(), earliest_offsets);
+
+// Query latest offsets
+std::unordered_map<int32_t, int64_t> latest_offsets;
+admin.ListOffsets(table_path, bucket_ids,
+                  fluss::OffsetSpec::Latest(), latest_offsets);
+
+// Query offsets for a specific timestamp
+std::unordered_map<int32_t, int64_t> timestamp_offsets;
+admin.ListOffsets(table_path, bucket_ids,
+                  fluss::OffsetSpec::Timestamp(timestamp_ms),
+                  timestamp_offsets);
+
+// Query partition offsets
+std::unordered_map<int32_t, int64_t> partition_offsets;
+admin.ListPartitionOffsets(table_path, "partition_name",
+                           bucket_ids, fluss::OffsetSpec::Latest(),
+                           partition_offsets);
+```
+
+## Lake Snapshot
+
+:::note
+Lake snapshots require [lake integration](https://fluss.apache.org/docs/maintenance/tiered-storage/overview/) (e.g. Paimon or Iceberg) to be enabled on the server. Without it, `GetLatestLakeSnapshot` will return an error.
+:::
+
+```cpp
+fluss::LakeSnapshot snapshot;
+admin.GetLatestLakeSnapshot(table_path, snapshot);
+std::cout << "Snapshot ID: " << snapshot.snapshot_id << std::endl;
+for (const auto& bucket_offset : snapshot.bucket_offsets) {
+    std::cout << "  Table " << bucket_offset.table_id
+              << ", Bucket " << bucket_offset.bucket_id
+              << ": offset=" << bucket_offset.offset << std::endl;
+}
+```
diff --git a/website/docs/apis/cpp/example/configuration.md b/website/docs/apis/cpp/example/configuration.md
new file mode 100644
index 0000000000..38202618c9
--- /dev/null
+++ b/website/docs/apis/cpp/example/configuration.md
@@ -0,0 +1,42 @@
+---
+sidebar_position: 2
+---
+# Configuration
+
+## Connection Setup
+
+```cpp
+#include "fluss.hpp"
+
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+
+if (!result.Ok()) {
+    std::cerr << "Connection failed: " << result.error_message << std::endl;
+}
+```
+
+## Connection Configurations
+
+All fields have sensible defaults. Only `bootstrap_servers` typically needs to be set.
+
+See the [`Configuration`](../api-reference.md#configuration) section in the API Reference for the full list of configuration fields, types, and defaults.
+
+## SASL Authentication
+
+To connect to a Fluss cluster with SASL/PLAIN authentication enabled:
+
+```cpp
+fluss::Configuration config;
+config.bootstrap_servers = "127.0.0.1:9123";
+config.security_protocol = "sasl";
+config.security_sasl_mechanism = "PLAIN";
+config.security_sasl_username = "admin";
+config.security_sasl_password = "admin-secret";
+
+fluss::Connection conn;
+fluss::Result result = fluss::Connection::Create(config, conn);
+```
diff --git a/website/docs/apis/cpp/example/index.md b/website/docs/apis/cpp/example/index.md
new file mode 100644
index 0000000000..51f60e4175
--- /dev/null
+++ b/website/docs/apis/cpp/example/index.md
@@ -0,0 +1,63 @@
+---
+sidebar_position: 1
+---
+# Example
+
+Minimal working example: connect to Fluss, create a table, write data, and read it back.
+
+```cpp
+#include <iostream>
+#include "fluss.hpp"
+
+int main() {
+    // Connect
+    fluss::Configuration config;
+    config.bootstrap_servers = "127.0.0.1:9123";
+
+    fluss::Connection conn;
+    fluss::Connection::Create(config, conn);
+
+    fluss::Admin admin;
+    conn.GetAdmin(admin);
+
+    // Create a log table
+    fluss::TablePath table_path("fluss", "quickstart_cpp");
+    auto schema = fluss::Schema::NewBuilder()
+        .AddColumn("id", fluss::DataType::Int())
+        .AddColumn("name", fluss::DataType::String())
+        .Build();
+    auto descriptor = fluss::TableDescriptor::NewBuilder()
+        .SetSchema(schema)
+        .Build();
+    admin.CreateTable(table_path, descriptor, true);
+
+    // Write
+    fluss::Table table;
+    conn.GetTable(table_path, table);
+
+    fluss::AppendWriter writer;
+    table.NewAppend().CreateWriter(writer);
+
+    fluss::GenericRow row;
+    row.SetInt32(0, 1);
+    row.SetString(1, "hello");
+    writer.Append(row);
+    writer.Flush();
+
+    // Read
+    fluss::LogScanner scanner;
+    table.NewScan().CreateLogScanner(scanner);
+    auto info = table.GetTableInfo();
+    for (int b = 0; b < info.num_buckets; ++b) {
+        scanner.Subscribe(b, 0);
+    }
+    fluss::ScanRecords records;
+    scanner.Poll(5000, records);
+    for (const auto& rec : records) {
+        std::cout << "id=" << rec.row.GetInt32(0)
+                  << ", name=" << rec.row.GetString(1) << std::endl;
+    }
+
+    return 0;
+}
+```
diff --git a/website/docs/apis/cpp/example/log-tables.md b/website/docs/apis/cpp/example/log-tables.md
new file mode 100644
index 0000000000..0125a4ce29
--- /dev/null
+++ b/website/docs/apis/cpp/example/log-tables.md
@@ -0,0 +1,161 @@
+---
+sidebar_position: 4
+---
+# Log Tables
+
+Log tables are append-only tables without primary keys, suitable for event streaming.
+
+## Creating a Log Table
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("event_id", fluss::DataType::Int())
+    .AddColumn("event_type", fluss::DataType::String())
+    .AddColumn("timestamp", fluss::DataType::BigInt())
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .Build();
+
+fluss::TablePath table_path("fluss", "events");
+admin.CreateTable(table_path, descriptor, true);
+```
+
+## Writing to Log Tables
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+fluss::AppendWriter writer;
+table.NewAppend().CreateWriter(writer);
+
+fluss::GenericRow row;
+row.SetInt32(0, 1);           // event_id
+row.SetString(1, "user_login");  // event_type
+row.SetInt64(2, 1704067200000L); // timestamp
+writer.Append(row);
+
+writer.Flush();
+```
+
+## Reading from Log Tables
+
+```cpp
+fluss::LogScanner scanner;
+table.NewScan().CreateLogScanner(scanner);
+
+auto info = table.GetTableInfo();
+for (int b = 0; b < info.num_buckets; ++b) {
+    scanner.Subscribe(b, 0);
+}
+
+fluss::ScanRecords records;
+scanner.Poll(5000, records);  // timeout in ms
+
+for (const auto& rec : records) {
+    std::cout << "event_id=" << rec.row.GetInt32(0)
+              << " event_type=" << rec.row.GetString(1)
+              << " timestamp=" << rec.row.GetInt64(2)
+              << " @ offset=" << rec.offset << std::endl;
+}
+
+// Or per-bucket access
+for (const auto& bucket : records.Buckets()) {
+    auto view = records.Records(bucket);
+    std::cout << "Bucket " << bucket.bucket_id << ": "
+              << view.Size() << " records" << std::endl;
+    for (const auto& rec : view) {
+        std::cout << "  event_id=" << rec.row.GetInt32(0)
+                  << " event_type=" << rec.row.GetString(1)
+                  << " @ offset=" << rec.offset << std::endl;
+    }
+}
+```
+
+**Continuous polling:**
+
+```cpp
+while (running) {
+    fluss::ScanRecords records;
+    scanner.Poll(1000, records);
+    for (const auto& rec : records) {
+        process(rec);
+    }
+}
+```
+
+**Accumulating records across polls:**
+
+`ScanRecord` is a value type — it can be freely copied, stored, and accumulated. The underlying data stays alive via reference counting (zero-copy).
+
+```cpp
+std::vector<fluss::ScanRecord> all_records;
+while (all_records.size() < 1000) {
+    fluss::ScanRecords records;
+    scanner.Poll(1000, records);
+    for (const auto& rec : records) {
+        all_records.push_back(rec);  // ref-counted, no data copy
+    }
+}
+// all_records is valid — each record keeps its data alive
+```
+
+**Batch subscribe:**
+
+```cpp
+std::vector<fluss::BucketSubscription> subscriptions;
+subscriptions.push_back({0, 0});    // bucket 0, offset 0
+subscriptions.push_back({1, 100});  // bucket 1, offset 100
+scanner.Subscribe(subscriptions);
+```
+
+**Unsubscribe from a bucket:**
+
+```cpp
+// Stop receiving records from bucket 1
+scanner.Unsubscribe(1);
+```
+
+**Arrow RecordBatch polling (high performance):**
+
+```cpp
+#include <arrow/record_batch.h>
+
+fluss::LogScanner arrow_scanner;
+table.NewScan().CreateRecordBatchLogScanner(arrow_scanner);
+
+for (int b = 0; b < info.num_buckets; ++b) {
+    arrow_scanner.Subscribe(b, 0);
+}
+
+fluss::ArrowRecordBatches batches;
+arrow_scanner.PollRecordBatch(5000, batches);
+
+for (size_t i = 0; i < batches.Size(); ++i) {
+    const auto& batch = batches[i];
+    if (batch->Available()) {
+        auto arrow_batch = batch->GetArrowRecordBatch();
+        std::cout << "Batch " << i << ": " << arrow_batch->num_rows() << " rows"
+                  << ", partition_id=" << batch->GetPartitionId()
+                  << ", bucket_id=" << batch->GetBucketId() << std::endl;
+    }
+}
+```
+
+## Column Projection
+
+```cpp
+// Project by column index
+fluss::LogScanner projected_scanner;
+table.NewScan().ProjectByIndex({0, 2}).CreateLogScanner(projected_scanner);
+
+// Project by column name
+fluss::LogScanner name_projected_scanner;
+table.NewScan().ProjectByName({"event_id", "timestamp"}).CreateLogScanner(name_projected_scanner);
+
+// Arrow RecordBatch with projection
+fluss::LogScanner projected_arrow_scanner;
+table.NewScan().ProjectByIndex({0, 2}).CreateRecordBatchLogScanner(projected_arrow_scanner);
+```
diff --git a/website/docs/apis/cpp/example/partitioned-tables.md b/website/docs/apis/cpp/example/partitioned-tables.md
new file mode 100644
index 0000000000..17c1c2057d
--- /dev/null
+++ b/website/docs/apis/cpp/example/partitioned-tables.md
@@ -0,0 +1,179 @@
+---
+sidebar_position: 6
+---
+# Partitioned Tables
+
+Partitioned tables distribute data across partitions based on partition column values, enabling efficient data organization and querying. Both log tables and primary key tables support partitioning.
+
+## Partitioned Log Tables
+
+### Creating a Partitioned Log Table
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("event_id", fluss::DataType::Int())
+    .AddColumn("event_type", fluss::DataType::String())
+    .AddColumn("dt", fluss::DataType::String())
+    .AddColumn("region", fluss::DataType::String())
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetPartitionKeys({"dt", "region"})
+    .SetBucketCount(3)
+    .Build();
+
+fluss::TablePath table_path("fluss", "partitioned_events");
+admin.CreateTable(table_path, descriptor, true);
+```
+
+### Writing to Partitioned Log Tables
+
+**Partitions must exist before writing data, otherwise the client will by default retry indefinitely.** Include partition column values in each row, the client routes records to the correct partition automatically.
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+fluss::AppendWriter writer;
+table.NewAppend().CreateWriter(writer);
+
+fluss::GenericRow row;
+row.SetInt32(0, 1);
+row.SetString(1, "user_login");
+row.SetString(2, "2024-01-15");
+row.SetString(3, "US");
+writer.Append(row);
+writer.Flush();
+```
+
+### Reading from Partitioned Log Tables
+
+For partitioned tables, use partition-aware subscribe methods.
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+fluss::LogScanner scanner;
+table.NewScan().CreateLogScanner(scanner);
+
+// Subscribe to individual partitions
+for (const auto& pi : partition_infos) {
+    scanner.SubscribePartitionBuckets(pi.partition_id, 0, 0);
+}
+
+fluss::ScanRecords records;
+scanner.Poll(5000, records);
+
+for (const auto& rec : records) {
+    std::cout << "bucket_id=" << rec.bucket_id
+              << " offset=" << rec.offset << std::endl;
+}
+
+// Or batch-subscribe to all partitions at once
+fluss::LogScanner batch_scanner;
+table.NewScan().CreateLogScanner(batch_scanner);
+
+std::vector<fluss::PartitionBucketSubscription> subs;
+for (const auto& pi : partition_infos) {
+    subs.push_back({pi.partition_id, 0, 0});
+}
+batch_scanner.SubscribePartitionBuckets(subs);
+```
+
+**Unsubscribe from a partition bucket:**
+
+```cpp
+// Stop receiving records from a specific partition bucket
+scanner.UnsubscribePartition(partition_infos[0].partition_id, 0);
+```
+
+### Managing Partitions
+
+```cpp
+// Create a partition
+admin.CreatePartition(table_path, {{"dt", "2024-01-15"}, {"region", "EMEA"}}, true);
+
+// List partitions
+std::vector<fluss::PartitionInfo> partition_infos;
+admin.ListPartitionInfos(table_path, partition_infos);
+
+// Query partition offsets
+std::vector<int32_t> bucket_ids = {0, 1, 2};
+std::unordered_map<int32_t, int64_t> offsets;
+admin.ListPartitionOffsets(table_path, "2024-01-15$US",
+                           bucket_ids, fluss::OffsetSpec::Latest(), offsets);
+```
+
+## Partitioned Primary Key Tables
+
+Partitioned KV tables combine partitioning with primary key operations. Partition columns must be part of the primary key.
+
+### Creating a Partitioned Primary Key Table
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("user_id", fluss::DataType::Int())
+    .AddColumn("region", fluss::DataType::String())
+    .AddColumn("zone", fluss::DataType::BigInt())
+    .AddColumn("score", fluss::DataType::BigInt())
+    .SetPrimaryKeys({"user_id", "region", "zone"})
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetPartitionKeys({"region", "zone"})
+    .SetBucketCount(3)
+    .Build();
+
+fluss::TablePath table_path("fluss", "partitioned_users");
+admin.CreateTable(table_path, descriptor, true);
+```
+
+### Writing to Partitioned Primary Key Tables
+
+**Partitions must exist before upserting data, otherwise the client will by default retry indefinitely.**
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+// Create partitions first
+admin.CreatePartition(table_path, {{"region", "APAC"}, {"zone", "1"}}, true);
+admin.CreatePartition(table_path, {{"region", "EMEA"}, {"zone", "2"}}, true);
+admin.CreatePartition(table_path, {{"region", "US"}, {"zone", "3"}}, true);
+
+fluss::UpsertWriter writer;
+table.NewUpsert().CreateWriter(writer);
+
+auto row = table.NewRow();
+row.Set("user_id", 1001);
+row.Set("region", "APAC");
+row.Set("zone", static_cast<int64_t>(1));
+row.Set("score", static_cast<int64_t>(1234));
+writer.Upsert(row);
+writer.Flush();
+```
+
+### Looking Up Records in Partitioned Tables
+
+Lookup requires all primary key columns including partition columns.
+
+> **Note:** Scanning partitioned primary key tables is not supported. Use lookup operations instead.
+
+```cpp
+fluss::Lookuper lookuper;
+table.NewLookup().CreateLookuper(lookuper);
+
+auto pk = table.NewRow();
+pk.Set("user_id", 1001);
+pk.Set("region", "APAC");
+pk.Set("zone", static_cast<int64_t>(1));
+
+fluss::LookupResult result;
+lookuper.Lookup(pk, result);
+if (result.Found()) {
+    std::cout << "score=" << result.GetInt64(3) << std::endl;
+}
+```
diff --git a/website/docs/apis/cpp/example/primary-key-tables.md b/website/docs/apis/cpp/example/primary-key-tables.md
new file mode 100644
index 0000000000..f26b5477a7
--- /dev/null
+++ b/website/docs/apis/cpp/example/primary-key-tables.md
@@ -0,0 +1,132 @@
+---
+sidebar_position: 5
+---
+# Primary Key Tables
+
+Primary key tables (KV tables) support upsert, delete, and lookup operations.
+
+## Creating a Primary Key Table
+
+```cpp
+auto schema = fluss::Schema::NewBuilder()
+    .AddColumn("id", fluss::DataType::Int())
+    .AddColumn("name", fluss::DataType::String())
+    .AddColumn("age", fluss::DataType::BigInt())
+    .SetPrimaryKeys({"id"})
+    .Build();
+
+auto descriptor = fluss::TableDescriptor::NewBuilder()
+    .SetSchema(schema)
+    .SetBucketCount(3)
+    .Build();
+
+fluss::TablePath table_path("fluss", "users");
+admin.CreateTable(table_path, descriptor, true);
+```
+
+## Upserting Records
+
+```cpp
+fluss::Table table;
+conn.GetTable(table_path, table);
+
+fluss::UpsertWriter upsert_writer;
+table.NewUpsert().CreateWriter(upsert_writer);
+
+// Fire-and-forget upserts
+{
+    auto row = table.NewRow();
+    row.Set("id", 1);
+    row.Set("name", "Alice");
+    row.Set("age", static_cast<int64_t>(25));
+    upsert_writer.Upsert(row);
+}
+{
+    auto row = table.NewRow();
+    row.Set("id", 2);
+    row.Set("name", "Bob");
+    row.Set("age", static_cast<int64_t>(30));
+    upsert_writer.Upsert(row);
+}
+upsert_writer.Flush();
+
+// Per-record acknowledgment
+{
+    auto row = table.NewRow();
+    row.Set("id", 3);
+    row.Set("name", "Charlie");
+    row.Set("age", static_cast<int64_t>(35));
+    fluss::WriteResult wr;
+    upsert_writer.Upsert(row, wr);
+    wr.Wait();
+}
+```
+
+## Updating Records
+
+Upsert with the same primary key to update an existing record.
+
+```cpp
+auto row = table.NewRow();
+row.Set("id", 1);
+row.Set("name", "Alice Updated");
+row.Set("age", static_cast<int64_t>(26));
+fluss::WriteResult wr;
+upsert_writer.Upsert(row, wr);
+wr.Wait();
+```
+
+## Deleting Records
+
+```cpp
+auto pk_row = table.NewRow();
+pk_row.Set("id", 2);
+fluss::WriteResult wr;
+upsert_writer.Delete(pk_row, wr);
+wr.Wait();
+```
+
+## Partial Updates
+
+Update only specific columns while preserving others.
+
+```cpp
+// By column names
+fluss::UpsertWriter partial_writer;
+table.NewUpsert()
+    .PartialUpdateByName({"id", "age"})
+    .CreateWriter(partial_writer);
+
+auto row = table.NewRow();
+row.Set("id", 1);
+row.Set("age", static_cast<int64_t>(27));
+fluss::WriteResult wr;
+partial_writer.Upsert(row, wr);
+wr.Wait();
+
+// By column indices
+fluss::UpsertWriter partial_writer_idx;
+table.NewUpsert()
+    .PartialUpdateByIndex({0, 2})
+    .CreateWriter(partial_writer_idx);
+```
+
+## Looking Up Records
+
+```cpp
+fluss::Lookuper lookuper;
+table.NewLookup().CreateLookuper(lookuper);
+
+auto pk_row = table.NewRow();
+pk_row.Set("id", 1);
+
+fluss::LookupResult result;
+lookuper.Lookup(pk_row, result);
+
+if (result.Found()) {
+    std::cout << "Found: name=" << result.GetString(1)
+              << ", age=" << result.GetInt64(2) << std::endl;
+} else {
+    std::cout << "Not found" << std::endl;
+}
+```
diff --git a/website/docs/apis/cpp/installation.md b/website/docs/apis/cpp/installation.md
new file mode 100644
index 0000000000..6360da4369
--- /dev/null
+++ b/website/docs/apis/cpp/installation.md
@@ -0,0 +1,107 @@
+---
+sidebar_position: 1
+---
+# Installation
+
+The C++ bindings are not yet published as a package. You need to build from source.
+
+**Prerequisites:** CMake 3.22+, C++17 compiler, Rust 1.85+, Apache Arrow C++ library
+
+```bash
+git clone https://github.com/apache/fluss-rust.git
+cd fluss-rust
+```
+
+Install dependencies:
+
+```bash
+# macOS
+brew install cmake arrow
+
+# Ubuntu/Debian
+sudo apt-get install cmake libarrow-dev
+```
+
+If Arrow is not available via package manager, build from source:
+
+```bash
+git clone https://github.com/apache/arrow.git
+cd arrow/cpp
+cmake -B build -DARROW_BUILD_SHARED=ON
+cmake --build build
+sudo cmake --install build
+```
+
+Build the C++ bindings:
+
+```bash
+cd bindings/cpp
+mkdir -p build && cd build
+
+# Debug mode
+cmake ..
+
+# Or Release mode
+cmake -DCMAKE_BUILD_TYPE=Release ..
+
+# Build
+cmake --build .
+```
+
+This produces:
+- `libfluss_cpp.a` (Static library)
+- `fluss_cpp_example` (Example executable)
+- Header files in `include/`
+
+## Integrating into Your Project
+
+**Option 1: CMake FetchContent**
+
+```cmake
+include(FetchContent)
+FetchContent_Declare(
+    fluss-cpp
+    GIT_REPOSITORY https://github.com/apache/fluss-rust.git
+    SOURCE_SUBDIR bindings/cpp
+)
+FetchContent_MakeAvailable(fluss-cpp)
+
+target_link_libraries(your_target PRIVATE fluss_cpp)
+```
+
+**Option 2: Manual Integration**
+
+Copy the build artifacts and configure CMake:
+
+```cmake
+find_package(Arrow REQUIRED)
+
+add_library(fluss_cpp STATIC IMPORTED)
+set_target_properties(fluss_cpp PROPERTIES
+    IMPORTED_LOCATION ${CMAKE_SOURCE_DIR}/lib/libfluss_cpp.a
+    INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/include
+)
+
+target_link_libraries(your_target
+    PRIVATE
+    fluss_cpp
+    Arrow::arrow_shared
+    ${CMAKE_DL_LIBS}
+    Threads::Threads
+)
+
+# On macOS, also link these frameworks
+if(APPLE)
+    target_link_libraries(your_target PRIVATE
+        "-framework CoreFoundation"
+        "-framework Security"
+    )
+endif()
+```
+
+**Option 3: Subdirectory**
+
+```cmake
+add_subdirectory(vendor/fluss-rust/bindings/cpp)
+target_link_libraries(your_target PRIVATE fluss_cpp)
+```
diff --git a/website/docs/apis/index.md b/website/docs/apis/index.md
new file mode 100644
index 0000000000..342782883c
--- /dev/null
+++ b/website/docs/apis/index.md
@@ -0,0 +1,42 @@
+---
+sidebar_position: 1
+title: Introduction
+---
+
+# Introduction
+
+[Apache Fluss](https://fluss.apache.org/) (incubating) is a streaming storage system built for real-time analytics, serving as the real-time data layer for Lakehouse architectures.
+
+This documentation covers the **Fluss client libraries** for [Java](./java/index.md), Rust, Python, and C++, which are developed in the [Apache Fluss](https://github.com/apache/fluss) repository. These clients allow you to:
+
+- **Create and manage** databases, tables, and partitions
+- **Write** data to log tables (append-only) and primary key tables (upsert/delete)
+- **Read** data via log scanning and key lookups
+- **Integrate** with the broader Fluss ecosystem including lakehouse snapshots
+
+## Prerequisites
+
+You need a running Fluss cluster to use any of the client libraries. See the [Deploying a Local Cluster](https://fluss.apache.org/docs/install-deploy/deploying-local-cluster/) guide to get started.
+
+## Key Concepts
+
+- **Log table** — an append-only table (no primary key). Records are immutable once written. Use for event streams, logs, and audit trails.
+  - **Offset** — the position of a record within a log table's bucket. Used to track reading progress. Start from `EARLIEST_OFFSET` to read all data, or resolve the current latest offset via `list_offsets` to only read new records.
+- **Primary key (PK) table** — a table with a primary key. Supports upsert, delete, and point lookups.
+- **Bucket** — the unit of parallelism within a table (similar to Kafka partitions). Each table has one or more buckets. Readers subscribe to individual buckets.
+- **Partition** — a way to organize data by column values (e.g. by date or region). Each partition contains its own set of buckets. Partitions must be created explicitly before writing.
+
+## Client Overview
+
+|                        | Rust                                                       | Python                   | C++                                            |
+|------------------------|------------------------------------------------------------|--------------------------|------------------------------------------------|
+| **Package**            | [fluss-rs](https://crates.io/crates/fluss-rs) on crates.io | Build from source (PyO3) | Build from source (CMake)                      |
+| **Async runtime**      | Tokio                                                      | asyncio                  | Synchronous (Tokio runtime managed internally) |
+| **Data format**        | Arrow RecordBatch / GenericRow                             | PyArrow / Pandas / dict  | Arrow RecordBatch / GenericRow                 |
+| **Log tables**         | Read + Write                                               | Read + Write             | Read + Write                                   |
+| **Primary key tables** | Upsert + Delete + Lookup                                   | Upsert + Delete + Lookup | Upsert + Delete + Lookup                       |
+| **Partitioned tables** | Read + Write                                               | Read + Write             | Read + Write                                   |
+
+## How This Guide Is Organised
+
+These guides walk through installation, configuration, and working with each table type. Code examples for Rust, Python, and C++ are shown side by side; the Java client has its own comprehensive guide.
diff --git a/website/docs/apis/java/_category_.json b/website/docs/apis/java/_category_.json
new file mode 100644
index 0000000000..5d19ed2ea3
--- /dev/null
+++ b/website/docs/apis/java/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Java",
+  "position": 2
+}
diff --git a/website/docs/apis/java-client.md b/website/docs/apis/java/index.md
similarity index 100%
rename from website/docs/apis/java-client.md
rename to website/docs/apis/java/index.md
diff --git a/website/docs/apis/python-client.md b/website/docs/apis/python-client.md
deleted file mode 100644
index 4f81ab833d..0000000000
--- a/website/docs/apis/python-client.md
+++ /dev/null
@@ -1,45 +0,0 @@
----
-title: "Python Client"
-sidebar_position: 2
----
-
-# Fluss Python Client
-
-The Fluss Python Client provides a high-performance, asynchronous interface for
-interacting with Fluss clusters. Built on top of the Rust core via
-[PyO3](https://pyo3.rs/), it leverages PyArrow for efficient data interchange
-and supports idiomatic integration with Pandas.
-
-The client provides two main APIs:
-
-- **[Admin API](https://clients.fluss.apache.org/user-guide/python/api-reference#flussadmin)**: For managing databases, tables, and partitions.
-- **[Table API](https://clients.fluss.apache.org/user-guide/python/api-reference#flusstable)**: For reading and writing to Log and Primary Key tables
-
-## Installation
-```bash
-pip install pyfluss
-```
-
-## Quick Example
-```python
-import asyncio
-import fluss
-
-async def main():
-    config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"})
-    conn = await fluss.FlussConnection.create(config)
-    async with conn:
-        admin = await conn.get_admin()
-        databases = await admin.list_databases()
-        print(f"Available databases: {databases}")
-
-if __name__ == "__main__":
-    asyncio.run(main())
-```
-For more examples, see [Fluss Python Client documentation](https://clients.fluss.apache.org/user-guide/python/example/).
-
-## Full Documentation
-
-For the complete Python client reference including all configuration options,
-API methods, data types, error handling, and worked examples — see the
-**[Fluss Python Client documentation](https://clients.fluss.apache.org/user-guide/python/installation)**.
\ No newline at end of file
diff --git a/website/docs/apis/python/_category_.json b/website/docs/apis/python/_category_.json
new file mode 100644
index 0000000000..5775bfc99b
--- /dev/null
+++ b/website/docs/apis/python/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Python",
+  "position": 4
+}
diff --git a/website/docs/apis/python/api-reference.md b/website/docs/apis/python/api-reference.md
new file mode 100644
index 0000000000..9bf0b6902f
--- /dev/null
+++ b/website/docs/apis/python/api-reference.md
@@ -0,0 +1,389 @@
+---
+sidebar_position: 2
+---
+# API Reference
+
+Complete API reference for the Fluss Python client.
+
+## `Config`
+
+| Method / Property                     | Config Key                            | Description                                                                             |
+|---------------------------------------|---------------------------------------|-----------------------------------------------------------------------------------------|
+| `Config(properties: dict = None)`     |                                       | Create config from a dict of key-value pairs                                            |
+| `bootstrap_servers`                   | `bootstrap.servers`                   | Get/set coordinator server address                                                      |
+| `writer_request_max_size`             | `writer.request-max-size`             | Get/set max request size in bytes                                                       |
+| `writer_acks`                         | `writer.acks`                         | Get/set acknowledgment setting (`"all"` for all replicas)                               |
+| `writer_retries`                      | `writer.retries`                      | Get/set number of retries on failure                                                    |
+| `writer_batch_size`                   | `writer.batch-size`                   | Get/set write batch size in bytes. Upper bound when dynamic sizing is on; fixed batch size when off |
+| `writer_dynamic_batch_size_enabled`   | `writer.dynamic-batch-size.enabled`   | Get/set whether the per-table dynamic batch size estimator is enabled (default `true`)  |
+| `writer_dynamic_batch_size_min`       | `writer.dynamic-batch-size-min`       | Get/set the lower bound for the dynamic batch size estimator (default 256 KB; ignored when disabled) |
+| `writer_batch_timeout_ms`             | `writer.batch-timeout-ms`             | Get/set max time in ms to wait for a writer batch to fill up before sending             |
+| `writer_bucket_no_key_assigner`       | `writer.bucket.no-key-assigner`       | Get/set bucket assignment strategy (`"sticky"` or `"round_robin"`)                      |
+| `scanner_remote_log_prefetch_num`     | `scanner.remote-log.prefetch-num`     | Get/set number of remote log segments to prefetch                                       |
+| `remote_file_download_thread_num`     | `remote-file.download-thread-num`     | Get/set number of threads for remote log downloads                                      |
+| `scanner_remote_log_read_concurrency` | `scanner.remote-log.read-concurrency` | Get/set streaming read concurrency within a remote log file                             |
+| `scanner_log_max_poll_records`        | `scanner.log.max-poll-records`        | Get/set max number of records returned in a single poll()                               |
+| `scanner_log_fetch_max_bytes`         | `scanner.log.fetch.max-bytes`         | Get/set maximum bytes per fetch response for LogScanner                                 |
+| `scanner_log_fetch_min_bytes`         | `scanner.log.fetch.min-bytes`         | Get/set minimum bytes the server must accumulate before returning a fetch response      |
+| `scanner_log_fetch_wait_max_time_ms`  | `scanner.log.fetch.wait-max-time-ms`  | Get/set maximum time (ms) the server may wait to satisfy min-bytes                      |
+| `scanner_log_fetch_max_bytes_for_bucket` | `scanner.log.fetch.max-bytes-for-bucket` | Get/set maximum bytes per fetch response per bucket for LogScanner                |
+| `connect_timeout_ms`                  | `connect-timeout`                     | Get/set TCP connect timeout in milliseconds                                             |
+| `security_protocol`                   | `security.protocol`                   | Get/set security protocol (`"PLAINTEXT"` or `"sasl"`)                                   |
+| `security_sasl_mechanism`             | `security.sasl.mechanism`             | Get/set SASL mechanism (only `"PLAIN"` is supported)                                    |
+| `security_sasl_username`              | `security.sasl.username`              | Get/set SASL username (required when protocol is `"sasl"`)                              |
+| `security_sasl_password`              | `security.sasl.password`              | Get/set SASL password (required when protocol is `"sasl"`)                              |
+
+## `FlussConnection`
+
+| Method                                                    |  Description                          |
+|-----------------------------------------------------------|---------------------------------------|
+| `await FlussConnection.create(config) -> FlussConnection` | Connect to a Fluss cluster            |
+| `conn.get_admin() -> FlussAdmin`                        | Get admin interface                   |
+| `await conn.get_table(table_path) -> FlussTable`          | Get a table for read/write operations |
+| `await conn.close()`                                      | Close the connection                  |
+
+Supports `async with` statement (async context manager).
+
+## `FlussAdmin`
+
+| Method                                                                                                                |  Description                          |
+|-----------------------------------------------------------------------------------------------------------------------|---------------------------------------|
+| `await create_database(name, database_descriptor=None, ignore_if_exists=False)`                                       | Create a database                     |
+| `await drop_database(name, ignore_if_not_exists=False, cascade=True)`                                                 | Drop a database                       |
+| `await list_databases() -> list[str]`                                                                                 | List all databases                    |
+| `await database_exists(name) -> bool`                                                                                 | Check if a database exists            |
+| `await get_database_info(name) -> DatabaseInfo`                                                                       | Get database metadata                 |
+| `await create_table(table_path, table_descriptor, ignore_if_exists=False)`                                            | Create a table                        |
+| `await drop_table(table_path, ignore_if_not_exists=False)`                                                            | Drop a table                          |
+| `await get_table_info(table_path) -> TableInfo`                                                                       | Get table metadata                    |
+| `await list_tables(database_name) -> list[str]`                                                                       | List tables in a database             |
+| `await table_exists(table_path) -> bool`                                                                              | Check if a table exists               |
+| `await list_offsets(table_path, bucket_ids, offset_spec) -> dict[int, int]`                           | Get offsets for buckets               |
+| `await list_partition_offsets(table_path, partition_name, bucket_ids, offset_spec) -> dict[int, int]` | Get offsets for a partition's buckets |
+| `await create_partition(table_path, partition_spec, ignore_if_exists=False)`                                          | Create a partition                    |
+| `await drop_partition(table_path, partition_spec, ignore_if_not_exists=False)`                                        | Drop a partition                      |
+| `await list_partition_infos(table_path) -> list[PartitionInfo]`                                                       | List partitions                       |
+| `await get_latest_lake_snapshot(table_path) -> LakeSnapshot`                                                          | Get latest lake snapshot              |
+| `await get_server_nodes() -> list[ServerNode]`                                                                        | Get all alive server nodes            |
+
+## `ServerNode`
+
+| Property                 | Description                                                |
+|--------------------------|------------------------------------------------------------|
+| `.id -> int`             | Server node ID                                             |
+| `.host -> str`           | Hostname of the server                                     |
+| `.port -> int`           | Port number                                                |
+| `.server_type -> str`    | Server type (`"CoordinatorServer"` or `"TabletServer"`)    |
+| `.uid -> str`            | Unique identifier (e.g. `"cs-0"`, `"ts-1"`)               |
+
+## `FlussTable`
+
+| Method                          |  Description                            |
+|---------------------------------|-----------------------------------------|
+| `new_scan() -> TableScan`       | Create a scan builder                   |
+| `new_append() -> TableAppend`   | Create an append builder for log tables |
+| `new_upsert() -> TableUpsert`   | Create an upsert builder for PK tables  |
+| `new_lookup() -> TableLookup`   | Create a lookup builder for PK tables   |
+| `get_table_info() -> TableInfo` | Get table metadata                      |
+| `get_table_path() -> TablePath` | Get table path                          |
+| `has_primary_key() -> bool`     | Check if table has a primary key        |
+
+## `TableScan`
+
+| Method                                                   |  Description                                                        |
+|----------------------------------------------------------|---------------------------------------------------------------------|
+| `.project(indices) -> TableScan`                         | Project columns by index                                            |
+| `.project_by_name(names) -> TableScan`                   | Project columns by name                                             |
+| `await .create_log_scanner() -> LogScanner`              | Create record-based scanner (for `poll()`)                          |
+| `await .create_record_batch_log_scanner() -> LogScanner` | Create batch-based scanner (for `poll_arrow()`, `to_arrow()`, etc.) |
+
+## `TableAppend`
+
+Builder for creating an `AppendWriter`. Obtain via `FlussTable.new_append()`.
+
+| Method                             |  Description             |
+|------------------------------------|--------------------------|
+| `.create_writer() -> AppendWriter` | Create the append writer |
+
+## `TableUpsert`
+
+Builder for creating an `UpsertWriter`. Obtain via `FlussTable.new_upsert()`.
+
+| Method                                             |  Description                               |
+|----------------------------------------------------|--------------------------------------------|
+| `.partial_update_by_name(columns) -> TableUpsert`  | Configure partial update by column names   |
+| `.partial_update_by_index(indices) -> TableUpsert` | Configure partial update by column indices |
+| `.create_writer() -> UpsertWriter`                 | Create the upsert writer                   |
+
+## `TableLookup`
+
+Builder for creating a `Lookuper` or `PrefixLookuper`. Obtain via `FlussTable.new_lookup()`.
+
+| Method                                              |  Description                              |
+|-----------------------------------------------------|-------------------------------------------|
+| `.create_lookuper() -> Lookuper`                    | Create a primary key lookuper             |
+| `.lookup_by(column_names) -> TablePrefixLookup`     | Switch to prefix-scan mode for the given columns (partition keys + bucket keys) |
+
+## `TablePrefixLookup`
+
+Builder for creating a `PrefixLookuper`. Obtain via `TableLookup.lookup_by(columns)`.
+
+| Method                                     |  Description              |
+|--------------------------------------------|---------------------------|
+| `.create_lookuper() -> PrefixLookuper`     | Create the prefix lookuper |
+
+## `AppendWriter`
+
+| Method                                           |  Description                        |
+|--------------------------------------------------|-------------------------------------|
+| `.append(row) -> WriteResultHandle`              | Append a row (dict, list, or tuple) |
+| `.write_arrow(table)`                            | Write a PyArrow Table               |
+| `.write_arrow_batch(batch) -> WriteResultHandle` | Write a PyArrow RecordBatch         |
+| `.write_pandas(df)`                              | Write a Pandas DataFrame            |
+| `await .flush()`                                 | Flush all pending writes            |
+
+## `UpsertWriter`
+
+| Method                              |  Description                          |
+|-------------------------------------|---------------------------------------|
+| `.upsert(row) -> WriteResultHandle` | Upsert a row (insert or update by PK) |
+| `.delete(pk) -> WriteResultHandle`  | Delete a row by primary key           |
+| `await .flush()`                    | Flush all pending operations          |
+
+## `WriteResultHandle`
+
+| Method          |  Description                                 |
+|-----------------|----------------------------------------------|
+| `await .wait()` | Wait for server acknowledgment of this write |
+
+## `Lookuper`
+
+| Method                              |  Description                |
+|-------------------------------------|-----------------------------|
+| `await .lookup(pk) -> dict \| None` | Lookup a row by primary key |
+
+## `PrefixLookuper`
+
+| Method                                        |  Description                                |
+|-----------------------------------------------|---------------------------------------------|
+| `await .lookup(prefix) -> list[dict]`         | Lookup all rows matching a prefix key       |
+
+## `LogScanner`
+
+| Method                                                        |  Description                                                                     |
+|---------------------------------------------------------------|----------------------------------------------------------------------------------|
+| `.subscribe(bucket_id, start_offset)`                         | Subscribe to a bucket                                                            |
+| `.subscribe_buckets(bucket_offsets)`                          | Subscribe to multiple buckets (`{bucket_id: offset}`)                            |
+| `.subscribe_partition(partition_id, bucket_id, start_offset)` | Subscribe to a partition bucket                                                  |
+| `.subscribe_partition_buckets(partition_bucket_offsets)`      | Subscribe to multiple partition+bucket combos (`{(part_id, bucket_id): offset}`) |
+| `.unsubscribe(bucket_id)`                                     | Unsubscribe from a bucket (non-partitioned tables)                               |
+| `.unsubscribe_partition(partition_id, bucket_id)`             | Unsubscribe from a partition bucket                                              |
+| `await .poll(timeout_ms) -> ScanRecords`                      | Poll individual records (record scanner only)                                    |
+| `await .poll_arrow(timeout_ms) -> pa.Table`                   | Poll as Arrow Table (batch scanner only)                                         |
+| `await .poll_record_batch(timeout_ms) -> list[RecordBatch]`   | Poll batches with metadata (batch scanner only)                                  |
+| `.to_arrow_batch_reader() -> pa.RecordBatchReader`            | Lazy Arrow RecordBatchReader reading until latest offsets (batch scanner only)    |
+| `await .to_arrow() -> pa.Table`                               | Read all subscribed data as Arrow Table (batch scanner only)                     |
+| `await .to_pandas() -> pd.DataFrame`                          | Read all subscribed data as DataFrame (batch scanner only)                       |
+
+> **Note:** Overlapping `poll_*` / `to_arrow*` / `to_arrow_batch_reader` calls on the same underlying scanner are not supported. Use only one active polling/consumption path at a time.
+
+## `ScanRecords`
+
+Returned by `LogScanner.poll()`. Records are grouped by bucket.
+
+> **Note:** Flat iteration and integer indexing traverse buckets in an arbitrary order that is consistent within a single `ScanRecords` instance but may differ between `poll()` calls. Use per-bucket access (`.items()`, `.records(bucket)`) when bucket ordering matters.
+
+```python
+scan_records = await scanner.poll(timeout_ms=5000)
+
+# Sequence access
+scan_records[0]                              # first record
+scan_records[-1]                             # last record
+scan_records[:5]                             # first 5 records
+
+# Per-bucket access
+for bucket, records in scan_records.items():
+    for record in records:
+        print(f"bucket={bucket.bucket_id}, offset={record.offset}, row={record.row}")
+
+# Flat iteration
+for record in scan_records:
+    print(record.row)
+```
+
+### Methods
+
+| Method                                 |  Description                                                     |
+|----------------------------------------|------------------------------------------------------------------|
+| `.buckets() -> list[TableBucket]`      | List of distinct buckets                                         |
+| `.records(bucket) -> list[ScanRecord]` | Records for a specific bucket (empty list if bucket not present) |
+| `.count() -> int`                      | Total record count across all buckets                            |
+| `.is_empty() -> bool`                  | Check if empty                                                   |
+
+### Indexing
+
+| Expression                   | Returns              | Description                       |
+|------------------------------|----------------------|-----------------------------------|
+| `scan_records[0]`           | `ScanRecord`         | Record by flat index              |
+| `scan_records[-1]`          | `ScanRecord`         | Negative indexing                  |
+| `scan_records[1:5]`         | `list[ScanRecord]`   | Slice                             |
+| `scan_records[bucket]`      | `list[ScanRecord]`   | Records for a bucket              |
+
+### Mapping Protocol
+
+| Method / Protocol              | Description                                     |
+|--------------------------------|-------------------------------------------------|
+| `.keys()`                      | Same as `.buckets()`                            |
+| `.values()`                    | Lazy iterator over record lists, one per bucket |
+| `.items()`                     | Lazy iterator over `(bucket, records)` pairs    |
+| `len(scan_records)`           | Same as `.count()`                              |
+| `bucket in scan_records`      | Membership test                                 |
+| `for record in scan_records`  | Flat iteration over all records                 |
+
+## `ScanRecord`
+
+| Property                     |  Description                                                        |
+|------------------------------|---------------------------------------------------------------------|
+| `.offset -> int`             | Record offset in the log                                            |
+| `.timestamp -> int`          | Record timestamp                                                    |
+| `.change_type -> ChangeType` | Change type (AppendOnly, Insert, UpdateBefore, UpdateAfter, Delete) |
+| `.row -> dict`               | Row data as `{column_name: value}`                                  |
+
+## `RecordBatch`
+
+| Property                   | Description                  |
+|----------------------------|------------------------------|
+| `.batch -> pa.RecordBatch` | Arrow RecordBatch data       |
+| `.bucket -> TableBucket`   | Bucket this batch belongs to |
+| `.base_offset -> int`      | First record offset          |
+| `.last_offset -> int`      | Last record offset           |
+
+## `Schema`
+
+| Method                                         |  Description               |
+|------------------------------------------------|----------------------------|
+| `Schema(schema: pa.Schema, primary_keys=None)` | Create from PyArrow schema. Field nullability (`pa.field(..., nullable=False)`) is preserved. |
+| `.get_column_names() -> list[str]`             | Get column names           |
+| `.get_column_types() -> list[str]`             | Get column type names. Non-nullable types include a `" NOT NULL"` suffix (e.g., `"int NOT NULL"`). |
+| `.get_columns() -> list[tuple[str, str]]`      | Get `(name, type)` pairs. Type strings follow the same nullability formatting as `.get_column_types()`. |
+| `.get_primary_keys() -> list[str]`             | Get primary key columns    |
+
+## `TableDescriptor`
+
+| Method                                                                                                                                                                         | Description             |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------|
+| `TableDescriptor(schema, *, partition_keys=None, bucket_count=None, bucket_keys=None, comment=None, log_format=None, kv_format=None, properties=None, custom_properties=None)` | Create table descriptor |
+| `.get_schema() -> Schema`                                                                                                                                                      | Get the schema          |
+
+## `TablePath`
+
+| Method / Property            | Description         |
+|------------------------------|---------------------|
+| `TablePath(database, table)` | Create a table path |
+| `.database_name -> str`      | Database name       |
+| `.table_name -> str`         | Table name          |
+
+## `TableInfo`
+
+| Property / Method                    |  Description                |
+|--------------------------------------|-----------------------------|
+| `.table_id -> int`                   | Table ID                    |
+| `.table_path -> TablePath`           | Table path                  |
+| `.num_buckets -> int`                | Number of buckets           |
+| `.schema_id -> int`                  | Schema ID                   |
+| `.comment -> str \| None`            | Table comment               |
+| `.created_time -> int`               | Creation timestamp          |
+| `.modified_time -> int`              | Last modification timestamp |
+| `.get_primary_keys() -> list[str]`   | Primary key columns         |
+| `.get_partition_keys() -> list[str]` | Partition columns           |
+| `.get_bucket_keys() -> list[str]`    | Bucket key columns          |
+| `.has_primary_key() -> bool`         | Has primary key?            |
+| `.is_partitioned() -> bool`          | Is partitioned?             |
+| `.get_schema() -> Schema`            | Get table schema            |
+| `.get_column_names() -> list[str]`   | Column names                |
+| `.get_column_count() -> int`         | Number of columns           |
+| `.get_properties() -> dict`          | All table properties        |
+| `.get_custom_properties() -> dict`   | Custom properties only      |
+
+## `PartitionInfo`
+
+| Property                 |  Description   |
+|--------------------------|----------------|
+| `.partition_id -> int`   | Partition ID   |
+| `.partition_name -> str` | Partition name |
+
+## `DatabaseDescriptor`
+
+| Method / Property                                          | Description       |
+|------------------------------------------------------------|-------------------|
+| `DatabaseDescriptor(comment=None, custom_properties=None)` | Create descriptor |
+| `.comment -> str \| None`                                  | Database comment  |
+| `.get_custom_properties() -> dict`                         | Custom properties |
+
+## `DatabaseInfo`
+
+| Property / Method                                  | Description                 |
+|----------------------------------------------------|-----------------------------|
+| `.database_name -> str`                            | Database name               |
+| `.created_time -> int`                             | Creation timestamp          |
+| `.modified_time -> int`                            | Last modification timestamp |
+| `.get_database_descriptor() -> DatabaseDescriptor` | Get descriptor              |
+
+## `LakeSnapshot`
+
+| Property / Method                                 | Description             |
+|---------------------------------------------------|-------------------------|
+| `.snapshot_id -> int`                             | Snapshot ID             |
+| `.table_buckets_offset -> dict[TableBucket, int]` | All bucket offsets      |
+| `.get_bucket_offset(bucket) -> int \| None`       | Get offset for a bucket |
+| `.get_table_buckets() -> list[TableBucket]`       | Get all buckets         |
+
+## `TableBucket`
+
+| Method / Property                                            | Description                            |
+|--------------------------------------------------------------|----------------------------------------|
+| `TableBucket(table_id, bucket)`                              | Create non-partitioned bucket          |
+| `TableBucket.with_partition(table_id, partition_id, bucket)` | Create partitioned bucket              |
+| `.table_id -> int`                                           | Table ID                               |
+| `.bucket_id -> int`                                          | Bucket ID                              |
+| `.partition_id -> int \| None`                               | Partition ID (None if non-partitioned) |
+
+## `FlussError`
+
+| Property             | Description                                                                         |
+|----------------------|-------------------------------------------------------------------------------------|
+| `.message -> str`    | Error message                                                                       |
+| `.error_code -> int` | Error code (`ErrorCode.CLIENT_ERROR` for client-side errors, server code otherwise) |
+
+Raised for all Fluss-specific errors (connection failures, table not found, schema mismatches, etc.). Inherits from `Exception`. See [Error Handling](./error-handling.md) for details on matching specific error codes.
+
+## Constants
+
+| Constant                     | Value         | Description                                         |
+|------------------------------|---------------|-----------------------------------------------------|
+| `fluss.EARLIEST_OFFSET`      | `-2`          | Start reading from earliest available offset        |
+
+## `OffsetSpec`
+
+| Method                      | Description                                      |
+|-----------------------------|--------------------------------------------------|
+| `OffsetSpec.earliest()`     | Earliest available offset                        |
+| `OffsetSpec.latest()`       | Latest offset                                    |
+| `OffsetSpec.timestamp(ts)`  | Offset at or after the given timestamp (millis)  |
+
+To start reading from the latest offset (only new records), resolve the current offset via `list_offsets` before subscribing:
+
+```python
+offsets = await admin.list_offsets(table_path, [0], fluss.OffsetSpec.latest())
+scanner.subscribe(bucket_id=0, start_offset=offsets[0])
+```
+
+## `ChangeType`
+
+| Value                         | Short String | Description                   |
+|-------------------------------|--------------|-------------------------------|
+| `ChangeType.AppendOnly` (0)   | `+A`         | Append-only                   |
+| `ChangeType.Insert` (1)       | `+I`         | Insert                        |
+| `ChangeType.UpdateBefore` (2) | `-U`         | Previous value of updated row |
+| `ChangeType.UpdateAfter` (3)  | `+U`         | New value of updated row      |
+| `ChangeType.Delete` (4)       | `-D`         | Delete                        |
diff --git a/website/docs/apis/python/data-types.md b/website/docs/apis/python/data-types.md
new file mode 100644
index 0000000000..8e4371e216
--- /dev/null
+++ b/website/docs/apis/python/data-types.md
@@ -0,0 +1,95 @@
+---
+sidebar_position: 3
+---
+# Data Types
+
+The Python client uses PyArrow types for schema definitions:
+
+| PyArrow Type                                    | Fluss Type                        | Python Type         |
+|-------------------------------------------------|-----------------------------------|---------------------|
+| `pa.bool_()`                                    | Boolean                           | `bool`              |
+| `pa.int8()` / `int16()` / `int32()` / `int64()` | TinyInt / SmallInt / Int / BigInt | `int`               |
+| `pa.float32()` / `float64()`                    | Float / Double                    | `float`             |
+| `pa.string()`                                   | String                            | `str`               |
+| `pa.binary()`                                   | Bytes                             | `bytes`             |
+| `pa.binary(n)`                                  | Binary(n)                         | `bytes`             |
+| `pa.date32()`                                   | Date                              | `datetime.date`     |
+| `pa.time32("ms")`                               | Time                              | `datetime.time`     |
+| `pa.timestamp("us")`                            | Timestamp (NTZ)                   | `datetime.datetime` |
+| `pa.timestamp("us", tz="UTC")`                  | TimestampLTZ                      | `datetime.datetime` |
+| `pa.decimal128(precision, scale)`               | Decimal                           | `decimal.Decimal`   |
+| `pa.list_(type)`                                  | Array                             | `list`              |
+
+All Python native types (`date`, `time`, `datetime`, `Decimal`) work when appending rows via dicts.
+
+## Nullability
+
+PyArrow field nullability is preserved when constructing Fluss schemas. By default, fields are nullable. Use `nullable=False` on `pa.field()` to create a `NOT NULL` column:
+
+```python
+schema = pa.schema([
+    pa.field("id", pa.int32(), nullable=False),
+    pa.field("name", pa.string()),          # nullable by default
+])
+fluss_schema = fluss.Schema(schema)
+fluss_schema.get_column_types()  # ["int NOT NULL", "string"]
+```
+
+Primary key columns are automatically forced `NOT NULL` regardless of the PyArrow field setting.
+
+For nested types, element nullability is also preserved:
+
+```python
+schema = pa.schema([
+    pa.field("tags", pa.list_(pa.field("item", pa.string(), nullable=False))),
+])
+fluss_schema = fluss.Schema(schema)
+fluss_schema.get_column_types()  # ["array<string NOT NULL>"]
+```
+
+## Writing Data
+
+Rows can be dicts, lists, or tuples:
+
+```python
+from datetime import date, time, datetime
+from decimal import Decimal
+
+row = {
+    "user_id": 1,
+    "name": "Alice",
+    "active": True,
+    "score": 95.5,
+    "balance": Decimal("1234.56"),
+    "birth_date": date(1990, 3, 15),
+    "login_time": time(9, 30, 0),
+    "created_at": datetime(2024, 1, 1, 0, 0, 0),
+    "nickname": None,  # null value
+    "tags": ["active", "premium"],  # Array of strings
+    "scores": [10, None, 30],       # Array with null values
+}
+handle = writer.append(row)
+```
+
+Lists and tuples must have values in column order:
+
+```python
+row = [1, "Alice", True, 95.5, Decimal("1234.56"), date(1990, 3, 15), time(9, 30, 0), datetime(2024, 1, 1), None]
+handle = writer.append(row)
+```
+
+## Reading Data
+
+```python
+records = await scanner.poll(timeout_ms=1000)
+for record in records:
+    row = record.row  # dict[str, Any]
+    print(row["user_id"])     # int
+    print(row["name"])        # str
+    print(row["balance"])     # decimal.Decimal
+    print(row["birth_date"])  # datetime.date
+    print(row["created_at"])  # datetime.datetime
+
+    if row["nickname"] is None:
+        print("nickname is null")
+```
diff --git a/website/docs/apis/python/error-handling.md b/website/docs/apis/python/error-handling.md
new file mode 100644
index 0000000000..5bef366516
--- /dev/null
+++ b/website/docs/apis/python/error-handling.md
@@ -0,0 +1,168 @@
+---
+sidebar_position: 4
+---
+# Error Handling
+
+The client raises `fluss.FlussError` for all Fluss-specific errors. Each error carries a `message` and an `error_code`.
+
+## Basic Usage
+
+```python
+import fluss
+
+try:
+    await admin.create_table(table_path, table_descriptor)
+except fluss.FlussError as e:
+    print(f"Error (code {e.error_code}): {e.message}")
+```
+
+## Error Codes
+
+Server-side errors carry a specific error code (>0 or -1). Client-side errors (connection failures, type mismatches, etc.) use `ErrorCode.CLIENT_ERROR` (-2). Use `fluss.ErrorCode` to match on specific codes:
+
+```python
+import fluss
+
+try:
+    await admin.drop_table(table_path)
+except fluss.FlussError as e:
+    if e.error_code == fluss.ErrorCode.TABLE_NOT_EXIST:
+        print("Table does not exist")
+    elif e.error_code == fluss.ErrorCode.PARTITION_NOT_EXISTS:
+        print("Partition does not exist")
+    elif e.error_code == fluss.ErrorCode.CLIENT_ERROR:
+        print(f"Client-side error: {e.message}")
+    else:
+        print(f"Server error (code {e.error_code}): {e.message}")
+```
+
+### Common Error Codes
+
+| Constant                                     | Code | Description                         |
+|----------------------------------------------|------|-------------------------------------|
+| `ErrorCode.CLIENT_ERROR`                     | -2   | Client-side error (not from server) |
+| `ErrorCode.UNKNOWN_SERVER_ERROR`             | -1   | Unexpected server error             |
+| `ErrorCode.NETWORK_EXCEPTION`                | 1    | Server disconnected before response |
+| `ErrorCode.DATABASE_NOT_EXIST`               | 4    | Database does not exist             |
+| `ErrorCode.DATABASE_ALREADY_EXIST`           | 6    | Database already exists             |
+| `ErrorCode.TABLE_NOT_EXIST`                  | 7    | Table does not exist                |
+| `ErrorCode.TABLE_ALREADY_EXIST`              | 8    | Table already exists                |
+| `ErrorCode.INVALID_TABLE_EXCEPTION`          | 15   | Invalid table operation             |
+| `ErrorCode.REQUEST_TIME_OUT`                 | 25   | Request timed out                   |
+| `ErrorCode.PARTITION_NOT_EXISTS`             | 36   | Partition does not exist            |
+| `ErrorCode.PARTITION_ALREADY_EXISTS`         | 42   | Partition already exists            |
+| `ErrorCode.PARTITION_SPEC_INVALID_EXCEPTION` | 43   | Invalid partition spec              |
+| `ErrorCode.LEADER_NOT_AVAILABLE_EXCEPTION`   | 44   | No leader available for partition   |
+| `ErrorCode.AUTHENTICATE_EXCEPTION`           | 46   | Authentication failed (bad credentials) |
+
+See `fluss.ErrorCode` for the full list of named constants.
+
+## Retry Logic
+
+Some errors are transient, where the server may be temporarily unavailable, mid-election, or under load. `is_retriable` can be used for deciding to retry an operation rather than treating the error as permanent.
+
+`FlussError.is_retriable` is a property available directly on the exception:
+
+```python
+import fluss
+
+try:
+    await writer.append(row)
+except fluss.FlussError as e:
+    if e.is_retriable:
+        # Transient failure — safe to retry
+        pass
+    else:
+        # Permanent failure — log and abort
+        print(f"Fatal error (code {e.error_code}): {e.message}")
+```
+
+### Retriable Error Codes
+
+| Constant                                                     | Code | Reason                                    |
+|--------------------------------------------------------------|------|-------------------------------------------|
+| `ErrorCode.NETWORK_EXCEPTION`                               | 1    | Server disconnected                       |
+| `ErrorCode.CORRUPT_MESSAGE`                                 | 3    | CRC or size error                         |
+| `ErrorCode.SCHEMA_NOT_EXIST`                                | 9    | Schema may not exist                      |
+| `ErrorCode.LOG_STORAGE_EXCEPTION`                           | 10   | Transient log storage error               |
+| `ErrorCode.KV_STORAGE_EXCEPTION`                            | 11   | Transient KV storage error                |
+| `ErrorCode.NOT_LEADER_OR_FOLLOWER`                          | 12   | Leader election in progress               |
+| `ErrorCode.CORRUPT_RECORD_EXCEPTION`                        | 14   | Corrupt record                            |
+| `ErrorCode.UNKNOWN_TABLE_OR_BUCKET_EXCEPTION`               | 21   | Metadata not yet available                |
+| `ErrorCode.REQUEST_TIME_OUT`                                | 25   | Request timed out                         |
+| `ErrorCode.STORAGE_EXCEPTION`                               | 26   | Transient storage error                   |
+| `ErrorCode.NOT_ENOUGH_REPLICAS_AFTER_APPEND_EXCEPTION`      | 28   | Wrote to server but with low ISR size     |
+| `ErrorCode.NOT_ENOUGH_REPLICAS_EXCEPTION`                   | 29   | Low ISR size at write time                |
+| `ErrorCode.LEADER_NOT_AVAILABLE_EXCEPTION`                  | 44   | No leader available for partition         |
+
+Client-side errors (`ErrorCode.CLIENT_ERROR`, code -2) always return `False` from `is_retriable`.
+
+## Common Error Scenarios
+
+### Connection Refused
+
+The Fluss cluster is not running or the address is incorrect.
+
+```python
+try:
+    config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"})
+    conn = await fluss.FlussConnection.create(config)
+except fluss.FlussError as e:
+    # error_code == ErrorCode.CLIENT_ERROR for connection failures
+    print(f"Cannot connect to cluster: {e.message}")
+```
+
+### Table Not Found
+
+The table does not exist or has been dropped.
+
+```python
+try:
+    await admin.drop_table(table_path)
+except fluss.FlussError as e:
+    if e.error_code == fluss.ErrorCode.TABLE_NOT_EXIST:
+        print("Table not found")
+```
+
+### Partition Not Found
+
+Writing to a partitioned table before creating partitions.
+
+```python
+try:
+    await admin.drop_partition(table_path, {"region": "US"})
+except fluss.FlussError as e:
+    if e.error_code == fluss.ErrorCode.PARTITION_NOT_EXISTS:
+        print("Partition does not exist, create it first")
+```
+
+### Authentication Failed
+
+SASL credentials are incorrect or the user does not exist.
+
+```python
+try:
+    config = fluss.Config({
+        "bootstrap.servers": "127.0.0.1:9123",
+        "client.security.protocol": "sasl",
+        "client.security.sasl.username": "admin",
+        "client.security.sasl.password": "wrong-password",
+    })
+    conn = await fluss.FlussConnection.create(config)
+except fluss.FlussError as e:
+    if e.error_code == fluss.ErrorCode.AUTHENTICATE_EXCEPTION:
+        print(f"Authentication failed: {e.message}")
+```
+
+### Schema Mismatch
+
+Row data doesn't match the table schema.
+
+```python
+try:
+    writer.append({"wrong_column": "value"})
+    await writer.flush()
+except fluss.FlussError as e:
+    # error_code == ErrorCode.CLIENT_ERROR for type/schema mismatches
+    print(f"Schema mismatch: {e.message}")
+```
diff --git a/website/docs/apis/python/example/_category_.json b/website/docs/apis/python/example/_category_.json
new file mode 100644
index 0000000000..4d81ec12ae
--- /dev/null
+++ b/website/docs/apis/python/example/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Examples",
+  "position": 5
+}
diff --git a/website/docs/apis/python/example/admin-operations.md b/website/docs/apis/python/example/admin-operations.md
new file mode 100644
index 0000000000..2cda6c4abf
--- /dev/null
+++ b/website/docs/apis/python/example/admin-operations.md
@@ -0,0 +1,81 @@
+---
+sidebar_position: 3
+---
+# Admin Operations
+
+```python
+admin = conn.get_admin()
+```
+
+## Databases
+
+```python
+await admin.create_database("my_database", ignore_if_exists=True)
+databases = await admin.list_databases()
+exists = await admin.database_exists("my_database")
+await admin.drop_database("my_database", ignore_if_not_exists=True, cascade=True)
+```
+
+## Tables
+
+Schemas are defined using PyArrow and wrapped in `fluss.Schema`:
+
+```python
+import pyarrow as pa
+
+schema = fluss.Schema(pa.schema([
+    pa.field("id", pa.int32()),
+    pa.field("name", pa.string()),
+    pa.field("amount", pa.int64()),
+]))
+
+table_path = fluss.TablePath("my_database", "my_table")
+await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True)
+
+table_info = await admin.get_table_info(table_path)
+tables = await admin.list_tables("my_database")
+await admin.drop_table(table_path, ignore_if_not_exists=True)
+```
+
+### TableDescriptor Options
+
+`TableDescriptor` accepts these optional parameters:
+
+| Parameter           | Description                                                                         |
+|---------------------|-------------------------------------------------------------------------------------|
+| `partition_keys`    | Column names to partition by (e.g. `["region"]`)                                    |
+| `bucket_count`      | Number of buckets (parallelism units) for the table                                 |
+| `bucket_keys`       | Columns used to determine bucket assignment                                         |
+| `comment`           | Table comment / description                                                         |
+| `log_format`        | Log storage format: `"ARROW"` or `"INDEXED"`                                        |
+| `kv_format`         | KV storage format for primary key tables: `"INDEXED"` or `"COMPACTED"`              |
+| `properties`        | Table configuration properties as a dict (e.g. `{"table.replication.factor": "1"}`) |
+| `custom_properties` | User-defined properties as a dict                                                   |
+
+## Offsets
+
+```python
+# Latest offsets for buckets
+offsets = await admin.list_offsets(table_path, bucket_ids=[0, 1], offset_spec=fluss.OffsetSpec.latest())
+
+# By timestamp
+offsets = await admin.list_offsets(table_path, bucket_ids=[0], offset_spec=fluss.OffsetSpec.timestamp(1704067200000))
+
+# Per-partition offsets
+offsets = await admin.list_partition_offsets(table_path, partition_name="US", bucket_ids=[0], offset_spec=fluss.OffsetSpec.latest())
+```
+
+## Lake Snapshot
+
+:::note
+Lake snapshots require [lake integration](https://fluss.apache.org/docs/maintenance/tiered-storage/overview/) (e.g. Paimon or Iceberg) to be enabled on the server. Without it, `get_latest_lake_snapshot` will raise an error.
+:::
+
+```python
+snapshot = await admin.get_latest_lake_snapshot(table_path)
+print(f"Snapshot ID: {snapshot.snapshot_id}")
+print(f"Table buckets: {snapshot.get_table_buckets()}")
+
+bucket = fluss.TableBucket(table_id=1, bucket=0)
+offset = snapshot.get_bucket_offset(bucket)
+```
diff --git a/website/docs/apis/python/example/configuration.md b/website/docs/apis/python/example/configuration.md
new file mode 100644
index 0000000000..448ae029ac
--- /dev/null
+++ b/website/docs/apis/python/example/configuration.md
@@ -0,0 +1,49 @@
+---
+sidebar_position: 2
+---
+# Configuration
+
+## Connection Setup
+
+```python
+import fluss
+
+config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"})
+conn = await fluss.FlussConnection.create(config)
+```
+
+The connection also supports async context managers:
+
+```python
+async with await fluss.FlussConnection.create(config) as conn:
+    ...
+```
+
+## Connection Configurations
+
+Configuration options can be set either via dict keys in the `Config()` constructor, or via Python property setters.
+
+See the [`Config`](../api-reference.md#config) section in the API Reference for the full list of options, their config keys, and descriptions.
+
+## SASL Authentication
+
+To connect to a Fluss cluster with SASL/PLAIN authentication enabled:
+
+```python
+config = fluss.Config({
+    "bootstrap.servers": "127.0.0.1:9123",
+    "security.protocol": "sasl",
+    "security.sasl.mechanism": "PLAIN",
+    "security.sasl.username": "admin",
+    "security.sasl.password": "admin-secret",
+})
+conn = await fluss.FlussConnection.create(config)
+```
+
+## Connection Lifecycle
+
+Remember to close the connection when done:
+
+```python
+await conn.close()
+```
diff --git a/website/docs/apis/python/example/index.md b/website/docs/apis/python/example/index.md
new file mode 100644
index 0000000000..ecbdc84685
--- /dev/null
+++ b/website/docs/apis/python/example/index.md
@@ -0,0 +1,46 @@
+---
+sidebar_position: 1
+---
+# Example
+
+Minimal working example: connect to Fluss, create a table, write data, and read it back.
+
+```python
+import asyncio
+import pyarrow as pa
+import fluss
+
+async def main():
+    # Connect
+    config = fluss.Config({"bootstrap.servers": "127.0.0.1:9123"})
+    conn = await fluss.FlussConnection.create(config)
+    admin = conn.get_admin()
+
+    # Create a log table
+    schema = fluss.Schema(pa.schema([
+        pa.field("id", pa.int32()),
+        pa.field("name", pa.string()),
+        pa.field("score", pa.float32()),
+    ]))
+    table_path = fluss.TablePath("fluss", "quick_start")
+    await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True)
+
+    # Write
+    table = await conn.get_table(table_path)
+    writer = table.new_append().create_writer()
+    writer.append({"id": 1, "name": "Alice", "score": 95.5})
+    writer.append({"id": 2, "name": "Bob", "score": 87.0})
+    await writer.flush()
+
+    # Read
+    num_buckets = (await admin.get_table_info(table_path)).num_buckets
+    scanner = await table.new_scan().create_record_batch_log_scanner()
+    scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+    print(await scanner.to_pandas())
+
+    # Cleanup
+    await admin.drop_table(table_path, ignore_if_not_exists=True)
+    await conn.close()
+
+asyncio.run(main())
+```
diff --git a/website/docs/apis/python/example/log-tables.md b/website/docs/apis/python/example/log-tables.md
new file mode 100644
index 0000000000..4dbe256781
--- /dev/null
+++ b/website/docs/apis/python/example/log-tables.md
@@ -0,0 +1,129 @@
+---
+sidebar_position: 4
+---
+# Log Tables
+
+Log tables are append-only tables without primary keys, suitable for event streaming.
+
+## Creating a Log Table
+
+```python
+import pyarrow as pa
+
+schema = fluss.Schema(pa.schema([
+    pa.field("id", pa.int32()),
+    pa.field("name", pa.string()),
+    pa.field("score", pa.float32()),
+]))
+
+table_path = fluss.TablePath("fluss", "events")
+await admin.create_table(table_path, fluss.TableDescriptor(schema), ignore_if_exists=True)
+```
+
+## Writing
+
+Rows can be appended as dicts, lists, or tuples. For bulk writes, use `write_arrow()`, `write_arrow_batch()`, or `write_pandas()`.
+
+Write methods like `append()` and `write_arrow_batch()` return a `WriteResultHandle`. You can ignore it for fire-and-forget semantics (flush at the end), or `await handle.wait()` to block until the server acknowledges that specific write.
+
+```python
+table = await conn.get_table(table_path)
+writer = table.new_append().create_writer()
+
+# Fire-and-forget: queue writes, flush at the end
+writer.append({"id": 1, "name": "Alice", "score": 95.5})
+writer.append([2, "Bob", 87.0])
+await writer.flush()
+
+# Per-record acknowledgment
+handle = writer.append({"id": 3, "name": "Charlie", "score": 91.0})
+await handle.wait()
+
+# Bulk writes
+writer.write_arrow(pa_table)          # PyArrow Table
+writer.write_arrow_batch(record_batch) # PyArrow RecordBatch
+writer.write_pandas(df)                # Pandas DataFrame
+await writer.flush()
+```
+
+## Reading
+
+There are two scanner types:
+- **Batch scanner** (`create_record_batch_log_scanner()`): returns Arrow Tables or DataFrames, best for analytics
+- **Record scanner** (`create_log_scanner()`): returns individual records with metadata (offset, timestamp, change type), best for streaming
+
+And two reading modes:
+- **`to_arrow()` / `to_pandas()`**: reads all data from subscribed buckets up to the current latest offset, then returns. Best for one-shot batch reads.
+- **`poll_arrow()` / `poll()` / `poll_record_batch()`**: returns whatever data is available within the timeout, then returns. Call in a loop for continuous streaming.
+
+### Batch Read (One-Shot)
+
+```python
+num_buckets = (await admin.get_table_info(table_path)).num_buckets
+
+scanner = await table.new_scan().create_record_batch_log_scanner()
+scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+# Reads everything up to current latest offset, then returns
+arrow_table = await scanner.to_arrow()
+df = await scanner.to_pandas()
+```
+
+### Continuous Polling
+
+Use `poll_arrow()` or `poll()` in a loop for streaming consumption:
+
+```python
+# Batch scanner: poll as Arrow Tables
+scanner = await table.new_scan().create_record_batch_log_scanner()
+scanner.subscribe(bucket_id=0, start_offset=fluss.EARLIEST_OFFSET)
+
+while True:
+    result = await scanner.poll_arrow(timeout_ms=5000)
+    if result.num_rows > 0:
+        print(result.to_pandas())
+
+# Record scanner: poll individual records
+scanner = await table.new_scan().create_log_scanner()
+scanner.subscribe_buckets({i: fluss.EARLIEST_OFFSET for i in range(num_buckets)})
+
+while True:
+    scan_records = await scanner.poll(timeout_ms=5000)
+
+    for record in scan_records:
+        print(f"offset={record.offset}, change={record.change_type.short_string()}, row={record.row}")
+
+    # Or per-bucket access (dict-like)
+    for bucket, records in scan_records.items():
+        for record in records:
+            print(f"bucket={bucket.bucket_id}, offset={record.offset}, row={record.row}")
+```
+
+### Unsubscribing
+
+To stop consuming from a bucket, use `unsubscribe()`:
+
+```python
+scanner.unsubscribe(bucket_id=0)
+```
+
+### Subscribe from Latest Offset
+
+To only consume new records (skip existing data), first resolve the current latest offset via `list_offsets`, then subscribe at that offset:
+
+```python
+admin = conn.get_admin()
+offsets = await admin.list_offsets(table_path, [0], fluss.OffsetSpec.latest())
+latest = offsets[0]
+
+scanner = await table.new_scan().create_record_batch_log_scanner()
+scanner.subscribe(bucket_id=0, start_offset=latest)
+```
+
+## Column Projection
+
+```python
+scanner = await table.new_scan().project([0, 2]).create_record_batch_log_scanner()
+# or by name
+scanner = await table.new_scan().project_by_name(["id", "score"]).create_record_batch_log_scanner()
+```
diff --git a/website/docs/apis/python/example/partitioned-tables.md b/website/docs/apis/python/example/partitioned-tables.md
new file mode 100644
index 0000000000..894bb519db
--- /dev/null
+++ b/website/docs/apis/python/example/partitioned-tables.md
@@ -0,0 +1,104 @@
+---
+sidebar_position: 6
+---
+# Partitioned Tables
+
+Partitioned tables distribute data across partitions based on column values. Partitions must exist before writing data, otherwise the client will by default retry indefinitely.
+
+## Creating and Managing Partitions
+
+```python
+import pyarrow as pa
+
+schema = fluss.Schema(pa.schema([
+    pa.field("id", pa.int32()),
+    pa.field("region", pa.string()),
+    pa.field("value", pa.int64()),
+]))
+
+table_path = fluss.TablePath("fluss", "partitioned_events")
+await admin.create_table(
+    table_path,
+    fluss.TableDescriptor(schema, partition_keys=["region"], bucket_count=1),
+    ignore_if_exists=True,
+)
+
+# Create partitions
+await admin.create_partition(table_path, {"region": "US"}, ignore_if_exists=True)
+await admin.create_partition(table_path, {"region": "EU"}, ignore_if_exists=True)
+
+# List partitions
+partition_infos = await admin.list_partition_infos(table_path)
+```
+
+## Writing
+
+Same as non-partitioned tables - include partition column values in each row. **Partitions must exist before writing data, otherwise the client will by default retry indefinitely.**
+
+```python
+table = await conn.get_table(table_path)
+writer = table.new_append().create_writer()
+writer.append({"id": 1, "region": "US", "value": 100})
+writer.append({"id": 2, "region": "EU", "value": 200})
+await writer.flush()
+```
+
+## Reading
+
+Use `subscribe_partition()` or `subscribe_partition_buckets()` instead of `subscribe()`:
+
+```python
+scanner = await table.new_scan().create_record_batch_log_scanner()
+
+# Subscribe to individual partitions
+for p in partition_infos:
+    scanner.subscribe_partition(partition_id=p.partition_id, bucket_id=0, start_offset=fluss.EARLIEST_OFFSET)
+
+# Or batch-subscribe
+scanner.subscribe_partition_buckets({
+    (p.partition_id, 0): fluss.EARLIEST_OFFSET for p in partition_infos
+})
+
+print(await scanner.to_pandas())
+```
+
+### Unsubscribing
+
+To stop consuming from a specific partition bucket, use `unsubscribe_partition()`:
+
+```python
+scanner.unsubscribe_partition(partition_id=partition_infos[0].partition_id, bucket_id=0)
+```
+
+## Partitioned Primary Key Tables
+
+Partition columns must be part of the primary key. Partitions must exist before upserting data, otherwise the client will by default retry indefinitely.
+
+```python
+schema = fluss.Schema(
+    pa.schema([
+        pa.field("user_id", pa.int32()),
+        pa.field("region", pa.string()),
+        pa.field("score", pa.int64()),
+    ]),
+    primary_keys=["user_id", "region"],
+)
+
+table_path = fluss.TablePath("fluss", "partitioned_users")
+await admin.create_table(
+    table_path,
+    fluss.TableDescriptor(schema, partition_keys=["region"]),
+    ignore_if_exists=True,
+)
+
+await admin.create_partition(table_path, {"region": "US"}, ignore_if_exists=True)
+
+table = await conn.get_table(table_path)
+writer = table.new_upsert().create_writer()
+writer.upsert({"user_id": 1, "region": "US", "score": 1234})
+await writer.flush()
+
+# Lookup includes partition columns
+lookuper = table.new_lookup().create_lookuper()
+result = await lookuper.lookup({"user_id": 1, "region": "US"})
+```
diff --git a/website/docs/apis/python/example/primary-key-tables.md b/website/docs/apis/python/example/primary-key-tables.md
new file mode 100644
index 0000000000..cd61e5084c
--- /dev/null
+++ b/website/docs/apis/python/example/primary-key-tables.md
@@ -0,0 +1,61 @@
+---
+sidebar_position: 5
+---
+# Primary Key Tables
+
+Primary key tables support upsert, delete, and point lookup operations.
+
+## Creating a Primary Key Table
+
+Pass `primary_keys` to `fluss.Schema`:
+
+```python
+import pyarrow as pa
+
+schema = fluss.Schema(
+    pa.schema([
+        pa.field("id", pa.int32()),
+        pa.field("name", pa.string()),
+        pa.field("age", pa.int64()),
+    ]),
+    primary_keys=["id"],
+)
+table_path = fluss.TablePath("fluss", "users")
+await admin.create_table(table_path, fluss.TableDescriptor(schema, bucket_count=3), ignore_if_exists=True)
+```
+
+## Upsert, Delete, Lookup
+
+```python
+table = await conn.get_table(table_path)
+
+# Upsert (fire-and-forget, flush at the end)
+writer = table.new_upsert().create_writer()
+writer.upsert({"id": 1, "name": "Alice", "age": 25})
+writer.upsert({"id": 2, "name": "Bob", "age": 30})
+await writer.flush()
+
+# Per-record acknowledgment (for read-after-write)
+handle = writer.upsert({"id": 3, "name": "Charlie", "age": 35})
+await handle.wait()
+
+# Delete by primary key
+handle = writer.delete({"id": 2})
+await handle.wait()
+
+# Lookup
+lookuper = table.new_lookup().create_lookuper()
+result = await lookuper.lookup({"id": 1})
+if result:
+    print(f"Found: name={result['name']}, age={result['age']}")
+```
+
+## Partial Updates
+
+Update specific columns while preserving others:
+
+```python
+partial_writer = table.new_upsert().partial_update_by_name(["id", "age"]).create_writer()
+partial_writer.upsert({"id": 1, "age": 27})  # only updates age
+await partial_writer.flush()
+```
diff --git a/website/docs/apis/python/installation.md b/website/docs/apis/python/installation.md
new file mode 100644
index 0000000000..4182dbb431
--- /dev/null
+++ b/website/docs/apis/python/installation.md
@@ -0,0 +1,41 @@
+---
+sidebar_position: 1
+---
+# Installation
+
+```bash
+pip install pyfluss
+```
+
+## Building From Source (Optional)
+
+**Prerequisites:** Python 3.9+, Rust 1.85+
+
+```bash
+git clone https://github.com/apache/fluss-rust.git
+cd fluss-rust/bindings/python
+```
+
+Install [maturin](https://github.com/PyO3/maturin):
+
+```bash
+pip install maturin
+```
+
+Build and install:
+
+```bash
+# Development mode (editable)
+maturin develop
+
+# Or build a wheel
+maturin build --release
+pip install target/wheels/fluss-*.whl
+```
+
+Verify:
+
+```python
+import fluss
+print("Fluss Python bindings installed successfully!")
+```
diff --git a/website/docs/apis/rust-client.md b/website/docs/apis/rust-client.md
deleted file mode 100644
index 8b71936315..0000000000
--- a/website/docs/apis/rust-client.md
+++ /dev/null
@@ -1,53 +0,0 @@
----
-title: "Rust Client"
-sidebar_position: 3
----
-
-# Fluss Rust Client
-
-The Fluss Rust Client is a high-performance, asynchronous library powered by the
-[Tokio](https://tokio.rs/) runtime. It provides a native interface for interacting
-with Fluss clusters with minimal overhead.
-
-The client provides two main APIs:
-
-- **[Admin API](https://clients.fluss.apache.org/user-guide/rust/api-reference#flussadmin)**: For managing databases, tables, and partitions.
-- **[Table API](https://clients.fluss.apache.org/user-guide/rust/api-reference/#flusstablea)**: For reading and writing to Log and Primary Key tables
-
-## Installation
-
-The Fluss Rust client is published to [crates.io](https://crates.io/crates/fluss-rs)
-as `fluss-rs`. The crate's library name is `fluss`, so you import it with `use fluss::...`.
-
-Add the following to your `Cargo.toml`:
-```toml
-[dependencies]
-fluss-rs = "0.1"
-tokio = { version = "1", features = ["full"] }
-```
-
-## Quick Example
-```rust
-use fluss::client::FlussConnection;
-use fluss::config::Config;
-use fluss::error::Result;
-
-#[tokio::main]
-async fn main() -> Result<()> {
-    let mut config = Config::default();
-    config.bootstrap_servers = "127.0.0.1:9123".to_string();
-
-    let conn = FlussConnection::new(config).await?;
-    let admin = conn.get_admin().await?;
-
-    Ok(())
-}
-```
-
-For more examples, see [Fluss Rust Client documentation](https://clients.fluss.apache.org/user-guide/rust/example/).
-
-## Full Documentation
-
-For the complete Rust client reference including all configuration options,
-API methods, data types, error handling, and worked examples — see the
-**[Fluss Rust Client documentation](https://clients.fluss.apache.org/user-guide/rust/installation)**.
\ No newline at end of file
diff --git a/website/docs/apis/rust/_category_.json b/website/docs/apis/rust/_category_.json
new file mode 100644
index 0000000000..d2279a0af3
--- /dev/null
+++ b/website/docs/apis/rust/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Rust",
+  "position": 3
+}
diff --git a/website/docs/apis/rust/api-reference.md b/website/docs/apis/rust/api-reference.md
new file mode 100644
index 0000000000..bb2ec3e8f1
--- /dev/null
+++ b/website/docs/apis/rust/api-reference.md
@@ -0,0 +1,597 @@
+---
+sidebar_position: 2
+---
+# API Reference
+
+Complete API reference for the Fluss Rust client.
+
+## `Config`
+
+| Field                                 | Type            | Default            | Description                                                                          |
+|---------------------------------------|-----------------|--------------------|--------------------------------------------------------------------------------------|
+| `bootstrap_servers`                   | `String`        | `"127.0.0.1:9123"` | Coordinator server address                                                           |
+| `writer_request_max_size`             | `i32`           | `10485760` (10 MB) | Maximum request size in bytes                                                        |
+| `writer_acks`                         | `String`        | `"all"`            | Acknowledgment setting (`"all"` waits for all replicas)                              |
+| `writer_retries`                      | `i32`           | `i32::MAX`         | Number of retries on failure                                                         |
+| `writer_batch_size`                   | `i32`           | `2097152` (2 MB)   | Batch size for writes in bytes. Upper bound when dynamic sizing is on; fixed batch size when off. |
+| `writer_dynamic_batch_size_enabled`   | `bool`          | `true`             | Enable per-table dynamic batch sizing: target grows 10% above 80% fill, shrinks 5% below 50%, clamped to `[writer_dynamic_batch_size_min, writer_batch_size]` |
+| `writer_dynamic_batch_size_min`       | `i32`           | `262144` (256 KB)  | Lower bound for the dynamic batch size estimator (ignored when `writer_dynamic_batch_size_enabled` is `false`) |
+| `writer_batch_timeout_ms`             | `i64`           | `100`              | Maximum time in ms to wait for a writer batch to fill up before sending              |
+| `writer_bucket_no_key_assigner`       | `NoKeyAssigner` | `sticky`           | Bucket assignment strategy for tables without bucket keys: `sticky` or `round_robin` |
+| `scanner_remote_log_prefetch_num`     | `usize`         | `4`                | Number of remote log segments to prefetch                                            |
+| `remote_file_download_thread_num`     | `usize`         | `3`                | Number of threads for remote log downloads                                           |
+| `scanner_remote_log_read_concurrency` | `usize`         | `4`                | Streaming read concurrency within a remote log file                                  |
+| `scanner_log_max_poll_records`        | `usize`         | `500`              | Maximum number of records returned in a single poll()                                |
+| `scanner_log_fetch_max_bytes`         | `i32`           | `16777216` (16 MB) | Maximum bytes per fetch response for LogScanner                                      |
+| `scanner_log_fetch_min_bytes`         | `i32`           | `1`                | Minimum bytes the server must accumulate before returning a fetch response           |
+| `scanner_log_fetch_wait_max_time_ms`  | `i32`           | `500`              | Maximum time (ms) the server may wait to satisfy min-bytes                           |
+| `scanner_log_fetch_max_bytes_for_bucket`| `i32`         | `1048576` (1 MB)   | Maximum bytes per fetch response per bucket for LogScanner                           |
+| `connect_timeout_ms`                  | `u64`           | `120000`           | TCP connect timeout in milliseconds                                                  |
+| `security_protocol`                   | `String`        | `"PLAINTEXT"`      | `PLAINTEXT` (default) or `sasl` for SASL auth                                        |
+| `security_sasl_mechanism`             | `String`        | `"PLAIN"`          | SASL mechanism (only `PLAIN` is supported)                                           |
+| `security_sasl_username`              | `String`        | (empty)            | SASL username (required when protocol is `sasl`)                                     |
+| `security_sasl_password`              | `String`        | (empty)            | SASL password (required when protocol is `sasl`)                                     |
+
+## `FlussConnection`
+
+| Method                                                                        | Description                                    |
+|-------------------------------------------------------------------------------|------------------------------------------------|
+| `async fn new(config: Config) -> Result<Self>`                                | Create a new connection to a Fluss cluster     |
+| `fn get_admin(&self) -> Result<Arc<FlussAdmin>>`                              | Get the admin interface for cluster management |
+| `async fn get_table(&self, table_path: &TablePath) -> Result<FlussTable<'_>>` | Get a table for read/write operations          |
+| `fn config(&self) -> &Config`                                                 | Get a reference to the connection config       |
+
+## `FlussAdmin`
+
+### Database Operations
+
+| Method                                                                                                                       | Description                |
+|------------------------------------------------------------------------------------------------------------------------------|----------------------------|
+| `async fn create_database(&self, name: &str, descriptor: Option<&DatabaseDescriptor>, ignore_if_exists: bool) -> Result<()>` | Create a database          |
+| `async fn drop_database(&self, name: &str, ignore_if_not_exists: bool, cascade: bool) -> Result<()>`                         | Drop a database            |
+| `async fn list_databases(&self) -> Result<Vec<String>>`                                                                      | List all databases         |
+| `async fn database_exists(&self, name: &str) -> Result<bool>`                                                                | Check if a database exists |
+| `async fn get_database_info(&self, name: &str) -> Result<DatabaseInfo>`                                                      | Get database metadata      |
+
+### Table Operations
+
+| Method                                                                                                                     | Description                                                                 |
+|----------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------|
+| `async fn create_table(&self, table_path: &TablePath, descriptor: &TableDescriptor, ignore_if_exists: bool) -> Result<()>` | Create a table                                                              |
+| `async fn drop_table(&self, table_path: &TablePath, ignore_if_not_exists: bool) -> Result<()>`                             | Drop a table                                                                |
+| `async fn get_table_info(&self, table_path: &TablePath) -> Result<TableInfo>`                                              | Get table metadata                                                          |
+| `async fn get_table_schema(&self, table_path: &TablePath, schema_id: Option<i32>) -> Result<SchemaInfo>`                   | Get a table's schema by id, or the latest schema when `schema_id` is `None` |
+| `async fn list_tables(&self, database_name: &str) -> Result<Vec<String>>`                                                  | List tables in a database                                                   |
+| `async fn table_exists(&self, table_path: &TablePath) -> Result<bool>`                                                     | Check if a table exists                                                     |
+
+### Partition Operations
+
+| Method                                                                                                                               | Description                     |
+|--------------------------------------------------------------------------------------------------------------------------------------|---------------------------------|
+| `async fn list_partition_infos(&self, table_path: &TablePath) -> Result<Vec<PartitionInfo>>`                                         | List all partitions             |
+| `async fn list_partition_infos_with_spec(&self, table_path: &TablePath, spec: Option<&PartitionSpec>) -> Result<Vec<PartitionInfo>>` | List partitions matching a spec |
+| `async fn create_partition(&self, table_path: &TablePath, spec: &PartitionSpec, ignore_if_exists: bool) -> Result<()>`               | Create a partition              |
+| `async fn drop_partition(&self, table_path: &TablePath, spec: &PartitionSpec, ignore_if_not_exists: bool) -> Result<()>`             | Drop a partition                |
+
+### Offset Operations
+
+| Method                                                                                                                                                           |  Description                          |
+|------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------|
+| `async fn list_offsets(&self, table_path: &TablePath, bucket_ids: &[i32], offset_spec: OffsetSpec) -> Result<HashMap<i32, i64>>`                                 | Get offsets for buckets               |
+| `async fn list_partition_offsets(&self, table_path: &TablePath, partition_name: &str, bucket_ids: &[i32], offset_spec: OffsetSpec) -> Result<HashMap<i32, i64>>` | Get offsets for a partition's buckets |
+
+### Lake Operations
+
+| Method                                                                                     |  Description                 |
+|--------------------------------------------------------------------------------------------|------------------------------|
+| `async fn get_latest_lake_snapshot(&self, table_path: &TablePath) -> Result<LakeSnapshot>` | Get the latest lake snapshot |
+
+### Cluster Operations
+
+| Method                                                        | Description                                         |
+|---------------------------------------------------------------|-----------------------------------------------------|
+| `async fn get_server_nodes(&self) -> Result<Vec<ServerNode>>` | Get all alive server nodes (coordinator + tablets)  |
+
+## `ServerNode`
+
+| Method                            | Description                                          |
+|-----------------------------------|------------------------------------------------------|
+| `fn id(&self) -> i32`            | Server node ID                                       |
+| `fn host(&self) -> &str`         | Hostname of the server                               |
+| `fn port(&self) -> u32`          | Port number                                          |
+| `fn server_type(&self) -> &ServerType` | Server type (`CoordinatorServer` or `TabletServer`) |
+| `fn uid(&self) -> &str`          | Unique identifier (e.g. `"cs-0"`, `"ts-1"`)         |
+
+## `FlussTable<'a>`
+
+| Method                                        | Description                             |
+|-----------------------------------------------|-----------------------------------------|
+| `fn get_table_info(&self) -> &TableInfo`      | Get table metadata                      |
+| `fn new_append(&self) -> Result<TableAppend>` | Create an append builder for log tables |
+| `fn new_scan(&self) -> TableScan<'_>`         | Create a scan builder                   |
+| `fn new_lookup(&self) -> Result<TableLookup>` | Create a lookup builder for PK tables   |
+| `fn new_upsert(&self) -> Result<TableUpsert>` | Create an upsert builder for PK tables  |
+| `fn has_primary_key(&self) -> bool`           | Check if the table has a primary key    |
+| `fn table_path(&self) -> &TablePath`          | Get the table path                      |
+
+## `TableAppend`
+
+| Method                                            | Description             |
+|---------------------------------------------------|-------------------------|
+| `fn create_writer(&self) -> Result<AppendWriter>` | Create an append writer |
+
+## `AppendWriter`
+
+| Method                                                                          | Description                                       |
+|---------------------------------------------------------------------------------|---------------------------------------------------|
+| `fn append(&self, row: &impl InternalRow) -> Result<WriteResultFuture>`         | Append a row; returns a future for acknowledgment |
+| `fn append_arrow_batch(&self, batch: RecordBatch) -> Result<WriteResultFuture>` | Append an Arrow RecordBatch                       |
+| `async fn flush(&self) -> Result<()>`                                           | Flush all pending writes to the server            |
+
+## `TableScan<'a>`
+
+| Method                                                                      | Description                             |
+|-----------------------------------------------------------------------------|-----------------------------------------|
+| `fn project(self, indices: &[usize]) -> Result<Self>`                       | Project columns by index                |
+| `fn project_by_name(self, names: &[&str]) -> Result<Self>`                  | Project columns by name                 |
+| `fn limit(self, n: i32) -> Result<Self>`                                    | Set a row limit (enables `create_bucket_batch_scanner`; rejected by log scanners) |
+| `fn create_log_scanner(self) -> Result<LogScanner>`                         | Create a record-based log scanner       |
+| `fn create_record_batch_log_scanner(self) -> Result<RecordBatchLogScanner>` | Create an Arrow batch-based log scanner |
+| `fn create_bucket_batch_scanner(self, bucket: TableBucket) -> Result<LimitBatchScanner>` | Bounded scan of one bucket (requires `limit`; runs on first `next_batch`) |
+
+## `LogScanner`
+
+Single-consumer: do not call `poll` concurrently on the same scanner (e.g. from `tokio::join!` or two tasks sharing an `Arc`). Mirrors Java's `LogScannerImpl.acquire()` guard. Debug builds surface overlapping calls via a `debug_assert!`; release builds skip the check for performance and produce skewed poll-timing metrics (`fluss.client.scanner.time_between_poll_ms`, `fluss.client.scanner.poll_idle_ratio`) if the contract is violated.
+
+All `fluss.client.scanner.*` metrics carry `database` and `table` labels (matching Java's per-`TablePath` `ScannerMetricGroup`), so multi-table consumers get one time series per scanned table.
+
+| Method                                                                                                    | Description                                              |
+|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------|
+| `async fn subscribe(&self, bucket_id: i32, start_offset: i64) -> Result<()>`                              | Subscribe to a bucket                                    |
+| `async fn subscribe_buckets(&self, bucket_offsets: &HashMap<i32, i64>) -> Result<()>`                     | Subscribe to multiple buckets                            |
+| `async fn subscribe_partition(&self, partition_id: i64, bucket_id: i32, start_offset: i64) -> Result<()>` | Subscribe to a partition bucket                          |
+| `async fn subscribe_partition_buckets(&self, offsets: &HashMap<(i64, i32), i64>) -> Result<()>`           | Subscribe to multiple partition-bucket pairs             |
+| `async fn unsubscribe(&self, bucket_id: i32) -> Result<()>`                                               | Unsubscribe from a bucket (non-partitioned tables)       |
+| `async fn unsubscribe_partition(&self, partition_id: i64, bucket_id: i32) -> Result<()>`                  | Unsubscribe from a partition bucket (partitioned tables) |
+| `async fn poll(&self, timeout: Duration) -> Result<ScanRecords>`                                          | Poll for records                                         |
+
+## `RecordBatchLogScanner`
+
+Single-consumer: overlapping `poll` calls on handles that share state, or `poll` concurrent with `RecordBatchLogReader::next_batch`, are not supported — use one active polling/consumption call at a time per underlying scanner state. Mirrors Java's `LogScannerImpl.acquire()` guard. Debug builds surface overlapping calls via a `debug_assert!`; release builds skip the check for performance and produce skewed poll-timing metrics (`fluss.client.scanner.time_between_poll_ms`, `fluss.client.scanner.poll_idle_ratio`) if the contract is violated.
+
+| Method                                                                                                    | Description                                              |
+|-----------------------------------------------------------------------------------------------------------|----------------------------------------------------------|
+| `async fn subscribe(&self, bucket_id: i32, start_offset: i64) -> Result<()>`                              | Subscribe to a bucket                                    |
+| `async fn subscribe_buckets(&self, bucket_offsets: &HashMap<i32, i64>) -> Result<()>`                     | Subscribe to multiple buckets                            |
+| `async fn subscribe_partition(&self, partition_id: i64, bucket_id: i32, start_offset: i64) -> Result<()>` | Subscribe to a partition bucket                          |
+| `async fn subscribe_partition_buckets(&self, offsets: &HashMap<(i64, i32), i64>) -> Result<()>`           | Subscribe to multiple partition-bucket pairs             |
+| `async fn unsubscribe(&self, bucket_id: i32) -> Result<()>`                                               | Unsubscribe from a bucket (non-partitioned tables)       |
+| `async fn unsubscribe_partition(&self, partition_id: i64, bucket_id: i32) -> Result<()>`                  | Unsubscribe from a partition bucket (partitioned tables) |
+| `async fn poll(&self, timeout: Duration) -> Result<Vec<ScanBatch>>`                                       | Poll for Arrow record batches                            |
+| `fn is_partitioned(&self) -> bool`                                                                        | Check if the table is partitioned                        |
+| `fn get_subscribed_buckets(&self) -> Vec<(TableBucket, i64)>`                                             | Get all current subscriptions as (bucket, offset) pairs  |
+| `fn schema(&self) -> SchemaRef`                                                                           | Get the Arrow schema for batches produced by this scanner|
+| `fn table_path(&self) -> &TablePath`                                                                      | Get the table path                                       |
+| `fn table_id(&self) -> TableId`                                                                           | Get the table ID                                         |
+
+## `RecordBatchLogReader`
+
+Bounded log reader that consumes data up to specified stopping offsets, then terminates.
+Unlike `RecordBatchLogScanner` which polls indefinitely, this reader stops automatically.
+
+| Method                                                                                                      | Description                                              |
+|-------------------------------------------------------------------------------------------------------------|----------------------------------------------------------|
+| `async fn new_until_latest(scanner: RecordBatchLogScanner, admin: &FlussAdmin) -> Result<Self>`              | Read until the latest offsets at time of creation         |
+| `fn new_until_offsets(scanner: RecordBatchLogScanner, stopping_offsets: HashMap<TableBucket, i64>) -> Result<Self>` | Read until custom stopping offsets per bucket             |
+| `async fn next_batch(&mut self) -> Result<Option<ScanBatch>>`                                                | Get the next batch with bucket/offset metadata, or `None` when all buckets caught up |
+| `async fn collect_all_batches(&mut self) -> Result<Vec<ScanBatch>>`                                          | Drain all batches (with metadata) until stopping offsets are satisfied |
+| `fn schema(&self) -> SchemaRef`                                                                              | Arrow schema for produced batches                        |
+| `fn to_record_batch_reader(self, handle: tokio::runtime::Handle) -> SyncRecordBatchLogReader`                | Sync adapter implementing `arrow::RecordBatchReader` (see below) |
+
+## `SyncRecordBatchLogReader`
+
+Synchronous adapter for `RecordBatchLogReader`. Created via
+`RecordBatchLogReader::to_record_batch_reader(handle)`.
+
+Implements both [`Iterator`] and [`arrow::record_batch::RecordBatchReader`], so it
+plugs into the wider Arrow ecosystem — FFI, PyArrow's
+`pa.RecordBatchReader.from_batches`, the C++ Arrow `RecordBatchReader` interface,
+DataFusion sources, etc.
+
+Each `next()` call drives the underlying async reader via
+`tokio::runtime::Handle::block_on`. **Do not call from inside a Tokio worker
+thread that belongs to the same runtime** — nested `block_on` panics. Prefer
+`RecordBatchLogReader::next_batch` in async Rust code; use this adapter only at
+sync/FFI boundaries.
+
+Bucket and offset metadata carried by `ScanBatch` is **dropped** here, because
+the Arrow trait contract yields plain `RecordBatch`. If you need offsets or
+bucket identity per batch, use `next_batch` instead.
+
+| Method                                                          | Description                                      |
+|-----------------------------------------------------------------|--------------------------------------------------|
+| `fn next(&mut self) -> Option<Result<RecordBatch, ArrowError>>` | Iterator: next batch, or `None` when caught up   |
+| `fn schema(&self) -> SchemaRef`                                 | Arrow schema for produced batches                |
+
+## `LimitBatchScanner`
+
+One-shot bounded scanner from `TableScan::limit(n).create_bucket_batch_scanner(bucket)`.
+Poll it with `next_batch` until it returns `None` (mirrors `RecordBatchLogReader`).
+Supports both log and primary-key tables (the latter returns the current,
+server-deduplicated state); yields a single batch of at most `n` rows.
+
+| Method                                                        | Description                          |
+|---------------------------------------------------------------|--------------------------------------|
+| `async fn next_batch(&mut self) -> Result<Option<ScanBatch>>` | Rows on the first call, `None` after |
+| `async fn collect_all_batches(&mut self) -> Result<Vec<ScanBatch>>` | Drain into all batches         |
+| `fn bucket(&self) -> &TableBucket`                            | The scanned bucket                   |
+
+## `ScanRecord`
+
+| Method                                 | Description                            |
+|----------------------------------------|----------------------------------------|
+| `fn row(&self) -> &dyn InternalRow`    | Get the row data                       |
+| `fn offset(&self) -> i64`              | Record offset in the log               |
+| `fn timestamp(&self) -> i64`           | Record timestamp                       |
+| `fn change_type(&self) -> &ChangeType` | Change type (AppendOnly, Insert, etc.) |
+
+## `ScanRecords`
+
+| Method                                                                   | Description                       |
+|--------------------------------------------------------------------------|-----------------------------------|
+| `fn count(&self) -> usize`                                               | Number of records                 |
+| `fn is_empty(&self) -> bool`                                             | Whether the result set is empty   |
+| `fn records(&self, bucket: &TableBucket) -> &[ScanRecord]`               | Get records for a specific bucket |
+| `fn records_by_buckets(&self) -> &HashMap<TableBucket, Vec<ScanRecord>>` | Get all records grouped by bucket |
+
+`ScanRecords` also implements `IntoIterator`, so you can iterate over all records directly:
+
+```rust
+for record in records {
+    println!("offset={}", record.offset());
+}
+```
+
+## `ScanBatch`
+
+| Method                             | Description                    |
+|------------------------------------|--------------------------------|
+| `fn bucket(&self) -> &TableBucket` | Bucket this batch belongs to   |
+| `fn batch(&self) -> &RecordBatch`  | Arrow RecordBatch data         |
+| `fn base_offset(&self) -> i64`     | First record offset            |
+| `fn last_offset(&self) -> i64`     | Last record offset             |
+| `fn num_records(&self) -> usize`   | Number of records in the batch |
+
+## `TableUpsert`
+
+| Method                                                                                | Description                                       |
+|---------------------------------------------------------------------------------------|---------------------------------------------------|
+| `fn create_writer(&self) -> Result<UpsertWriter>`                                     | Create an upsert writer                           |
+| `fn partial_update(&self, column_indices: Option<Vec<usize>>) -> Result<TableUpsert>` | Create a partial update builder by column indices |
+| `fn partial_update_with_column_names(&self, names: &[&str]) -> Result<TableUpsert>`   | Create a partial update builder by column names   |
+
+## `UpsertWriter`
+
+| Method                                                                  | Description                           |
+|-------------------------------------------------------------------------|---------------------------------------|
+| `fn upsert(&self, row: &impl InternalRow) -> Result<WriteResultFuture>` | Upsert a row (insert or update by PK) |
+| `fn delete(&self, row: &impl InternalRow) -> Result<WriteResultFuture>` | Delete a row by primary key           |
+| `async fn flush(&self) -> Result<()>`                                   | Flush all pending operations          |
+
+## `TableLookup`
+
+| Method                                          |  Description                        |
+|-------------------------------------------------|-------------------------------------|
+| `fn create_lookuper(&self) -> Result<Lookuper>` | Create a lookuper for point lookups |
+
+## `Lookuper`
+
+| Method                                                                       |  Description                |
+|------------------------------------------------------------------------------|-----------------------------|
+| `async fn lookup(&mut self, key: &impl InternalRow) -> Result<LookupResult>` | Lookup a row by primary key |
+
+## `LookupResult`
+
+| Method                                                         |  Description                     |
+|----------------------------------------------------------------|----------------------------------|
+| `fn get_single_row(&self) -> Result<Option<impl InternalRow>>` | Get a single row from the result |
+| `fn get_rows(&self) -> Result<Vec<impl InternalRow>>`          | Get all rows from the result     |
+| `fn to_record_batch(&self) -> Result<RecordBatch>`             | Convert all rows to an Arrow `RecordBatch` for DataFusion or other Arrow-based tools    |
+
+## `WriteResultFuture`
+
+| Description                                                                                                                                   |
+|-----------------------------------------------------------------------------------------------------------------------------------------------|
+| Implements `Future<Output = Result<(), Error>>`. Await to wait for server acknowledgment. Returned by `append()`, `upsert()`, and `delete()`. |
+
+Usage:
+
+```rust
+// Fire-and-forget (batched)
+writer.append(&row)?;
+writer.flush().await?;
+
+// Per-record acknowledgment
+writer.append(&row)?.await?;
+```
+
+## `Schema`
+
+| Method                                         |  Description                             |
+|------------------------------------------------|------------------------------------------|
+| `fn builder() -> SchemaBuilder`                | Create a schema builder                  |
+| `fn columns(&self) -> &[Column]`               | Get all columns                          |
+| `fn primary_key(&self) -> Option<&PrimaryKey>` | Get primary key (None if no primary key) |
+| `fn column_names(&self) -> Vec<&str>`          | Get all column names                     |
+| `fn primary_key_indexes(&self) -> Vec<usize>`  | Get primary key column indices           |
+
+## `SchemaBuilder`
+
+| Method                                               |  Description            |
+|------------------------------------------------------|-------------------------|
+| `fn column(name: &str, data_type: DataType) -> Self` | Add a column            |
+| `fn primary_key(keys: Vec<&str>) -> Self`            | Set primary key columns |
+| `fn build() -> Result<Schema>`                       | Build the schema        |
+
+## `SchemaInfo`
+
+A schema together with its server-assigned version id. Returned by [`FlussAdmin::get_table_schema`](#flussadmin).
+
+| Method                                           | Description                              |
+|--------------------------------------------------|------------------------------------------|
+| `fn new(schema: Schema, schema_id: i32) -> Self` | Construct from a schema and id           |
+| `fn schema(&self) -> &Schema`                    | Borrow the schema                        |
+| `fn schema_id(&self) -> i32`                     | Get the server-assigned schema id        |
+| `fn into_parts(self) -> (Schema, i32)`           | Consume and return `(schema, schema_id)` |
+
+## `TableDescriptor`
+
+| Method                                                    | Description                          |
+|-----------------------------------------------------------|--------------------------------------|
+| `fn builder() -> TableDescriptorBuilder`                  | Create a table descriptor builder    |
+| `fn schema(&self) -> &Schema`                             | Get the table schema                 |
+| `fn partition_keys(&self) -> &[String]`                   | Get partition key column names       |
+| `fn has_primary_key(&self) -> bool`                       | Check if the table has a primary key |
+| `fn properties(&self) -> &HashMap<String, String>`        | Get all table properties             |
+| `fn custom_properties(&self) -> &HashMap<String, String>` | Get custom properties                |
+| `fn comment(&self) -> Option<&str>`                       | Get table comment                    |
+
+## `TableDescriptorBuilder`
+
+| Method                                                                                    | Description                                 |
+|-------------------------------------------------------------------------------------------|---------------------------------------------|
+| `fn schema(schema: Schema) -> Self`                                                       | Set the schema                              |
+| `fn log_format(format: LogFormat) -> Self`                                                | Set log format (e.g., `LogFormat::ARROW`)   |
+| `fn kv_format(format: KvFormat) -> Self`                                                  | Set KV format (e.g., `KvFormat::COMPACTED`) |
+| `fn property(key: &str, value: &str) -> Self`                                             | Set a table property                        |
+| `fn custom_property(key: impl Into<String>, value: impl Into<String>) -> Self`            | Set a single custom property                |
+| `fn custom_properties(properties: HashMap<impl Into<String>, impl Into<String>>) -> Self` | Set custom properties                       |
+| `fn partitioned_by(keys: Vec<&str>) -> Self`                                              | Set partition columns                       |
+| `fn distributed_by(bucket_count: Option<i32>, bucket_keys: Vec<String>) -> Self`          | Set bucket distribution                     |
+| `fn comment(comment: &str) -> Self`                                                       | Set table comment                           |
+| `fn build() -> Result<TableDescriptor>`                                                   | Build the table descriptor                  |
+
+## `TablePath`
+
+| Method                                                |  Description        |
+|-------------------------------------------------------|---------------------|
+| `TablePath::new(database: &str, table: &str) -> Self` | Create a table path |
+| `fn database(&self) -> &str`                          | Get database name   |
+| `fn table(&self) -> &str`                             | Get table name      |
+
+## `TableInfo`
+
+| Field / Method       | Description                                         |
+|----------------------|-----------------------------------------------------|
+| `.table_path`        | `TablePath` -- Table path                           |
+| `.table_id`          | `i64` -- Table ID                                   |
+| `.schema_id`         | `i32` -- Schema ID                                  |
+| `.schema`            | `Schema` -- Table schema                            |
+| `.primary_keys`      | `Vec<String>` -- Primary key column names           |
+| `.partition_keys`    | `Vec<String>` -- Partition key column names         |
+| `.num_buckets`       | `i32` -- Number of buckets                          |
+| `.properties`        | `HashMap<String, String>` -- All table properties   |
+| `.custom_properties` | `HashMap<String, String>` -- Custom properties only |
+| `.comment`           | `Option<String>` -- Table comment                   |
+| `.created_time`      | `i64` -- Creation timestamp                         |
+| `.modified_time`     | `i64` -- Last modification timestamp                |
+
+## `TableBucket`
+
+| Method                                                                                              | Description                                |
+|-----------------------------------------------------------------------------------------------------|--------------------------------------------|
+| `TableBucket::new(table_id: i64, bucket_id: i32) -> Self`                                           | Create a non-partitioned bucket            |
+| `TableBucket::new_with_partition(table_id: i64, partition_id: Option<i64>, bucket_id: i32) -> Self` | Create a partitioned bucket                |
+| `fn table_id(&self) -> i64`                                                                         | Get table ID                               |
+| `fn partition_id(&self) -> Option<i64>`                                                             | Get partition ID (None if non-partitioned) |
+| `fn bucket_id(&self) -> i32`                                                                        | Get bucket ID                              |
+
+## `PartitionSpec`
+
+| Method                                                      | Description                                           |
+|-------------------------------------------------------------|-------------------------------------------------------|
+| `PartitionSpec::new(spec_map: HashMap<&str, &str>) -> Self` | Create from a map of partition column names to values |
+| `fn get_spec_map(&self) -> &HashMap<String, String>`        | Get the partition spec map                            |
+
+## `PartitionInfo`
+
+| Method                                   |  Description       |
+|------------------------------------------|--------------------|
+| `fn get_partition_id(&self) -> i64`      | Get partition ID   |
+| `fn get_partition_name(&self) -> String` | Get partition name |
+
+## `DatabaseDescriptor`
+
+| Method                                                    | Description                          |
+|-----------------------------------------------------------|--------------------------------------|
+| `fn builder() -> DatabaseDescriptorBuilder`               | Create a database descriptor builder |
+| `fn comment(&self) -> Option<&str>`                       | Get database comment                 |
+| `fn custom_properties(&self) -> &HashMap<String, String>` | Get custom properties                |
+
+## `DatabaseDescriptorBuilder`
+
+| Method                                                                                    | Description                   |
+|-------------------------------------------------------------------------------------------|-------------------------------|
+| `fn comment(comment: impl Into<String>) -> Self`                                          | Set database comment          |
+| `fn custom_properties(properties: HashMap<impl Into<String>, impl Into<String>>) -> Self` | Set custom properties         |
+| `fn custom_property(key: impl Into<String>, value: impl Into<String>) -> Self`            | Set a single custom property  |
+| `fn build() -> DatabaseDescriptor`                                                        | Build the database descriptor |
+
+## `DatabaseInfo`
+
+| Method                                                 | Description                     |
+|--------------------------------------------------------|---------------------------------|
+| `fn database_name(&self) -> &str`                      | Get database name               |
+| `fn created_time(&self) -> i64`                        | Get creation timestamp          |
+| `fn modified_time(&self) -> i64`                       | Get last modification timestamp |
+| `fn database_descriptor(&self) -> &DatabaseDescriptor` | Get the database descriptor     |
+
+## `LakeSnapshot`
+
+| Field                   | Description                                       |
+|-------------------------|---------------------------------------------------|
+| `.snapshot_id`          | `i64` -- Snapshot ID                              |
+| `.table_buckets_offset` | `HashMap<TableBucket, i64>` -- All bucket offsets |
+
+## `GenericRow<'a>`
+
+| Method                                                             | Description                                      |
+|--------------------------------------------------------------------|--------------------------------------------------|
+| `GenericRow::new(field_count: usize) -> Self`                      | Create a new row with the given number of fields |
+| `fn set_field(&mut self, pos: usize, value: impl Into<Datum<'a>>)` | Set a field value by position                    |
+| `GenericRow::from_data(data: Vec<impl Into<Datum<'a>>>) -> Self`   | Create a row from existing field data            |
+
+Implements the `InternalRow` trait (see below).
+
+## `InternalRow` trait
+
+| Method                                                                                 | Description                             |
+|----------------------------------------------------------------------------------------|-----------------------------------------|
+| `fn is_null_at(&self, idx: usize) -> Result<bool>`                                     | Check if a field is null                |
+| `fn get_boolean(&self, idx: usize) -> Result<bool>`                                    | Get boolean value                       |
+| `fn get_byte(&self, idx: usize) -> Result<i8>`                                         | Get tinyint value                       |
+| `fn get_short(&self, idx: usize) -> Result<i16>`                                       | Get smallint value                      |
+| `fn get_int(&self, idx: usize) -> Result<i32>`                                         | Get int value                           |
+| `fn get_long(&self, idx: usize) -> Result<i64>`                                        | Get bigint value                        |
+| `fn get_float(&self, idx: usize) -> Result<f32>`                                       | Get float value                         |
+| `fn get_double(&self, idx: usize) -> Result<f64>`                                      | Get double value                        |
+| `fn get_string(&self, idx: usize) -> Result<&str>`                                     | Get string value                        |
+| `fn get_decimal(&self, idx: usize, precision: usize, scale: usize) -> Result<Decimal>` | Get decimal value                       |
+| `fn get_date(&self, idx: usize) -> Result<Date>`                                       | Get date value                          |
+| `fn get_time(&self, idx: usize) -> Result<Time>`                                       | Get time value                          |
+| `fn get_timestamp_ntz(&self, idx: usize, precision: u32) -> Result<TimestampNtz>`      | Get timestamp value                     |
+| `fn get_timestamp_ltz(&self, idx: usize, precision: u32) -> Result<TimestampLtz>`      | Get timestamp with local timezone value |
+| `fn get_bytes(&self, idx: usize) -> Result<&[u8]>`                                     | Get bytes value                         |
+| `fn get_binary(&self, idx: usize, length: usize) -> Result<&[u8]>`                     | Get fixed-length binary value           |
+| `fn get_char(&self, idx: usize, length: usize) -> Result<&str>`                        | Get fixed-length char value             |
+| `fn get_array(&self, idx: usize) -> Result<FlussArray>`                                | Get array value                         |
+| `fn get_map(&self, idx: usize) -> Result<FlussMap>`                                    | Get map value                           |
+
+## `FlussArray`
+
+`FlussArray` is the Rust row representation for `ARRAY` values. You usually obtain it from `InternalRow::get_array()`.
+
+| Method | Description |
+|--------|-------------|
+| `fn size(&self) -> usize` | Number of elements in the array |
+| `fn is_null_at(&self, pos: usize) -> bool` | Check whether an element is null |
+| `fn as_bytes(&self) -> &[u8]` | Get encoded bytes of the array |
+
+Element getters mirror `InternalRow` typed getters and return `Result<T>`. For example, use `get_int()`, `get_long()`, and `get_double()` for primitive elements, and `get_string()`, `get_binary()`, `get_decimal()`, `get_timestamp_ntz()`, `get_timestamp_ltz()`, and `get_array()` for variable-length or nested elements.
+
+## `FlussMap`
+
+`FlussMap` is the Rust row representation for `MAP` values. You usually obtain it from `InternalRow::get_map()`.
+
+| Method | Description |
+|--------|-------------|
+| `fn size(&self) -> usize` | Number of entries in the map |
+| `fn as_bytes(&self) -> &[u8]` | Get encoded bytes of the map |
+| `fn key_type(&self) -> &DataType` | Schema-declared type of keys |
+| `fn value_type(&self) -> &DataType` | Schema-declared type of values |
+| `fn entries(&self) -> Entries<'_>` | Iterator yielding `Result<(Datum, Datum)>` pairs |
+| `fn get(&self, key: &Datum) -> Result<Option<Datum>>` | Linear-scan lookup by key (`O(n)`) |
+| `fn key_array(&self) -> &FlussArray` | Parallel keys array (zero-copy view) |
+| `fn value_array(&self) -> &FlussArray` | Parallel values array (zero-copy view) |
+
+Most user code should prefer `entries()` (iteration) and `get()` (lookup). The `key_array()` / `value_array()` views are for serdes and Arrow-adapter code that needs zero-copy access to the underlying parallel-array layout.
+
+## `FlussMapWriter`
+
+`FlussMapWriter` builds a `FlussMap` for write paths.
+
+| Method | Description |
+|--------|-------------|
+| `fn new(capacity: usize, key_type: &DataType, value_type: &DataType) -> Self` | Create a writer sized for `capacity` entries |
+| `fn write_entry(&mut self, key: Datum, value: Datum) -> Result<()>` | Append a single entry; rejects null keys and type mismatches |
+| `fn extend<I, K, V>(&mut self, entries: I) -> Result<()>` | Append every pair from `entries: IntoIterator<Item = (K, V)>` |
+| `fn complete(self) -> Result<FlussMap>` | Finalize the writer and produce the `FlussMap` |
+
+## `ChangeType`
+
+| Value                      | Short String  | Description                      |
+|----------------------------|---------------|----------------------------------|
+| `ChangeType::AppendOnly`   | `+A`          | Append-only record               |
+| `ChangeType::Insert`       | `+I`          | Inserted row                     |
+| `ChangeType::UpdateBefore` | `-U`          | Previous value of an updated row |
+| `ChangeType::UpdateAfter`  | `+U`          | New value of an updated row      |
+| `ChangeType::Delete`       | `-D`          | Deleted row                      |
+
+| Method                           | Description                         |
+|----------------------------------|-------------------------------------|
+| `fn short_string(&self) -> &str` | Get the short string representation |
+
+## `OffsetSpec`
+
+| Variant                      | Description                                     |
+|------------------------------|-------------------------------------------------|
+| `OffsetSpec::Earliest`       | Start from the earliest available offset        |
+| `OffsetSpec::Latest`         | Start from the latest offset (only new records) |
+| `OffsetSpec::Timestamp(i64)` | Start from a specific timestamp in milliseconds |
+
+## Constants
+
+| Constant                         | Value  | Description                                             |
+|----------------------------------|--------|---------------------------------------------------------|
+| `fluss::client::EARLIEST_OFFSET` | `-2`   | Start reading from the earliest available offset        |
+
+To start reading from the latest offset (only new records), resolve the current offset via `list_offsets` before subscribing:
+
+```rust
+use fluss::rpc::message::OffsetSpec;
+
+let offsets = admin.list_offsets(&table_path, &[0], OffsetSpec::Latest).await?;
+let latest = offsets[&0];
+log_scanner.subscribe(0, latest).await?;
+```
+
+## `DataTypes` factory
+
+| Method                                           | Returns    | Description                        |
+|--------------------------------------------------|------------|------------------------------------|
+| `DataTypes::boolean()`                           | `DataType` | Boolean type                       |
+| `DataTypes::tinyint()`                           | `DataType` | 8-bit signed integer               |
+| `DataTypes::smallint()`                          | `DataType` | 16-bit signed integer              |
+| `DataTypes::int()`                               | `DataType` | 32-bit signed integer              |
+| `DataTypes::bigint()`                            | `DataType` | 64-bit signed integer              |
+| `DataTypes::float()`                             | `DataType` | 32-bit floating point              |
+| `DataTypes::double()`                            | `DataType` | 64-bit floating point              |
+| `DataTypes::string()`                            | `DataType` | Variable-length string             |
+| `DataTypes::bytes()`                             | `DataType` | Variable-length byte array         |
+| `DataTypes::date()`                              | `DataType` | Date (days since epoch)            |
+| `DataTypes::time()`                              | `DataType` | Time (milliseconds since midnight) |
+| `DataTypes::timestamp()`                         | `DataType` | Timestamp without timezone         |
+| `DataTypes::timestamp_ltz()`                     | `DataType` | Timestamp with local timezone      |
+| `DataTypes::decimal(precision: u32, scale: u32)` | `DataType` | Fixed-point decimal                |
+| `DataTypes::char(length: u32)`                   | `DataType` | Fixed-length string                |
+| `DataTypes::binary(length: usize)`               | `DataType` | Fixed-length byte array            |
+| `DataTypes::array(element: DataType)`            | `DataType` | Array of elements                  |
+| `DataTypes::map(key: DataType, value: DataType)` | `DataType` | Map of key-value pairs             |
+| `DataTypes::row(fields: Vec<DataField>)`         | `DataType` | Nested row type                    |
+
+## `DataField`
+
+| Method                                                                                                   | Description         |
+|----------------------------------------------------------------------------------------------------------|---------------------|
+| `DataField::new(name: impl Into<String>, data_type: DataType, description: Option<String>) -> DataField` | Create a data field |
+| `fn name(&self) -> &str`                                                                                 | Get the field name  |
diff --git a/website/docs/apis/rust/data-types.md b/website/docs/apis/rust/data-types.md
new file mode 100644
index 0000000000..5418839184
--- /dev/null
+++ b/website/docs/apis/rust/data-types.md
@@ -0,0 +1,179 @@
+---
+sidebar_position: 3
+---
+# Data Types
+
+| Fluss Type      | Rust Type      | Getter                               | Setter                         |
+|-----------------|----------------|--------------------------------------|--------------------------------|
+| `BOOLEAN`       | `bool`         | `get_boolean()`                      | `set_field(idx, bool)`         |
+| `TINYINT`       | `i8`           | `get_byte()`                         | `set_field(idx, i8)`           |
+| `SMALLINT`      | `i16`          | `get_short()`                        | `set_field(idx, i16)`          |
+| `INT`           | `i32`          | `get_int()`                          | `set_field(idx, i32)`          |
+| `BIGINT`        | `i64`          | `get_long()`                         | `set_field(idx, i64)`          |
+| `FLOAT`         | `f32`          | `get_float()`                        | `set_field(idx, f32)`          |
+| `DOUBLE`        | `f64`          | `get_double()`                       | `set_field(idx, f64)`          |
+| `CHAR`          | `&str`         | `get_char(idx, length)`              | `set_field(idx, &str)`         |
+| `STRING`        | `&str`         | `get_string()`                       | `set_field(idx, &str)`         |
+| `DECIMAL`       | `Decimal`      | `get_decimal(idx, precision, scale)` | `set_field(idx, Decimal)`      |
+| `DATE`          | `Date`         | `get_date()`                         | `set_field(idx, Date)`         |
+| `TIME`          | `Time`         | `get_time()`                         | `set_field(idx, Time)`         |
+| `TIMESTAMP`     | `TimestampNtz` | `get_timestamp_ntz(idx, precision)`  | `set_field(idx, TimestampNtz)` |
+| `TIMESTAMP_LTZ` | `TimestampLtz` | `get_timestamp_ltz(idx, precision)`  | `set_field(idx, TimestampLtz)` |
+| `BYTES`         | `&[u8]`        | `get_bytes()`                        | `set_field(idx, &[u8])`        |
+| `BINARY(n)`     | `&[u8]`        | `get_binary(idx, length)`            | `set_field(idx, &[u8])`        |
+| `ARRAY<T>`      | `FlussArray`   | `get_array()`                        | `set_field(idx, FlussArray)`   |
+| `MAP<K, V>`     | `FlussMap`     | `get_map(idx)`                       | `set_field(idx, FlussMap)`     |
+
+## Constructing Special Types
+
+Primitive types (`bool`, `i8`, `i16`, `i32`, `i64`, `f32`, `f64`, `&str`, `&[u8]`) can be passed directly to `set_field`. The following types require explicit construction:
+
+```rust
+use fluss::row::{Date, Time, TimestampNtz, TimestampLtz, Decimal};
+
+// Date: days since Unix epoch
+let date = Date::new(19738);
+
+// Time: milliseconds since midnight
+let time = Time::new(43200000);
+
+// Timestamp without timezone: milliseconds since epoch
+// DataTypes::timestamp() defaults to precision 6 (microseconds).
+// Use DataTypes::timestamp_with_precision(p) for a different precision (0–9).
+let ts = TimestampNtz::new(1704067200000);
+
+// Timestamp with local timezone: milliseconds since epoch
+// DataTypes::timestamp_ltz() also defaults to precision 6.
+let ts_ltz = TimestampLtz::new(1704067200000);
+
+// Decimal: from an unscaled long value with precision and scale
+let decimal = Decimal::from_unscaled_long(12345, 10, 2)?; // represents 123.45
+```
+
+## Creating Rows from Data
+
+`GenericRow::from_data` accepts a `Vec<Datum>`. Because multiple crates implement `From<&str>`, Rust cannot infer the target type from `.into()` alone. Annotate the vector type explicitly:
+
+```rust
+use fluss::row::{Datum, GenericRow};
+
+let data: Vec<Datum> = vec![1i32.into(), "hello".into(), Datum::Null];
+let row = GenericRow::from_data(data);
+```
+
+## Arrays
+
+Use `DataTypes::array(element_type)` in schema definitions. At runtime, read arrays with `row.get_array(idx)?`.
+
+To construct array values for writes, build a `FlussArray` and wrap it with `Datum::Array`:
+
+```rust
+use fluss::metadata::DataTypes;
+use fluss::row::binary_array::FlussArrayWriter;
+use fluss::row::{Datum, GenericRow};
+
+let mut writer = FlussArrayWriter::new(3, &DataTypes::int());
+writer.write_int(0, 10);
+writer.write_int(1, 20);
+writer.set_null_at(2);
+let arr = writer.complete()?;
+
+let mut row = GenericRow::new(1);
+row.set_field(0, Datum::Array(arr));
+```
+
+`ARRAY` is supported for row values and nested row fields. For key encoding, Rust follows Java parity: `ARRAY` can be encoded by the compacted key encoder, while table-level key constraints are validated by the server (which may reject unsupported key types).
+
+## Maps
+
+Use `DataTypes::map(key_type, value_type)` in schema definitions. At runtime, read maps with `row.get_map(idx)?` — the row knows its schema, so no extra type arguments are needed.
+
+### Writing
+
+Build a `FlussMap` entry-by-entry, then wrap it with `Datum::Map`:
+
+```rust
+use fluss::metadata::DataTypes;
+use fluss::row::binary_map::FlussMapWriter;
+use fluss::row::{Datum, GenericRow};
+
+let mut writer = FlussMapWriter::new(2, &DataTypes::string(), &DataTypes::int());
+writer.write_entry("key1".into(), 100.into())?;
+writer.write_entry("key2".into(), Datum::Null)?;
+let map = writer.complete()?;
+
+let mut row = GenericRow::new(1);
+row.set_field(0, Datum::Map(map));
+```
+
+For bulk writes from any iterator of `(key, value)` pairs (including a `HashMap`), use `extend`:
+
+```rust
+use std::collections::HashMap;
+
+let entries: HashMap<&str, i32> = HashMap::from([("a", 1), ("b", 2)]);
+let mut writer = FlussMapWriter::new(entries.len(), &DataTypes::string(), &DataTypes::int());
+writer.extend(entries)?;
+let map = writer.complete()?;
+```
+
+### Reading
+
+The `entries()` iterator yields `(key, value)` pairs as schema-typed `Datum`s, folding the null check in:
+
+```rust
+use fluss::row::InternalRow;
+
+let m = row.get_map(0)?;
+for entry in m.entries() {
+    let (k, v) = entry?;
+    println!("{k:?} => {v:?}");          // Datum's Debug handles null
+}
+```
+
+For point lookups, `get(&key)` does a linear scan and returns `Option<Datum>`:
+
+```rust
+use fluss::row::Datum;
+
+if let Some(v) = m.get(&Datum::from("attr_size"))? {
+    println!("size = {v:?}");
+}
+```
+
+Lookup is `O(n)` — the binary MAP layout has no key index. If you need repeated lookups against the same map, collect the entries once:
+
+```rust
+use std::collections::HashMap;
+
+let snapshot: HashMap<String, Datum<'_>> = m
+    .entries()
+    .map(|e| e.map(|(k, v)| (format!("{k:?}"), v)))
+    .collect::<Result<_, _>>()?;
+```
+
+For raw access to the underlying parallel-array representation (zero-copy, used by serdes / Arrow adapters), `m.key_array()` and `m.value_array()` are still available.
+
+### Constraints
+
+`MAP` keys cannot be null. `MAP` is supported for row values and nested row fields. `MAP` cannot be used as a primary key or bucket key column — the Rust client rejects it at the compacted key encoder, and the Fluss server bans `MAP` (along with `ARRAY` and `ROW`) from key columns.
+
+## Reading Row Data
+
+```rust
+use fluss::row::InternalRow;
+
+for record in scan_records {
+    let row = record.row();
+
+    if row.is_null_at(0)? {
+        // field is null
+    }
+    let id: i32 = row.get_int(0)?;
+    let name: &str = row.get_string(1)?;
+    let score: f32 = row.get_float(2)?;
+    let date: Date = row.get_date(3)?;
+    let ts: TimestampNtz = row.get_timestamp_ntz(4, 6)?;
+    let decimal: Decimal = row.get_decimal(5, 10, 2)?;
+}
+```
diff --git a/website/docs/apis/rust/error-handling.md b/website/docs/apis/rust/error-handling.md
new file mode 100644
index 0000000000..4966428997
--- /dev/null
+++ b/website/docs/apis/rust/error-handling.md
@@ -0,0 +1,241 @@
+---
+sidebar_position: 4
+---
+# Error Handling
+
+The Fluss Rust client uses a unified `Error` type and a `Result<T>` alias for all fallible operations.
+
+## Basic Usage
+
+```rust
+use fluss::error::{Error, Result};
+
+// All operations return Result<T>
+let conn = FlussConnection::new(config).await?;
+let admin = conn.get_admin()?;
+let table = conn.get_table(&table_path).await?;
+```
+
+Use the `?` operator to propagate errors, or `match` on specific variants for fine-grained handling.
+
+## Matching Error Variants
+
+```rust
+use fluss::error::Error;
+
+match result {
+    Ok(val) => {
+        // handle success
+    }
+    Err(Error::RpcError { message, .. }) => {
+        eprintln!("RPC failure: {}", message);
+    }
+    Err(Error::UnsupportedOperation { message }) => {
+        eprintln!("Unsupported: {}", message);
+    }
+    Err(Error::FlussAPIError { api_error }) => {
+        eprintln!("Server error: {}", api_error);
+    }
+    Err(e) => {
+        eprintln!("Unexpected error: {}", e);
+    }
+}
+```
+
+## Error Variants
+
+| Variant                        | Description                                                  |
+|--------------------------------|--------------------------------------------------------------|
+| `UnexpectedError`              | General unexpected errors with a message and optional source |
+| `IoUnexpectedError`            | I/O errors (network, file system)                            |
+| `RemoteStorageUnexpectedError` | Remote storage errors (OpenDAL backend failures)             |
+| `RpcError`                     | RPC communication failures (connection refused, timeout)     |
+| `RowConvertError`              | Row conversion failures (type mismatch, invalid data)        |
+| `ArrowError`                   | Arrow data handling errors (schema mismatch, encoding)       |
+| `IllegalArgument`              | Invalid arguments passed to an API method                    |
+| `UnsupportedOperation`         | Operation not supported on the table type                    |
+| `FlussAPIError`                | Server-side API errors returned by the Fluss cluster         |
+
+Server side errors are represented as `FlussAPIError` with a specific error code. Use the `api_error()` helper to match them ergonomically:
+
+```rust
+use fluss::error::FlussError;
+
+match result {
+    Err(ref e) if e.api_error() == Some(FlussError::InvalidTableException) => {
+        eprintln!("Invalid table: {}", e);
+    }
+    Err(ref e) if e.api_error() == Some(FlussError::PartitionNotExists) => {
+        eprintln!("Partition does not exist: {}", e);
+    }
+    Err(ref e) if e.api_error() == Some(FlussError::LeaderNotAvailableException) => {
+        eprintln!("Leader not available: {}", e);
+    }
+    Err(ref e) if e.api_error() == Some(FlussError::AuthenticateException) => {
+        eprintln!("Authentication failed: {}", e);
+    }
+    _ => {}
+}
+```
+
+## Retry Logic
+
+Some errors are transient, where the server may be temporarily unavailable, mid-election, or under load. `is_retriable()` can be used for deciding to retry an operation rather than treating the error as permanent.
+
+`Error::is_retriable()` is available directly on any `Error` value. `RpcError` is always retriable; `FlussAPIError` delegates to the server error code; all other variants return `false`.
+
+```rust
+use fluss::error::Error;
+
+match writer.append(&row) {
+    Ok(_) => {}
+    Err(ref e) if e.is_retriable() => {
+        // Transient failure — safe to retry
+    }
+    Err(e) => {
+        // Permanent failure — log and abort
+        eprintln!("Fatal error: {}", e);
+    }
+}
+```
+
+### Retriable Variants
+
+| Variant / Error                              | Code | Reason                                    |
+|----------------------------------------------|------|-------------------------------------------|
+| `Error::RpcError`                            | —    | Network-level failure, always retriable   |
+| `FlussError::NetworkException`               | 1    | Server disconnected                       |
+| `FlussError::CorruptMessage`                 | 3    | CRC or size error                         |
+| `FlussError::SchemaNotExist`                 | 9    | Schema may not exist                      |
+| `FlussError::LogStorageException`            | 10   | Transient log storage error               |
+| `FlussError::KvStorageException`             | 11   | Transient KV storage error                |
+| `FlussError::NotLeaderOrFollower`            | 12   | Leader election in progress               |
+| `FlussError::CorruptRecordException`         | 14   | Corrupt record                            |
+| `FlussError::UnknownTableOrBucketException`  | 21   | Metadata not yet available                |
+| `FlussError::RequestTimeOut`                 | 25   | Request timed out                         |
+| `FlussError::StorageException`               | 26   | Transient storage error                   |
+| `FlussError::NotEnoughReplicasAfterAppendException` | 28 | Wrote to server but with low ISR size |
+| `FlussError::NotEnoughReplicasException`     | 29   | Low ISR size at write time                |
+| `FlussError::LeaderNotAvailableException`    | 44   | No leader available for partition         |
+
+All other `Error` variants (e.g. `RowConvertError`, `IllegalArgument`, `UnsupportedOperation`) always return `false` from `is_retriable()`.
+
+## Common Error Scenarios
+
+### Connection Refused
+
+The Fluss cluster is not running or the address is incorrect.
+
+```rust
+let result = FlussConnection::new(config).await;
+match result {
+    Err(Error::RpcError { message, .. }) => {
+        eprintln!("Cannot connect to cluster: {}", message);
+    }
+    _ => {}
+}
+```
+
+### Table Not Found
+
+The table does not exist or has been dropped.
+
+```rust
+use fluss::error::{Error, FlussError};
+
+// Admin operations return FlussError::TableNotExist (code 7)
+let result = admin.drop_table(&table_path, false).await;
+match result {
+    Err(ref e) if e.api_error() == Some(FlussError::TableNotExist) => {
+        eprintln!("Table not found: {}", e);
+    }
+    _ => {}
+}
+
+// conn.get_table() wraps the error differently, match on FlussAPIError directly
+let result = conn.get_table(&table_path).await;
+match result {
+    Err(Error::FlussAPIError { ref api_error }) => {
+        eprintln!("Server error (code {}): {}", api_error.code, api_error.message);
+    }
+    _ => {}
+}
+```
+
+### Partition Not Found
+
+The partition does not exist on a partitioned table.
+
+```rust
+use fluss::error::FlussError;
+
+let result = admin.drop_partition(&table_path, &spec, false).await;
+match result {
+    Err(ref e) if e.api_error() == Some(FlussError::PartitionNotExists) => {
+        eprintln!("Partition does not exist: {}", e);
+    }
+    _ => {}
+}
+```
+
+### Authentication Failed
+
+SASL credentials are incorrect or the user does not exist.
+
+```rust
+use fluss::error::{Error, FlussError};
+
+let result = FlussConnection::new(config).await;
+match result {
+    Err(ref e) if e.api_error() == Some(FlussError::AuthenticateException) => {
+        eprintln!("Authentication failed: {}", e);
+    }
+    _ => {}
+}
+```
+
+### Schema Mismatch
+
+Row data does not match the expected table schema.
+
+```rust
+let result = writer.append(&row);
+match result {
+    Err(Error::RowConvertError { .. }) => {
+        eprintln!("Row does not match table schema");
+    }
+    _ => {}
+}
+```
+
+## Using `Result<T>` in Application Code
+
+The `fluss::error::Result<T>` type alias makes it easy to use Fluss errors with the `?` operator in your application functions:
+
+```rust
+use fluss::error::Result;
+
+async fn my_pipeline() -> Result<()> {
+    let conn = FlussConnection::new(config).await?;
+    let admin = conn.get_admin()?;
+    let table = conn.get_table(&table_path).await?;
+    let writer = table.new_append()?.create_writer()?;
+    writer.append(&row)?;
+    writer.flush().await?;
+    Ok(())
+}
+```
+
+For applications that use other error types alongside Fluss errors, you can convert with standard `From` / `Into` traits or use crates like `anyhow`:
+
+```rust
+use anyhow::Result;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    let conn = FlussConnection::new(config).await?;
+    // fluss::error::Error implements std::error::Error,
+    // so it converts into anyhow::Error automatically
+    Ok(())
+}
+```
diff --git a/website/docs/apis/rust/example/_category_.json b/website/docs/apis/rust/example/_category_.json
new file mode 100644
index 0000000000..4d81ec12ae
--- /dev/null
+++ b/website/docs/apis/rust/example/_category_.json
@@ -0,0 +1,4 @@
+{
+  "label": "Examples",
+  "position": 5
+}
diff --git a/website/docs/apis/rust/example/admin-operations.md b/website/docs/apis/rust/example/admin-operations.md
new file mode 100644
index 0000000000..39752754f1
--- /dev/null
+++ b/website/docs/apis/rust/example/admin-operations.md
@@ -0,0 +1,122 @@
+---
+sidebar_position: 3
+---
+# Admin Operations
+
+## Get Admin Interface
+
+```rust
+let admin = conn.get_admin()?;
+```
+
+## Database Operations
+
+```rust
+// Create database
+admin.create_database("my_database", None, true).await?;
+
+// List all databases
+let databases = admin.list_databases().await?;
+println!("Databases: {:?}", databases);
+
+// Check if database exists
+let exists = admin.database_exists("my_database").await?;
+
+// Get database information
+let db_info = admin.get_database_info("my_database").await?;
+
+// Drop database
+admin.drop_database("my_database", true, false).await?;
+```
+
+## Table Operations
+
+```rust
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .column("amount", DataTypes::bigint())
+            .build()?,
+    )
+    .build()?;
+
+let table_path = TablePath::new("my_database", "my_table");
+
+// Create table
+admin.create_table(&table_path, &table_descriptor, true).await?;
+
+// Get table information
+let table_info = admin.get_table_info(&table_path).await?;
+println!("Table: {}", table_info);
+
+// List tables in database
+let tables = admin.list_tables("my_database").await?;
+
+// Check if table exists
+let exists = admin.table_exists(&table_path).await?;
+
+// Drop table
+admin.drop_table(&table_path, true).await?;
+```
+
+## Partition Operations
+
+```rust
+use fluss::metadata::PartitionSpec;
+use std::collections::HashMap;
+
+// List all partitions
+let partitions = admin.list_partition_infos(&table_path).await?;
+
+// List partitions matching a spec
+let mut filter = HashMap::new();
+filter.insert("year", "2024");
+let spec = PartitionSpec::new(filter);
+let partitions = admin.list_partition_infos_with_spec(&table_path, Some(&spec)).await?;
+
+// Create partition
+admin.create_partition(&table_path, &spec, true).await?;
+
+// Drop partition
+admin.drop_partition(&table_path, &spec, true).await?;
+```
+
+## Offset Operations
+
+```rust
+use fluss::rpc::message::OffsetSpec;
+
+let bucket_ids = vec![0, 1, 2];
+
+// Get earliest offsets
+let earliest = admin.list_offsets(&table_path, &bucket_ids, OffsetSpec::Earliest).await?;
+
+// Get latest offsets
+let latest = admin.list_offsets(&table_path, &bucket_ids, OffsetSpec::Latest).await?;
+
+// Get offsets for a specific timestamp
+let timestamp_ms = 1704067200000; // 2024-01-01 00:00:00 UTC
+let offsets = admin.list_offsets(
+    &table_path, &bucket_ids, OffsetSpec::Timestamp(timestamp_ms),
+).await?;
+
+// Get offsets for a specific partition
+let partition_offsets = admin.list_partition_offsets(
+    &table_path, "partition_name", &bucket_ids, OffsetSpec::Latest,
+).await?;
+```
+
+## Lake Snapshot
+
+:::note
+Lake snapshots require [lake integration](https://fluss.apache.org/docs/maintenance/tiered-storage/overview/) (e.g. Paimon or Iceberg) to be enabled on the server. Without it, `get_latest_lake_snapshot` will return an error.
+:::
+
+```rust
+let snapshot = admin.get_latest_lake_snapshot(&table_path).await?;
+println!("Snapshot ID: {}", snapshot.snapshot_id);
+```
diff --git a/website/docs/apis/rust/example/configuration.md b/website/docs/apis/rust/example/configuration.md
new file mode 100644
index 0000000000..eba38d85f2
--- /dev/null
+++ b/website/docs/apis/rust/example/configuration.md
@@ -0,0 +1,35 @@
+---
+sidebar_position: 2
+---
+# Configuration
+
+## Connection Setup
+
+```rust
+use fluss::client::FlussConnection;
+use fluss::config::Config;
+
+let mut config = Config::default();
+config.bootstrap_servers = "127.0.0.1:9123".to_string();
+
+let conn = FlussConnection::new(config).await?;
+```
+
+## Connection Configurations
+
+See the [`Config`](../api-reference.md#config) section in the API Reference for the full list of configuration options, types, and defaults.
+
+## SASL Authentication
+
+To connect to a Fluss cluster with SASL/PLAIN authentication enabled:
+
+```rust
+let mut config = Config::default();
+config.bootstrap_servers = "127.0.0.1:9123".to_string();
+config.security_protocol = "sasl".to_string();
+config.security_sasl_mechanism = "PLAIN".to_string();
+config.security_sasl_username = "admin".to_string();
+config.security_sasl_password = "admin-secret".to_string();
+
+let conn = FlussConnection::new(config).await?;
+```
diff --git a/website/docs/apis/rust/example/index.md b/website/docs/apis/rust/example/index.md
new file mode 100644
index 0000000000..f1d5a6882d
--- /dev/null
+++ b/website/docs/apis/rust/example/index.md
@@ -0,0 +1,56 @@
+---
+sidebar_position: 1
+---
+# Example
+
+Minimal working examples: connect to Fluss, create a table, write data, and read it back.
+
+```rust
+use fluss::client::FlussConnection;
+use fluss::config::Config;
+use fluss::error::Result;
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+use fluss::row::{GenericRow, InternalRow};
+use std::time::Duration;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    // Connect
+    let mut config = Config::default();
+    config.bootstrap_servers = "127.0.0.1:9123".to_string();
+    let conn = FlussConnection::new(config).await?;
+    let admin = conn.get_admin()?;
+
+    // Create a log table
+    let table_path = TablePath::new("fluss", "quickstart_rust");
+    let descriptor = TableDescriptor::builder()
+        .schema(
+            Schema::builder()
+                .column("id", DataTypes::int())
+                .column("name", DataTypes::string())
+                .build()?,
+        )
+        .build()?;
+    admin.create_table(&table_path, &descriptor, true).await?;
+
+    // Write
+    let table = conn.get_table(&table_path).await?;
+    let writer = table.new_append()?.create_writer()?;
+    let mut row = GenericRow::new(2);
+    row.set_field(0, 1);
+    row.set_field(1, "hello");
+    writer.append(&row)?;
+    writer.flush().await?;
+
+    // Read
+    let scanner = table.new_scan().create_log_scanner()?;
+    scanner.subscribe(0, 0).await?;
+    let records = scanner.poll(Duration::from_secs(5)).await?;
+    for record in records {
+        let row = record.row();
+        println!("id={}, name={}", row.get_int(0)?, row.get_string(1)?);
+    }
+
+    Ok(())
+}
+```
diff --git a/website/docs/apis/rust/example/log-tables.md b/website/docs/apis/rust/example/log-tables.md
new file mode 100644
index 0000000000..e77c8c6c43
--- /dev/null
+++ b/website/docs/apis/rust/example/log-tables.md
@@ -0,0 +1,172 @@
+---
+sidebar_position: 4
+---
+# Log Tables
+
+Log tables are append-only tables without primary keys, suitable for event streaming.
+
+## Creating a Log Table
+
+```rust
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("event_id", DataTypes::int())
+            .column("event_type", DataTypes::string())
+            .column("timestamp", DataTypes::bigint())
+            .build()?,
+    )
+    .build()?;
+
+let table_path = TablePath::new("fluss", "events");
+admin.create_table(&table_path, &table_descriptor, true).await?;
+```
+
+## Writing to Log Tables
+
+```rust
+use fluss::row::{GenericRow, InternalRow};
+
+let table = conn.get_table(&table_path).await?;
+let append_writer = table.new_append()?.create_writer()?;
+
+let mut row = GenericRow::new(3);
+row.set_field(0, 1);                    // event_id
+row.set_field(1, "user_login");         // event_type
+row.set_field(2, 1704067200000i64);     // timestamp
+
+append_writer.append(&row)?;
+append_writer.flush().await?;
+```
+
+Write operations use a **fire-and-forget** pattern for efficient batching. Each call queues the write and returns a `WriteResultFuture` immediately. Call `flush()` to ensure all queued writes are sent to the server.
+
+For per-record acknowledgment:
+
+```rust
+append_writer.append(&row)?.await?;
+```
+
+## Reading from Log Tables
+
+```rust
+use std::time::Duration;
+
+let table = conn.get_table(&table_path).await?;
+let log_scanner = table.new_scan().create_log_scanner()?;
+
+// Subscribe to bucket 0 starting from offset 0
+log_scanner.subscribe(0, 0).await?;
+
+// Poll for records
+let records = log_scanner.poll(Duration::from_secs(10)).await?;
+
+// Per-bucket access
+for (bucket, bucket_records) in records.records_by_buckets() {
+    println!("Bucket {}: {} records", bucket.bucket_id(), bucket_records.len());
+    for record in bucket_records {
+        let row = record.row();
+        println!(
+            "  event_id={}, event_type={} @ offset={}",
+            row.get_int(0)?,
+            row.get_string(1)?,
+            record.offset()
+        );
+    }
+}
+
+// Or flat iteration (consumes ScanRecords)
+for record in records {
+    let row = record.row();
+    println!(
+        "event_id={}, event_type={}, timestamp={} @ offset={}",
+        row.get_int(0)?,
+        row.get_string(1)?,
+        row.get_long(2)?,
+        record.offset()
+    );
+}
+```
+
+**Subscribe from special offsets:**
+
+```rust
+use fluss::client::EARLIEST_OFFSET;
+
+log_scanner.subscribe(0, EARLIEST_OFFSET).await?;  // from earliest
+log_scanner.subscribe(0, 42).await?;                // from specific offset
+```
+
+**Subscribe from latest offset (only new records):**
+
+To start reading only new records, first resolve the current latest offset via `list_offsets`, then subscribe at that offset:
+
+```rust
+use fluss::rpc::message::OffsetSpec;
+
+let admin = conn.get_admin()?;
+let offsets = admin.list_offsets(&table_path, &[0], OffsetSpec::Latest).await?;
+let latest = offsets[&0];
+log_scanner.subscribe(0, latest).await?;
+```
+
+**Subscribe to all buckets:**
+
+```rust
+let num_buckets = table.get_table_info().get_num_buckets();
+for bucket_id in 0..num_buckets {
+    log_scanner.subscribe(bucket_id, 0).await?;
+}
+```
+
+**Subscribe to multiple buckets at once:**
+
+```rust
+use std::collections::HashMap;
+
+let mut bucket_offsets = HashMap::new();
+bucket_offsets.insert(0, 0i64);
+bucket_offsets.insert(1, 100i64);
+log_scanner.subscribe_buckets(&bucket_offsets).await?;
+```
+
+**Unsubscribe from a bucket:**
+
+```rust
+// Non-partitioned tables
+log_scanner.unsubscribe(bucket_id).await?;
+
+// Partitioned tables
+log_scanner.unsubscribe_partition(partition_id, bucket_id).await?;
+```
+
+## Column Projection
+
+```rust
+// Project by column index
+let scanner = table.new_scan().project(&[0, 2])?.create_log_scanner()?;
+
+// Project by column name
+let scanner = table.new_scan()
+    .project_by_name(&["event_id", "timestamp"])?
+    .create_log_scanner()?;
+```
+
+## Limit Scan
+
+For a bounded read of up to `n` rows from a single bucket, use a batch scanner
+instead of subscribing. It issues one request; poll it with `next_batch` until
+it returns `None`.
+
+```rust
+let bucket = TableBucket::new(table.get_table_info().table_id, 0);
+let mut scanner = table.new_scan().limit(10)?.create_bucket_batch_scanner(bucket)?;
+
+while let Some(batch) = scanner.next_batch().await? {
+    println!("rows: {}", batch.batch().num_rows());
+}
+```
+
+Limit applies per bucket; scan each bucket to cover a multi-bucket table.
diff --git a/website/docs/apis/rust/example/partitioned-tables.md b/website/docs/apis/rust/example/partitioned-tables.md
new file mode 100644
index 0000000000..e583e06ead
--- /dev/null
+++ b/website/docs/apis/rust/example/partitioned-tables.md
@@ -0,0 +1,219 @@
+---
+sidebar_position: 6
+---
+# Partitioned Tables
+
+Partitioned tables distribute data across partitions based on partition column values, enabling efficient data organization and querying. Both log tables and primary key tables support partitioning.
+
+## Partitioned Log Tables
+
+### Creating a Partitioned Log Table
+
+```rust
+use fluss::metadata::{DataTypes, LogFormat, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("event_id", DataTypes::int())
+            .column("event_type", DataTypes::string())
+            .column("dt", DataTypes::string())
+            .column("region", DataTypes::string())
+            .build()?,
+    )
+    .partitioned_by(vec!["dt", "region"])
+    .log_format(LogFormat::ARROW)
+    .build()?;
+
+let table_path = TablePath::new("fluss", "partitioned_events");
+admin.create_table(&table_path, &table_descriptor, true).await?;
+```
+
+### Writing to Partitioned Log Tables
+
+**Partitions must exist before writing data, otherwise the client will by default retry indefinitely.** Include partition column values in each row, the client routes records to the correct partition automatically.
+
+```rust
+use fluss::metadata::PartitionSpec;
+use std::collections::HashMap;
+
+let table = conn.get_table(&table_path).await?;
+
+// Create the partition before writing
+let mut partition_values = HashMap::new();
+partition_values.insert("dt", "2024-01-15");
+partition_values.insert("region", "US");
+admin.create_partition(&table_path, &PartitionSpec::new(partition_values), true).await?;
+
+let append_writer = table.new_append()?.create_writer()?;
+
+let mut row = GenericRow::new(4);
+row.set_field(0, 1);              // event_id
+row.set_field(1, "user_login");   // event_type
+row.set_field(2, "2024-01-15");   // dt (partition column)
+row.set_field(3, "US");           // region (partition column)
+
+append_writer.append(&row)?;
+append_writer.flush().await?;
+```
+
+### Reading from Partitioned Log Tables
+
+For partitioned tables, use partition-aware subscribe methods.
+
+```rust
+use std::time::Duration;
+
+let table = conn.get_table(&table_path).await?;
+let admin = conn.get_admin()?;
+let partitions = admin.list_partition_infos(&table_path).await?;
+
+let log_scanner = table.new_scan().create_log_scanner()?;
+
+// Subscribe to each partition's buckets
+for partition_info in &partitions {
+    let partition_id = partition_info.get_partition_id();
+    let num_buckets = table.get_table_info().get_num_buckets();
+    for bucket_id in 0..num_buckets {
+        log_scanner.subscribe_partition(partition_id, bucket_id, 0).await?;
+    }
+}
+
+let records = log_scanner.poll(Duration::from_secs(10)).await?;
+for record in records {
+    println!("Record: {:?}", record.row());
+}
+```
+
+Subscribe to multiple partition-buckets at once:
+
+```rust
+use std::collections::HashMap;
+
+let mut partition_bucket_offsets = HashMap::new();
+partition_bucket_offsets.insert((partition_id, 0), 0i64);
+partition_bucket_offsets.insert((partition_id, 1), 0i64);
+log_scanner.subscribe_partition_buckets(&partition_bucket_offsets).await?;
+```
+
+### Managing Partitions
+
+```rust
+use fluss::metadata::PartitionSpec;
+use std::collections::HashMap;
+
+// Create a partition
+let mut partition_values = HashMap::new();
+partition_values.insert("dt", "2024-01-15");
+partition_values.insert("region", "EMEA");
+let spec = PartitionSpec::new(partition_values);
+admin.create_partition(&table_path, &spec, true).await?;
+
+// List all partitions
+let partitions = admin.list_partition_infos(&table_path).await?;
+for partition in &partitions {
+    println!(
+        "Partition: id={}, name={}",
+        partition.get_partition_id(),
+        partition.get_partition_name()
+    );
+}
+
+// List with filter
+let mut partial_values = HashMap::new();
+partial_values.insert("dt", "2024-01-15");
+let partial_spec = PartitionSpec::new(partial_values);
+let filtered = admin.list_partition_infos_with_spec(
+    &table_path, Some(&partial_spec),
+).await?;
+
+// Drop a partition
+admin.drop_partition(&table_path, &spec, true).await?;
+```
+
+## Partitioned Primary Key Tables
+
+Partitioned KV tables combine partitioning with primary key operations. Partition columns must be part of the primary key.
+
+### Creating a Partitioned Primary Key Table
+
+```rust
+use fluss::metadata::{DataTypes, KvFormat, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("user_id", DataTypes::int())
+            .column("region", DataTypes::string())
+            .column("zone", DataTypes::bigint())
+            .column("score", DataTypes::bigint())
+            .primary_key(vec!["user_id", "region", "zone"])
+            .build()?,
+    )
+    .partitioned_by(vec!["region", "zone"])
+    .kv_format(KvFormat::COMPACTED)
+    .build()?;
+
+let table_path = TablePath::new("fluss", "partitioned_users");
+admin.create_table(&table_path, &table_descriptor, true).await?;
+```
+
+### Writing to Partitioned Primary Key Tables
+
+**Partitions must exist before upserting data, otherwise the client will by default retry indefinitely.**
+
+```rust
+use fluss::metadata::PartitionSpec;
+use std::collections::HashMap;
+
+let table = conn.get_table(&table_path).await?;
+
+// Create partitions first
+for (region, zone) in [("APAC", "1"), ("EMEA", "2"), ("US", "3")] {
+    let mut values = HashMap::new();
+    values.insert("region", region);
+    values.insert("zone", zone);
+    admin.create_partition(&table_path, &PartitionSpec::new(values), true).await?;
+}
+
+let table_upsert = table.new_upsert()?;
+let upsert_writer = table_upsert.create_writer()?;
+
+for (user_id, region, zone, score) in [
+    (1001, "APAC", 1i64, 1234i64),
+    (1002, "EMEA", 2, 2234),
+    (1003, "US", 3, 3234),
+] {
+    let mut row = GenericRow::new(4);
+    row.set_field(0, user_id);
+    row.set_field(1, region);
+    row.set_field(2, zone);
+    row.set_field(3, score);
+    upsert_writer.upsert(&row)?;
+}
+upsert_writer.flush().await?;
+```
+
+### Looking Up Records in Partitioned Tables
+
+Lookup requires all primary key columns including partition columns.
+
+```rust
+let mut lookuper = table.new_lookup()?.create_lookuper()?;
+
+let mut key = GenericRow::new(3);
+key.set_field(0, 1001);    // user_id
+key.set_field(1, "APAC");  // region (partition column)
+key.set_field(2, 1i64);    // zone (partition column)
+
+let result = lookuper.lookup(&key).await?;
+if let Some(row) = result.get_single_row()? {
+    println!("Found: score={}", row.get_long(3)?);
+}
+```
+
+### Prefix Lookup on Partitioned Tables
+
+See [Prefix Lookup — Partitioned Table](./prefix-lookup.md#partitioned-table) for details and a full runnable example.
+
+> **Note:** Scanning partitioned primary key tables is not supported. Use lookup operations instead.
diff --git a/website/docs/apis/rust/example/prefix-lookup.md b/website/docs/apis/rust/example/prefix-lookup.md
new file mode 100644
index 0000000000..619ba8341d
--- /dev/null
+++ b/website/docs/apis/rust/example/prefix-lookup.md
@@ -0,0 +1,110 @@
+---
+sidebar_position: 7
+---
+# Prefix Lookup
+
+Prefix lookup returns all rows whose primary key starts with a given prefix. It's enabled by choosing a **bucket key that is a strict prefix of the primary key** — rows sharing the same bucket-key prefix land in the same bucket, so one bucket lookup returns them all.
+
+## Table Requirements
+
+- The table must have a primary key.
+- The bucket key must be a strict prefix of the primary key (on partitioned tables, of the *non-partition* portion of the primary key).
+- The bucket key cannot equal the full primary key — that's a normal primary-key lookup, use [`Lookuper`](./primary-key-tables.md#looking-up-records) instead.
+- The `lookup_by` columns passed to the client must equal `partition_keys ++ bucket_key` (in that order, if partitioned).
+
+`create_lookuper()` validates these rules and returns `Err(Error::IllegalArgument { .. })` on mismatch, with a message describing the violation.
+
+## Non-Partitioned Table
+
+Pick a schema where the bucket key is a prefix of the primary key:
+
+```rust
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("user_id", DataTypes::int())
+            .column("session_id", DataTypes::string())
+            .column("event_seq", DataTypes::bigint())
+            .column("event_data", DataTypes::string())
+            .primary_key(vec!["user_id", "session_id", "event_seq"])
+            .build()?,
+    )
+    // Bucket key (user_id, session_id) is a prefix of the primary key.
+    .distributed_by(Some(3), vec!["user_id".to_string(), "session_id".to_string()])
+    .build()?;
+```
+
+Create the lookuper with `lookup_by(columns)` naming the prefix columns, then call `lookup(prefix_row)`:
+
+```rust
+use fluss::row::{GenericRow, InternalRow};
+
+let mut prefix_lookuper = table
+    .new_lookup()?
+    .lookup_by(vec!["user_id".to_string(), "session_id".to_string()])
+    .create_lookuper()?;
+
+let mut prefix = GenericRow::new(2);
+prefix.set_field(0, 1);                // user_id
+prefix.set_field(1, "sess-a");         // session_id
+
+let result = prefix_lookuper.lookup(&prefix).await?;
+for row in result.get_rows()? {
+    println!(
+        "seq={}, data={}",
+        row.get_long(2)?,
+        row.get_string(3)?,
+    );
+}
+```
+
+Unlike primary-key lookup (which uses `get_single_row()`), prefix lookup returns zero or more rows via `get_rows()`.
+
+## Partitioned Table
+
+On a partitioned table, the partition columns are stripped from the primary key before the bucket-prefix rule is evaluated. The lookup key, though, must still carry the partition values so the client can route the request to the right partition — so the `lookup_by` columns are `partition_keys ++ bucket_key`.
+
+```rust
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("region", DataTypes::string())
+            .column("user_id", DataTypes::int())
+            .column("session_id", DataTypes::string())
+            .column("event_seq", DataTypes::bigint())
+            .column("event_data", DataTypes::string())
+            .primary_key(vec!["region", "user_id", "session_id", "event_seq"])
+            .build()?,
+    )
+    .partitioned_by(vec!["region"])
+    // Bucket key (user_id, session_id) is a prefix of the pk minus partition cols.
+    .distributed_by(Some(3), vec!["user_id".to_string(), "session_id".to_string()])
+    .build()?;
+```
+
+```rust
+let mut prefix_lookuper = table
+    .new_lookup()?
+    .lookup_by(vec![
+        "region".to_string(),
+        "user_id".to_string(),
+        "session_id".to_string(),
+    ])
+    .create_lookuper()?;
+
+let mut prefix = GenericRow::new(3);
+prefix.set_field(0, "US");             // region (partition column)
+prefix.set_field(1, 1);                // user_id
+prefix.set_field(2, "sess-a");         // session_id
+
+let result = prefix_lookuper.lookup(&prefix).await?;
+for row in result.get_rows()? {
+    println!(
+        "seq={}, data={}",
+        row.get_long(3)?,
+        row.get_string(4)?,
+    );
+}
+```
diff --git a/website/docs/apis/rust/example/primary-key-tables.md b/website/docs/apis/rust/example/primary-key-tables.md
new file mode 100644
index 0000000000..01836e29e4
--- /dev/null
+++ b/website/docs/apis/rust/example/primary-key-tables.md
@@ -0,0 +1,141 @@
+---
+sidebar_position: 5
+---
+# Primary Key Tables
+
+Primary key tables (KV tables) support upsert, delete, and lookup operations.
+
+## Creating a Primary Key Table
+
+```rust
+use fluss::metadata::{DataTypes, Schema, TableDescriptor, TablePath};
+
+let table_descriptor = TableDescriptor::builder()
+    .schema(
+        Schema::builder()
+            .column("id", DataTypes::int())
+            .column("name", DataTypes::string())
+            .column("age", DataTypes::bigint())
+            .primary_key(vec!["id"])
+            .build()?,
+    )
+    .build()?;
+
+let table_path = TablePath::new("fluss", "users");
+admin.create_table(&table_path, &table_descriptor, true).await?;
+```
+
+## Upserting Records
+
+```rust
+use fluss::row::{GenericRow, InternalRow};
+
+let table = conn.get_table(&table_path).await?;
+let table_upsert = table.new_upsert()?;
+let upsert_writer = table_upsert.create_writer()?;
+
+for (id, name, age) in [(1, "Alice", 25i64), (2, "Bob", 30), (3, "Charlie", 35)] {
+    let mut row = GenericRow::new(3);
+    row.set_field(0, id);
+    row.set_field(1, name);
+    row.set_field(2, age);
+    upsert_writer.upsert(&row)?;
+}
+upsert_writer.flush().await?;
+```
+
+## Updating Records
+
+Upsert with the same primary key to update an existing record.
+
+```rust
+let mut row = GenericRow::new(3);
+row.set_field(0, 1);        // id (primary key)
+row.set_field(1, "Alice");
+row.set_field(2, 26i64);    // updated age
+
+upsert_writer.upsert(&row)?;
+upsert_writer.flush().await?;
+```
+
+## Deleting Records
+
+```rust
+// Only primary key field needs to be set
+let mut row = GenericRow::new(3);
+row.set_field(0, 2);  // id of record to delete
+
+upsert_writer.delete(&row)?;
+upsert_writer.flush().await?;
+```
+
+## Partial Updates
+
+Update only specific columns while preserving others.
+
+```rust
+// By column indices
+let partial_upsert = table_upsert.partial_update(Some(vec![0, 2]))?;
+let partial_writer = partial_upsert.create_writer()?;
+
+let mut row = GenericRow::new(3);
+row.set_field(0, 1);       // id (primary key, required)
+row.set_field(2, 27i64);   // age (will be updated)
+// name will remain unchanged
+
+partial_writer.upsert(&row)?;
+partial_writer.flush().await?;
+
+// By column names
+let partial_upsert = table_upsert.partial_update_with_column_names(&["id", "age"])?;
+let partial_writer = partial_upsert.create_writer()?;
+```
+
+## Looking Up Records
+
+```rust
+let mut lookuper = table.new_lookup()?.create_lookuper()?;
+
+let mut key = GenericRow::new(1);
+key.set_field(0, 1);  // id to lookup
+
+let result = lookuper.lookup(&key).await?;
+
+if let Some(row) = result.get_single_row()? {
+    println!(
+        "Found: id={}, name={}, age={}",
+        row.get_int(0)?,
+        row.get_string(1)?,
+        row.get_long(2)?
+    );
+} else {
+    println!("Record not found");
+}
+```
+## Looking Up Records as Arrow RecordBatch
+
+Use `to_record_batch()` to get lookup results in Arrow format, for example when integrating with DataFusion.
+```rust
+let result = lookuper.lookup(&key).await?;
+let batch = result.to_record_batch()?;
+println!("Rows: {}", batch.num_rows());
+```
+
+## Prefix Lookup
+
+To fetch all rows sharing a common primary-key prefix (by choosing a bucket key that's a strict prefix of the primary key), see [Prefix Lookup](./prefix-lookup.md).
+
+## Limit Scan
+
+To read up to `n` rows of a bucket's current state without supplying keys, use a batch scanner. The server returns the deduplicated current rows as Arrow batches, which is convenient for previews or DataFusion sources.
+
+```rust
+let bucket = TableBucket::new(table.get_table_info().table_id, 0);
+let mut scanner = table.new_scan().limit(10)?.create_bucket_batch_scanner(bucket)?;
+
+while let Some(batch) = scanner.next_batch().await? {
+    println!("rows: {}", batch.batch().num_rows());
+}
+```
+
+Limit applies per bucket; scan each bucket to cover a multi-bucket table.
diff --git a/website/docs/apis/rust/installation.md b/website/docs/apis/rust/installation.md
new file mode 100644
index 0000000000..540d4a10a0
--- /dev/null
+++ b/website/docs/apis/rust/installation.md
@@ -0,0 +1,76 @@
+---
+sidebar_position: 1
+---
+# Installation
+
+The Fluss Rust client is published to [crates.io](https://crates.io/crates/fluss-rs) as `fluss-rs`. The crate's library name is `fluss`, so you import it with `use fluss::...`.
+
+```toml
+[dependencies]
+fluss-rs = "0.1.0"
+tokio = { version = "1", features = ["full"] }
+```
+
+## Feature Flags
+
+```toml
+[dependencies]
+# Default: memory and filesystem storage
+fluss-rs = "0.1.0"
+
+# With S3 storage support
+fluss-rs = { version = "0.1", features = ["storage-s3"] }
+
+# With OSS storage support
+fluss-rs = { version = "0.1", features = ["storage-oss"] }
+
+# All storage backends
+fluss-rs = { version = "0.1", features = ["storage-all"] }
+```
+
+Available features:
+- `storage-memory` (default: In-memory storage)
+- `storage-fs` (default: Local filesystem storage)
+- `storage-s3` (Amazon S3 storage)
+- `storage-oss` (Alibaba OSS storage)
+- `storage-all` (All storage backends)
+
+## Git or Path Dependency
+
+For development against unreleased changes:
+
+```toml
+[dependencies]
+# From Git
+fluss = { git = "https://github.com/apache/fluss-rust.git", package = "fluss-rs" }
+
+# From local path
+fluss = { path = "/path/to/fluss-rust/crates/fluss", package = "fluss-rs" }
+```
+
+> **Note:** When using `git` or `path` dependencies, the `package = "fluss-rs"` field is required so that Cargo resolves the correct package while still allowing `use fluss::...` imports.
+
+## Building from Source
+
+**Prerequisites:** Rust 1.85+, Protobuf compiler (`protoc`)
+
+```bash
+git clone https://github.com/apache/fluss-rust.git
+cd fluss-rust
+```
+
+Install `protoc`:
+
+```bash
+# macOS
+brew install protobuf
+
+# Ubuntu/Debian
+sudo apt-get install protobuf-compiler
+```
+
+Build:
+
+```bash
+cargo build --workspace --all-targets
+```
diff --git a/website/docs/table-design/merge-engines/aggregation.md b/website/docs/table-design/merge-engines/aggregation.md
index 4375404af0..71dabb654e 100644
--- a/website/docs/table-design/merge-engines/aggregation.md
+++ b/website/docs/table-design/merge-engines/aggregation.md
@@ -1166,4 +1166,4 @@ For detailed information about Exactly-Once implementation, please refer to: [FI
 - [FirstRow Merge Engine](table-design/merge-engines/first-row.md)
 - [Versioned Merge Engine](table-design/merge-engines/versioned.md)
 - [Primary Key Tables](table-design/table-types/pk-table.md)
-- [Fluss Client API](apis/java-client.md)
+- [Fluss Client API](../../apis/java/index.md)
diff --git a/website/docusaurus.config.ts b/website/docusaurus.config.ts
index 1f6225a759..1e8b4e81ad 100644
--- a/website/docusaurus.config.ts
+++ b/website/docusaurus.config.ts
@@ -325,7 +325,7 @@ const config: Config = {
     prism: {
       theme: lightTheme,
       darkTheme: darkTheme,
-      additionalLanguages: ['java', 'bash', 'scala']
+      additionalLanguages: ['java', 'bash', 'scala', 'rust', 'toml', 'cmake']
     },
     algolia: {
       appId: "X8KSGGLJW1",