diff --git a/.bazelrc b/.bazelrc index 0c3b6c31e..94c1b4a59 100644 --- a/.bazelrc +++ b/.bazelrc @@ -88,11 +88,13 @@ build --@rules_rust//:extra_rustc_flag=-Wtrivial_casts build --@rules_rust//:extra_rustc_flag=-Wtrivial_numeric_casts build --@rules_rust//:extra_rustc_flag=-Dunconditional_recursion build --@rules_rust//:extra_rustc_flag=-Dunexpected_cfgs +build --@rules_rust//:extra_rustc_flag=-Dunknown_lints build --@rules_rust//:extra_rustc_flag=-Dunnameable_test_items build --@rules_rust//:extra_rustc_flag=-Wunreachable_pub build --@rules_rust//:extra_rustc_flag=-Dunsafe_op_in_unsafe_fn build --@rules_rust//:extra_rustc_flag=-Dunstable_syntax_pre_expansion build --@rules_rust//:extra_rustc_flag=-Wunused_import_braces +build --@rules_rust//:extra_rustc_flag=-Dunused_imports build --@rules_rust//:extra_rustc_flag=-Wunused_lifetimes build --@rules_rust//:extra_rustc_flag=-Wunused_qualifications build --@rules_rust//:extra_rustc_flag=-Wvariant_size_differences @@ -102,22 +104,42 @@ build --@rules_rust//:clippy_flag=-Wclippy::nursery build --@rules_rust//:clippy_flag=-Wclippy::pedantic build --@rules_rust//:clippy_flag=-Dclippy::alloc_instead_of_core build --@rules_rust//:clippy_flag=-Dclippy::as_underscore +build --@rules_rust//:clippy_flag=-Dclippy::await_holding_lock +build --@rules_rust//:clippy_flag=-Dclippy::bind_instead_of_map +build --@rules_rust//:clippy_flag=-Dclippy::collapsible_if build --@rules_rust//:clippy_flag=-Wclippy::dbg_macro build --@rules_rust//:clippy_flag=-Wclippy::decimal_literal_representation +build --@rules_rust//:clippy_flag=-Dclippy::disallowed_methods +build --@rules_rust//:clippy_flag=-Dclippy::doc_markdown build --@rules_rust//:clippy_flag=-Dclippy::elidable_lifetime_names +build --@rules_rust//:clippy_flag=-Dclippy::explicit_into_iter_loop +build --@rules_rust//:clippy_flag=-Dclippy::future_not_send build --@rules_rust//:clippy_flag=-Aclippy::get_unwrap +build --@rules_rust//:clippy_flag=-Dclippy::implicit_clone +build --@rules_rust//:clippy_flag=-Dclippy::implicit_hasher +build --@rules_rust//:clippy_flag=-Dclippy::manual_is_variant_and +build --@rules_rust//:clippy_flag=-Dclippy::map_unwrap_or build --@rules_rust//:clippy_flag=-Dclippy::missing_const_for_fn build --@rules_rust//:clippy_flag=-Aclippy::missing_docs_in_private_items +build --@rules_rust//:clippy_flag=-Dclippy::or_fun_call build --@rules_rust//:clippy_flag=-Wclippy::print_stdout +build --@rules_rust//:clippy_flag=-Dclippy::ptr_arg +build --@rules_rust//:clippy_flag=-Dclippy::redundant_closure_for_method_calls build --@rules_rust//:clippy_flag=-Dclippy::semicolon_if_nothing_returned +build --@rules_rust//:clippy_flag=-Dclippy::single_char_pattern build --@rules_rust//:clippy_flag=-Dclippy::std_instead_of_core +build --@rules_rust//:clippy_flag=-Dclippy::string_lit_as_bytes build --@rules_rust//:clippy_flag=-Dclippy::todo build --@rules_rust//:clippy_flag=-Aclippy::too_long_first_doc_paragraph +build --@rules_rust//:clippy_flag=-Dclippy::unchecked_duration_subtraction build --@rules_rust//:clippy_flag=-Wclippy::unimplemented +build --@rules_rust//:clippy_flag=-Dclippy::unnecessary_semicolon build --@rules_rust//:clippy_flag=-Aclippy::unwrap_in_result build --@rules_rust//:clippy_flag=-Aclippy::unwrap_used build --@rules_rust//:clippy_flag=-Wclippy::use_debug -build --@rules_rust//:clippy_flag=-Aclippy::cast_possible_truncation +build --@rules_rust//:clippy_flag=-Dclippy::used_underscore_binding +build --@rules_rust//:clippy_flag=-Dclippy::useless_format +build --@rules_rust//:clippy_flag=-Dclippy::cast_possible_truncation build --@rules_rust//:clippy_flag=-Aclippy::cast_possible_wrap build --@rules_rust//:clippy_flag=-Aclippy::cast_precision_loss build --@rules_rust//:clippy_flag=-Aclippy::cast_sign_loss diff --git a/.github/actions/free-disk/action.yaml b/.github/actions/free-disk/action.yaml new file mode 100644 index 000000000..7e65fd0db --- /dev/null +++ b/.github/actions/free-disk/action.yaml @@ -0,0 +1,48 @@ +--- +name: Free up disk space +description: "Free up disk space on workers" +runs: + using: "composite" + steps: + - name: Free disk space + uses: >- # v3.1.0 + endersonmenezes/free-disk-space@e6ed9b02e683a3b55ed0252f1ee469ce3b39a885 + with: + rm_cmd: "rmz" # For speed up + remove_android: false # Takes too long. + remove_dotnet: true + remove_haskell: true + remove_tool_cache: false # TODO(palfrey): Do we really need this? + # Note: Not deleting google-cloud-cli because it takes too long. + remove_packages: > + azure-cli + microsoft-edge-stable + google-chrome-stable + firefox + postgresql* + temurin-* + *llvm* + mysql* + dotnet-sdk-* + remove_packages_one_command: true + remove_folders: > + /usr/share/swift + /usr/share/miniconda + /usr/share/az* + /usr/share/glade* + /usr/local/share/chromium + /usr/local/share/powershell + + - name: Delete platform specific items to free up disk space + shell: bash + run: | + if [ "$(uname)" = "Darwin" ]; then + echo "Deleting Applications" + sudo rm -rf ~/Applications/* + echo "Deleting all iOS simulators" + xcrun simctl delete all + echo "Deleting iOS Simulator caches" + sudo rm -rf ~/Library/Developer/CoreSimulator/Caches/* + else + echo "Nothing to do here." + fi diff --git a/.github/actions/prepare-nix/action.yaml b/.github/actions/prepare-nix/action.yaml index 64f51d110..889b91202 100644 --- a/.github/actions/prepare-nix/action.yaml +++ b/.github/actions/prepare-nix/action.yaml @@ -5,52 +5,21 @@ runs: using: "composite" steps: - name: Free disk space - uses: >- # v2.0.0 - endersonmenezes/free-disk-space@3f9ec39ebae520864ac93467ee395f5237585c21 - with: - remove_android: false # Takes too long. - remove_dotnet: true - remove_haskell: true - remove_tool_cache: false # TODO(palfrey): Do we really need this? - # Note: Not deleting google-cloud-cli because it takes too long. - remove_packages: > - azure-cli - microsoft-edge-stable - google-chrome-stable - firefox - postgresql* - temurin-* - *llvm* - mysql* - dotnet-sdk-* - remove_packages_one_command: true - remove_folders: > - /usr/share/swift - /usr/share/miniconda - /usr/share/az* - /usr/share/glade* - /usr/local/lib/node_modules - /usr/local/share/chromium - /usr/local/share/powershell - - - name: Delete platform specific items to free up disk space - shell: bash - run: | - if [ "$(uname)" = "Darwin" ]; then - echo "Deleting Applications" - sudo rm -rf ~/Applications/* - echo "Deleting all iOS simulators" - xcrun simctl delete all - echo "Deleting iOS Simulator caches" - sudo rm -rf ~/Library/Developer/CoreSimulator/Caches/* - else - echo "Nothing to do here." - fi + uses: ./.github/actions/free-disk - name: Install Nix - uses: >- # https://github.com/DeterminateSystems/nix-installer-action/releases/tag/v17 - DeterminateSystems/nix-installer-action@21a544727d0c62386e78b4befe52d19ad12692e3 + uses: >- # https://github.com/DeterminateSystems/nix-installer-action/releases/tag/v20 + DeterminateSystems/nix-installer-action@786fff0690178f1234e4e1fe9b536e94f5433196 + with: + source-tag: v3.13.0 + + # FIXME(palfrey): Replace with better cache. Workers are currently taking minutes to upload data + # all the time, probably because we're at ~500GB of 10GB in the cache storage and it's breaking. + # We've tried Flakehub, but it doesn't work for us because it assumes "branches on an org repo" + # not our "fork and branch on your own repo" setup for it's auth so we can't currently use that. - - name: Add Nix magic cache - uses: >- # https://github.com/DeterminateSystems/magic-nix-cache-action/releases/tag/v11 - DeterminateSystems/magic-nix-cache-action@def9f5a5c6a6b8751c0534e8813a5d0ad2635660 + # - name: Add Nix magic cache + # uses: >- # https://github.com/DeterminateSystems/magic-nix-cache-action/releases/tag/v13 + # DeterminateSystems/magic-nix-cache-action@565684385bcd71bad329742eefe8d12f2e765b39 + # with: + # source-tag: v0.1.6 diff --git a/.github/renovate.json5 b/.github/renovate.json5 index 4e8d8bb86..1c3eadae6 100644 --- a/.github/renovate.json5 +++ b/.github/renovate.json5 @@ -8,6 +8,7 @@ matchUpdateTypes: [ "patch", "minor", + "digest", ], enabled: false, }, diff --git a/.github/styles/config/vocabularies/TraceMachina/accept.txt b/.github/styles/config/vocabularies/TraceMachina/accept.txt index fed5748e3..61ecc2a8d 100644 --- a/.github/styles/config/vocabularies/TraceMachina/accept.txt +++ b/.github/styles/config/vocabularies/TraceMachina/accept.txt @@ -17,6 +17,8 @@ FFI FFIs GPUs Goma +gzip +[Hh]eatmap [Hh]ermeticity Istio JDK @@ -111,7 +113,16 @@ Trendshift Norwest Databricks Datadog +Downsampling Brex Citrix Menlo benchmarked +Thanos +Quickwit +[Mm]iddleware +queryable +gRPC +[Mm]itigations +[Pp]recompute +attrs diff --git a/.github/workflows/custom-image.yaml b/.github/workflows/custom-image.yaml new file mode 100644 index 000000000..2cd982ba8 --- /dev/null +++ b/.github/workflows/custom-image.yaml @@ -0,0 +1,141 @@ +name: Build Custom Docker Image + +on: + workflow_dispatch: + inputs: + image: + description: 'Image to build' + required: false + default: 'image' + type: choice + options: + - image + - nativelink-worker-init + - nativelink-worker-lre-cc + skip_signing: + description: 'Skip cosign signing' + required: false + default: true + type: boolean + + issue_comment: + types: [created] + +permissions: + contents: read + packages: write + pull-requests: write + id-token: write + +jobs: + check-trigger: + runs-on: ubuntu-latest + outputs: + should_build: ${{ steps.check.outputs.should_build }} + pr_sha: ${{ steps.check.outputs.pr_sha }} + image: ${{ steps.check.outputs.image }} + steps: + - name: Check trigger + id: check + uses: actions/github-script@v8 + with: + script: | + if (context.eventName === 'workflow_dispatch') { + core.setOutput('should_build', 'true'); + core.setOutput('pr_sha', context.sha); + core.setOutput('image', '${{ inputs.image }}'); + return; + } + + if (context.eventName === 'issue_comment') { + const body = context.payload.comment.body.trim(); + const isPR = !!context.payload.issue.pull_request; + + // Match /build-image or /build-image + const match = body.match(/^\/build-image(?:\s+(\S+))?/i); + + if (isPR && match) { + const { data: pr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.payload.issue.number + }); + + const image = match[1] || 'image'; + const validImages = ['image', 'nativelink-worker-init', 'nativelink-worker-lre-cc']; + + if (!validImages.includes(image)) { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.payload.issue.number, + body: `Unknown image: \`${image}\`\n\nValid options: ${validImages.map(i => `\`${i}\``).join(', ')}` + }); + core.setOutput('should_build', 'false'); + return; + } + + core.setOutput('should_build', 'true'); + core.setOutput('pr_sha', pr.head.sha); + core.setOutput('image', image); + + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: context.payload.comment.id, + content: 'rocket' + }); + return; + } + } + + core.setOutput('should_build', 'false'); + + build-image: + name: Build and Push Image + needs: check-trigger + if: needs.check-trigger.outputs.should_build == 'true' + runs-on: ubuntu-24.04 + timeout-minutes: 45 + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + ref: ${{ needs.check-trigger.outputs.pr_sha }} + + - name: Prepare Worker + uses: ./.github/actions/prepare-nix + + - name: Upload image + id: upload + run: | + GIT_HASH=$(git rev-parse --short HEAD) + nix run .#publish-ghcr ${{ needs.check-trigger.outputs.image }} "$GIT_HASH" + + IMAGE_NAME=$(nix eval .#${{ needs.check-trigger.outputs.image }}.imageName --raw) + echo "image_tag=ghcr.io/${{ github.repository_owner }}/${IMAGE_NAME}:${GIT_HASH}" >> $GITHUB_OUTPUT + env: + GHCR_REGISTRY: ghcr.io/${{ github.repository_owner }} + GHCR_USERNAME: ${{ github.actor }} + GHCR_PASSWORD: ${{ secrets.GITHUB_TOKEN }} + SKIP_SIGNING: "true" + SKIP_TRIVY: "true" + + - name: Output image info + run: | + echo "### Published Image" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + echo "${{ steps.upload.outputs.image_tag }}" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + + - name: Comment on PR + if: github.event_name == 'issue_comment' + uses: actions/github-script@v8 + with: + script: | + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.payload.issue.number, + body: `Image built and pushed!\n\n\`\`\`\n${{ steps.upload.outputs.image_tag }}\n\`\`\`` + }); diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index ba7801072..c2a0817dd 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -20,86 +20,6 @@ concurrency: cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} jobs: - nativelink-dot-com-cloud-rbe-main-legacy-dockerfile-test: - runs-on: ubuntu-24.04 - environment: production - name: NativeLink.com Cloud / RBE on Main (Legacy Dockerfile Test) - if: github.ref == 'refs/heads/main' - steps: - - name: Checkout - uses: >- # v4.2.2 - actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 - - - name: Set up AWS CLI - uses: >- # v4.1.0 - aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 - with: - aws-access-key-id: ${{ secrets.RBE_ECR_AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.RBE_ECR_AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ secrets.RBE_ECR_AWS_ACCOUNT_REGION }} - - - name: Calculate Dockerfile hash and Retrieve Image URI for RBE - run: | - DOCKERFILE_HASH=$(sha256sum "$GITHUB_WORKSPACE/tools/toolchain-nativelink/Dockerfile" | awk '{print $1}') - IMAGE_DETAILS=$(aws ecr describe-images --repository-name ${{ secrets.RBE_ECR_REPOSITORY_NAME }} --image-ids imageTag=$DOCKERFILE_HASH) - if [ $? -ne 0 ]; then - echo "Run tools/toolchain-nativelink/toolchain-nativelink.sh locally and upload a new version of the stock image" - exit 1; - fi - echo "RBE_IMAGE=${{ secrets.RBE_ECR_AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.RBE_ECR_AWS_ACCOUNT_REGION }}.amazonaws.com/${{ secrets.RBE_ECR_REPOSITORY_NAME }}:$DOCKERFILE_HASH" >> $GITHUB_ENV - - - name: Setup Bazel - uses: >- # v0.13.0 - bazel-contrib/setup-bazel@663f88d97adf17db2523a5b385d9407a562e5551 - with: - bazelisk-cache: true - repository-cache: true - - - name: Run Bazel tests - shell: bash - # remove digest_function when #1325 is resolved - run: | - bazel --digest_function=sha256 test \ - --remote_cache=grpcs://tm-ci-cas.build-faster.nativelink.net \ - --remote_header=x-nativelink-api-key=${{ secrets.NATIVELINK_COM_API_HEADER }} \ - --bes_backend=grpcs://tm-ci-bep.build-faster.nativelink.net \ - --bes_header=x-nativelink-api-key=${{ secrets.NATIVELINK_COM_API_HEADER }} \ - --bes_results_url=https://tm-ci-web.build-faster.nativelink.net/ \ - --remote_header=x-nativelink-project=nativelink-ci \ - --remote_executor=grpcs://tm-ci-cas.build-faster.nativelink.net \ - --remote_default_exec_properties="container-image=docker://$RBE_IMAGE" \ - --jobs=200 \ - //... - - nativelink-dot-com-cloud-cache-test: - strategy: - fail-fast: false - matrix: - os: [ubuntu-24.04, macos-15] - runs-on: ${{ matrix.os }} - environment: production - name: NativeLink.com Cloud / Remote Cache / ${{ matrix.os }} - env: - NL_COM_API_KEY: ${{ secrets.NATIVELINK_COM_API_HEADER || '065f02f53f26a12331d5cfd00a778fb243bfb4e857b8fcd4c99273edfb15deae' }} - steps: - - name: Checkout - uses: >- # v4.2.2 - actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 - - - name: Prepare Worker - uses: ./.github/actions/prepare-nix - - - name: Run Bazel tests - run: > - nix develop --impure --command - bash -c "bazel test \ - --remote_header=x-nativelink-api-key=$NL_COM_API_KEY \ - --bes_backend=grpcs://tm-ci-bep.build-faster.nativelink.net \ - --bes_header=x-nativelink-api-key=$NL_COM_API_KEY \ - --bes_results_url=https://tm-ci-web.build-faster.nativelink.net/ \ - ${{ github.ref == 'refs/heads/main' && '--remote_upload_local_results=true' || '--nogenerate_json_trace_profile --remote_upload_local_results=false' }} \ - //..." - # TODO(palfrey): Flaky. Fix. # docker-compose-compiles-nativelink: # # The type of runner that the job will run on. diff --git a/.github/workflows/native-bazel.yaml b/.github/workflows/native-bazel.yaml index b7d80606a..4d48c1ab3 100644 --- a/.github/workflows/native-bazel.yaml +++ b/.github/workflows/native-bazel.yaml @@ -33,9 +33,12 @@ jobs: uses: >- # v4.2.2 actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + - name: Free disk space + uses: ./.github/actions/free-disk + - name: Setup Bazel - uses: >- # v0.13.0 - bazel-contrib/setup-bazel@663f88d97adf17db2523a5b385d9407a562e5551 + uses: >- # v0.18.0 + bazel-contrib/setup-bazel@083175551ceeceebc757ebee2127fde78840ca77 with: bazelisk-cache: true repository-cache: true @@ -59,3 +62,57 @@ jobs: exit 1 fi shell: bash + + redis-store-tester: + name: Redis store tester + runs-on: ubuntu-24.04 + timeout-minutes: 30 + steps: + - name: Checkout + uses: >- # v4.2.2 + actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + + - uses: hoverkraft-tech/compose-action@3846bcd61da338e9eaaf83e7ed0234a12b099b72 # v2.4.1 + with: + compose-file: src/bin/docker-compose.store-tester.yaml + + - name: Setup Bazel + uses: >- # v0.13.0 + bazel-contrib/setup-bazel@663f88d97adf17db2523a5b385d9407a562e5551 + with: + bazelisk-cache: true + repository-cache: true + disk-cache: ${{ github.workflow }}-ubuntu-24.04 + + - name: Run Store tester with sentinel + run: | + bazel run //:redis_store_tester \ + --extra_toolchains=@rust_toolchains//:all \ + --verbose_failures -- --redis-mode sentinel --mode sequential + env: + RUST_LOG: trace + REDIS_HOST: localhost + MAX_LOOPS: 10 # running sequentially just to test all the actions work + shell: bash + + - name: Run Store tester with standard + run: | + bazel run //:redis_store_tester \ + --extra_toolchains=@rust_toolchains//:all \ + --verbose_failures -- --redis-mode standard --mode sequential + env: + RUST_LOG: trace + REDIS_HOST: localhost + MAX_LOOPS: 10 # running sequentially just to test all the actions work + shell: bash + + - name: Run Store tester with cluster + run: | + bazel run //:redis_store_tester \ + --extra_toolchains=@rust_toolchains//:all \ + --verbose_failures -- --redis-mode cluster --mode sequential + env: + RUST_LOG: trace + REDIS_HOST: localhost + MAX_LOOPS: 10 # running sequentially just to test all the actions work + shell: bash diff --git a/.github/workflows/native-cargo.yaml b/.github/workflows/native-cargo.yaml index 4e180ecec..eb6d2a6df 100644 --- a/.github/workflows/native-cargo.yaml +++ b/.github/workflows/native-cargo.yaml @@ -42,7 +42,8 @@ jobs: shell: bash - name: Rust cache - uses: Swatinem/rust-cache@v2 + # https://github.com/Swatinem/rust-cache/releases/tag/v2.8.1 + uses: Swatinem/rust-cache@a84bfdc502f07db5a85dd9d7a30f91a931516cc5 - name: Build on ${{ runner.os }} run: cargo build --all --profile=smol @@ -50,6 +51,5 @@ jobs: - name: Test on ${{ runner.os }} run: cargo test --all --profile=smol - # Not a default target, but need to make sure we don't actually break it - - name: Test worker_find_logging - run: cargo build --features worker_find_logging --all-targets + - name: Check schema export + run: cargo run --bin build-schema --features dev-schema --package nativelink-config diff --git a/.github/workflows/nix.yaml b/.github/workflows/nix.yaml index d695dbedb..a6fb36021 100644 --- a/.github/workflows/nix.yaml +++ b/.github/workflows/nix.yaml @@ -97,7 +97,7 @@ jobs: name: ${{ matrix.test-name }} strategy: matrix: - test-name: [buildstream, mongo] + test-name: [buildstream, mongo, buck2, rbe-toolchain] runs-on: ubuntu-24.04 timeout-minutes: 45 steps: diff --git a/.github/workflows/tagged_image.yaml b/.github/workflows/tagged_image.yaml index dfecdfb93..1a9f65258 100644 --- a/.github/workflows/tagged_image.yaml +++ b/.github/workflows/tagged_image.yaml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - image: [image, nativelink-worker-init, nativelink-worker-lre-cc] + image: [image, nativelink-worker-init] runs-on: ubuntu-24.04 permissions: packages: write diff --git a/.github/workflows/templates.yaml b/.github/workflows/templates.yaml deleted file mode 100644 index b8fa2c721..000000000 --- a/.github/workflows/templates.yaml +++ /dev/null @@ -1,63 +0,0 @@ ---- -name: Templates - -on: - push: - branches: [main] - paths-ignore: - - '.github/styles/**' - - 'web/**' - pull_request: - branches: [main] - paths-ignore: - - '.github/styles/**' - - 'web/**' - -permissions: read-all - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} - -jobs: - remote-execution: - strategy: - fail-fast: false - matrix: - os: [ubuntu-24.04, macos-15] - template: [bazel] - exclude: - - os: macos-15 - name: Local / ${{ matrix.template }} / ${{ matrix.os }} - runs-on: ${{ matrix.os }} - environment: production - timeout-minutes: 45 - steps: - - name: Checkout - uses: >- # v4.2.2 - actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 - with: - path: nativelink - - - name: Prepare Worker - uses: ./nativelink/.github/actions/prepare-nix - - - name: Build ${{ matrix.template }} examples - env: - TEMPLATE: ${{ matrix.template }} - NL_COM_API_KEY: ${{ secrets.NATIVELINK_COM_API_HEADER || '065f02f53f26a12331d5cfd00a778fb243bfb4e857b8fcd4c99273edfb15deae' }} - run: | - mkdir ${TEMPLATE} - cd ${TEMPLATE} - nix flake init -t ../nativelink#${TEMPLATE} - rm user.bazelrc - git init - git add . - nix develop -c bazel build \ - --verbose_failures \ - --remote_header=x-nativelink-api-key=$NL_COM_API_KEY \ - --bes_backend=grpcs://tm-ci-bep.build-faster.nativelink.net \ - --bes_header=x-nativelink-api-key=$NL_COM_API_KEY \ - --bes_results_url=https://tm-ci-web.build-faster.nativelink.net \ - ${{ github.ref == 'refs/heads/main' && '--remote_cache=grpcs://tm-ci-cas.build-faster.nativelink.net --remote_executor=grpcs://tm-ci-cas.build-faster.nativelink.net' || '' }} \ - //... diff --git a/.gitignore b/.gitignore index 3e1fe02ac..64e1dd1a1 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,6 @@ nixos.bazelrc rust-project.json darwin.bazelrc nativelink.bazelrc -integration_tests/**/*.log +*.log +buck-out/ +nativelink_config.schema.json diff --git a/BUILD.bazel b/BUILD.bazel index ff9bd61fc..a60441b09 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -14,8 +14,6 @@ rust_binary( srcs = [ "src/bin/nativelink.rs", ], - # Enable this to get extra debug about workers that are not being used by the CAS - # crate_features = ["worker_find_logging"], deps = [ "//nativelink-config", "//nativelink-error", @@ -32,7 +30,7 @@ rust_binary( "@crates//:hyper-util", "@crates//:mimalloc", "@crates//:parking_lot", - "@crates//:rustls-pemfile", + "@crates//:rustls-pki-types", "@crates//:tokio", "@crates//:tokio-rustls", "@crates//:tonic", @@ -41,6 +39,45 @@ rust_binary( ], ) +rust_binary( + name = "redis_store_tester", + srcs = [ + "src/bin/redis_store_tester.rs", + ], + deps = [ + "//nativelink-config", + "//nativelink-error", + "//nativelink-store", + "//nativelink-util", + "@crates//:bytes", + "@crates//:clap", + "@crates//:futures", + "@crates//:rand", + "@crates//:redis", + "@crates//:tokio", + "@crates//:tracing", + ], +) + +rust_binary( + name = "cas_speed_check", + srcs = [ + "src/bin/cas_speed_check.rs", + ], + deps = [ + "//nativelink-error", + "//nativelink-proto", + "//nativelink-util", + "@crates//:clap", + "@crates//:hex", + "@crates//:rand", + "@crates//:sha2", + "@crates//:tokio", + "@crates//:tonic", + "@crates//:tracing", + ], +) + filegroup( name = "docs", srcs = [ diff --git a/CHANGELOG.md b/CHANGELOG.md index 171c044f3..07be0c908 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,273 @@ All notable changes to this project will be documented in this file. -## [0.7.3](https://github.com/TraceMachina/nativelink/compare/v0.7.2..0.7.3) - 2025-10-10 +## [1.0.0-rc4](https://github.com/TraceMachina/nativelink/compare/v0.7.10..1.0.0-rc4) - 2026-03-13 + + + +### ⛰️ Features + +- Add debug info to connection manager queues ([#2188](https://github.com/TraceMachina/nativelink/issues/2188)) - ([6b6efcf](https://github.com/TraceMachina/nativelink/commit/6b6efcfdfd0488ebb927910e6ee4ef14790f7716)) +- Add json schema ([#2193](https://github.com/TraceMachina/nativelink/issues/2193)) - ([d926c47](https://github.com/TraceMachina/nativelink/commit/d926c4756a830e38c9b162c388e6fafcba091da7)) +- Add boolean and optional data size shellexpands ([#2172](https://github.com/TraceMachina/nativelink/issues/2172)) - ([e54a0c3](https://github.com/TraceMachina/nativelink/commit/e54a0c3e55b54f4b5c51fd67db5541ba01081224)) +- Add Max Concurrent Writes ([#2156](https://github.com/TraceMachina/nativelink/issues/2156)) - ([3a90838](https://github.com/TraceMachina/nativelink/commit/3a90838081e3e6a14d13ee231075492256753d22)) +- Add logs for stall detection ([#2155](https://github.com/TraceMachina/nativelink/issues/2155)) - ([94e7e3f](https://github.com/TraceMachina/nativelink/commit/94e7e3f134f2586aa89384e6088544a83dba2694)) +- Add Max action executing timeouts to scheduler ([#2153](https://github.com/TraceMachina/nativelink/issues/2153)) - ([5549a96](https://github.com/TraceMachina/nativelink/commit/5549a969bd7be1f10b94dc725ae6dcd68dd00130)) +- Add GRPC timeouts and other improvements to detect dead connections ([#2152](https://github.com/TraceMachina/nativelink/issues/2152)) - ([b4b44ba](https://github.com/TraceMachina/nativelink/commit/b4b44ba6db8b830d05de2d6180d0c452836eeea2)) +- Allows setting environment variables from the environment ([#2143](https://github.com/TraceMachina/nativelink/issues/2143)) - ([a57c771](https://github.com/TraceMachina/nativelink/commit/a57c7714b868e5b22bdcb7736e370ea454f5c843)) +- Add Max Upload timeout to CAS ([#2150](https://github.com/TraceMachina/nativelink/issues/2150)) - ([24cc324](https://github.com/TraceMachina/nativelink/commit/24cc324b21de72d8079fc7e54e5dc4abf678c0bd)) +- Add tracing to hyper-util ([#2132](https://github.com/TraceMachina/nativelink/issues/2132)) - ([bc773dc](https://github.com/TraceMachina/nativelink/commit/bc773dc3d43ff208e996e97547528c5b111abd14)) +- Add worker config option to limit maximum inflight tasks ([#2125](https://github.com/TraceMachina/nativelink/issues/2125)) - ([1821bec](https://github.com/TraceMachina/nativelink/commit/1821bec1cd888b4440368504678be64aa43d37e3)) +- Add additional logging around worker property matching ([#2118](https://github.com/TraceMachina/nativelink/issues/2118)) - ([24c637a](https://github.com/TraceMachina/nativelink/commit/24c637ab86b44864787bf7b789d6bf29b98df87f)) + +### 🐛 Bug Fixes + +- *(deps)* update module github.com/go-git/go-git/v5 to v5.16.5 [security] ([#2138](https://github.com/TraceMachina/nativelink/issues/2138)) - ([dc25843](https://github.com/TraceMachina/nativelink/commit/dc258438336ba6ab5e63c0a48e71987bb88b4621)) +- Handle correctly subscription messages ([#2201](https://github.com/TraceMachina/nativelink/issues/2201)) - ([2ea428b](https://github.com/TraceMachina/nativelink/commit/2ea428bfc66e9f7303108141e3a5ee9a6e84dc0d)) +- Fix Redis to reconnect in Sentinel (Chris Staite) ([#2190](https://github.com/TraceMachina/nativelink/issues/2190)) - ([8783134](https://github.com/TraceMachina/nativelink/commit/87831340af3cfcb3cffbc4f43bc3da9ecf8c8467)) +- Fix worker inflight tasks heading ([#2177](https://github.com/TraceMachina/nativelink/issues/2177)) - ([8ae17ba](https://github.com/TraceMachina/nativelink/commit/8ae17bae0603d66102d171554f331e10a3e9ac9e)) +- Fix all the current clippy lints ([#2174](https://github.com/TraceMachina/nativelink/issues/2174)) - ([23611ca](https://github.com/TraceMachina/nativelink/commit/23611caa3966a1934d6a3a7da0007083bbc75d8b)) +- Fix integer overflow in compression_store.rs data retrieval logic ([#2151](https://github.com/TraceMachina/nativelink/issues/2151)) - ([f996507](https://github.com/TraceMachina/nativelink/commit/f996507b152a7a5e79367475e7854680cce3eb2c)) +- Fix Max Inflight Workers job acceptance ([#2142](https://github.com/TraceMachina/nativelink/issues/2142)) - ([6ffab5f](https://github.com/TraceMachina/nativelink/commit/6ffab5f049666158b14e277653d8ce6b487c2ff6)) +- Fix Redis index creation race ([#2111](https://github.com/TraceMachina/nativelink/issues/2111)) - ([c3a497d](https://github.com/TraceMachina/nativelink/commit/c3a497d36df49d3a1caadede02c4cc6d5af87492)) + +### 📚 Documentation + +- Document max concurrent writes ([#2169](https://github.com/TraceMachina/nativelink/issues/2169)) - ([cedba0e](https://github.com/TraceMachina/nativelink/commit/cedba0e829daeb6affa601324ca7eacdcd4e7fea)) +- Document RPC timeouts in Redis config ([#2168](https://github.com/TraceMachina/nativelink/issues/2168)) - ([f0d12ff](https://github.com/TraceMachina/nativelink/commit/f0d12ffce777662eb23f898042393a2fac8f2952)) +- Document max inflight tasks ([#2167](https://github.com/TraceMachina/nativelink/issues/2167)) - ([2650680](https://github.com/TraceMachina/nativelink/commit/26506800e0bddfe9dd35008dfda279a2b19604df)) +- Add docs for configuring Worker Match Logging Interval ([#2103](https://github.com/TraceMachina/nativelink/issues/2103)) - ([ae963be](https://github.com/TraceMachina/nativelink/commit/ae963be97178284a1aa53b526a3fa3292ca12e2a)) + +### 🧪 Testing & CI + +- Fix Fast slow store Not Found error by returning failed precondition ([#2194](https://github.com/TraceMachina/nativelink/issues/2194)) - ([3354945](https://github.com/TraceMachina/nativelink/commit/3354945b1f0cb9aba7041ad6ffad0bb67def8d4f)) +- Flake update fixes ([#2192](https://github.com/TraceMachina/nativelink/issues/2192)) - ([a7d873a](https://github.com/TraceMachina/nativelink/commit/a7d873aca54ae62f0ce13fbbf3dc7817f9f82efa)) +- pre-commit rustfmt all files ([#2176](https://github.com/TraceMachina/nativelink/issues/2176)) - ([27fa965](https://github.com/TraceMachina/nativelink/commit/27fa9652baf9ed7cdbc248fd6591bf813a790f65)) +- Every bytestream_read had a debug log, which we don't need ([#2117](https://github.com/TraceMachina/nativelink/issues/2117)) - ([18360ad](https://github.com/TraceMachina/nativelink/commit/18360ada6e5e3ecc04a7f6f96fbae09cf919111b)) + +### ⚙️ Miscellaneous + +- *(deps)* update rust crate toml to v1 ([#2147](https://github.com/TraceMachina/nativelink/issues/2147)) - ([85e9ecf](https://github.com/TraceMachina/nativelink/commit/85e9ecf05e1e6646513f4b32a8ce1fba609ebcf7)) +- *(deps)* update rust crate bytes to v1.11.1 [security] ([#2134](https://github.com/TraceMachina/nativelink/issues/2134)) - ([5d32d18](https://github.com/TraceMachina/nativelink/commit/5d32d181fe68d29bf354a2a5f41e634d8faaec37)) +- empty find_missing_blobs can return immediately ([#2217](https://github.com/TraceMachina/nativelink/issues/2217)) - ([dad870a](https://github.com/TraceMachina/nativelink/commit/dad870a41d70208b88b395d6f4121f3d4e1b8828)) +- remove free cloud user ([#2199](https://github.com/TraceMachina/nativelink/issues/2199)) - ([c7109f6](https://github.com/TraceMachina/nativelink/commit/c7109f6d70e049a011c367dfe4018b5cea675b9e)) +- Only display Baggage enduser.id when identity is present ([#2197](https://github.com/TraceMachina/nativelink/issues/2197)) - ([86b86e1](https://github.com/TraceMachina/nativelink/commit/86b86e15e8dcc3936a07d22feb10d088dc9ad4ae)) +- Prevent retry loop large uploads ([#2195](https://github.com/TraceMachina/nativelink/issues/2195)) - ([2a2ca64](https://github.com/TraceMachina/nativelink/commit/2a2ca6496af559a91207de3e384e338111138fd1)) +- If all workers are fully allocated, shortcut find workers ([#2130](https://github.com/TraceMachina/nativelink/issues/2130)) - ([faad8bb](https://github.com/TraceMachina/nativelink/commit/faad8bb038fefc439daca73978138b821084648c)) +- Log NotFound as info, not error ([#2171](https://github.com/TraceMachina/nativelink/issues/2171)) - ([4ca9d7b](https://github.com/TraceMachina/nativelink/commit/4ca9d7b3d3e29e392d7b39b2ff509cb1b75cf5aa)) +- Dummy streams should be pending, not empty ([#2154](https://github.com/TraceMachina/nativelink/issues/2154)) - ([e72b5a0](https://github.com/TraceMachina/nativelink/commit/e72b5a0feaace00ee9960886d3c2715eeb76c361)) +- fix metrics ([#2097](https://github.com/TraceMachina/nativelink/issues/2097)) - ([e6c7097](https://github.com/TraceMachina/nativelink/commit/e6c70977a879d552b98ebc2cb23717ab51658a2a)) +- Advise the kernel to drop page cache ([#2149](https://github.com/TraceMachina/nativelink/issues/2149)) - ([727760d](https://github.com/TraceMachina/nativelink/commit/727760d1e208ca8be7bc134f432baf5dc5bf5928)) +- Replace Fred with redis-rs ([#2076](https://github.com/TraceMachina/nativelink/issues/2076)) - ([4956889](https://github.com/TraceMachina/nativelink/commit/4956889cd258a98f0e8720b5b7ef028ca0ed4d99)) +- No workers logging ([#2137](https://github.com/TraceMachina/nativelink/issues/2137)) - ([12c63f5](https://github.com/TraceMachina/nativelink/commit/12c63f50fef02bf36624ac0770fc8f5dac407a9c)) +- Make update_with_whole_file logging default to trace ([#2131](https://github.com/TraceMachina/nativelink/issues/2131)) - ([ecd2903](https://github.com/TraceMachina/nativelink/commit/ecd2903f8ca5086e10f74290533a9fc75c580a7c)) +- Be clearer about what property values workers are missing ([#2121](https://github.com/TraceMachina/nativelink/issues/2121)) - ([85385e6](https://github.com/TraceMachina/nativelink/commit/85385e68271d78b2b72a24098202aade157a5553)) +- Correct ignore handling for PlatformProperties ([#2126](https://github.com/TraceMachina/nativelink/issues/2126)) - ([8c3bacb](https://github.com/TraceMachina/nativelink/commit/8c3bacb0e95525c68e2ec7c2e90208fa383bd81d)) +- output_files can be very noisy, drop from debug ([#2123](https://github.com/TraceMachina/nativelink/issues/2123)) - ([3ed406f](https://github.com/TraceMachina/nativelink/commit/3ed406faa9c116485218f1c5aa6340d5b9e312c4)) +- Support ignorable platform properties ([#2120](https://github.com/TraceMachina/nativelink/issues/2120)) - ([1b45027](https://github.com/TraceMachina/nativelink/commit/1b450275c8d826c8124be121b62e61c67a2cad38)) +- Reduce logging level for "Dropping file to update_file" ([#2116](https://github.com/TraceMachina/nativelink/issues/2116)) - ([95a8a34](https://github.com/TraceMachina/nativelink/commit/95a8a3438968ab082a38c343d708dd2a70ee74ed)) +- Pull MAX_COUNT_PER_CURSOR into redis config, not hardcoding ([#2112](https://github.com/TraceMachina/nativelink/issues/2112)) - ([5b043eb](https://github.com/TraceMachina/nativelink/commit/5b043eb08ec46518db7784c6cfd9c47ae7fcc93d)) +- Test redis improvements with client drop and higher max count per cursor ([#2110](https://github.com/TraceMachina/nativelink/issues/2110)) - ([bed6f9a](https://github.com/TraceMachina/nativelink/commit/bed6f9a8acf45da17fbd56d12202413360204218)) + +### ⬆️ Bumps & Version Updates + +- *(deps)* update rust crate lru to 0.16.0 [security] ([#2106](https://github.com/TraceMachina/nativelink/issues/2106)) - ([c127bba](https://github.com/TraceMachina/nativelink/commit/c127bba823ca4e5df56da9eaa65df58787b74e3a)) +- Upgrade curl to 8.5.0-2ubuntu10.8 ([#2204](https://github.com/TraceMachina/nativelink/issues/2204)) - ([36a8238](https://github.com/TraceMachina/nativelink/commit/36a823836a8c679bcf751ec64e830f272e4c2e28)) +- Update module github.com/cloudflare/circl to v1.6.3 [SECURITY] ([#2191](https://github.com/TraceMachina/nativelink/issues/2191)) - ([77b13f0](https://github.com/TraceMachina/nativelink/commit/77b13f053a40e3f67cb202ff086ca0a9185907fb)) +- Update curl version in Dockerfiles ([#2189](https://github.com/TraceMachina/nativelink/issues/2189)) - ([c161433](https://github.com/TraceMachina/nativelink/commit/c161433702cd6b6a29a169e7516c06a60c1341f9)) +- Update grafana/grafana Docker tag to v12 ([#2182](https://github.com/TraceMachina/nativelink/issues/2182)) - ([658dd53](https://github.com/TraceMachina/nativelink/commit/658dd532c2275c888cfc03c2149fa805de8ecbc5)) +- Update jsonwebtoken ([#2135](https://github.com/TraceMachina/nativelink/issues/2135)) - ([56a8955](https://github.com/TraceMachina/nativelink/commit/56a89557ee14130ca10b44f1688d5e9b6e4691d5)) + +## [0.7.10](https://github.com/TraceMachina/nativelink/compare/v0.7.9..v0.7.10) - 2025-12-30 + + + +### 🐛 Bug Fixes + +- *(deps)* update module golang.org/x/crypto to v0.45.0 [security] ([#2062](https://github.com/TraceMachina/nativelink/issues/2062)) - ([7a4cdb6](https://github.com/TraceMachina/nativelink/commit/7a4cdb681fe23b90f68f1bcc897b5b9ce43c1e37)) + +### 🧪 Testing & CI + +- New filesystem test for eviction breaking ([#2024](https://github.com/TraceMachina/nativelink/issues/2024)) - ([47ebd44](https://github.com/TraceMachina/nativelink/commit/47ebd44809657889f185d0cb36c4217012211c48)) + +### ⚙️ Miscellaneous + +- *(deps)* update dependency abseil-cpp to v20250512 ([#2099](https://github.com/TraceMachina/nativelink/issues/2099)) - ([2bdb869](https://github.com/TraceMachina/nativelink/commit/2bdb869b7cb42ad1c2411f282d454fe2cb81cc65)) +- *(deps)* update actions/checkout action to v6 ([#2085](https://github.com/TraceMachina/nativelink/issues/2085)) - ([fbda7bb](https://github.com/TraceMachina/nativelink/commit/fbda7bbfd1910bda6abace60feef3645f6f92ab4)) +- *(deps)* update actions/github-script action to v8 ([#2098](https://github.com/TraceMachina/nativelink/issues/2098)) - ([f9f3b60](https://github.com/TraceMachina/nativelink/commit/f9f3b6031f400cb3ef327b2c956ea6c6d0d4ff54)) +- reduce worker disconnect cascades ([#2093](https://github.com/TraceMachina/nativelink/issues/2093)) - ([44ada84](https://github.com/TraceMachina/nativelink/commit/44ada84405f17696c04f363b98773692a1c122f6)) +- Replace rustls-pemfile to fix RUSTSEC-2025-0134 ([#2094](https://github.com/TraceMachina/nativelink/issues/2094)) - ([1b85f71](https://github.com/TraceMachina/nativelink/commit/1b85f71d977f61ff79391934e434af9c10d057e8)) + +## [0.7.9](https://github.com/TraceMachina/nativelink/compare/v0.7.8..v0.7.9) - 2025-12-10 + + + +### ⛰️ Features + +- Add LazyNotFound Store Optimization, Support for fast_slow_store (S3, GCS slow_store targets) ([#2072](https://github.com/TraceMachina/nativelink/issues/2072)) - ([8c62bb3](https://github.com/TraceMachina/nativelink/commit/8c62bb318d849c7122659bd1c583fee627fa4f74)) + +### 🐛 Bug Fixes + +- Fix the scheduler timeouts and errors ([#2083](https://github.com/TraceMachina/nativelink/issues/2083)) - ([93f4ead](https://github.com/TraceMachina/nativelink/commit/93f4eaddad157842549d1cd9cc1da676194997bd)) + +### 📚 Documentation + +- Perf spike ([#2081](https://github.com/TraceMachina/nativelink/issues/2081)) - ([422bfa1](https://github.com/TraceMachina/nativelink/commit/422bfa176891bae17eacb78f1b64e95bd68916d9)) +- Implement remote execution metrics rebased ([#2080](https://github.com/TraceMachina/nativelink/issues/2080)) - ([e38af3d](https://github.com/TraceMachina/nativelink/commit/e38af3d6ce897084832fbd66757de25d532acae6)) + +### ⚙️ Miscellaneous + +- Build Custom Docker Image for each PR ([#2084](https://github.com/TraceMachina/nativelink/issues/2084)) - ([0926bff](https://github.com/TraceMachina/nativelink/commit/0926bffdf8918c9fd15b07673cb0cddab9c382ff)) + +## [0.7.8](https://github.com/TraceMachina/nativelink/compare/v0.7.7..v0.7.8) - 2025-11-28 + + + +### 🐛 Bug Fixes + +- Fix the changelog post 0.7.7 ([#2057](https://github.com/TraceMachina/nativelink/issues/2057)) - ([437a785](https://github.com/TraceMachina/nativelink/commit/437a785e5631bff3b28378c16101a8b21b151d37)) +- Fix assertion message for fastcdc ([#2056](https://github.com/TraceMachina/nativelink/issues/2056)) - ([7ec4f11](https://github.com/TraceMachina/nativelink/commit/7ec4f11d1cac24dfcc3ad88803be0b087465610c)) + +### 🧪 Testing & CI + +- use wildcard query when Redis index value is empty ([#2069](https://github.com/TraceMachina/nativelink/issues/2069)) ([#2075](https://github.com/TraceMachina/nativelink/issues/2075)) - ([92869d9](https://github.com/TraceMachina/nativelink/commit/92869d9ae0249de1c676396f6af439afc8112c86)) +- use wildcard query when Redis index value is empty ([#2069](https://github.com/TraceMachina/nativelink/issues/2069)) - ([43f7f8d](https://github.com/TraceMachina/nativelink/commit/43f7f8df6562c605cebbf3bbcbfa265f6cf2f46e)) +- Recoverable connection pool ([#2067](https://github.com/TraceMachina/nativelink/issues/2067)) - ([14b2cc6](https://github.com/TraceMachina/nativelink/commit/14b2cc684e77af485518444d40499b9cc204be55)) +- Redis store tester and permits ([#1878](https://github.com/TraceMachina/nativelink/issues/1878)) - ([3df6293](https://github.com/TraceMachina/nativelink/commit/3df6293e09131d44f73bb053eba1c1b282b3d9d7)) + +### ⚙️ Miscellaneous + +- *(deps)* update dependency astro to v5.15.9 [security] ([#2061](https://github.com/TraceMachina/nativelink/issues/2061)) - ([3d41449](https://github.com/TraceMachina/nativelink/commit/3d4144985f6479e08dc1989f666bbecdbe98f98e)) +- Revert "bugfix: prefix Redis index name and sort key ([#2066](https://github.com/TraceMachina/nativelink/issues/2066))" ([#2068](https://github.com/TraceMachina/nativelink/issues/2068)) - ([2e84883](https://github.com/TraceMachina/nativelink/commit/2e848832053ec86a95be159578282fef68481d2e)) +- prefix Redis index name and sort key ([#2066](https://github.com/TraceMachina/nativelink/issues/2066)) - ([6a95ae8](https://github.com/TraceMachina/nativelink/commit/6a95ae8e258b70423da585e5cc2b78ec8d911072)) +- Disable digest updates for renovate and Nix magic cache ([#2059](https://github.com/TraceMachina/nativelink/issues/2059)) - ([f56c2bb](https://github.com/TraceMachina/nativelink/commit/f56c2bbe9c756c233c1efaf4f705aedbd3f940ee)) +- Do not need to store zero-length filesystem files ([#2033](https://github.com/TraceMachina/nativelink/issues/2033)) - ([5adf904](https://github.com/TraceMachina/nativelink/commit/5adf904b5a54eb7488f987706dc8c22e1fe4b75b)) +- Don't complain about worker stream error if we're shutting down ([#2055](https://github.com/TraceMachina/nativelink/issues/2055)) - ([6282afc](https://github.com/TraceMachina/nativelink/commit/6282afc6846bb071d2120e49f0488c905ad07200)) + +### ⬆️ Bumps & Version Updates + +- Update the default max permits for redis ([#2063](https://github.com/TraceMachina/nativelink/issues/2063)) - ([7b9df29](https://github.com/TraceMachina/nativelink/commit/7b9df29b9a682b49add7f0c3198734509655d59a)) + +## [0.7.7](https://github.com/TraceMachina/nativelink/compare/v0.7.6..v0.7.7) - 2025-11-17 + + + +### ⛰️ Features + +- Add periodic logging regarding scheduler job states ([#2042](https://github.com/TraceMachina/nativelink/issues/2042)) - ([7d6f663](https://github.com/TraceMachina/nativelink/commit/7d6f6632628df772289b76b21321bc3d25a230f8)) + +### 🧪 Testing & CI + +- *(worker)* Resolve deadlock due to file permit exhaustion ([#2051](https://github.com/TraceMachina/nativelink/issues/2051)) ([#2052](https://github.com/TraceMachina/nativelink/issues/2052)) - ([b5dd8fb](https://github.com/TraceMachina/nativelink/commit/b5dd8fbaba59a47598189d49efce7e02fc0e9ed2)) + +### ⚙️ Miscellaneous + +- *(deps)* update dependency astro to v5.15.6 [security] ([#2045](https://github.com/TraceMachina/nativelink/issues/2045)) - ([0cd70ee](https://github.com/TraceMachina/nativelink/commit/0cd70eebf7134b0102ae5d37eae825fc340e1bd5)) + +## [0.7.6](https://github.com/TraceMachina/nativelink/compare/v0.7.5..v0.7.6) - 2025-11-13 + + + +### ⛰️ Features + +- Redo worker_find_logging as config ([#2039](https://github.com/TraceMachina/nativelink/issues/2039)) - ([958f687](https://github.com/TraceMachina/nativelink/commit/958f68763524e3f2d3d12f91e8949ecfeea98479)) +- Log on command complete ([#2032](https://github.com/TraceMachina/nativelink/issues/2032)) - ([daea037](https://github.com/TraceMachina/nativelink/commit/daea03751c09e6553f3c9636003ad315811cec03)) +- Directory Cache ([#2021](https://github.com/TraceMachina/nativelink/issues/2021)) - ([a01bd65](https://github.com/TraceMachina/nativelink/commit/a01bd652efb59cb092f1383398c54d694b137f60)) +- Log failures to update actions ([#2022](https://github.com/TraceMachina/nativelink/issues/2022)) - ([3697512](https://github.com/TraceMachina/nativelink/commit/369751249eb19e8dc3bdbb31f041fa60c6948cbc)) + +### 🐛 Bug Fixes + +- Fix flake timestamp ([#2036](https://github.com/TraceMachina/nativelink/issues/2036)) - ([e0e4d41](https://github.com/TraceMachina/nativelink/commit/e0e4d411e5942bd65d2ff864be2e7e0019dacc24)) + +### 🧪 Testing & CI + +- Add testing for running action manager failure logging ([#2031](https://github.com/TraceMachina/nativelink/issues/2031)) - ([922d7f6](https://github.com/TraceMachina/nativelink/commit/922d7f60b38dae49cf907217d8c1e485a011ced6)) +- Fix fast store direction ([#2019](https://github.com/TraceMachina/nativelink/issues/2019)) - ([e7f29fe](https://github.com/TraceMachina/nativelink/commit/e7f29fe8aad6e2e6f7bef1ce822b983090d77fc2)) + +### ⚙️ Miscellaneous + +- *(deps)* update swatinem/rust-cache digest to a84bfdc ([#2018](https://github.com/TraceMachina/nativelink/issues/2018)) - ([d5ea603](https://github.com/TraceMachina/nativelink/commit/d5ea603356adfa60e563af406429fdb836039173)) +- Upgrade python3 to new security patch version ([#2044](https://github.com/TraceMachina/nativelink/issues/2044)) - ([222731d](https://github.com/TraceMachina/nativelink/commit/222731de0295abcdb9f6262cd5547d50168918cc)) +- Use common_s3_utils in s3_store ([#2040](https://github.com/TraceMachina/nativelink/issues/2040)) - ([b2eaf79](https://github.com/TraceMachina/nativelink/commit/b2eaf79b19d3f12afa6194968cb582d466a2a0d6)) +- Lockdown and upgrade the nix action versions ([#2038](https://github.com/TraceMachina/nativelink/issues/2038)) - ([f679946](https://github.com/TraceMachina/nativelink/commit/f6799465fc5a77263e025ffadeb6a670a9b37ffc)) +- Log more info about redis key updates ([#2035](https://github.com/TraceMachina/nativelink/issues/2035)) - ([1d3cc10](https://github.com/TraceMachina/nativelink/commit/1d3cc10390b8c246f40dd675404a1b94a2122d58)) +- Use display, not debug formatting for operation ids ([#2028](https://github.com/TraceMachina/nativelink/issues/2028)) - ([b7238b3](https://github.com/TraceMachina/nativelink/commit/b7238b3c1bbb549a7c364339d8a4b6e4a5d5ef47)) +- Removes starter pricing ([#2027](https://github.com/TraceMachina/nativelink/issues/2027)) - ([bef18b3](https://github.com/TraceMachina/nativelink/commit/bef18b31024c1c612b1d995c524aff33b82d1390)) +- Drops the cloud references ([#2025](https://github.com/TraceMachina/nativelink/issues/2025)) - ([c3431ac](https://github.com/TraceMachina/nativelink/commit/c3431acc109129586ee5a288166a5139e6a0d27c)) + +## [0.7.5](https://github.com/TraceMachina/nativelink/compare/v0.7.4..v0.7.5) - 2025-10-30 + + + +### 🐛 Bug Fixes + +- scheduler shutdown not guarded ([#2015](https://github.com/TraceMachina/nativelink/issues/2015)) - ([552a1cd](https://github.com/TraceMachina/nativelink/commit/552a1cde0013a90a9ceba93f77f4c18b6e475652)) +- Fast slow store directions ([#1581](https://github.com/TraceMachina/nativelink/issues/1581)) - ([6d867c9](https://github.com/TraceMachina/nativelink/commit/6d867c99b08f6cb078900b5a9f4fae1e262158d9)) + +### 🧪 Testing & CI + +- Buck2 integration test ([#1828](https://github.com/TraceMachina/nativelink/issues/1828)) - ([1296a3a](https://github.com/TraceMachina/nativelink/commit/1296a3aaa6b1040d70f2d2609644698c57d029a6)) + +### ⚙️ Miscellaneous + +- Filestore update deadlock ([#2007](https://github.com/TraceMachina/nativelink/issues/2007)) - ([d55c59d](https://github.com/TraceMachina/nativelink/commit/d55c59dd101173195fde4376a6185cbaaa50d252)) +- guard shutting down in scheduler while SIGTERM ([#2012](https://github.com/TraceMachina/nativelink/issues/2012)) - ([1708859](https://github.com/TraceMachina/nativelink/commit/17088593e5bcfc30f0e20cb9b25743ebcf90ca8b)) +- Remove unnecessary Mutex ([#2006](https://github.com/TraceMachina/nativelink/issues/2006)) - ([083232d](https://github.com/TraceMachina/nativelink/commit/083232dc47946bdbba1f82b741ebf8dde3ac948e)) + +## [0.7.4](https://github.com/TraceMachina/nativelink/compare/v0.7.3..v0.7.4) - 2025-10-23 + + + +### ⛰️ Features + +- GCS do not upload zero ([#1995](https://github.com/TraceMachina/nativelink/issues/1995)) - ([ab0d4e6](https://github.com/TraceMachina/nativelink/commit/ab0d4e6e1920f8d099ce17b8b20f93bbab6dba27)) +- GCS store connect timeout ([#1994](https://github.com/TraceMachina/nativelink/issues/1994)) - ([854d51c](https://github.com/TraceMachina/nativelink/commit/854d51caddef98888eaaff3e5866a5248a482d67)) +- Add cache to native-cargo step ([#1974](https://github.com/TraceMachina/nativelink/issues/1974)) - ([0c02306](https://github.com/TraceMachina/nativelink/commit/0c02306de8067c7f8d5c5d0e6b90c949ed3a99a6)) +- Add metadata checks to machete ([#1952](https://github.com/TraceMachina/nativelink/issues/1952)) - ([21d5fdc](https://github.com/TraceMachina/nativelink/commit/21d5fdc3b5f5ce6cd99c3199b14c30a3a7774168)) + +### 🐛 Bug Fixes + +- Fix clippy::cast_possible_truncation ([#1423](https://github.com/TraceMachina/nativelink/issues/1423)) - ([b050976](https://github.com/TraceMachina/nativelink/commit/b0509764084bd5aa1c6b61c39a63429f3c6b6859)) +- Notify execution complete ([#1975](https://github.com/TraceMachina/nativelink/issues/1975)) - ([8527f25](https://github.com/TraceMachina/nativelink/commit/8527f258f756e5c337ad133dd635416bbf9b89fb)) +- Fix removal state ([#1981](https://github.com/TraceMachina/nativelink/issues/1981)) - ([d85e491](https://github.com/TraceMachina/nativelink/commit/d85e491c4e26bd78d88d08c5d1ca357fc42b3e93)) +- Fix Redis subscribe race ([#1970](https://github.com/TraceMachina/nativelink/issues/1970)) - ([9353508](https://github.com/TraceMachina/nativelink/commit/9353508fed8f96f5d754978047491869cbeba71a)) + +### 📚 Documentation + +- fixed cost docs ([#1986](https://github.com/TraceMachina/nativelink/issues/1986)) - ([aab10ee](https://github.com/TraceMachina/nativelink/commit/aab10ee553781fb1bc2194d0eed58d6a625ee4f6)) + +### 🧪 Testing & CI + +- Add Rust test to RBE work ([#1992](https://github.com/TraceMachina/nativelink/issues/1992)) - ([e01079b](https://github.com/TraceMachina/nativelink/commit/e01079b00f37c7211f5d2094c153e516dae09ef2)) +- Make all tests in running_actions_manager_test serial ([#1984](https://github.com/TraceMachina/nativelink/issues/1984)) - ([41cdd9c](https://github.com/TraceMachina/nativelink/commit/41cdd9cd62ad431fff7dea2fdbab9252a55ae05c)) +- comment legacy Dockerfile test ([#1983](https://github.com/TraceMachina/nativelink/issues/1983)) - ([6316b55](https://github.com/TraceMachina/nativelink/commit/6316b5529d3b228757ed454828352497caed39ea)) +- Adds testing to bytestream backwards compatibility ([#1979](https://github.com/TraceMachina/nativelink/issues/1979)) - ([21bb502](https://github.com/TraceMachina/nativelink/commit/21bb502c1eae34900b461b43ad65a443deb95406)) + +### ⚙️ Miscellaneous + +- Pin various dependencies (mostly Docker images) ([#1990](https://github.com/TraceMachina/nativelink/issues/1990)) - ([29c3dc4](https://github.com/TraceMachina/nativelink/commit/29c3dc4581e511d28f7355ca6d203ddc65394f0c)) +- Unify all the service setups with a macro ([#1996](https://github.com/TraceMachina/nativelink/issues/1996)) - ([e46b5c7](https://github.com/TraceMachina/nativelink/commit/e46b5c7b8710df60efeaf895e9d92eb8296fc931)) +- Sweep forgotten client operation IDs ([#1965](https://github.com/TraceMachina/nativelink/issues/1965)) - ([9fcf5b1](https://github.com/TraceMachina/nativelink/commit/9fcf5b1de4a8d7ac7623039f43d51d0682a65e67)) +- Require default-features=false ([#1993](https://github.com/TraceMachina/nativelink/issues/1993)) - ([0146c34](https://github.com/TraceMachina/nativelink/commit/0146c34a6988a284c4b7d44ed4db14a2b66412e6)) +- Single worker stream ([#1977](https://github.com/TraceMachina/nativelink/issues/1977)) - ([e9250ee](https://github.com/TraceMachina/nativelink/commit/e9250ee83296aaaf950a2d930bca9fa05cc2ad4a)) +- Explicitly separate state locks and awaits ([#1991](https://github.com/TraceMachina/nativelink/issues/1991)) - ([930b352](https://github.com/TraceMachina/nativelink/commit/930b352548b1ca6a428e272d9c7ec12c2c228a2d)) +- Replace derivative with derive_more ([#1989](https://github.com/TraceMachina/nativelink/issues/1989)) - ([9f39700](https://github.com/TraceMachina/nativelink/commit/9f397002214cc8d734624499de113c08c4178176)) +- Build toolchain-examples ([#1971](https://github.com/TraceMachina/nativelink/issues/1971)) - ([2d08aba](https://github.com/TraceMachina/nativelink/commit/2d08abaeb9eaaa423eb3ebb598d0100a2212cf41)) +- Remove folders with bad permissions ([#1980](https://github.com/TraceMachina/nativelink/issues/1980)) - ([5e487f3](https://github.com/TraceMachina/nativelink/commit/5e487f374d7ef2c13a0239aa37c4bfe963951f0e)) +- Property replace ([#1976](https://github.com/TraceMachina/nativelink/issues/1976)) - ([41a2452](https://github.com/TraceMachina/nativelink/commit/41a2452ca0350eb6d153c6ac7b6af97c2152f614)) +- Harden worker disconnect ([#1972](https://github.com/TraceMachina/nativelink/issues/1972)) - ([1055cd1](https://github.com/TraceMachina/nativelink/commit/1055cd150430769d043561f16f9c0b759e707dc4)) +- Drop MacOS 14 support ([#1973](https://github.com/TraceMachina/nativelink/issues/1973)) - ([bdfa17c](https://github.com/TraceMachina/nativelink/commit/bdfa17c9c18439e7e20a0bdbddcda544e7110ebc)) +- Drop 22.04 support ([#1883](https://github.com/TraceMachina/nativelink/issues/1883)) - ([4fe024b](https://github.com/TraceMachina/nativelink/commit/4fe024b03f118fa56842e0500fa190d32694396d)) + +### ⬆️ Bumps & Version Updates + +- Update Swatinem/rust-cache digest to 9416228 ([#2004](https://github.com/TraceMachina/nativelink/issues/2004)) - ([15c747e](https://github.com/TraceMachina/nativelink/commit/15c747e056567bae86c0bfd8a153eb480d40d88a)) +- Update dependency hermetic_cc_toolchain to v4 ([#1988](https://github.com/TraceMachina/nativelink/issues/1988)) - ([ed918d8](https://github.com/TraceMachina/nativelink/commit/ed918d8365a012c320a7cd8b4a0333975f2807ab)) +- Update Rust crate relative-path to v2 ([#1985](https://github.com/TraceMachina/nativelink/issues/1985)) - ([997feb4](https://github.com/TraceMachina/nativelink/commit/997feb4537fa19f7e2cb3bfedc45f9add772ddcf)) +- Update dependency astro to v5.14.3 [SECURITY] ([#1969](https://github.com/TraceMachina/nativelink/issues/1969)) - ([d896788](https://github.com/TraceMachina/nativelink/commit/d896788cda243950377a747c7e8c5b1cce1625d4)) +- Update dependency dotenv to v17 ([#1966](https://github.com/TraceMachina/nativelink/issues/1966)) - ([3b7f05f](https://github.com/TraceMachina/nativelink/commit/3b7f05fce82a36e1339590b827bfee8cbe150221)) + +## [0.7.3](https://github.com/TraceMachina/nativelink/compare/v0.7.2..v0.7.3) - 2025-10-10 @@ -169,7 +435,6 @@ All notable changes to this project will be documented in this file. - Prepare 0.7.0-rc-2 ([#1908](https://github.com/TraceMachina/nativelink/issues/1908)) - ([b23cf19](https://github.com/TraceMachina/nativelink/commit/b23cf19ce07f3415a82a4860641d7d6248a17bd6)) - Modified the todos, though many will be removed ([#1909](https://github.com/TraceMachina/nativelink/issues/1909)) - ([0e9626c](https://github.com/TraceMachina/nativelink/commit/0e9626cefa4f234db7938c2379ac3e5322171ce8)) -- Dedupe fast slow ([#1905](https://github.com/TraceMachina/nativelink/issues/1905)) - ([66c383b](https://github.com/TraceMachina/nativelink/commit/66c383b936f817c073b842059107f3d1d606ae99)) - Retry matching on failure ([#1892](https://github.com/TraceMachina/nativelink/issues/1892)) - ([e691bea](https://github.com/TraceMachina/nativelink/commit/e691bea24ba0b0b5827e9464a26cfd8988b61512)) - Temporarily disable llre.yaml ([#1902](https://github.com/TraceMachina/nativelink/issues/1902)) - ([7c02e58](https://github.com/TraceMachina/nativelink/commit/7c02e589c6d0386db5e15487fd108a882fe97083)) - Graceful worker shutdown ([#1899](https://github.com/TraceMachina/nativelink/issues/1899)) - ([98b1201](https://github.com/TraceMachina/nativelink/commit/98b1201433e3e7834dc4d1d1a2d8688061a26047)) @@ -1040,7 +1305,7 @@ All notable changes to this project will be documented in this file. - Update dependency rules_rust to v0.34.1 ([#547](https://github.com/TraceMachina/nativelink/issues/547)) - ([637f283](https://github.com/TraceMachina/nativelink/commit/637f2834138f86be45c12cf46623de539148fe24)) - Update dependency @google-cloud/compute to v4.1.0 ([#544](https://github.com/TraceMachina/nativelink/issues/544)) - ([dbac23a](https://github.com/TraceMachina/nativelink/commit/dbac23afa27f55c662f8a1d0539cc8fc82717afe)) -## [0.1.0](https://github.com/TraceMachina/nativelink/compare/v1.0.1..v0.1.0) - 2023-12-20 +## [0.1.0] - 2023-12-20 @@ -1099,6 +1364,55 @@ All notable changes to this project will be documented in this file. - Add ability to create low watermark to avoid thrashing against eviction cap. - ([e16b45c](https://github.com/TraceMachina/nativelink/commit/e16b45c155b697f0f4be9af5004437afa0a016fd)) - Add is_empty to LenEntry - ([e643090](https://github.com/TraceMachina/nativelink/commit/e6430900ef21ad4bc651eb0076060b513ca8c3b3)) - Add timestamps to executor jobs. - ([fa97b28](https://github.com/TraceMachina/nativelink/commit/fa97b288bb683e78e95b5805883da632396b4034)) +- Add support for environmental variable lookup in S3Store config - ([cb0de9e](https://github.com/TraceMachina/nativelink/commit/cb0de9eb40119f7098b4ac0865b4cc5eda8ed374)) +- Add ability to use env variables in config files - ([d54b38e](https://github.com/TraceMachina/nativelink/commit/d54b38e213fb243a9b27622894a1529d614a52fb)) +- Add Send trait to as_any() store calls - ([c4be423](https://github.com/TraceMachina/nativelink/commit/c4be4239aa8813e238eb76f3efc208fa72f0af0a)) +- Add fs module which limits outstanding file handles - ([f7b565f](https://github.com/TraceMachina/nativelink/commit/f7b565f0c525bccd7dc42d529eac64110f15fae5)) +- Add functionality for worker to download and create working dir - ([5e7f9ef](https://github.com/TraceMachina/nativelink/commit/5e7f9efece6a8d4ae0288e14f5bda6a04cf594b0)) +- Adds .as_any() to stores - ([e5de86d](https://github.com/TraceMachina/nativelink/commit/e5de86d78e7d640d492ef97f7c4b98a1f7e9d358)) +- Adds initial implementation for LocalWorker and supporting classes - ([90cff23](https://github.com/TraceMachina/nativelink/commit/90cff230ebb5e7982d780f767aa0b0dc85d87b20)) +- Various minor updates - ([cf6dd3d](https://github.com/TraceMachina/nativelink/commit/cf6dd3db5a9633aa9fa3060395266925c09e9a62)) +- Add shlex package in third_party - ([d935d7f](https://github.com/TraceMachina/nativelink/commit/d935d7f849a362473aed08347e20607f620589bc)) +- Add worker config definitions and rename Metadata to Priority - ([98c4e08](https://github.com/TraceMachina/nativelink/commit/98c4e08e25f1baa0134c61147ee04f736917ef28)) +- Add WorkerApiServer to services being served - ([af0ccc3](https://github.com/TraceMachina/nativelink/commit/af0ccc3faa419e37d3e0bde7ff44e3d528617643)) +- Add support for keep alive for workers - ([be6f2ee](https://github.com/TraceMachina/nativelink/commit/be6f2ee94b7047d94aef01294b1b37716e80e822)) +- [RE] Add WorkerApiService and connection functionality - ([e8a349c](https://github.com/TraceMachina/nativelink/commit/e8a349c991e4bec40fc5435b26d869acbf6a9ac4)) +- [RE] Various changes to worker_api.proto - ([86220b7](https://github.com/TraceMachina/nativelink/commit/86220b7429e26ad2b8ba10f877c05baebe3c6d71)) +- Add uuid package and update other packages - ([5115bc6](https://github.com/TraceMachina/nativelink/commit/5115bc618be4e1718d437a6be866f57f3bea7099)) +- Add SizePartitioningStore - ([d0112be](https://github.com/TraceMachina/nativelink/commit/d0112be4c0deb0ab46bccee8dc074e977336bc74)) +- Add RefStore and restructure StoreManager - ([6795bb0](https://github.com/TraceMachina/nativelink/commit/6795bb08d84e53e03f573026b9d97e38a0ac41cc)) +- Can now pass json config through CLI & add more sample configs - ([ea4d76d](https://github.com/TraceMachina/nativelink/commit/ea4d76d33fc5130e2b6557f0b8283fe4314adc46)) +- Add nix package and upgrade third_party packages - ([a451628](https://github.com/TraceMachina/nativelink/commit/a451628777c34f21d12f95ffdd407a51a8e5a3bb)) +- Add basic scaffolding for scheduler + remote execution - ([c91f61e](https://github.com/TraceMachina/nativelink/commit/c91f61edf182f2b64451fd48a5e63fa506a43aae)) +- Adds readme to configuration - ([54e8fe7](https://github.com/TraceMachina/nativelink/commit/54e8fe75753876a5feadf800b1b4cfe5dff820d1)) +- Add filesystem store - ([d183cad](https://github.com/TraceMachina/nativelink/commit/d183cad24a14b04e2a0c870324f6f5d482db809b)) +- Adds simple query_write_status support - ([844014a](https://github.com/TraceMachina/nativelink/commit/844014ac9a8ca246b20a6c3fa861ac970cf94caa)) +- Add buf_channel that will be used to help transport bytes around - ([7e111c1](https://github.com/TraceMachina/nativelink/commit/7e111c13bb78ce80b3007aa325839a47790a3341)) +- Add byteorder to third_party cargo - ([a76a35f](https://github.com/TraceMachina/nativelink/commit/a76a35f813afa2fe570cb0a59e495c41dcd1004b)) +- Adds more eviction templates and functions in prep for filesystem store - ([f2896a7](https://github.com/TraceMachina/nativelink/commit/f2896a798e18569a833fd0d6055bc2d3de59b3a7)) +- Adds FastSlow store that will try the fast store before slow store - ([8c71137](https://github.com/TraceMachina/nativelink/commit/8c711376590a6d657b5207d4d318012322f61f30)) +- Add dedup store - ([2dba31c](https://github.com/TraceMachina/nativelink/commit/2dba31c44a5baeeefe225b4f5e636b41e4747342)) +- Add retry support to get_part in s3_store - ([ea2fc4c](https://github.com/TraceMachina/nativelink/commit/ea2fc4cba95c849e628ecba8b96131aa3378a22e)) +- Add CompressionStore and implement LZ4 compression - ([d6cd4f9](https://github.com/TraceMachina/nativelink/commit/d6cd4f91fa1f7d538a10fc11526adfbc05418fb3)) +- Add s3 configuration - ([be87381](https://github.com/TraceMachina/nativelink/commit/be87381d05f62e6065c04979f3af7be9a2f222d4)) +- Add retry utility in prep for s3_store - ([86e63ee](https://github.com/TraceMachina/nativelink/commit/86e63ee71b0196754774adf23201482a3e272bba)) +- Add async_read_taker in prep for s3_store - ([90222f9](https://github.com/TraceMachina/nativelink/commit/90222f958a116aa6df5f366bd0e8ffde266f4f37)) +- Add trust_size to DigestInfo - ([d8f218f](https://github.com/TraceMachina/nativelink/commit/d8f218f833fa90410f7feb3c3a9f96f6d2f8eb65)) +- Add ability for VerifyStore to check the sha256 hash of the digest - ([40ba2fb](https://github.com/TraceMachina/nativelink/commit/40ba2fb7131dc2946d1adab9f1dfda60b356e282)) +- Add sha2 to Cargo.toml in prep for sha256 checking - ([0eb2dab](https://github.com/TraceMachina/nativelink/commit/0eb2dab83722f500c8261b0ab1308c7bf94a77f3)) +- Add mock_instant library to Cargo.toml - ([34b9312](https://github.com/TraceMachina/nativelink/commit/34b93120d94d20f0d77b50d9314b98799dd81824)) +- Add maplit to third_party dependencies - ([b09153b](https://github.com/TraceMachina/nativelink/commit/b09153b45fa316ebc6c7db2a746430986cd4e8bb)) +- Add json package dependencies and updates packages - ([69cf723](https://github.com/TraceMachina/nativelink/commit/69cf72367b78cbe5d6a91c1e9a43902cb0e9fad9)) +- Add read stream support - ([5c2db23](https://github.com/TraceMachina/nativelink/commit/5c2db2378ebbd859bdd615ba105c9e3195d8df01)) +- Add drop_guard to Cargo.toml - ([3c147cd](https://github.com/TraceMachina/nativelink/commit/3c147cda0de7ed6b2117ac60db0b9d551cd534da)) +- Add ability to read partial store - ([0b304cc](https://github.com/TraceMachina/nativelink/commit/0b304cc9fec41fbcffe0b1379f4b4660a6957a1c)) +- Add multi-threading and fix some minor performance issues - ([0ed309c](https://github.com/TraceMachina/nativelink/commit/0ed309c0994fe60b6ebfa23024779d3e1170631e)) +- Add DigestInfo utility - ([25bef4a](https://github.com/TraceMachina/nativelink/commit/25bef4aa20ac6bf6c8e2af55d5bb7b4055e87e10)) +- Add much better way to do error logging with .err_tip() - ([9ae49b6](https://github.com/TraceMachina/nativelink/commit/9ae49b64cabb6ceaf9a4de9718ec123e34d76379)) +- Add futures package to Cargo.toml - ([92912e6](https://github.com/TraceMachina/nativelink/commit/92912e6cc786a9716fd29469dab81c603e7718f9)) +- Add Capabilities and Execution api endpoints - ([24dec02](https://github.com/TraceMachina/nativelink/commit/24dec02fe054da8ba3862f8e5057e6a0f42998ed)) +- Add ./rust_fmt.sh - ([5c65005](https://github.com/TraceMachina/nativelink/commit/5c650052e6edf35246c00513e58d7c0fe19e91fc)) +- Add dependent proto files for bazel cas - ([d845d40](https://github.com/TraceMachina/nativelink/commit/d845d404fdc07bd848ea057f7fa7260dc877fb13)) ### 🐛 Bug Fixes @@ -1137,6 +1451,30 @@ All notable changes to this project will be documented in this file. - Fix most clippy warnings in worker files - ([be228d0](https://github.com/TraceMachina/nativelink/commit/be228d0d90b41e1d32b2851d594d25a726cadafc)) - Fixes the `entrypoint_cmd` configuration - ([096d7ea](https://github.com/TraceMachina/nativelink/commit/096d7eae802dc4edf4e38251b853917050d470ad)) - Fix a couple of nits with the timestamp additions. - ([b320de5](https://github.com/TraceMachina/nativelink/commit/b320de5ee54595c530ba0078c3f449812cce33d4)) +- Fix bug if no instance_name/resource_name is given upload does not work - ([b010b4b](https://github.com/TraceMachina/nativelink/commit/b010b4bd019e3e4cce5e5115b0ff797c45e85d96)) +- Fix scheduler so platform properties are properly restored - ([059b0ef](https://github.com/TraceMachina/nativelink/commit/059b0ef90474ffbb7839fa3764db9dcb31b21cf5)) +- Fix bug on output_files' folders were not being created - ([bb010f2](https://github.com/TraceMachina/nativelink/commit/bb010f2fffca465a6af9afd21db61ae9b2212534)) +- Fix bug where worker was not creating working directory properly - ([4e51b6d](https://github.com/TraceMachina/nativelink/commit/4e51b6d80e284de5d0f7dfcf469900e1af2b610b)) +- Fix wrong `type_url` in google-proto's Any type - ([9cda96a](https://github.com/TraceMachina/nativelink/commit/9cda96a654fed9d997b9ac179f7a69b28af8b6de)) +- Fix bug during .has() call in dedup store - ([5cc9a09](https://github.com/TraceMachina/nativelink/commit/5cc9a09dcf2330d993c68a7510871e17d4321227)) +- Fixed various bugs in filesystem store - ([7ba407d](https://github.com/TraceMachina/nativelink/commit/7ba407d24533a397b49c39f7ee5eb42f3a951415)) +- Fix bug in evicting_map with unref improperly called and readability - ([ea393a5](https://github.com/TraceMachina/nativelink/commit/ea393a520f57c8d23aba565317d56ecce7aa80b8)) +- Fix minor issue in FastSlowStore - ([81fb378](https://github.com/TraceMachina/nativelink/commit/81fb378e0c3d894694c7a830f05b37035393edb2)) +- Fix case where s3 uploads in wrong order - ([4798fe9](https://github.com/TraceMachina/nativelink/commit/4798fe9d7130e98ebeda5a8c27512b042a1058c0)) +- Fix bug in s3_store where 5mb is calculated wrong & improve debugability - ([0451781](https://github.com/TraceMachina/nativelink/commit/0451781a8ab55ddaa93d577e8ceb49daaa1bca62)) +- Fix s3_store - ([efcb653](https://github.com/TraceMachina/nativelink/commit/efcb653ae741f97eb1e65272decc6842e33b424b)) +- Fixed AsyncFixedBuffer - ([519fa9f](https://github.com/TraceMachina/nativelink/commit/519fa9f2c49edb2054a9263940bfa350b4c62306)) +- Minor changes to AsyncFixedBuffer - ([a506363](https://github.com/TraceMachina/nativelink/commit/a506363c8a4b8c8171982b4edcb1fbc6eef1f8ac)) +- Fix lifetime of StoreTrait::update() - ([9ec43a2](https://github.com/TraceMachina/nativelink/commit/9ec43a2d5bf408b419fb7a75d976f6668888dc6f)) +- Fix --config debug config to properly add debug symbols - ([90b43c6](https://github.com/TraceMachina/nativelink/commit/90b43c6a5e056543b341004e28385b88b2fca39a)) +- Fix small bug in gen_rs_proto - ([627c0f8](https://github.com/TraceMachina/nativelink/commit/627c0f8ed7bf1098f99fd756c440005a98b2579a)) +- Fix small needless cast to i64 - ([59c609e](https://github.com/TraceMachina/nativelink/commit/59c609e71977a0d3822f85730d4b7844780a366d)) +- Fix bug with verify_store when receiving multiple chunks - ([a78caec](https://github.com/TraceMachina/nativelink/commit/a78caec3927fe6c1b4fdd8bf207013125ff72a30)) +- Fixed typo in debug message when instance_name is not properly set - ([d231ea1](https://github.com/TraceMachina/nativelink/commit/d231ea1f08802e09a1b1f3501b8368d844643a45)) +- Fixed EOF bits and few other items in order to get bazel working - ([8558ee9](https://github.com/TraceMachina/nativelink/commit/8558ee9b51644782eb726638226e338b7605f465)) +- Fix async_fixed_buffers to add get_closer() - ([9225b1f](https://github.com/TraceMachina/nativelink/commit/9225b1fb0c75ed9fd54fa584682eb1bbba3dbab0)) +- Fix memory leak - ([c27685c](https://github.com/TraceMachina/nativelink/commit/c27685c2f7846cb2868bc5ecae9fd697c9e7c1bb)) +- Fix Store import in cas_server.rs - ([a7e7859](https://github.com/TraceMachina/nativelink/commit/a7e7859d485712a7857b7d5a55178e03a8a403a9)) ### 📚 Documentation @@ -1156,6 +1494,10 @@ All notable changes to this project will be documented in this file. - Update README.md - ([7563df7](https://github.com/TraceMachina/nativelink/commit/7563df7a489a926c01bae1d3ec52505db0f49327)) - Document that users should use `-c opt` for release builds - ([9351f26](https://github.com/TraceMachina/nativelink/commit/9351f265f71eca308b18a9ccca2d158f778bba0f)) - Fix bazel version change that broke proto building and documentation - ([1994dde](https://github.com/TraceMachina/nativelink/commit/1994dde8777c718c159823fea93cde89529d1b3c)) +- Add terraform deployment example and documentation - ([c7dff9f](https://github.com/TraceMachina/nativelink/commit/c7dff9f48169171696fa42654823e6beb82dd6c3)) +- Filesystem store now delays before deleting temp file - ([33d88c5](https://github.com/TraceMachina/nativelink/commit/33d88c5d24943bc7bc134dfbbb6cbd91c62b400a)) +- Support deprecated symlink fields & fix bug for workers use CWD - ([00431f9](https://github.com/TraceMachina/nativelink/commit/00431f947b358a7dc95400a361307521c9d1c5ad)) +- FastSlowStore now properly documented and used in LocalWorkerConfig - ([728cb90](https://github.com/TraceMachina/nativelink/commit/728cb90c7765f94460197113feb6d9c7ae6c514b)) ### 🧪 Testing & CI @@ -1199,6 +1541,36 @@ All notable changes to this project will be documented in this file. - Add convenience config to test clippy - ([1185876](https://github.com/TraceMachina/nativelink/commit/118587684ebc11fbc1bff634a1ad79bb2af2edd4)) - Add a test for filestore loading from disk. - ([5f3e9f5](https://github.com/TraceMachina/nativelink/commit/5f3e9f5d09ac9468cc6d9a57706acc7c79d611b8)) - Remove the callbacks from the filesystem_store - ([e2e62d2](https://github.com/TraceMachina/nativelink/commit/e2e62d20b8badadf20970dde763394310fb24cb7)) +- Adds GrpcStore and first integration tests - ([117e173](https://github.com/TraceMachina/nativelink/commit/117e1733b81e8f71d28dec324a7d9dffd79cb1ca)) +- Fix bug in scheduler of not removing actions after execution - ([f2b825b](https://github.com/TraceMachina/nativelink/commit/f2b825bf436bddb7d24c076b1efc165e5809ff61)) +- Fixes flakey filesystem_store_test - ([717d87a](https://github.com/TraceMachina/nativelink/commit/717d87a89b0ee855c45b6ee6a07c1eafe43029a7)) +- First draft to get remote execution working - ([f207dfa](https://github.com/TraceMachina/nativelink/commit/f207dfaf41226ec568720534c1d28ca2d57ef634)) +- Restructure LocalWorker for easier testing - ([d7d71a1](https://github.com/TraceMachina/nativelink/commit/d7d71a138269ee71d31e9816d6ae2dd90ecd65bc)) +- Fix bug in memory store when receiving a zero byte object - ([52445a1](https://github.com/TraceMachina/nativelink/commit/52445a1c234cef5f065d76c0af938b5744dc732d)) +- Fix github CI badge - ([2758d22](https://github.com/TraceMachina/nativelink/commit/2758d22a086da3a9d16546b702598597cdea2bf9)) +- Adds automated CI tests on pull requests and master - ([e647de0](https://github.com/TraceMachina/nativelink/commit/e647de0ba650bac1b2c785327e34ccb53d68a5d5)) +- Add more basic scheduler support - ([2edf514](https://github.com/TraceMachina/nativelink/commit/2edf514742e27cba2bc12c74539463494800a29c)) +- Dedup store will now bypass deduplication when size is small - ([997be53](https://github.com/TraceMachina/nativelink/commit/997be53c7560bb0dca8fe2ab08831ec172ede7a6)) +- Fix buf in bytestream_server when NotFound was returned - ([a4634eb](https://github.com/TraceMachina/nativelink/commit/a4634ebf54f2ee4ad8b154c2ed2e5f4e29f8d23a)) +- Upgrade rustc, use new nightly, rules_python, and rustfmt - ([d0c31fb](https://github.com/TraceMachina/nativelink/commit/d0c31fb3b224921a58a9da5e9d746ceb192e9b71)) +- Fix format of util/tests/async_read_taker_test.rs - ([cd12d1d](https://github.com/TraceMachina/nativelink/commit/cd12d1da698d932775ffc32802855a2c3297675b)) +- dummy_test.sh will now print some equal signs when done - ([1227d39](https://github.com/TraceMachina/nativelink/commit/1227d39d4b995e1127743be333e4890220d8aa21)) +- Added single_item_wrong_digest_size test back to stable - ([b517db1](https://github.com/TraceMachina/nativelink/commit/b517db148d1c807bfdc84916801ae3926e805384)) +- Add //:dummy_test that is useful for testing caching - ([e5a1e9a](https://github.com/TraceMachina/nativelink/commit/e5a1e9ad82b2b910738798764e0f367d76496122)) +- Add dummy test that is used for easy caching - ([efd449a](https://github.com/TraceMachina/nativelink/commit/efd449afd665f16f21c81f5618e294658e8e7d32)) +- Add test for bytestream::write() - ([5dc8ac0](https://github.com/TraceMachina/nativelink/commit/5dc8ac0d64a7241bc4f1c54d1376a9f870dfca8c)) +- Add bytestream server scaffolding - ([7aff76f](https://github.com/TraceMachina/nativelink/commit/7aff76f755b731a99adae5f4c2a512c0cf8c5476)) +- Add test for single item update action cache - ([c3d89e1](https://github.com/TraceMachina/nativelink/commit/c3d89e1981d4184928086d5643594b77d3fad433)) +- get_action_result done with tests - ([fcc8a31](https://github.com/TraceMachina/nativelink/commit/fcc8a319f9f4c061612ee43de58e46cea730a2d9)) +- Add first test for ac_server - ([221ed5f](https://github.com/TraceMachina/nativelink/commit/221ed5fbd765c92f7277a1da074563836689c867)) +- Add test and fix bug when querying and using bad hash on .has() - ([9adbe81](https://github.com/TraceMachina/nativelink/commit/9adbe81aa401bb067f3fca0aeb35a3433b2cf97b)) +- Add test for batch_read_blobs - ([4b1ae1a](https://github.com/TraceMachina/nativelink/commit/4b1ae1ae70118b8b3b324201c46466b106fe206e)) +- Add tests for invalid memory store requests - ([4f8e5a7](https://github.com/TraceMachina/nativelink/commit/4f8e5a7e2cacd8bcc4370ba3c55825398292c826)) +- Add impl and tests for get store data - ([7922f84](https://github.com/TraceMachina/nativelink/commit/7922f8439c2cb59b7f888f409876971a6c0d59aa)) +- Basic HashMap for memory store and enable store_one_item_existence test - ([5206e74](https://github.com/TraceMachina/nativelink/commit/5206e742b3294633864252e3ff6341d84dd08d64)) +- Add test for store_one_item_existence - ([a6f1a70](https://github.com/TraceMachina/nativelink/commit/a6f1a70cb81de2ef0fe74cdb08401a1cd6828ffe)) +- Add store and first test - ([ed4bde4](https://github.com/TraceMachina/nativelink/commit/ed4bde4310ddedff0e5473295410f1f3d68fce71)) +- Add ability to resolve GetCapabilities and bazel connect testing - ([1aba20c](https://github.com/TraceMachina/nativelink/commit/1aba20c23f2db10277e50cb1ee8ecb51c04c2e10)) ### ⚙️ Miscellaneous @@ -1288,156 +1660,6 @@ All notable changes to this project will be documented in this file. - Simplify proto generation - ([eebd6be](https://github.com/TraceMachina/nativelink/commit/eebd6bea6ca80c89cfd185f804320e478b5a3524)) - Overhaul filesystem store to no longer use renameat2 - ([a3cddf9](https://github.com/TraceMachina/nativelink/commit/a3cddf9adb3c287de33cd9b967d8eb99a0c8561a)) - Move from fast-async-mutex to async-lock crate as it's maintained. - ([e172756](https://github.com/TraceMachina/nativelink/commit/e172756613b5398f1ccdaaf258f3f7b80ac4b08e)) - -### ⬆️ Bumps & Version Updates - -- Update dependency mintlify to v4.0.80 ([#536](https://github.com/TraceMachina/nativelink/issues/536)) - ([7564e5e](https://github.com/TraceMachina/nativelink/commit/7564e5e15e39cdf20f5f868a883af8a0ff7b566c)) -- Update Rust crate http to ^0.2.11 ([#530](https://github.com/TraceMachina/nativelink/issues/530)) - ([ca146ac](https://github.com/TraceMachina/nativelink/commit/ca146ac97a3a22213af4358e0c2d1ebe8fbee6f9)) -- Update native-cargo.yaml Runner Group ([#511](https://github.com/TraceMachina/nativelink/issues/511)) - ([e1843f1](https://github.com/TraceMachina/nativelink/commit/e1843f17c3f957fb8542b6ffcc6784ee2b417ad1)) -- Update protobuf dependencies ([#493](https://github.com/TraceMachina/nativelink/issues/493)) - ([3dacdad](https://github.com/TraceMachina/nativelink/commit/3dacdad203c4c2f238e74d6e5beb7401fb312c55)) -- Bump trivially bumpable deps ([#488](https://github.com/TraceMachina/nativelink/issues/488)) - ([96302cb](https://github.com/TraceMachina/nativelink/commit/96302cbeab6c59966d3dfd3b99fa0933752d1018)) -- Update protos after 1aadd42 ([#489](https://github.com/TraceMachina/nativelink/issues/489)) - ([9c6efe0](https://github.com/TraceMachina/nativelink/commit/9c6efe04acb79e6c75d2d58065d2a8914e3efcc9)) -- Make max_bytes_per_stream optional in config ([#474](https://github.com/TraceMachina/nativelink/issues/474)) - ([a01a552](https://github.com/TraceMachina/nativelink/commit/a01a55272f78ef6916e8dfa0532d4b5cb3789036)) -- Bump Rust version to 1.74 ([#459](https://github.com/TraceMachina/nativelink/issues/459)) - ([5412d7c](https://github.com/TraceMachina/nativelink/commit/5412d7cc15b48b9871d0e73686c89efc43d35b53)) -- Update nightly Rust toolchain for Bazel ([#456](https://github.com/TraceMachina/nativelink/issues/456)) - ([5acfa25](https://github.com/TraceMachina/nativelink/commit/5acfa255703abe2134820881aabeece0efb4edda)) -- Update Bazel to 6.4.0 ([#381](https://github.com/TraceMachina/nativelink/issues/381)) - ([2fb59b6](https://github.com/TraceMachina/nativelink/commit/2fb59b61a026416c88a67849435b1d9acd8aa271)) -- Update Rust version to 1.73.0 ([#371](https://github.com/TraceMachina/nativelink/issues/371)) - ([56eda36](https://github.com/TraceMachina/nativelink/commit/56eda36661daae5458b2821effcdbcbc9d03b753)) -- Reduce flakiness of memory_store_test ([#318](https://github.com/TraceMachina/nativelink/issues/318)) - ([ee1f343](https://github.com/TraceMachina/nativelink/commit/ee1f3436be7db34b0d7adab50e0c29eba9d70968)) -- Make memory_store_test compatible with Windows ([#315](https://github.com/TraceMachina/nativelink/issues/315)) - ([2c7e22b](https://github.com/TraceMachina/nativelink/commit/2c7e22b8d5db04ffc9ce2668a7c2cc35da3cc3f6)) -- Update rules_rust to 0.29.0 - ([d925e26](https://github.com/TraceMachina/nativelink/commit/d925e264efd7300d0d7c229b015e7ab7019d99dd)) -- Update Bazel to 6.3.2 - ([c577db5](https://github.com/TraceMachina/nativelink/commit/c577db5dde9afcb26d24279fe54ae013a1d03730)) -- Introduce get_part_ref() and migrate primary use to .get_part() - ([fb6e1fd](https://github.com/TraceMachina/nativelink/commit/fb6e1fd7741852cfe894a9fa7dda1b1106e8cce0)) -- Update remote_execution.proto to v2.3 - ([4c71336](https://github.com/TraceMachina/nativelink/commit/4c713362e6876396546c6f02c3dc9d4b181e345e)) -- Update all dependencies to their latest versions - ([6a72841](https://github.com/TraceMachina/nativelink/commit/6a7284138c8835ce4abdb61bee3a7d2eb33a7290)) -- Update Bazel to 6.2.1 - ([d30571e](https://github.com/TraceMachina/nativelink/commit/d30571ed5135a0901e37dad5ea6283796357d246)) -- Update dependencies. - ([85bf34d](https://github.com/TraceMachina/nativelink/commit/85bf34d9adcd4e57b70b1189da56eb1a7a8d1e31)) -- Update rules_rust to 0.20.0 - ([7a543c2](https://github.com/TraceMachina/nativelink/commit/7a543c2d832fcd8e17d2227eace4811b22601a43)) - -## [1.0.1] - 2022-10-17 - - - -### ⛰️ Features - -- Add support for environmental variable lookup in S3Store config - ([cb0de9e](https://github.com/TraceMachina/nativelink/commit/cb0de9eb40119f7098b4ac0865b4cc5eda8ed374)) -- Add ability to use env variables in config files - ([d54b38e](https://github.com/TraceMachina/nativelink/commit/d54b38e213fb243a9b27622894a1529d614a52fb)) -- Add Send trait to as_any() store calls - ([c4be423](https://github.com/TraceMachina/nativelink/commit/c4be4239aa8813e238eb76f3efc208fa72f0af0a)) -- Add fs module which limits outstanding file handles - ([f7b565f](https://github.com/TraceMachina/nativelink/commit/f7b565f0c525bccd7dc42d529eac64110f15fae5)) -- Add functionality for worker to download and create working dir - ([5e7f9ef](https://github.com/TraceMachina/nativelink/commit/5e7f9efece6a8d4ae0288e14f5bda6a04cf594b0)) -- Adds .as_any() to stores - ([e5de86d](https://github.com/TraceMachina/nativelink/commit/e5de86d78e7d640d492ef97f7c4b98a1f7e9d358)) -- Adds initial implementation for LocalWorker and supporting classes - ([90cff23](https://github.com/TraceMachina/nativelink/commit/90cff230ebb5e7982d780f767aa0b0dc85d87b20)) -- Various minor updates - ([cf6dd3d](https://github.com/TraceMachina/nativelink/commit/cf6dd3db5a9633aa9fa3060395266925c09e9a62)) -- Add shlex package in third_party - ([d935d7f](https://github.com/TraceMachina/nativelink/commit/d935d7f849a362473aed08347e20607f620589bc)) -- Add worker config definitions and rename Metadata to Priority - ([98c4e08](https://github.com/TraceMachina/nativelink/commit/98c4e08e25f1baa0134c61147ee04f736917ef28)) -- Add WorkerApiServer to services being served - ([af0ccc3](https://github.com/TraceMachina/nativelink/commit/af0ccc3faa419e37d3e0bde7ff44e3d528617643)) -- Add support for keep alive for workers - ([be6f2ee](https://github.com/TraceMachina/nativelink/commit/be6f2ee94b7047d94aef01294b1b37716e80e822)) -- [RE] Add WorkerApiService and connection functionality - ([e8a349c](https://github.com/TraceMachina/nativelink/commit/e8a349c991e4bec40fc5435b26d869acbf6a9ac4)) -- [RE] Various changes to worker_api.proto - ([86220b7](https://github.com/TraceMachina/nativelink/commit/86220b7429e26ad2b8ba10f877c05baebe3c6d71)) -- Add uuid package and update other packages - ([5115bc6](https://github.com/TraceMachina/nativelink/commit/5115bc618be4e1718d437a6be866f57f3bea7099)) -- Add SizePartitioningStore - ([d0112be](https://github.com/TraceMachina/nativelink/commit/d0112be4c0deb0ab46bccee8dc074e977336bc74)) -- Add RefStore and restructure StoreManager - ([6795bb0](https://github.com/TraceMachina/nativelink/commit/6795bb08d84e53e03f573026b9d97e38a0ac41cc)) -- Can now pass json config through CLI & add more sample configs - ([ea4d76d](https://github.com/TraceMachina/nativelink/commit/ea4d76d33fc5130e2b6557f0b8283fe4314adc46)) -- Add nix package and upgrade third_party packages - ([a451628](https://github.com/TraceMachina/nativelink/commit/a451628777c34f21d12f95ffdd407a51a8e5a3bb)) -- Add basic scaffolding for scheduler + remote execution - ([c91f61e](https://github.com/TraceMachina/nativelink/commit/c91f61edf182f2b64451fd48a5e63fa506a43aae)) -- Adds readme to configuration - ([54e8fe7](https://github.com/TraceMachina/nativelink/commit/54e8fe75753876a5feadf800b1b4cfe5dff820d1)) -- Add filesystem store - ([d183cad](https://github.com/TraceMachina/nativelink/commit/d183cad24a14b04e2a0c870324f6f5d482db809b)) -- Adds simple query_write_status support - ([844014a](https://github.com/TraceMachina/nativelink/commit/844014ac9a8ca246b20a6c3fa861ac970cf94caa)) -- Add buf_channel that will be used to help transport bytes around - ([7e111c1](https://github.com/TraceMachina/nativelink/commit/7e111c13bb78ce80b3007aa325839a47790a3341)) -- Add byteorder to third_party cargo - ([a76a35f](https://github.com/TraceMachina/nativelink/commit/a76a35f813afa2fe570cb0a59e495c41dcd1004b)) -- Adds more eviction templates and functions in prep for filesystem store - ([f2896a7](https://github.com/TraceMachina/nativelink/commit/f2896a798e18569a833fd0d6055bc2d3de59b3a7)) -- Adds FastSlow store that will try the fast store before slow store - ([8c71137](https://github.com/TraceMachina/nativelink/commit/8c711376590a6d657b5207d4d318012322f61f30)) -- Add dedup store - ([2dba31c](https://github.com/TraceMachina/nativelink/commit/2dba31c44a5baeeefe225b4f5e636b41e4747342)) -- Add retry support to get_part in s3_store - ([ea2fc4c](https://github.com/TraceMachina/nativelink/commit/ea2fc4cba95c849e628ecba8b96131aa3378a22e)) -- Add CompressionStore and implement LZ4 compression - ([d6cd4f9](https://github.com/TraceMachina/nativelink/commit/d6cd4f91fa1f7d538a10fc11526adfbc05418fb3)) -- Add s3 configuration - ([be87381](https://github.com/TraceMachina/nativelink/commit/be87381d05f62e6065c04979f3af7be9a2f222d4)) -- Add retry utility in prep for s3_store - ([86e63ee](https://github.com/TraceMachina/nativelink/commit/86e63ee71b0196754774adf23201482a3e272bba)) -- Add async_read_taker in prep for s3_store - ([90222f9](https://github.com/TraceMachina/nativelink/commit/90222f958a116aa6df5f366bd0e8ffde266f4f37)) -- Add trust_size to DigestInfo - ([d8f218f](https://github.com/TraceMachina/nativelink/commit/d8f218f833fa90410f7feb3c3a9f96f6d2f8eb65)) -- Add ability for VerifyStore to check the sha256 hash of the digest - ([40ba2fb](https://github.com/TraceMachina/nativelink/commit/40ba2fb7131dc2946d1adab9f1dfda60b356e282)) -- Add sha2 to Cargo.toml in prep for sha256 checking - ([0eb2dab](https://github.com/TraceMachina/nativelink/commit/0eb2dab83722f500c8261b0ab1308c7bf94a77f3)) -- Add mock_instant library to Cargo.toml - ([34b9312](https://github.com/TraceMachina/nativelink/commit/34b93120d94d20f0d77b50d9314b98799dd81824)) -- Add maplit to third_party dependencies - ([b09153b](https://github.com/TraceMachina/nativelink/commit/b09153b45fa316ebc6c7db2a746430986cd4e8bb)) -- Add json package dependencies and updates packages - ([69cf723](https://github.com/TraceMachina/nativelink/commit/69cf72367b78cbe5d6a91c1e9a43902cb0e9fad9)) -- Add read stream support - ([5c2db23](https://github.com/TraceMachina/nativelink/commit/5c2db2378ebbd859bdd615ba105c9e3195d8df01)) -- Add drop_guard to Cargo.toml - ([3c147cd](https://github.com/TraceMachina/nativelink/commit/3c147cda0de7ed6b2117ac60db0b9d551cd534da)) -- Add ability to read partial store - ([0b304cc](https://github.com/TraceMachina/nativelink/commit/0b304cc9fec41fbcffe0b1379f4b4660a6957a1c)) -- Add multi-threading and fix some minor performance issues - ([0ed309c](https://github.com/TraceMachina/nativelink/commit/0ed309c0994fe60b6ebfa23024779d3e1170631e)) -- Add DigestInfo utility - ([25bef4a](https://github.com/TraceMachina/nativelink/commit/25bef4aa20ac6bf6c8e2af55d5bb7b4055e87e10)) -- Add much better way to do error logging with .err_tip() - ([9ae49b6](https://github.com/TraceMachina/nativelink/commit/9ae49b64cabb6ceaf9a4de9718ec123e34d76379)) -- Add futures package to Cargo.toml - ([92912e6](https://github.com/TraceMachina/nativelink/commit/92912e6cc786a9716fd29469dab81c603e7718f9)) -- Add Capabilities and Execution api endpoints - ([24dec02](https://github.com/TraceMachina/nativelink/commit/24dec02fe054da8ba3862f8e5057e6a0f42998ed)) -- Add ./rust_fmt.sh - ([5c65005](https://github.com/TraceMachina/nativelink/commit/5c650052e6edf35246c00513e58d7c0fe19e91fc)) -- Add dependent proto files for bazel cas - ([d845d40](https://github.com/TraceMachina/nativelink/commit/d845d404fdc07bd848ea057f7fa7260dc877fb13)) - -### 🐛 Bug Fixes - -- Fix bug if no instance_name/resource_name is given upload does not work - ([b010b4b](https://github.com/TraceMachina/nativelink/commit/b010b4bd019e3e4cce5e5115b0ff797c45e85d96)) -- Fix scheduler so platform properties are properly restored - ([059b0ef](https://github.com/TraceMachina/nativelink/commit/059b0ef90474ffbb7839fa3764db9dcb31b21cf5)) -- Fix bug on output_files' folders were not being created - ([bb010f2](https://github.com/TraceMachina/nativelink/commit/bb010f2fffca465a6af9afd21db61ae9b2212534)) -- Fix bug where worker was not creating working directory properly - ([4e51b6d](https://github.com/TraceMachina/nativelink/commit/4e51b6d80e284de5d0f7dfcf469900e1af2b610b)) -- Fix wrong `type_url` in google-proto's Any type - ([9cda96a](https://github.com/TraceMachina/nativelink/commit/9cda96a654fed9d997b9ac179f7a69b28af8b6de)) -- Fix bug during .has() call in dedup store - ([5cc9a09](https://github.com/TraceMachina/nativelink/commit/5cc9a09dcf2330d993c68a7510871e17d4321227)) -- Fixed various bugs in filesystem store - ([7ba407d](https://github.com/TraceMachina/nativelink/commit/7ba407d24533a397b49c39f7ee5eb42f3a951415)) -- Fix bug in evicting_map with unref improperly called and readability - ([ea393a5](https://github.com/TraceMachina/nativelink/commit/ea393a520f57c8d23aba565317d56ecce7aa80b8)) -- Fix minor issue in FastSlowStore - ([81fb378](https://github.com/TraceMachina/nativelink/commit/81fb378e0c3d894694c7a830f05b37035393edb2)) -- Fix case where s3 uploads in wrong order - ([4798fe9](https://github.com/TraceMachina/nativelink/commit/4798fe9d7130e98ebeda5a8c27512b042a1058c0)) -- Fix bug in s3_store where 5mb is calculated wrong & improve debugability - ([0451781](https://github.com/TraceMachina/nativelink/commit/0451781a8ab55ddaa93d577e8ceb49daaa1bca62)) -- Fix s3_store - ([efcb653](https://github.com/TraceMachina/nativelink/commit/efcb653ae741f97eb1e65272decc6842e33b424b)) -- Fixed AsyncFixedBuffer - ([519fa9f](https://github.com/TraceMachina/nativelink/commit/519fa9f2c49edb2054a9263940bfa350b4c62306)) -- Minor changes to AsyncFixedBuffer - ([a506363](https://github.com/TraceMachina/nativelink/commit/a506363c8a4b8c8171982b4edcb1fbc6eef1f8ac)) -- Fix lifetime of StoreTrait::update() - ([9ec43a2](https://github.com/TraceMachina/nativelink/commit/9ec43a2d5bf408b419fb7a75d976f6668888dc6f)) -- Fix --config debug config to properly add debug symbols - ([90b43c6](https://github.com/TraceMachina/nativelink/commit/90b43c6a5e056543b341004e28385b88b2fca39a)) -- Fix small bug in gen_rs_proto - ([627c0f8](https://github.com/TraceMachina/nativelink/commit/627c0f8ed7bf1098f99fd756c440005a98b2579a)) -- Fix small needless cast to i64 - ([59c609e](https://github.com/TraceMachina/nativelink/commit/59c609e71977a0d3822f85730d4b7844780a366d)) -- Fix bug with verify_store when receiving multiple chunks - ([a78caec](https://github.com/TraceMachina/nativelink/commit/a78caec3927fe6c1b4fdd8bf207013125ff72a30)) -- Fixed typo in debug message when instance_name is not properly set - ([d231ea1](https://github.com/TraceMachina/nativelink/commit/d231ea1f08802e09a1b1f3501b8368d844643a45)) -- Fixed EOF bits and few other items in order to get bazel working - ([8558ee9](https://github.com/TraceMachina/nativelink/commit/8558ee9b51644782eb726638226e338b7605f465)) -- Fix async_fixed_buffers to add get_closer() - ([9225b1f](https://github.com/TraceMachina/nativelink/commit/9225b1fb0c75ed9fd54fa584682eb1bbba3dbab0)) -- Fix memory leak - ([c27685c](https://github.com/TraceMachina/nativelink/commit/c27685c2f7846cb2868bc5ecae9fd697c9e7c1bb)) -- Fix Store import in cas_server.rs - ([a7e7859](https://github.com/TraceMachina/nativelink/commit/a7e7859d485712a7857b7d5a55178e03a8a403a9)) - -### 📚 Documentation - -- Add terraform deployment example and documentation - ([c7dff9f](https://github.com/TraceMachina/nativelink/commit/c7dff9f48169171696fa42654823e6beb82dd6c3)) -- Filesystem store now delays before deleting temp file - ([33d88c5](https://github.com/TraceMachina/nativelink/commit/33d88c5d24943bc7bc134dfbbb6cbd91c62b400a)) -- Support deprecated symlink fields & fix bug for workers use CWD - ([00431f9](https://github.com/TraceMachina/nativelink/commit/00431f947b358a7dc95400a361307521c9d1c5ad)) -- FastSlowStore now properly documented and used in LocalWorkerConfig - ([728cb90](https://github.com/TraceMachina/nativelink/commit/728cb90c7765f94460197113feb6d9c7ae6c514b)) - -### 🧪 Testing & CI - -- Adds GrpcStore and first integration tests - ([117e173](https://github.com/TraceMachina/nativelink/commit/117e1733b81e8f71d28dec324a7d9dffd79cb1ca)) -- Fix bug in scheduler of not removing actions after execution - ([f2b825b](https://github.com/TraceMachina/nativelink/commit/f2b825bf436bddb7d24c076b1efc165e5809ff61)) -- Fixes flakey filesystem_store_test - ([717d87a](https://github.com/TraceMachina/nativelink/commit/717d87a89b0ee855c45b6ee6a07c1eafe43029a7)) -- First draft to get remote execution working - ([f207dfa](https://github.com/TraceMachina/nativelink/commit/f207dfaf41226ec568720534c1d28ca2d57ef634)) -- Restructure LocalWorker for easier testing - ([d7d71a1](https://github.com/TraceMachina/nativelink/commit/d7d71a138269ee71d31e9816d6ae2dd90ecd65bc)) -- Fix bug in memory store when receiving a zero byte object - ([52445a1](https://github.com/TraceMachina/nativelink/commit/52445a1c234cef5f065d76c0af938b5744dc732d)) -- Fix github CI badge - ([2758d22](https://github.com/TraceMachina/nativelink/commit/2758d22a086da3a9d16546b702598597cdea2bf9)) -- Adds automated CI tests on pull requests and master - ([e647de0](https://github.com/TraceMachina/nativelink/commit/e647de0ba650bac1b2c785327e34ccb53d68a5d5)) -- Add more basic scheduler support - ([2edf514](https://github.com/TraceMachina/nativelink/commit/2edf514742e27cba2bc12c74539463494800a29c)) -- Dedup store will now bypass deduplication when size is small - ([997be53](https://github.com/TraceMachina/nativelink/commit/997be53c7560bb0dca8fe2ab08831ec172ede7a6)) -- Fix buf in bytestream_server when NotFound was returned - ([a4634eb](https://github.com/TraceMachina/nativelink/commit/a4634ebf54f2ee4ad8b154c2ed2e5f4e29f8d23a)) -- Upgrade rustc, use new nightly, rules_python, and rustfmt - ([d0c31fb](https://github.com/TraceMachina/nativelink/commit/d0c31fb3b224921a58a9da5e9d746ceb192e9b71)) -- Fix format of util/tests/async_read_taker_test.rs - ([cd12d1d](https://github.com/TraceMachina/nativelink/commit/cd12d1da698d932775ffc32802855a2c3297675b)) -- dummy_test.sh will now print some equal signs when done - ([1227d39](https://github.com/TraceMachina/nativelink/commit/1227d39d4b995e1127743be333e4890220d8aa21)) -- Added single_item_wrong_digest_size test back to stable - ([b517db1](https://github.com/TraceMachina/nativelink/commit/b517db148d1c807bfdc84916801ae3926e805384)) -- Add //:dummy_test that is useful for testing caching - ([e5a1e9a](https://github.com/TraceMachina/nativelink/commit/e5a1e9ad82b2b910738798764e0f367d76496122)) -- Add dummy test that is used for easy caching - ([efd449a](https://github.com/TraceMachina/nativelink/commit/efd449afd665f16f21c81f5618e294658e8e7d32)) -- Add test for bytestream::write() - ([5dc8ac0](https://github.com/TraceMachina/nativelink/commit/5dc8ac0d64a7241bc4f1c54d1376a9f870dfca8c)) -- Add bytestream server scaffolding - ([7aff76f](https://github.com/TraceMachina/nativelink/commit/7aff76f755b731a99adae5f4c2a512c0cf8c5476)) -- Add test for single item update action cache - ([c3d89e1](https://github.com/TraceMachina/nativelink/commit/c3d89e1981d4184928086d5643594b77d3fad433)) -- get_action_result done with tests - ([fcc8a31](https://github.com/TraceMachina/nativelink/commit/fcc8a319f9f4c061612ee43de58e46cea730a2d9)) -- Add first test for ac_server - ([221ed5f](https://github.com/TraceMachina/nativelink/commit/221ed5fbd765c92f7277a1da074563836689c867)) -- Add test and fix bug when querying and using bad hash on .has() - ([9adbe81](https://github.com/TraceMachina/nativelink/commit/9adbe81aa401bb067f3fca0aeb35a3433b2cf97b)) -- Add test for batch_read_blobs - ([4b1ae1a](https://github.com/TraceMachina/nativelink/commit/4b1ae1ae70118b8b3b324201c46466b106fe206e)) -- Add tests for invalid memory store requests - ([4f8e5a7](https://github.com/TraceMachina/nativelink/commit/4f8e5a7e2cacd8bcc4370ba3c55825398292c826)) -- Add impl and tests for get store data - ([7922f84](https://github.com/TraceMachina/nativelink/commit/7922f8439c2cb59b7f888f409876971a6c0d59aa)) -- Basic HashMap for memory store and enable store_one_item_existence test - ([5206e74](https://github.com/TraceMachina/nativelink/commit/5206e742b3294633864252e3ff6341d84dd08d64)) -- Add test for store_one_item_existence - ([a6f1a70](https://github.com/TraceMachina/nativelink/commit/a6f1a70cb81de2ef0fe74cdb08401a1cd6828ffe)) -- Add store and first test - ([ed4bde4](https://github.com/TraceMachina/nativelink/commit/ed4bde4310ddedff0e5473295410f1f3d68fce71)) -- Add ability to resolve GetCapabilities and bazel connect testing - ([1aba20c](https://github.com/TraceMachina/nativelink/commit/1aba20c23f2db10277e50cb1ee8ecb51c04c2e10)) - -### ⚙️ Miscellaneous - - Change license to Apache 2 license - ([1147525](https://github.com/TraceMachina/nativelink/commit/11475254245224de09647d130ad078f0abc35168)) - Remove dependency on rust-nightly - ([41028a9](https://github.com/TraceMachina/nativelink/commit/41028a956dd5eeac7166a25b56a7b96a401a2045)) - Enable Gzip compression support to GRPC - ([438afbf](https://github.com/TraceMachina/nativelink/commit/438afbfc2337dc10d6003d169a6c5419e3acce56)) @@ -1503,6 +1725,27 @@ All notable changes to this project will be documented in this file. ### ⬆️ Bumps & Version Updates +- Update dependency mintlify to v4.0.80 ([#536](https://github.com/TraceMachina/nativelink/issues/536)) - ([7564e5e](https://github.com/TraceMachina/nativelink/commit/7564e5e15e39cdf20f5f868a883af8a0ff7b566c)) +- Update Rust crate http to ^0.2.11 ([#530](https://github.com/TraceMachina/nativelink/issues/530)) - ([ca146ac](https://github.com/TraceMachina/nativelink/commit/ca146ac97a3a22213af4358e0c2d1ebe8fbee6f9)) +- Update native-cargo.yaml Runner Group ([#511](https://github.com/TraceMachina/nativelink/issues/511)) - ([e1843f1](https://github.com/TraceMachina/nativelink/commit/e1843f17c3f957fb8542b6ffcc6784ee2b417ad1)) +- Update protobuf dependencies ([#493](https://github.com/TraceMachina/nativelink/issues/493)) - ([3dacdad](https://github.com/TraceMachina/nativelink/commit/3dacdad203c4c2f238e74d6e5beb7401fb312c55)) +- Bump trivially bumpable deps ([#488](https://github.com/TraceMachina/nativelink/issues/488)) - ([96302cb](https://github.com/TraceMachina/nativelink/commit/96302cbeab6c59966d3dfd3b99fa0933752d1018)) +- Update protos after 1aadd42 ([#489](https://github.com/TraceMachina/nativelink/issues/489)) - ([9c6efe0](https://github.com/TraceMachina/nativelink/commit/9c6efe04acb79e6c75d2d58065d2a8914e3efcc9)) +- Make max_bytes_per_stream optional in config ([#474](https://github.com/TraceMachina/nativelink/issues/474)) - ([a01a552](https://github.com/TraceMachina/nativelink/commit/a01a55272f78ef6916e8dfa0532d4b5cb3789036)) +- Bump Rust version to 1.74 ([#459](https://github.com/TraceMachina/nativelink/issues/459)) - ([5412d7c](https://github.com/TraceMachina/nativelink/commit/5412d7cc15b48b9871d0e73686c89efc43d35b53)) +- Update nightly Rust toolchain for Bazel ([#456](https://github.com/TraceMachina/nativelink/issues/456)) - ([5acfa25](https://github.com/TraceMachina/nativelink/commit/5acfa255703abe2134820881aabeece0efb4edda)) +- Update Bazel to 6.4.0 ([#381](https://github.com/TraceMachina/nativelink/issues/381)) - ([2fb59b6](https://github.com/TraceMachina/nativelink/commit/2fb59b61a026416c88a67849435b1d9acd8aa271)) +- Update Rust version to 1.73.0 ([#371](https://github.com/TraceMachina/nativelink/issues/371)) - ([56eda36](https://github.com/TraceMachina/nativelink/commit/56eda36661daae5458b2821effcdbcbc9d03b753)) +- Reduce flakiness of memory_store_test ([#318](https://github.com/TraceMachina/nativelink/issues/318)) - ([ee1f343](https://github.com/TraceMachina/nativelink/commit/ee1f3436be7db34b0d7adab50e0c29eba9d70968)) +- Make memory_store_test compatible with Windows ([#315](https://github.com/TraceMachina/nativelink/issues/315)) - ([2c7e22b](https://github.com/TraceMachina/nativelink/commit/2c7e22b8d5db04ffc9ce2668a7c2cc35da3cc3f6)) +- Update rules_rust to 0.29.0 - ([d925e26](https://github.com/TraceMachina/nativelink/commit/d925e264efd7300d0d7c229b015e7ab7019d99dd)) +- Update Bazel to 6.3.2 - ([c577db5](https://github.com/TraceMachina/nativelink/commit/c577db5dde9afcb26d24279fe54ae013a1d03730)) +- Introduce get_part_ref() and migrate primary use to .get_part() - ([fb6e1fd](https://github.com/TraceMachina/nativelink/commit/fb6e1fd7741852cfe894a9fa7dda1b1106e8cce0)) +- Update remote_execution.proto to v2.3 - ([4c71336](https://github.com/TraceMachina/nativelink/commit/4c713362e6876396546c6f02c3dc9d4b181e345e)) +- Update all dependencies to their latest versions - ([6a72841](https://github.com/TraceMachina/nativelink/commit/6a7284138c8835ce4abdb61bee3a7d2eb33a7290)) +- Update Bazel to 6.2.1 - ([d30571e](https://github.com/TraceMachina/nativelink/commit/d30571ed5135a0901e37dad5ea6283796357d246)) +- Update dependencies. - ([85bf34d](https://github.com/TraceMachina/nativelink/commit/85bf34d9adcd4e57b70b1189da56eb1a7a8d1e31)) +- Update rules_rust to 0.20.0 - ([7a543c2](https://github.com/TraceMachina/nativelink/commit/7a543c2d832fcd8e17d2227eace4811b22601a43)) - Add minimum bazel version to .bazelversion - ([a2be6f5](https://github.com/TraceMachina/nativelink/commit/a2be6f5a902c28c270fc8a09cb2c26a85587044a)) - Updates cargo packages - ([a610e69](https://github.com/TraceMachina/nativelink/commit/a610e69ea37e3cc281df3ee5f066e9f901ffa3a5)) - Various minor changes - ([2546a77](https://github.com/TraceMachina/nativelink/commit/2546a7797cce995173c37b084d849b2c7080bdbc)) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cd3557847..4bdb8a2e1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -462,7 +462,7 @@ NativeLink Code of Conduct is available in the You can generate branch-based coverage reports via: ``` -nix run .#nativelinkCoverageForHost +nix build .#nativelinkCoverageForHost ``` The `result` symlink contains a webpage with the visualized report. diff --git a/Cargo.lock b/Cargo.lock index ed049d9dc..e5cb1b6bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,15 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "addr2line" -version = "0.25.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" -dependencies = [ - "gimli", -] - [[package]] name = "adler2" version = "2.0.1" @@ -24,7 +15,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "version_check", "zerocopy", @@ -116,6 +107,12 @@ version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" +[[package]] +name = "arcstr" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03918c3dbd7701a85c6b9887732e2921175f26c350b4563841d0958c21d57e6d" + [[package]] name = "arrayref" version = "0.3.9" @@ -157,7 +154,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -250,9 +247,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.108.0" +version = "1.109.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200be4aed61e3c0669f7268bacb768f283f1c32a7014ce57225e1160be2f6ccb" +checksum = "3c6d81b75f8ff78882e70c5909804b44553d56136899fb4015a0a68ecc870e0e" dependencies = [ "aws-credential-types", "aws-runtime", @@ -454,7 +451,7 @@ dependencies = [ "http-body 0.4.6", "http-body 1.0.1", "hyper 0.14.32", - "indexmap 2.11.4", + "indexmap 2.12.0", "pin-project-lite", "serde", "serde_json", @@ -612,6 +609,7 @@ dependencies = [ "http 1.3.1", "http-body 1.0.1", "http-body-util", + "hyper-util", "itoa", "matchit", "memchr", @@ -620,7 +618,8 @@ dependencies = [ "pin-project-lite", "serde_core", "sync_wrapper", - "tower 0.5.2", + "tokio", + "tower", "tower-layer", "tower-service", ] @@ -644,20 +643,20 @@ dependencies = [ ] [[package]] -name = "backtrace" -version = "0.3.76" +name = "backon" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" dependencies = [ - "addr2line", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", - "windows-link", + "fastrand", ] +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + [[package]] name = "base64" version = "0.13.1" @@ -704,9 +703,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.9.4" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2261d10cca569e4643e526d8dc2e62e433cc8aba21ab764233731f8d369bf394" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" [[package]] name = "bitvec" @@ -762,9 +761,9 @@ dependencies = [ "base64 0.22.1", "bitvec", "getrandom 0.2.16", - "getrandom 0.3.3", + "getrandom 0.3.4", "hex", - "indexmap 2.11.4", + "indexmap 2.12.0", "js-sys", "once_cell", "rand 0.9.2", @@ -805,9 +804,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.10.1" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" [[package]] name = "bytes-utils" @@ -840,9 +839,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.40" +version = "1.2.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1d05d92f4b1fd76aad469d46cdd858ca761576082cd37df81416691e50199fb" +checksum = "ac9fe6cdbb24b6ade63616c0a0688e45bb56732262c158df3c0c4bea4ca47cb7" dependencies = [ "find-msvc-tools", "jobserver", @@ -858,9 +857,9 @@ checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" [[package]] name = "cfg-if" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" [[package]] name = "cfg_aliases" @@ -909,9 +908,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.48" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2134bb3ea021b78629caa971416385309e0131b351b25e01dc16fb54e1b5fae" +checksum = "0c2cfd7bf8a6017ddaa4e32ffe7403d547790db06bd171c1c53926faab501623" dependencies = [ "clap_builder", "clap_derive", @@ -919,9 +918,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.48" +version = "4.5.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2ba64afa3c0a6df7fa517765e31314e983f51dda798ffba27b988194fb65dc9" +checksum = "0a4c05b9e80c5ccd3a7ef080ad7b6ba7d6fc00a985b8b157197075677c82c7a0" dependencies = [ "anstream", "anstyle", @@ -931,21 +930,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.47" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfd7eae0b0f1a6e63d4b13c9c478de77c2eb546fba158ad50b4203dc24b9f9c" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] name = "clap_lex" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" +checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "colorchoice" @@ -960,7 +959,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" dependencies = [ "bytes", + "futures-core", "memchr", + "pin-project-lite", + "tokio", + "tokio-util", ] [[package]] @@ -1104,6 +1107,30 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "critical-section" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -1116,6 +1143,18 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + [[package]] name = "crypto-common" version = "0.1.6" @@ -1126,6 +1165,33 @@ dependencies = [ "typenum", ] +[[package]] +name = "curve25519-dalek" +version = "4.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" +dependencies = [ + "cfg-if", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", + "zeroize", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "darling" version = "0.21.3" @@ -1147,7 +1213,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.106", + "syn", ] [[package]] @@ -1158,7 +1224,7 @@ checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" dependencies = [ "darling_core", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -1188,17 +1254,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "derivative" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "derive-syn-parse" version = "0.2.0" @@ -1207,7 +1262,7 @@ checksum = "d65d7ce8132b7c0e54497a4d9a55a1c2a0912a0d786cf894472ba818fba45762" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -1218,7 +1273,7 @@ checksum = "ef941ded77d15ca19b40374869ac6000af1c9f2a4c0f3d4c70926287e6364a8f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -1231,7 +1286,29 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.106", + "syn", +] + +[[package]] +name = "derive_more" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618" +dependencies = [ + "derive_more-impl", +] + +[[package]] +name = "derive_more-impl" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b" +dependencies = [ + "proc-macro2", + "quote", + "rustc_version", + "syn", + "unicode-xid", ] [[package]] @@ -1247,6 +1324,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", + "const-oid", "crypto-common", "subtle", ] @@ -1259,14 +1337,52 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] name = "dyn-clone" -version = "1.0.20" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005" + +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der", + "digest", + "elliptic-curve", + "rfc6979", + "signature", + "spki", +] + +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "pkcs8", + "signature", +] + +[[package]] +name = "ed25519-dalek" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9" +dependencies = [ + "curve25519-dalek", + "ed25519", + "serde", + "sha2", + "subtle", + "zeroize", +] [[package]] name = "either" @@ -1274,6 +1390,27 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct", + "crypto-bigint", + "digest", + "ff", + "generic-array", + "group", + "hkdf", + "pem-rfc7468", + "pkcs8", + "rand_core 0.6.4", + "sec1", + "subtle", + "zeroize", +] + [[package]] name = "encoding_rs" version = "0.8.35" @@ -1283,6 +1420,18 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "enum-as-inner" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -1326,6 +1475,22 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "ff" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + [[package]] name = "filetime" version = "0.2.26" @@ -1340,9 +1505,9 @@ dependencies = [ [[package]] name = "find-msvc-tools" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0399f9d26e5191ce32c498bebd31e7a3ceabc2745f0ac54af3f335126c3f24b3" +checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" [[package]] name = "fixedbitset" @@ -1360,15 +1525,6 @@ dependencies = [ "miniz_oxide", ] -[[package]] -name = "float-cmp" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" -dependencies = [ - "num-traits", -] - [[package]] name = "fnv" version = "1.0.7" @@ -1396,48 +1552,6 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8866fac38f53fc87fa3ae1b09ddd723e0482f8fa74323518b4c59df2c55a00a" -[[package]] -name = "fred" -version = "10.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a7b2fd0f08b23315c13b6156f971aeedb6f75fb16a29ac1872d2eabccc1490e" -dependencies = [ - "arc-swap", - "async-trait", - "bytes", - "bytes-utils", - "float-cmp", - "fred-macros", - "futures", - "glob-match", - "log", - "parking_lot", - "rand 0.8.5", - "redis-protocol", - "rustls", - "rustls-native-certs", - "semver", - "sha-1", - "socket2 0.5.10", - "tokio", - "tokio-rustls", - "tokio-stream", - "tokio-util", - "url", - "urlencoding", -] - -[[package]] -name = "fred-macros" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1458c6e22d36d61507034d5afecc64f105c1d39712b7ac6ec3b352c423f715cc" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - [[package]] name = "funty" version = "2.0.0" @@ -1500,7 +1614,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -1535,9 +1649,9 @@ dependencies = [ [[package]] name = "gcloud-auth" -version = "1.1.2" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce5aa2c8f36c2be2c352fcf62b221d92fab43fbdc6e8a379eec7354d6e77e1b4" +checksum = "5bdedbc36e6b9d8d79558fbf2ebc098745bc721e9d37d3e369558e420038e360" dependencies = [ "async-trait", "base64 0.22.1", @@ -1599,12 +1713,13 @@ dependencies = [ [[package]] name = "generic-array" -version = "0.14.7" +version = "0.14.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" dependencies = [ "typenum", "version_check", + "zeroize", ] [[package]] @@ -1616,41 +1731,51 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.1+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] [[package]] name = "getrandom" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "js-sys", "libc", "r-efi", - "wasi 0.14.7+wasi-0.2.4", + "wasip2", "wasm-bindgen", ] [[package]] -name = "gimli" -version = "0.32.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" - -[[package]] -name = "glob" -version = "0.3.3" +name = "ginepro" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +checksum = "9197cb67b35f86badd2e5a66c3a651d037a398247a394399d80700ef07ba662b" +dependencies = [ + "anyhow", + "async-trait", + "hickory-resolver", + "http 1.3.1", + "thiserror 2.0.17", + "tokio", + "tonic", + "tower", + "tracing", +] [[package]] -name = "glob-match" -version = "0.2.1" +name = "group" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9985c9503b412198aa4197559e9a318524ebc4519c229bfa05a535828c950b9d" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff", + "rand_core 0.6.4", + "subtle", +] [[package]] name = "h2" @@ -1664,7 +1789,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.11.4", + "indexmap 2.12.0", "slab", "tokio", "tokio-util", @@ -1683,7 +1808,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.3.1", - "indexmap 2.11.4", + "indexmap 2.12.0", "slab", "tokio", "tokio-util", @@ -1692,9 +1817,9 @@ dependencies = [ [[package]] name = "half" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e54c115d4f30f52c67202f079c5f9d8b49db4691f460fdb0b4c2e838261b2ba5" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", @@ -1736,6 +1861,61 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hickory-proto" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8a6fe56c0038198998a6f217ca4e7ef3a5e51f46163bd6dd60b5c71ca6c6502" +dependencies = [ + "async-trait", + "cfg-if", + "data-encoding", + "enum-as-inner", + "futures-channel", + "futures-io", + "futures-util", + "idna", + "ipnet", + "once_cell", + "rand 0.9.2", + "ring", + "thiserror 2.0.17", + "tinyvec", + "tokio", + "tracing", + "url", +] + +[[package]] +name = "hickory-resolver" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc62a9a99b0bfb44d2ab95a7208ac952d31060efc16241c87eaf36406fecf87a" +dependencies = [ + "cfg-if", + "futures-util", + "hickory-proto", + "ipconfig", + "moka", + "once_cell", + "parking_lot", + "rand 0.9.2", + "resolv-conf", + "smallvec", + "thiserror 2.0.17", + "tokio", + "tracing", +] + +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + [[package]] name = "hmac" version = "0.12.1" @@ -1925,7 +2105,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.0", + "socket2 0.6.1", "tokio", "tower-service", "tracing", @@ -2081,9 +2261,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.11.4" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b0f83760fb341a774ed326568e19f5a863af4a952def8c39f9ab92fd95b88e5" +checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" dependencies = [ "equivalent", "hashbrown 0.16.0", @@ -2092,14 +2272,15 @@ dependencies = [ ] [[package]] -name = "io-uring" -version = "0.7.10" +name = "ipconfig" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" +checksum = "b58db92f96b720de98181bbbe63c831e87005ab460c1bf306eb2622b4707997f" dependencies = [ - "bitflags 2.9.4", - "cfg-if", - "libc", + "socket2 0.5.10", + "widestring", + "windows-sys 0.48.0", + "winreg", ] [[package]] @@ -2120,9 +2301,9 @@ dependencies = [ [[package]] name = "is_terminal_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" @@ -2167,7 +2348,7 @@ version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ - "getrandom 0.3.3", + "getrandom 0.3.4", "libc", ] @@ -2183,16 +2364,24 @@ dependencies = [ [[package]] name = "jsonwebtoken" -version = "9.3.1" +version = "10.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a87cc7a48537badeae96744432de36f4be2b4a34a05a5ef32e9dd8a1c169dde" +checksum = "0529410abe238729a60b108898784df8984c87f6054c9c4fcacc47e4803c1ce1" dependencies = [ "base64 0.22.1", + "ed25519-dalek", + "getrandom 0.2.16", + "hmac", "js-sys", + "p256", + "p384", "pem", - "ring", + "rand 0.8.5", + "rsa", "serde", "serde_json", + "sha2", + "signature", "simple_asn1", ] @@ -2201,6 +2390,9 @@ name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +dependencies = [ + "spin", +] [[package]] name = "libc" @@ -2208,6 +2400,12 @@ version = "0.2.177" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2874a2af47a2325c2001a6e6fad9b16a53b802102b528163885171cf92b15976" +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + [[package]] name = "libmimalloc-sys" version = "0.1.44" @@ -2224,7 +2422,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "416f7e718bdb06000964960ffa43b4335ad4012ae8b99060261aa4a8088d5ccb" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "libc", "redox_syscall", ] @@ -2267,9 +2465,9 @@ dependencies = [ [[package]] name = "lru" -version = "0.13.0" +version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "227748d55f2f0ab4735d87fd623798cb6b664512fe979705f829c9f81c934465" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" [[package]] name = "lru-slab" @@ -2292,7 +2490,7 @@ dependencies = [ "macro_magic_core", "macro_magic_macros", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -2306,7 +2504,7 @@ dependencies = [ "macro_magic_core_macros", "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -2317,7 +2515,7 @@ checksum = "b02abfe41815b5bd98dbd4260173db2c116dda171dc0fe7838cb206333b83308" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -2328,7 +2526,7 @@ checksum = "73ea28ee64b88876bf45277ed9a5817c1817df061a74f2b988971a12570e5869" dependencies = [ "macro_magic_core", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -2364,9 +2562,9 @@ checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "memmap2" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7" +checksum = "744133e4a0e0a658e1374cf3bf8e415c4052a15a111acd372764c55b4177d490" dependencies = [ "libc", ] @@ -2424,13 +2622,13 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +checksum = "69d83b0086dc8ecf3ce9ae2874b2d1290252e2a30720bea58a5c6639b0092873" dependencies = [ "libc", - "wasi 0.11.1+wasi-snapshot-preview1", - "windows-sys 0.59.0", + "wasi", + "windows-sys 0.61.2", ] [[package]] @@ -2439,6 +2637,24 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e1d4c44418358edcac6e1d9ce59cea7fb38052429c7704033f1196f0c179e6a" +[[package]] +name = "moka" +version = "0.12.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8261cd88c312e0004c1d51baad2980c66528dfdb2bee62003e643a4d8f86b077" +dependencies = [ + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "equivalent", + "parking_lot", + "portable-atomic", + "rustc_version", + "smallvec", + "tagptr", + "uuid", +] + [[package]] name = "mongocrypt" version = "0.3.1" @@ -2469,7 +2685,7 @@ dependencies = [ "bson", "chrono", "derive-where", - "derive_more", + "derive_more 0.99.20", "futures-core", "futures-executor", "futures-io", @@ -2514,7 +2730,7 @@ dependencies = [ "macro_magic", "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -2525,39 +2741,45 @@ checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" [[package]] name = "nativelink" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ "async-lock", "axum", + "bytes", "clap", "futures", + "hex", "hyper 1.7.0", "hyper-util", "mimalloc", "nativelink-config", "nativelink-error", + "nativelink-proto", "nativelink-scheduler", "nativelink-service", "nativelink-store", "nativelink-util", "nativelink-worker", - "rustls-pemfile", + "rand 0.9.2", + "rustls-pki-types", + "sha2", "tokio", "tokio-rustls", - "tonic 0.13.1", - "tower 0.5.2", + "tonic", + "tower", "tracing", ] [[package]] name = "nativelink-config" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ "byte-unit", "humantime", "nativelink-error", "pretty_assertions", "rand 0.9.2", + "schemars 1.2.1", "serde", "serde_json", "serde_json5", @@ -2568,31 +2790,35 @@ dependencies = [ [[package]] name = "nativelink-error" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ - "fred", "nativelink-metric", "nativelink-proto", "prost", "prost-types", + "redis", + "rustls-pki-types", "serde", "serde_json5", "tokio", - "tonic 0.13.1", + "tonic", + "url", + "uuid", + "walkdir", ] [[package]] name = "nativelink-macro" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] name = "nativelink-metric" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ "async-lock", "nativelink-metric-macro-derive", @@ -2603,41 +2829,54 @@ dependencies = [ [[package]] name = "nativelink-metric-macro-derive" -version = "0.6.0" +version = "0.8.0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] name = "nativelink-proto" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ - "derivative", + "derive_more 2.1.0", "prost", "prost-build", "prost-types", - "tonic 0.13.1", + "tonic", "tonic-build", ] +[[package]] +name = "nativelink-redis-tester" +version = "1.0.0-rc4" +dependencies = [ + "either", + "nativelink-util", + "redis", + "redis-protocol", + "redis-test", + "tokio", + "tracing", +] + [[package]] name = "nativelink-scheduler" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ "async-lock", "async-trait", "bytes", - "fred", "futures", - "lru 0.13.0", + "lru 0.16.3", "mock_instant", "nativelink-config", "nativelink-error", "nativelink-macro", "nativelink-metric", "nativelink-proto", + "nativelink-redis-tester", "nativelink-store", "nativelink-util", "opentelemetry", @@ -2645,13 +2884,14 @@ dependencies = [ "parking_lot", "pretty_assertions", "prost", + "redis", "scopeguard", "serde", "serde_json", "static_assertions", "tokio", "tokio-stream", - "tonic 0.13.1", + "tonic", "tracing", "tracing-test", "uuid", @@ -2659,7 +2899,7 @@ dependencies = [ [[package]] name = "nativelink-service" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ "async-lock", "async-trait", @@ -2690,8 +2930,8 @@ dependencies = [ "sha2", "tokio", "tokio-stream", - "tonic 0.13.1", - "tower 0.5.2", + "tonic", + "tower", "tracing", "tracing-test", "uuid", @@ -2699,7 +2939,7 @@ dependencies = [ [[package]] name = "nativelink-store" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ "async-lock", "async-trait", @@ -2713,9 +2953,7 @@ dependencies = [ "blake3", "byteorder", "bytes", - "bytes-utils", "const_format", - "fred", "futures", "gcloud-auth", "gcloud-storage", @@ -2726,6 +2964,7 @@ dependencies = [ "hyper 1.7.0", "hyper-rustls", "hyper-util", + "itertools", "lz4_flex", "memory-stats", "mock_instant", @@ -2735,6 +2974,7 @@ dependencies = [ "nativelink-macro", "nativelink-metric", "nativelink-proto", + "nativelink-redis-tester", "nativelink-util", "opentelemetry", "parking_lot", @@ -2742,9 +2982,13 @@ dependencies = [ "pretty_assertions", "prost", "rand 0.9.2", + "redis", + "redis-test", "regex", + "reqwest", + "reqwest-middleware", "rustls", - "rustls-pemfile", + "rustls-pki-types", "serde", "serde_json", "sha2", @@ -2752,27 +2996,32 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tonic 0.13.1", + "tonic", "tracing", "tracing-test", + "url", "uuid", ] [[package]] name = "nativelink-util" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ "async-trait", + "axum", "base64 0.22.1", - "bitflags 2.9.4", + "bitflags 2.10.0", "blake3", "bytes", "futures", + "ginepro", "hex", "http-body-util", + "humantime", "hyper 1.7.0", "hyper-util", - "lru 0.13.0", + "libc", + "lru 0.16.3", "mock_instant", "nativelink-config", "nativelink-error", @@ -2800,18 +3049,20 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tonic 0.13.1", - "tower 0.5.2", + "tonic", + "tower", "tracing", "tracing-opentelemetry", "tracing-subscriber", "tracing-test", + "url", "uuid", + "walkdir", ] [[package]] name = "nativelink-worker" -version = "0.7.3" +version = "1.0.0-rc4" dependencies = [ "async-lock", "bytes", @@ -2838,9 +3089,10 @@ dependencies = [ "serde_json5", "serial_test", "shlex", + "tempfile", "tokio", "tokio-stream", - "tonic 0.13.1", + "tonic", "tracing", "tracing-test", "uuid", @@ -2858,11 +3110,11 @@ dependencies = [ [[package]] name = "nu-ansi-term" -version = "0.50.1" +version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4a28e057d01f97e61255210fcff094d74ed0466038633e95017f5beb68e4399" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -2875,6 +3127,22 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-bigint-dig" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e661dda6640fad38e827a6d4a310ff4763082116fe217f279885c97f511bb0b7" +dependencies = [ + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + [[package]] name = "num-conv" version = "0.1.0" @@ -2890,6 +3158,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-rational" version = "0.4.2" @@ -2908,15 +3187,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", -] - -[[package]] -name = "object" -version = "0.37.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" -dependencies = [ - "memchr", + "libm", ] [[package]] @@ -2924,12 +3195,16 @@ name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +dependencies = [ + "critical-section", + "portable-atomic", +] [[package]] name = "once_cell_polyfill" -version = "1.70.1" +version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "openssl-probe" @@ -2939,9 +3214,9 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "opentelemetry" -version = "0.29.1" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e87237e2775f74896f9ad219d26a2081751187eb7c9f5c58dde20a23b95d16c" +checksum = "aaf416e4cb72756655126f7dd7bb0af49c674f4c1b9903e80c009e0c37e552e6" dependencies = [ "futures-core", "futures-sink", @@ -2953,9 +3228,9 @@ dependencies = [ [[package]] name = "opentelemetry-appender-tracing" -version = "0.29.1" +version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e716f864eb23007bdd9dc4aec381e188a1cee28eecf22066772b5fd822b9727d" +checksum = "e68f63eca5fad47e570e00e893094fc17be959c80c79a7d6ec1abdd5ae6ffc16" dependencies = [ "opentelemetry", "tracing", @@ -2965,9 +3240,9 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.29.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46d7ab32b827b5b495bd90fa95a6cb65ccc293555dcc3199ae2937d2d237c8ed" +checksum = "50f6639e842a97dbea8886e3439710ae463120091e2e064518ba8e716e6ac36d" dependencies = [ "async-trait", "bytes", @@ -2977,11 +3252,10 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.29.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d899720fe06916ccba71c01d04ecd77312734e2de3467fd30d9d580c8ce85656" +checksum = "dbee664a43e07615731afc539ca60c6d9f1a9425e25ca09c57bc36c87c55852b" dependencies = [ - "futures-core", "http 1.3.1", "opentelemetry", "opentelemetry-proto", @@ -2989,37 +3263,36 @@ dependencies = [ "prost", "thiserror 2.0.17", "tokio", - "tonic 0.12.3", + "tonic", ] [[package]] name = "opentelemetry-proto" -version = "0.29.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c40da242381435e18570d5b9d50aca2a4f4f4d8e146231adb4e7768023309b3" +checksum = "2e046fd7660710fe5a05e8748e70d9058dc15c94ba914e7c4faa7c728f0e8ddc" dependencies = [ "opentelemetry", "opentelemetry_sdk", "prost", - "tonic 0.12.3", + "tonic", ] [[package]] name = "opentelemetry-semantic-conventions" -version = "0.29.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84b29a9f89f1a954936d5aa92f19b2feec3c8f3971d3e96206640db7f9706ae3" +checksum = "83d059a296a47436748557a353c5e6c5705b9470ef6c95cfc52c21a8814ddac2" [[package]] name = "opentelemetry_sdk" -version = "0.29.0" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afdefb21d1d47394abc1ba6c57363ab141be19e27cc70d0e422b7f303e4d290b" +checksum = "11f644aa9e5e31d11896e024305d7e3c98a88884d9f8919dbf37a9991bc47a4b" dependencies = [ "futures-channel", "futures-executor", "futures-util", - "glob", "opentelemetry", "percent-encoding", "rand 0.9.2", @@ -3033,6 +3306,30 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" +[[package]] +name = "p256" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" +dependencies = [ + "ecdsa", + "elliptic-curve", + "primeorder", + "sha2", +] + +[[package]] +name = "p384" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe42f1670a52a47d448f14b6a5c61dd78fce51856e68edaa38f7ae3a46b8d6b6" +dependencies = [ + "ecdsa", + "elliptic-curve", + "primeorder", + "sha2", +] + [[package]] name = "parking" version = "2.2.1" @@ -3068,7 +3365,7 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb45b6331bbdbb54c9a29413703e892ab94f83a31e4a546c778495a91e7fbca" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", ] [[package]] @@ -3082,12 +3379,12 @@ dependencies = [ [[package]] name = "pem" -version = "3.0.5" +version = "3.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38af38e8470ac9dee3ce1bae1af9c1671fffc44ddfd8bd1d0a3445bf349a8ef3" +checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be" dependencies = [ "base64 0.22.1", - "serde", + "serde_core", ] [[package]] @@ -3135,7 +3432,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -3155,7 +3452,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" dependencies = [ "fixedbitset", - "indexmap 2.11.4", + "indexmap 2.12.0", ] [[package]] @@ -3175,7 +3472,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -3190,6 +3487,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + [[package]] name = "pkcs8" version = "0.10.2" @@ -3206,6 +3514,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "portable-atomic" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" + [[package]] name = "potential_utf" version = "0.1.3" @@ -3247,7 +3561,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.106", + "syn", +] + +[[package]] +name = "primeorder" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" +dependencies = [ + "elliptic-curve", ] [[package]] @@ -3285,7 +3608,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.106", + "syn", "tempfile", ] @@ -3299,7 +3622,7 @@ dependencies = [ "itertools", "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -3324,7 +3647,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.0", + "socket2 0.6.1", "thiserror 2.0.17", "tokio", "tracing", @@ -3338,7 +3661,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ "bytes", - "getrandom 0.3.3", + "getrandom 0.3.4", "lru-slab", "rand 0.9.2", "ring", @@ -3361,7 +3684,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.0", + "socket2 0.6.1", "tracing", "windows-sys 0.60.2", ] @@ -3441,9 +3764,39 @@ dependencies = [ name = "rand_core" version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "redis" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47ba378d39b8053bffbfc2750220f5a24a06189b5129523d5db01618774e0239" dependencies = [ - "getrandom 0.3.3", + "ahash", + "arc-swap", + "arcstr", + "backon", + "bytes", + "cfg-if", + "combine", + "crc16", + "futures-channel", + "futures-util", + "itoa", + "log", + "percent-encoding", + "pin-project-lite", + "rand 0.9.2", + "ryu", + "sha1_smol", + "socket2 0.6.1", + "tokio", + "tokio-util", + "url", + "xxhash-rust", ] [[package]] @@ -3460,13 +3813,26 @@ dependencies = [ "nom", ] +[[package]] +name = "redis-test" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7a5cadf877f090eebfef0f4e8646c56531ab416b388410fe1c974f4e6e9cb20" +dependencies = [ + "futures", + "rand 0.9.2", + "redis", + "socket2 0.6.1", + "tempfile", +] + [[package]] name = "redox_syscall" version = "0.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", ] [[package]] @@ -3486,14 +3852,14 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] name = "regex" -version = "1.11.3" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -3503,9 +3869,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.11" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -3514,27 +3880,30 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "943f41321c63ef1c92fd763bfe054d2668f7f225a5c29f0105903dc2fc04ba30" +checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" [[package]] name = "regex-syntax" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf4aa5b0f434c91fe5c7f1ecb6a5ece2130b02ad2a590589dda5146df959001" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "relative-path" -version = "1.9.3" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" +checksum = "bca40a312222d8ba74837cb474edef44b37f561da5f773981007a10bbaa992b0" +dependencies = [ + "serde", +] [[package]] name = "reqwest" -version = "0.12.23" +version = "0.12.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" +checksum = "9d0946410b9f7b082a427e4ef5c8ff541a88b357bc6c637c40db3a68ac70a36f" dependencies = [ "base64 0.22.1", "bytes", @@ -3563,7 +3932,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", - "tower 0.5.2", + "tower", "tower-http", "tower-service", "url", @@ -3589,6 +3958,22 @@ dependencies = [ "tower-service", ] +[[package]] +name = "resolv-conf" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e061d1b48cb8d38042de4ae0a7a6401009d6143dc80d2e2d6f31f0bdd6470c7" + +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + [[package]] name = "ring" version = "0.17.14" @@ -3622,20 +4007,34 @@ dependencies = [ ] [[package]] -name = "rust_decimal" -version = "1.38.0" +name = "rsa" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8975fc98059f365204d635119cf9c5a60ae67b841ed49b5422a9a7e56cdfac0" +checksum = "b8573f03f5883dcaebdfcf4725caa1ecb9c15b2ef50c43a07b816e06799bb12d" dependencies = [ - "arrayvec", + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", "num-traits", + "pkcs1", + "pkcs8", + "rand_core 0.6.4", + "signature", + "spki", + "subtle", + "zeroize", ] [[package]] -name = "rustc-demangle" -version = "0.1.26" +name = "rust_decimal" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" +checksum = "35affe401787a9bd846712274d97654355d21b2a2c092a3139aabe31e9022282" +dependencies = [ + "arrayvec", + "num-traits", +] [[package]] name = "rustc-hash" @@ -3668,7 +4067,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd15f8a2c5551a84d56efdc1cd049089e409ac19a3072d5037a17fd70719ff3e" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "errno", "libc", "linux-raw-sys", @@ -3677,9 +4076,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.32" +version = "0.23.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd3c25631629d034ce7cd9940adc9d45762d46de2b0f57193c4443b92c6d4d40" +checksum = "6a9586e9ee2b4f8fab52a0048ca7334d7024eef48e2cb9407e3497bb7cab7fa7" dependencies = [ "log", "once_cell", @@ -3692,9 +4091,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcff2dd52b58a8d98a70243663a0d234c4e2b79235637849d15913394a247d3" +checksum = "9980d917ebb0c0536119ba501e90834767bffc3d60641457fd84a1f3fd337923" dependencies = [ "openssl-probe", "rustls-pki-types", @@ -3702,20 +4101,11 @@ dependencies = [ "security-framework", ] -[[package]] -name = "rustls-pemfile" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" -dependencies = [ - "rustls-pki-types", -] - [[package]] name = "rustls-pki-types" -version = "1.12.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +checksum = "708c0f9d5f54ba0272468c1d306a52c495b31fa155e91bc25371e6df7996908c" dependencies = [ "web-time", "zeroize", @@ -3723,9 +4113,9 @@ dependencies = [ [[package]] name = "rustls-platform-verifier" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be59af91596cac372a6942530653ad0c3a246cdd491aaa9dcaee47f88d67d5a0" +checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784" dependencies = [ "core-foundation", "core-foundation-sys", @@ -3739,7 +4129,7 @@ dependencies = [ "security-framework", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -3812,16 +4202,29 @@ dependencies = [ [[package]] name = "schemars" -version = "1.0.4" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82d20c4491bc164fa2f6c5d44565947a52ad80b9505d8e36f8d54c27c739fcd0" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" dependencies = [ "dyn-clone", "ref-cast", + "schemars_derive", "serde", "serde_json", ] +[[package]] +name = "schemars_derive" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -3834,13 +4237,27 @@ version = "3.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "490dcfcbfef26be6800d11870ff2df8774fa6e86d047e3e8c8a76b25655e41ca" +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct", + "der", + "generic-array", + "pkcs8", + "subtle", + "zeroize", +] + [[package]] name = "security-framework" version = "3.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "core-foundation", "core-foundation-sys", "libc", @@ -3906,7 +4323,18 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", +] + +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -3915,7 +4343,7 @@ version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" dependencies = [ - "indexmap 2.11.4", + "indexmap 2.12.0", "itoa", "memchr", "ryu", @@ -3948,17 +4376,17 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.15.0" +version = "3.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6093cd8c01b25262b84927e0f7151692158fab02d961e04c979d3903eba7ecc5" +checksum = "aa66c845eee442168b2c8134fec70ac50dc20e760769c8ba0ad1319ca1959b04" dependencies = [ "base64 0.22.1", "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.11.4", + "indexmap 2.12.0", "schemars 0.9.0", - "schemars 1.0.4", + "schemars 1.2.1", "serde_core", "serde_json", "serde_with_macros", @@ -3967,14 +4395,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.15.0" +version = "3.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7e6c180db0816026a61afa1cff5344fb7ebded7e4d3062772179f2501481c27" +checksum = "b91a903660542fced4e99881aa481bdbaec1634568ee02e0b8bd57c64cb38955" dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -3998,18 +4426,7 @@ checksum = "5d69265a08751de7844521fd15003ae0a888e035773ba05695c5c759a6f89eef" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", -] - -[[package]] -name = "sha-1" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", + "syn", ] [[package]] @@ -4023,6 +4440,12 @@ dependencies = [ "digest", ] +[[package]] +name = "sha1_smol" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" + [[package]] name = "sha2" version = "0.10.9" @@ -4064,6 +4487,16 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + [[package]] name = "simd-adler32" version = "0.3.7" @@ -4106,14 +4539,20 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "spki" version = "0.7.3" @@ -4161,20 +4600,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.106" +version = "2.0.107" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" dependencies = [ "proc-macro2", "quote", @@ -4198,9 +4626,15 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + [[package]] name = "take_mut" version = "0.2.2" @@ -4220,7 +4654,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", - "getrandom 0.3.3", + "getrandom 0.3.4", "once_cell", "rustix", "windows-sys 0.61.2", @@ -4252,7 +4686,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -4263,7 +4697,7 @@ checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -4351,33 +4785,30 @@ dependencies = [ [[package]] name = "tokio" -version = "1.47.1" +version = "1.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +checksum = "ff360e02eab121e0bc37a2d3b4d4dc622e6eda3a8e5253d5435ecf5bd4c68408" dependencies = [ - "backtrace", "bytes", - "io-uring", "libc", "mio", "parking_lot", "pin-project-lite", "signal-hook-registry", - "slab", - "socket2 0.6.0", + "socket2 0.6.1", "tokio-macros", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -4415,33 +4846,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tonic" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" -dependencies = [ - "async-trait", - "base64 0.22.1", - "bytes", - "http 1.3.1", - "http-body 1.0.1", - "http-body-util", - "hyper 1.7.0", - "hyper-timeout", - "hyper-util", - "percent-encoding", - "pin-project", - "prost", - "tokio", - "tokio-stream", - "tower 0.4.13", - "tower-layer", - "tower-service", - "tracing", - "zstd", -] - [[package]] name = "tonic" version = "0.13.1" @@ -4468,10 +4872,11 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-stream", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", "tracing", + "zstd", ] [[package]] @@ -4485,27 +4890,7 @@ dependencies = [ "prost-build", "prost-types", "quote", - "syn 2.0.106", -] - -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "indexmap 1.9.3", - "pin-project", - "pin-project-lite", - "rand 0.8.5", - "slab", - "tokio", - "tokio-util", - "tower-layer", - "tower-service", - "tracing", + "syn", ] [[package]] @@ -4516,7 +4901,7 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", - "indexmap 2.11.4", + "indexmap 2.12.0", "pin-project-lite", "slab", "sync_wrapper", @@ -4533,14 +4918,14 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" dependencies = [ - "bitflags 2.9.4", + "bitflags 2.10.0", "bytes", "futures-util", "http 1.3.1", "http-body 1.0.1", "iri-string", "pin-project-lite", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", ] @@ -4576,7 +4961,7 @@ checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -4602,9 +4987,9 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd8e764bd6f5813fd8bebc3117875190c5b0415be8f7f8059bffb6ecd979c444" +checksum = "ddcf5959f39507d0d04d6413119c04f33b623f4f951ebcbdddddfad2d0623a9c" dependencies = [ "js-sys", "once_cell", @@ -4666,7 +5051,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568" dependencies = [ "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -4692,7 +5077,7 @@ checksum = "3c36781cc0e46a83726d9879608e4cf6c2505237e263a8eb8c24502989cfdb28" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -4721,9 +5106,9 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63a545481291138910575129486daeaf8ac54aee4387fe7906919f7830c7d9d" +checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" [[package]] name = "unicode-normalization" @@ -4801,7 +5186,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f87b8aa10b915a06587d0dec516c282ff295b475d94abf425d62b57710070a2" dependencies = [ "atomic", - "getrandom 0.3.3", + "getrandom 0.3.4", "js-sys", "serde", "wasm-bindgen", @@ -4850,15 +5235,6 @@ version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" -[[package]] -name = "wasi" -version = "0.14.7+wasi-0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "883478de20367e224c0090af9cf5f9fa85bed63a95c1abf3afc5c083ebc06e8c" -dependencies = [ - "wasip2", -] - [[package]] name = "wasip2" version = "1.0.1+wasi-0.2.4" @@ -4891,7 +5267,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.106", + "syn", "wasm-bindgen-shared", ] @@ -4926,7 +5302,7 @@ checksum = "9f07d2f20d4da7b26400c9f4a0511e6e0345b040694e8a75bd41d578fa4421d7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5000,6 +5376,12 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "widestring" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72069c3113ab32ab29e5584db3c6ec55d416895e60715417b5b883a357c3e471" + [[package]] name = "winapi-util" version = "0.1.11" @@ -5030,7 +5412,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -5041,7 +5423,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -5077,6 +5459,15 @@ dependencies = [ "windows-targets 0.42.2", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -5128,6 +5519,21 @@ dependencies = [ "windows_x86_64_msvc 0.42.2", ] +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -5167,6 +5573,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -5185,6 +5597,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -5203,6 +5621,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -5233,6 +5657,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -5251,6 +5681,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -5269,6 +5705,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -5287,6 +5729,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -5299,6 +5747,16 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "wit-bindgen" version = "0.46.0" @@ -5326,6 +5784,12 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + [[package]] name = "yansi" version = "1.0.1" @@ -5352,7 +5816,7 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", "synstructure", ] @@ -5373,7 +5837,7 @@ checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] @@ -5393,7 +5857,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", "synstructure", ] @@ -5433,7 +5897,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.106", + "syn", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 5bbd43461..2e7355eba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,4 @@ +#:schema tools/cargo-with-detailed-deps.json [workspace] exclude = [ "nativelink-config/generate-stores-config", @@ -9,7 +10,7 @@ resolver = "2" edition = "2024" name = "nativelink" rust-version = "1.87.0" -version = "0.7.3" +version = "1.0.0-rc4" [profile.release] lto = true @@ -28,17 +29,10 @@ name = "nativelink" [features] nix = ["nativelink-worker/nix"] -# Enable this to get extra debug about workers that are not being used by the CAS -# for some reason. We don't enable this by default, as it's part of a hot path in -# the scheduling system, and also that a worker not matching isn't necessarily bad. -worker_find_logging = [ - "nativelink-scheduler/worker_find_logging", - "nativelink-util/worker_find_logging", -] - [dependencies] nativelink-config = { path = "nativelink-config" } nativelink-error = { path = "nativelink-error" } +nativelink-proto = { path = "nativelink-proto" } nativelink-scheduler = { path = "nativelink-scheduler" } nativelink-service = { path = "nativelink-service" } nativelink-store = { path = "nativelink-store" } @@ -47,14 +41,30 @@ nativelink-worker = { path = "nativelink-worker" } async-lock = { version = "3.4.0", features = ["std"], default-features = false } axum = { version = "0.8.3", default-features = false } -clap = { version = "4.5.35", features = ["derive"] } +bytes = { version = "1.10.1", default-features = false } +clap = { version = "4.5.35", features = [ + "color", + "derive", + "error-context", + "help", + "std", + "suggestions", + "usage", +], default-features = false } futures = { version = "0.3.31", default-features = false } -hyper = "1.6.0" -hyper-util = "0.1.11" -mimalloc = "0.1.44" -rustls-pemfile = { version = "2.2.0", features = [ +hex = { version = "0.4.3", default-features = false } +hyper = { version = "1.6.0", default-features = false } +hyper-util = { version = "0.1.11", default-features = false, features = [ + "tracing", +] } +mimalloc = { version = "0.1.44", default-features = false } +rand = { version = "0.9.0", default-features = false, features = [ + "thread_rng", +] } +rustls-pki-types = { version = "1.13.1", features = [ "std", ], default-features = false } +sha2 = { version = "0.10.8", default-features = false } tokio = { version = "1.44.1", features = [ "fs", "io-util", @@ -107,9 +117,11 @@ overlapping-range-endpoints = "deny" suspicious-double-ref-op = "deny" unconditional-recursion = "deny" unexpected-cfgs = "deny" +unknown-lints = "deny" unnameable-test-items = "deny" unsafe-op-in-unsafe-fn = "deny" unstable-syntax-pre-expansion = "deny" +unused-imports = "deny" keyword-idents = "warn" let-underscore = "warn" @@ -139,10 +151,30 @@ pedantic = { level = "warn", priority = -1 } # Restriction Denies with default priority alloc-instead-of-core = "deny" as-underscore = "deny" +await-holding-lock = "deny" +bind-instead-of-map = "deny" +collapsible-if = "deny" +disallowed-methods = "deny" +doc-markdown = "deny" elidable-lifetime-names = "deny" +explicit-into-iter-loop = "deny" +future-not-send = "deny" +implicit-clone = "deny" +implicit-hasher = "deny" +manual-is-variant-and = "deny" +map-unwrap-or = "deny" +or-fun-call = "deny" +ptr-arg = "deny" +redundant-closure-for-method-calls = "deny" semicolon-if-nothing-returned = "deny" +single-char-pattern = "deny" std-instead-of-core = "deny" +string-lit-as-bytes = "deny" todo = "deny" +unchecked-duration-subtraction = "deny" +unnecessary-semicolon = "deny" +used-underscore-binding = "deny" +useless-format = "deny" # Restriction Warnings with default priority dbg-macro = "warn" @@ -168,7 +200,7 @@ too-long-first-doc-paragraph = { level = "allow" } # TODO(jhpratt) uninhabited-references = { level = "allow", priority = 1 } # rust-lang/rust-clippy#11984 # TODO(palfrey): Remove these to get to pedantic. -cast_possible_truncation = { level = "allow", priority = 1 } +cast_possible_truncation = { level = "deny", priority = 1 } cast_possible_wrap = { level = "allow", priority = 1 } cast_precision_loss = { level = "allow", priority = 1 } cast_sign_loss = { level = "allow", priority = 1 } diff --git a/MODULE.bazel b/MODULE.bazel index 78427f9b8..320e0dd56 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -1,11 +1,11 @@ module( name = "nativelink", - version = "0.7.3", + version = "1.0.0-rc4", compatibility_level = 0, ) -bazel_dep(name = "rules_cc", version = "0.1.1") -bazel_dep(name = "platforms", version = "0.0.11") +bazel_dep(name = "rules_cc", version = "0.1.5") +bazel_dep(name = "platforms", version = "1.0.0") bazel_dep(name = "bazel_skylib", version = "1.7.1") bazel_dep(name = "rules_python", version = "1.3.0") # TODO(palfrey): Bump. bazel_dep(name = "rules_shell", version = "0.4.1") @@ -41,6 +41,7 @@ crate.from_cargo( "//nativelink-metric/nativelink-metric-macro-derive:Cargo.toml", "//nativelink-proto:Cargo.toml", "//nativelink-scheduler:Cargo.toml", + "//nativelink-redis-tester:Cargo.toml", "//nativelink-service:Cargo.toml", "//nativelink-store:Cargo.toml", "//nativelink-util:Cargo.toml", @@ -78,6 +79,8 @@ rust.toolchain( rust_analyzer_version = "nightly/2025-05-21", rustfmt_version = "nightly/2025-05-21", sha256s = { + # Update the shas with update-module-hashes + # BEGIN SHAS "2025-05-21/cargo-nightly-x86_64-unknown-linux-gnu.tar.xz": "e866f249dfbdf10a68b7191c025257591e8a5aa2fede1663b34c88a4f4bb8a74", "2025-05-21/clippy-nightly-x86_64-unknown-linux-gnu.tar.xz": "0a312d722a94e3b9e1f7871d9a9af01d410917c2406dbf91d014c06fe79540fb", "2025-05-21/llvm-tools-nightly-x86_64-unknown-linux-gnu.tar.xz": "eee28e99ac24c27f3de969915e808c0645ee099b136e5547681110607d09d050", @@ -96,6 +99,7 @@ rust.toolchain( "rust-std-1.87.0-x86_64-unknown-linux-gnu.tar.xz": "1b57253bd32b8b292c965b3a2d992a266763158494cab8555584c09360b90f77", "rustc-1.87.0-aarch64-apple-darwin.tar.xz": "175800bc89cccd8f8ee2f3a4d07bdf98c163030fd5d3dc6d5b23cf4dd0a2a4c3", "rustc-1.87.0-x86_64-unknown-linux-gnu.tar.xz": "e8395c5c5756253b76107055e093ffbc4431af7b30aeebe72ce2684b9cb53973", + # END SHAS }, versions = [ "1.87.0", diff --git a/README.md b/README.md index 79c54ac38..94e7a70bc 100644 --- a/README.md +++ b/README.md @@ -57,9 +57,7 @@ NativeLink seamlessly integrates with build tools that use the Remote Execution ## 🚀 Quickstart -To start, you can deploy NativeLink as a Docker image (as shown below) or by using our cloud-hosted solution, [NativeLink Cloud](https://app.nativelink.com). It's **FREE** for individuals, open-source projects, and cloud production environments, with support for unlimited team members. - -The setups below are **production-grade** installations. See the [contribution docs](https://nativelink.com/docs/contribute/nix/) for instructions on how to build from source with [Bazel](https://nativelink.com/docs/contribute/bazel/), [Cargo](https://nativelink.com/docs/contribute/cargo/), and [Nix](https://nativelink.com/docs/contribute/nix/). +To start, you can deploy NativeLink as a Docker image (as shown below). The setups below are **production-grade** installations. See the [contribution docs](https://nativelink.com/docs/contribute/nix/) for instructions on how to build from source with [Bazel](https://nativelink.com/docs/contribute/bazel/), [Cargo](https://nativelink.com/docs/contribute/cargo/), and [Nix](https://nativelink.com/docs/contribute/nix/). You can find a few example deployments in the [Docs](https://nativelink.com/docs/deployment-examples/kubernetes). @@ -74,14 +72,14 @@ for how to build the images yourself. ```bash curl -O \ - https://raw.githubusercontent.com/TraceMachina/nativelink/v0.7.0/nativelink-config/examples/basic_cas.json5 + https://raw.githubusercontent.com/TraceMachina/nativelink/v0.7.5/nativelink-config/examples/basic_cas.json5 # See https://github.com/TraceMachina/nativelink/pkgs/container/nativelink # to find the latest tag docker run \ -v $(pwd)/basic_cas.json5:/config \ -p 50051:50051 \ - ghcr.io/tracemachina/nativelink:v0.7.0 \ + ghcr.io/tracemachina/nativelink:v0.7.5 \ config ``` @@ -90,7 +88,7 @@ docker run \ ```powershell # Download the configuration file Invoke-WebRequest ` - -Uri "https://raw.githubusercontent.com/TraceMachina/nativelink/v0.7.0/nativelink-config/examples/basic_cas.json5" ` + -Uri "https://raw.githubusercontent.com/TraceMachina/nativelink/v0.7.5/nativelink-config/examples/basic_cas.json5" ` -OutFile "basic_cas.json5" # Run the Docker container @@ -98,7 +96,7 @@ Invoke-WebRequest ` docker run ` -v ${PWD}/basic_cas.json5:/config ` -p 50051:50051 ` - ghcr.io/tracemachina/nativelink:v0.7.0 ` + ghcr.io/tracemachina/nativelink:v0.7.5 ` config ``` diff --git a/cliff.toml b/cliff.toml index 1688c2b2d..803d85656 100644 --- a/cliff.toml +++ b/cliff.toml @@ -85,11 +85,14 @@ commit_parsers = [ { message = "GrpcStore now sends digest function from context", group = "🐛 Bug Fixes" }, { message = "Implement .* feature", group = "⛰️ Features" }, { message = "Implement `ClientStateManager` for `SimpleScheduler`", group = "⚙️ Miscellaneous" }, + { message = "Log failures to update actions", group = "⛰️ Features" }, + { message = "Log on command complete", group = "⛰️ Features" }, { message = "Make the error on a size field clearer", group = "🐛 Bug Fixes" }, { message = "Migrate to callPackage syntax", group = "⚙️ Miscellaneous" }, { message = "Move Bytestream to array config", group = "⛰️ Features" }, { message = "Move `update_action_with_internal_error` into `StateManager`", group = "⚙️ Miscellaneous" }, { message = "Prevent UUID collision", group = "🐛 Bug Fixes" }, + { message = "Redo worker_find_logging as config", group = "⛰️ Features" }, { message = "Remove nativelink-proto as build dependency", group = "🧪 Testing & CI" }, { message = "Retry GrpcStore get_part_ref", group = "⛰️ Features" }, { message = "Shard store weight scale distribution", group = "⛰️ Features" }, @@ -131,6 +134,8 @@ commit_parsers = [ { message = "Handle", group = "🐛 Bug Fixes" }, { message = "Resolve", group = "🐛 Bug Fixes" }, + { message = "Merge branch", skip = true }, + { message = "Prepare.+release", skip = true }, { message = "Release", skip = true }, # Catch-all in miscellaneous diff --git a/deployment-examples/docker-compose/Dockerfile b/deployment-examples/docker-compose/Dockerfile index 1bb5a4391..c8c7b6446 100644 --- a/deployment-examples/docker-compose/Dockerfile +++ b/deployment-examples/docker-compose/Dockerfile @@ -14,7 +14,7 @@ # Current supported Ubuntu version, Noble Numbat aka 24.04 LTS # Locked down to a specific revision to avoid issues with package versions -ARG OS_VERSION=noble-20250925 +ARG OS_VERSION=noble-20250925@sha256:728785b59223d755e3e5c5af178fab1be7031f3522c5ccd7a0b32b80d8248123 # `--compilation_mode` to pass into bazel (eg: opt, dbg, fastbuild). ARG OPT_LEVEL=opt # Additional bazel flags. @@ -32,7 +32,7 @@ RUN apt-get update \ git=1:2.43.0-1ubuntu7.3 \ gcc=4:13.2.0-7ubuntu1 \ g++=4:13.2.0-7ubuntu1 \ - python3=3.12.3-0ubuntu2 \ + python3=3.12.3-0ubuntu2.1 \ ca-certificates=20240203 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ @@ -59,7 +59,7 @@ COPY --from=builder /root/nativelink-bin /usr/local/bin/nativelink ARG ADDITIONAL_SETUP_WORKER_CMD RUN apt-get update \ - && apt-get install -y --no-install-recommends curl=8.5.0-2ubuntu10.6 \ + && apt-get install -y --no-install-recommends curl=8.5.0-2ubuntu10.8 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ && bash -ueo pipefail -c "${ADDITIONAL_SETUP_WORKER_CMD}" \ diff --git a/deployment-examples/docker-compose/worker.json5 b/deployment-examples/docker-compose/worker.json5 index 1198cde34..fd2aac594 100644 --- a/deployment-examples/docker-compose/worker.json5 +++ b/deployment-examples/docker-compose/worker.json5 @@ -41,6 +41,7 @@ }, }, }, + fast_direction: "get", slow: { ref_store: { name: "GRPC_LOCAL_STORE", diff --git a/deployment-examples/metrics/README.md b/deployment-examples/metrics/README.md new file mode 100644 index 000000000..cf6794ddd --- /dev/null +++ b/deployment-examples/metrics/README.md @@ -0,0 +1,428 @@ +# NativeLink Metrics with OpenTelemetry + +This directory contains configurations and examples for collecting, processing, and visualizing NativeLink metrics using OpenTelemetry (OTEL) and various server systems. + +## Overview + +NativeLink exposes comprehensive metrics about cache operations and remote execution through OpenTelemetry. These metrics provide insights into: + +- **Cache Performance**: Hit rates, operation latencies, eviction rates +- **Execution Pipeline**: Queue times, stage durations, success rates +- **System Health**: Worker utilization, throughput, error rates + +## Quick Start + +### Using Docker Compose (Recommended for Development) + +1. Start the metrics stack: +```bash +cd deployment-examples/metrics +docker-compose up -d +``` + +2. Configure NativeLink to send metrics to the collector: +```bash +export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +export OTEL_EXPORTER_OTLP_PROTOCOL=grpc +export OTEL_SERVICE_NAME=nativelink +export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=dev,nativelink.instance_name=main" +``` + +3. Start NativeLink with your configuration: +```bash +nativelink /path/to/config.json +``` + +4. Access the metrics: +- Prometheus UI: http://localhost:9090 +- Grafana: http://localhost:3000 (if included) +- OTEL Collector metrics: http://localhost:8888/metrics + +### Using Kubernetes + +1. Deploy the OTEL Collector: +```bash +kubectl apply -f kubernetes/otel-collector.yaml +``` + +2. Deploy Prometheus with OTLP receiver enabled: +```bash +kubectl apply -f kubernetes/prometheus.yaml +``` + +3. Configure NativeLink deployment to send metrics: +```yaml +env: + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://otel-collector:4317" + - name: OTEL_EXPORTER_OTLP_PROTOCOL + value: "grpc" + - name: OTEL_RESOURCE_ATTRIBUTES + value: "deployment.environment=prod,k8s.cluster.name=main" +``` + +## Metrics Catalog + +### Cache Metrics + +| Metric | Type | Description | Labels | +|--------|------|-------------|--------| +| `nativelink_cache_operations_total` | Counter | Total cache operations | `cache_type`, `cache_operation_name`, `cache_operation_result` | +| `nativelink_cache_operation_duration` | Histogram | Operation latency in milliseconds | `cache_type`, `cache_operation_name` | +| `nativelink_cache_io_total` | Counter | Bytes read/written | `cache_type`, `cache_operation_name` | +| `nativelink_cache_size` | Gauge | Current cache size in bytes | `cache_type` | +| `nativelink_cache_entries` | Gauge | Number of cached entries | `cache_type` | +| `nativelink_cache_item_size` | Histogram | Size distribution of cache entries | `cache_type` | + +**Cache Operation Names:** +- `read`: Data retrieval operations +- `write`: Data storage operations +- `delete`: Explicit removal operations +- `evict`: Automatic evictions (LRU, TTL) + +**Cache Operation Results:** +- `hit`: Data found and valid (reads) +- `miss`: Data not found (reads) +- `expired`: Data found but stale (reads) +- `success`: Operation completed (writes/deletes) +- `error`: Operation failed + +### Execution Metrics + +| Metric | Type | Description | Labels | +|--------|------|-------------|--------| +| `nativelink_execution_stage_duration` | Histogram | Time spent in each execution stage | `execution_stage` | +| `nativelink_execution_total_duration` | Histogram | Total execution time from submission to completion | `execution_instance` | +| `nativelink_execution_queue_time` | Histogram | Time spent waiting in queue | `execution_priority` | +| `nativelink_execution_active_count` | Gauge | Current actions in each stage | `execution_stage` | +| `nativelink_execution_completed_count_total` | Counter | Completed executions | `execution_result`, `execution_action_digest` | +| `nativelink_execution_stage_transitions_total` | Counter | Stage transition events | `execution_instance`, `execution_priority` | +| `nativelink_execution_output_size` | Histogram | Size of execution outputs | - | +| `nativelink_execution_retry_count_total` | Counter | Number of retries | - | + +**Execution Stages:** +- `unknown`: Initial state +- `cache_check`: Checking for cached results +- `queued`: Waiting for available worker +- `executing`: Running on worker +- `completed`: Finished execution + +**Execution Results:** +- `success`: Completed with exit code 0 +- `failure`: Completed with non-zero exit code +- `cancelled`: Execution was cancelled +- `timeout`: Execution timed out +- `cache_hit`: Result found in cache + +> **Note on Counter Names in Prometheus:** Counter metrics are exposed with a `_total` suffix +> (for example, `nativelink_execution_completed_count_total`). The Docker Compose quickstart, +> recording rules, and included dashboards assume `_total` counter names. + +## Configuration + +### Environment Variables + +NativeLink uses standard OpenTelemetry environment variables: + +```bash +# OTLP Exporter Configuration +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 # Collector endpoint +OTEL_EXPORTER_OTLP_PROTOCOL=grpc # Protocol (grpc or http/protobuf) +OTEL_EXPORTER_OTLP_HEADERS="Authorization=Bearer token" # Optional auth headers +OTEL_EXPORTER_OTLP_COMPRESSION=gzip # Compression (none, gzip) + +# Resource Attributes +OTEL_SERVICE_NAME=nativelink # Service name (fixed) +OTEL_RESOURCE_ATTRIBUTES="key1=value1,key2=value2" # Custom attributes + +# Metric Export Configuration +OTEL_METRIC_EXPORT_INTERVAL=60000 # Export interval in ms (default: 60s) +OTEL_METRIC_EXPORT_TIMEOUT=30000 # Export timeout in ms (default: 30s) + +# Disable telemetry types +OTEL_TRACES_EXPORTER=none # Disable traces (if only metrics needed) +OTEL_LOGS_EXPORTER=none # Disable logs (if only metrics needed) +``` + +### Collector Configuration + +The OTEL Collector can be configured to: +1. Add resource attributes +2. Batch metrics for efficiency +3. Export to multiple metrics servers +4. Transform metric attributes + +See `otel-collector-config.yaml` for a complete example. + +## Server Options + +### Prometheus (Recommended) + +Prometheus offers native OTLP support and excellent query capabilities. + +**Direct OTLP Ingestion:** +```bash +prometheus --web.enable-otlp-receiver \ + --storage.tsdb.out-of-order-time-window=30m +``` + +**Via Collector Scraping:** +```yaml +scrape_configs: + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:9090'] +``` + +### Grafana Cloud + +For managed metrics: +```yaml +exporters: + otlphttp: + endpoint: https://otlp-gateway-prod-us-central-0.grafana.net/otlp + headers: + Authorization: "Bearer ${GRAFANA_CLOUD_TOKEN}" +``` + +### ClickHouse + +For high-volume metrics storage: +```yaml +exporters: + clickhouse: + endpoint: tcp://clickhouse:9000 + database: metrics + ttl_days: 30 + logs_table: otel_logs + metrics_table: otel_metrics +``` + +### Quickwit + +For unified logs and metrics: +```yaml +exporters: + otlp: + endpoint: quickwit:7281 + headers: + "x-quickwit-index": "nativelink-metrics" +``` + +## Example Queries + +### Prometheus/PromQL + +**Cache hit rate:** +```promql +sum(rate(nativelink_cache_operations_total{cache_operation_result="hit"}[5m])) by (cache_type) / +sum(rate(nativelink_cache_operations_total{cache_operation_name="read"}[5m])) by (cache_type) +``` + +**Execution success rate:** +```promql +sum(rate(nativelink_execution_completed_count_total{execution_result="success"}[5m])) / +sum(rate(nativelink_execution_completed_count_total[5m])) +``` + +**Queue depth by priority:** +```promql +sum(nativelink_execution_active_count{execution_stage="queued"}) by (execution_priority) +``` + +**P95 cache operation latency:** +```promql +histogram_quantile(0.95, + sum(rate(nativelink_cache_operation_duration_bucket[5m])) by (le, cache_type) +) +``` + +**Worker utilization:** +```promql +count(nativelink_execution_active_count{execution_stage="executing"} > 0) / +count(count by (execution_worker_id) (nativelink_execution_active_count)) +``` + +### Joining with Resource Attributes + +Use `target_info` to join resource attributes: +```promql +rate(nativelink_execution_completed_count_total[5m]) +* on (job, instance) group_left (k8s_cluster_name, deployment_environment) +target_info +``` + +## Dashboards + +### Grafana Dashboard + +Import the included dashboard for a comprehensive view: +```bash +# Import via API +curl -X POST http://admin:admin@localhost:3000/api/dashboards/db \ + -H "Content-Type: application/json" \ + -d @grafana-dashboard.json + +# Or import via UI at http://localhost:3000 +``` + +Key panels include: +- Execution pipeline overview +- Cache performance metrics +- Worker utilization heatmap +- Error rate tracking +- Queue depth over time +- Stage duration percentiles + +## Alerting + +### Example Alert Rules + +```yaml +groups: + - name: nativelink_alerts + rules: + - alert: HighErrorRate + expr: | + (1 - ( + sum(rate(nativelink_execution_completed_count_total{execution_result="success"}[5m])) / + sum(rate(nativelink_execution_completed_count_total[5m])) + )) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "High execution error rate ({{ $value | humanizePercentage }})" + + - alert: CacheMissRateHigh + expr: | + (1 - nativelink:cache_hit_rate) > 0.5 + for: 10m + labels: + severity: info + annotations: + summary: "Cache miss rate above 50% for {{ $labels.cache_type }}" + + - alert: QueueBacklog + expr: | + sum(nativelink_execution_active_count{execution_stage="queued"}) > 100 + for: 15m + labels: + severity: warning + annotations: + summary: "Queue backlog above 100 actions" + + - alert: WorkerUtilizationLow + expr: | + nativelink:worker_utilization < 0.3 + for: 30m + labels: + severity: info + annotations: + summary: "Worker utilization below 30%" +``` + +## Troubleshooting + +### No Metrics Appearing + +1. Check NativeLink is configured with OTEL environment variables: +```bash +ps aux | grep nativelink | grep OTEL +``` + +2. Verify collector is receiving data: +```bash +curl http://localhost:13133/health +curl http://localhost:8888/metrics | grep otelcol_receiver_accepted_metric_points +``` + +3. Check collector logs: +```bash +docker logs otel-collector +# or +kubectl logs -l app=otel-collector +``` + +### Cache Metrics Missing + +If you see `nativelink_execution_*` metrics but no `nativelink_cache_*` metrics, your NativeLink build may not be emitting store-level cache operation metrics yet. In that case, cache recording rules like `nativelink:cache_hit_rate` won't produce any series. + +### High Memory Usage + +1. Adjust collector batch size: +```yaml +processors: + batch: + send_batch_size: 512 # Reduce from 1024 +``` + +2. Increase memory limits: +```yaml +memory_limiter: + limit_mib: 1024 # Increase from 512 +``` + +3. Reduce metric cardinality by dropping labels: +```yaml +processors: + attributes: + actions: + - key: unnecessary_label + action: delete +``` + +### Out-of-Order Samples + +Enable out-of-order ingestion in Prometheus: +```yaml +storage: + tsdb: + out_of_order_time_window: 1h # Increase from 30m +``` + +### Missing Resource Attributes + +Ensure attributes are promoted in Prometheus: +```yaml +otlp: + promote_resource_attributes: + - your.custom.attribute +``` + +## Performance Tuning + +### Collector Optimization + +1. **Batching**: Adjust batch processor settings based on volume +2. **Compression**: Enable gzip for network efficiency +3. **Sampling**: Use tail sampling for high-volume traces +4. **Filtering**: Drop unnecessary metrics at collector level + +### Prometheus Optimization + +1. **Recording Rules**: Pre-calculate expensive queries +2. **Retention**: Set appropriate retention periods +3. **Downsampling**: Use Thanos or Cortex for long-term storage +4. **Federation**: Split metrics across multiple Prometheus instances + +### NativeLink Optimization + +1. **Export Interval**: Increase `OTEL_METRIC_EXPORT_INTERVAL` to reduce overhead +2. **Resource Attributes**: Minimize cardinality of custom attributes +3. **Metric Selection**: Disable unused metric types if needed + +## Additional Resources + +- [OpenTelemetry Documentation](https://opentelemetry.io/docs/) +- [Prometheus Best Practices](https://prometheus.io/docs/practices/) +- [OTEL Collector Configuration](https://opentelemetry.io/docs/collector/configuration/) +- [NativeLink Documentation](https://nativelink.com/docs) +- [Grafana Dashboard Examples](https://grafana.com/grafana/dashboards/) + +## Support + +For issues or questions: +- File an issue: https://github.com/TraceMachina/nativelink/issues +- Join our Discord: https://discord.gg/nativelink +- Check documentation: https://nativelink.com/docs diff --git a/deployment-examples/metrics/alertmanager-config.yml b/deployment-examples/metrics/alertmanager-config.yml new file mode 100644 index 000000000..ebd17e97a --- /dev/null +++ b/deployment-examples/metrics/alertmanager-config.yml @@ -0,0 +1,78 @@ +# Alertmanager configuration for NativeLink metrics +global: + # The smarthost and SMTP sender used for mail notifications. + # smtp_smarthost: 'localhost:25' + # smtp_from: 'alertmanager@example.org' + # smtp_auth_username: 'alertmanager' + # smtp_auth_password: 'password' + + # The default SMTP From header field. + resolve_timeout: 5m + +# The root route on which each incoming alert enters. +route: + # The root route must not have any matchers as it is the entry point for + # all alerts. It needs to have a receiver configured. + receiver: 'default-receiver' + + # The labels by which incoming alerts are grouped together. + group_by: ['alertname', 'service', 'severity'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 4h + + # Child routes for specific alert routing + routes: + # Critical alerts - immediate notification + - match: + severity: critical + receiver: 'critical-receiver' + group_wait: 10s + repeat_interval: 1h + + # Warning alerts + - match: + severity: warning + receiver: 'warning-receiver' + group_wait: 1m + repeat_interval: 4h + +# Inhibition rules allow to mute a set of alerts given that another alert is firing. +inhibit_rules: + # Inhibit warning alerts when critical alert for the same service is firing + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'service'] + +# Receivers define notification integrations +receivers: + # Default receiver - logs to stdout (useful for development) + - name: 'default-receiver' + # No configuration means alerts are silently acknowledged + # Add webhook, email, or other integrations as needed + + # Critical alerts receiver + - name: 'critical-receiver' + # Example webhook configuration (uncomment and configure as needed): + # webhook_configs: + # - url: 'http://your-webhook-endpoint/alerts' + # send_resolved: true + + # Warning alerts receiver + - name: 'warning-receiver' + # Configure as needed for your environment + +# Templates for notification formatting (optional) +# templates: +# - '/etc/alertmanager/templates/*.tmpl' diff --git a/deployment-examples/metrics/cache-metrics-wrapper-store.md b/deployment-examples/metrics/cache-metrics-wrapper-store.md new file mode 100644 index 000000000..f40186287 --- /dev/null +++ b/deployment-examples/metrics/cache-metrics-wrapper-store.md @@ -0,0 +1,145 @@ +# Store-Level Cache Metrics via a Wrapper `StoreDriver` + +## Goal + +Expose consistent, low-cardinality cache metrics (CAS/AC/store backends) without needing to implement bespoke instrumentation inside every individual store implementation. + +This document focuses on a **wrapper store** (middleware) approach that can be applied to any `StoreDriver`, and compares it with **instrumenting inside each store**. + +## Problem Statement + +Users expect Prometheus/Grafana to show cache stats such as: +- Cache operation counts (`read`/`write`/`delete`/`evict`) +- Hit/miss rate for reads +- Latency distributions +- Bytes read/written throughput + +These should be queryable and composable with low cognitive overhead and consistent labels. + +## Two Approaches + +### A) Wrapper Store (middleware) + +Wrap an existing `Arc` with a new `StoreDriver` that: +1. Starts a timer +2. Calls the inner store method +3. Classifies the outcome (hit/miss/error/etc) +4. Records OpenTelemetry metrics + +This produces uniform metrics across all stores (filesystem, memory, Redis, S3, gRPC, for example) with one implementation. + +### B) Instrument Inside Each Store + +Add metrics to each store implementation directly (for example, `FilesystemStore`, `S3Store`, `GrpcStore`, `FastSlowStore`, `CompletenessCheckingStore`, …), recording the same metric family from each. + +This provides deeper store-specific insight but requires repeated work and continued maintenance as stores evolve. + +## Pros / Cons + +### Wrapper Store + +**Pros** +- **Broad coverage fast**: one implementation applies everywhere. +- **Consistent semantics**: identical label keys and values across all stores. +- **Lower ongoing maintenance**: new stores automatically get metrics. +- **Configurable**: can be enabled per “logical cache” (CAS/AC) and/or store name. + +**Cons** +- **Double-counting risk**: composite stores (`FastSlowStore`, `DedupStore`, `CompressionStore`, etc.) may call inner stores; wrapping both outer + inner can over-count. +- **Limited store insight**: a wrapper sees "a read happened," but may not know if it was served from fast vs slow tier unless you wrap at that level intentionally. +- **Imperfect hit classification**: for some methods, "hit" vs "miss" is best inferred from result codes (for example, `NotFound`), which may not map perfectly for all stores/operations. +- **Overhead per call**: extra timing + metric recording. Usually small, but measurable at very high QPS. + +### Instrumenting Each Store + +**Pros** +- **Max fidelity**: store can record store-specific outcomes (for example, S3 HEAD vs GET latency, Redis pipeline stats, filesystem rename failures). +- **Better attribution**: `FastSlowStore` can record whether fast or slow tier served the data. +- **Easier to avoid double counting** because each store "knows" whether it's a leaf or a wrapper. + +**Cons** +- **High implementation cost** across many stores. +- **Inconsistent semantics risk** (different developers interpret “hit/miss” differently over time). +- **Harder to keep dashboards/rules stable** when metrics differ across stores. + +## Wrapper Store Design Details + +### Metric Families (Prometheus-facing names) + +Assuming OpenTelemetry metric names like: +- `cache.operations` (counter) +- `cache.operation.duration` (histogram) +- `cache.io` (counter) + +Prometheus/OpenMetrics typically exposes: +- `nativelink_cache_operations_total` +- `nativelink_cache_operation_duration_bucket` / `_sum` / `_count` +- `nativelink_cache_io_total` + +Recording rules can derive: +- `nativelink:cache_hit_rate` +- `nativelink:cache_read_throughput_bytes` +- `nativelink:cache_operation_latency_p95`, etc. + +### Labels (low-cardinality) + +Recommended label keys (Prometheus form): +- `cache_type`: `cas`, `ac`, `memory`, `filesystem`, … +- `cache_operation_name`: `read`, `write`, `delete`, `evict` +- `cache_operation_result`: `hit`, `miss`, `expired`, `success`, `error` +- `instance_name`: provided by the OTEL collector transform in `deployment-examples/metrics/otel-collector-config.yaml` + +### Where to Wrap (avoid double counting) + +You must decide whether metrics represent: + +1) **User-visible cache behavior** (recommended default) + - Wrap only the **stores exposed to services** (for example, CAS service store, AC service store). + - Do **not** wrap inner leaf stores. + - Pros: One operation == one metric event, matches client perspective. + - Cons: less insight into fast/slow tiers. + +2) **Store-level behavior** + - Wrap leaf stores and/or specific tiers (for example, wrap the "fast" and "slow" stores separately). + - Pros: visibility into where reads are served from. + - Cons: needs careful config to prevent double counting. + +Practical rule: **wrap at exactly one layer of the store graph** for any given request path. + +### Operation Mapping + +Typical mapping from `StoreDriver` methods: +- `has_with_results`: `read` + `hit/miss/error` (based on `results[i].is_some()` and call result) +- `get_part`: `read` + `hit/miss/error` (`NotFound` => `miss`) +- `update` / `update_with_whole_file`: `write` + `success/error`, bytes from `UploadSizeInfo` where available +- `delete` / remove-like operations: `delete` + `success/miss/error` (store-dependent) + +### Performance Considerations + +Primary overhead sources: +- Timer reads (`Instant::now()` + elapsed) +- Attribute allocation (avoid per-call `Vec` where possible) +- Recording calls into OpenTelemetry SDK (batch exporter settings matter) + +Mitigations: +- Precompute attribute slices per `(cache_type, op, result)` (attrs cache). +- Keep label cardinality low and stable. +- Avoid attaching digests/paths as labels. + +### Failure Semantics + +To keep dashboards stable: +- Treat `NotFound` on reads as `miss` (not `error`). +- Treat other errors as `error`. +- Only introduce `expired` if the store layer can definitively identify expiration. + +## Docs / Recording Rules Impact + +Ideal outcome: **no documentation changes** once wrapper metrics land. + +To reach that: +- Keep Prometheus-facing metric names/labels stable (`nativelink_cache_operations_total`, `cache_type`, `cache_operation_name`, `cache_operation_result`). +- Ensure `deployment-examples/metrics/prometheus-recording-rules.yml` references `_total` counter names. +- Keep existing dashboards querying recording rules (for example, `nativelink:cache_hit_rate`) instead of raw high-cardinality series. + +If wrapper metrics are **optional/config-gated**, docs may need a small note describing how to enable them; otherwise docs can remain unchanged. diff --git a/deployment-examples/metrics/docker-compose.yaml b/deployment-examples/metrics/docker-compose.yaml new file mode 100644 index 000000000..79eaf5cd4 --- /dev/null +++ b/deployment-examples/metrics/docker-compose.yaml @@ -0,0 +1,139 @@ +version: '3.8' + +services: + # OpenTelemetry Collector + otel-collector: + image: otel/opentelemetry-collector-contrib:0.98.0 + container_name: otel-collector + restart: unless-stopped + command: ["--config=/etc/otel-collector/config.yaml"] + volumes: + - ./otel-collector-config.yaml:/etc/otel-collector/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + - "9090:9090" # Prometheus metrics exporter + - "8888:8888" # Collector metrics + - "13133:13133" # Health check + environment: + - OTLP_BACKEND_ENDPOINT=${OTLP_BACKEND_ENDPOINT:-otlp-backend:4317} + - OTLP_BACKEND_TOKEN=${OTLP_BACKEND_TOKEN:-} + networks: + - metrics + + # Prometheus with OTLP support + prometheus: + image: prom/prometheus:v3.7.3 + container_name: prometheus + restart: unless-stopped + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + - '--web.enable-otlp-receiver' # Enable OTLP receiver + - '--storage.tsdb.retention.time=30d' + - '--query.max-concurrency=20' + volumes: + - ./prometheus-config.yaml:/etc/prometheus/prometheus.yml:ro + - ./prometheus-recording-rules.yml:/etc/prometheus/rules/nativelink.yml:ro + - prometheus_data:/prometheus + ports: + - "9091:9090" # Prometheus web UI (different port to avoid conflict with collector) + networks: + - metrics + depends_on: + - otel-collector + + # Grafana for visualization + grafana: + image: grafana/grafana:12.4.0 + container_name: grafana + restart: unless-stopped + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_INSTALL_PLUGINS=grafana-piechart-panel + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + networks: + - metrics + depends_on: + - prometheus + + # Optional: AlertManager for alerts +# alertmanager: +# image: prom/alertmanager:v0.27.0 +# container_name: alertmanager +# restart: unless-stopped +# volumes: +# - ./alertmanager-config.yml:/etc/alertmanager/config.yml:ro +# - alertmanager_data:/alertmanager +# ports: +# - "9093:9093" +# command: +# - '--config.file=/etc/alertmanager/config.yml' +# - '--storage.path=/alertmanager' +# networks: +# - metrics + + # Optional: Node exporter for host metrics + node-exporter: + image: prom/node-exporter:v1.7.0 + container_name: node-exporter + restart: unless-stopped + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + ports: + - "9100:9100" + networks: + - metrics + + # Optional: Jaeger for trace visualization (if traces are enabled) + jaeger: + image: jaegertracing/all-in-one:1.53 + container_name: jaeger + restart: unless-stopped + environment: + - COLLECTOR_OTLP_ENABLED=true + ports: + - "16686:16686" # Jaeger UI + - "14268:14268" # Jaeger collector HTTP + networks: + - metrics + +volumes: + prometheus_data: + grafana_data: + alertmanager_data: + +networks: + metrics: + driver: bridge + +# Usage Instructions: +# 1. Start the stack: docker-compose up -d +# 2. Configure NativeLink with these environment variables: +# export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 +# export OTEL_EXPORTER_OTLP_PROTOCOL=grpc +# export OTEL_SERVICE_NAME=nativelink +# export OTEL_RESOURCE_ATTRIBUTES="deployment.environment=dev" +# 3. Access services: +# - Prometheus: http://localhost:9091 +# - Grafana: http://localhost:3000 (admin/admin) +# - Jaeger: http://localhost:16686 +# - AlertManager: http://localhost:9093 +# - OTEL Collector metrics: http://localhost:8888/metrics diff --git a/deployment-examples/metrics/grafana/dashboards/nativelink-execution.json b/deployment-examples/metrics/grafana/dashboards/nativelink-execution.json new file mode 100644 index 000000000..7564378be --- /dev/null +++ b/deployment-examples/metrics/grafana/dashboards/nativelink-execution.json @@ -0,0 +1,2092 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "NativeLink Remote Execution and Worker Pool Metrics Dashboard", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 100, + "panels": [], + "title": "📊 Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(nativelink_execution_active_count{execution_instance=~\"$instance\"})", + "legendFormat": "Active Actions", + "range": true, + "refId": "A" + } + ], + "title": "Active Actions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(nativelink_worker_pool_count{worker_pool_instance=~\"$instance\"})", + "legendFormat": "Workers", + "range": true, + "refId": "A" + } + ], + "title": "Total Workers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.8 + }, + { + "color": "red", + "value": 0.95 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(nativelink_execution_completed_count_total{execution_result=\"success\", execution_instance=~\"$instance\"}[$__rate_interval])) / sum(rate(nativelink_execution_completed_count_total{execution_instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "Success Rate", + "range": true, + "refId": "A" + } + ], + "title": "Success Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(nativelink_execution_completed_count_total{execution_instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "Throughput", + "range": true, + "refId": "A" + } + ], + "title": "Throughput", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 5 + }, + { + "color": "red", + "value": 30 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(nativelink_execution_queue_time_seconds_bucket{execution_instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p50 Queue Time", + "range": true, + "refId": "A" + } + ], + "title": "Median Queue Time", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 30 + }, + { + "color": "red", + "value": 120 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(nativelink_execution_total_duration_seconds_bucket{execution_instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p50 Total Duration", + "range": true, + "refId": "A" + } + ], + "title": "Median Execution Time", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 101, + "panels": [], + "title": "⚡ Execution Pipeline", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "queued" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "executing" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cache_check" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 10, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(nativelink_execution_actions_count{execution_instance=~\"$instance\"}) by (execution_stage)", + "legendFormat": "{{execution_stage}}", + "range": true, + "refId": "A" + } + ], + "title": "Actions by Stage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 11, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(nativelink_execution_stage_transitions_total{execution_instance=~\"$instance\"}[$__rate_interval])) by (execution_stage)", + "legendFormat": "{{execution_stage}}", + "range": true, + "refId": "A" + } + ], + "title": "Stage Transitions Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "success" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failure" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cancelled" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "timeout" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "cache_hit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 12, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(nativelink_execution_completed_count_total{execution_instance=~\"$instance\"}[$__rate_interval])) by (execution_result)", + "legendFormat": "{{execution_result}}", + "range": true, + "refId": "A" + } + ], + "title": "Completed Executions by Result", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 13, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(nativelink_execution_retry_count_total{execution_instance=~\"$instance\"}[$__rate_interval])) by (execution_instance)", + "legendFormat": "{{execution_instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Execution Retries", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 14, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(nativelink_execution_queued_actions_count{execution_instance=~\"$instance\"}) by (execution_instance)", + "legendFormat": "{{execution_instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Queued Actions", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 102, + "panels": [], + "title": "⏱️ Execution Performance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p95" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 20, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(nativelink_execution_total_duration_seconds_bucket{execution_instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_execution_total_duration_seconds_bucket{execution_instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p95", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(nativelink_execution_total_duration_seconds_bucket{execution_instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Total Execution Duration (Percentiles)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "p99" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p95" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 21, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(nativelink_execution_queue_time_seconds_bucket{execution_instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_execution_queue_time_seconds_bucket{execution_instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p95", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(nativelink_execution_queue_time_seconds_bucket{execution_instance=~\"$instance\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Queue Wait Time (Percentiles)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 39 + }, + "id": 22, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_execution_stage_duration_seconds_bucket{execution_instance=~\"$instance\"}[$__rate_interval])) by (le, execution_stage))", + "legendFormat": "{{execution_stage}} p95", + "range": true, + "refId": "A" + } + ], + "title": "Stage Duration by Stage (p95)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 47 + }, + "id": 103, + "panels": [], + "title": "👷 Worker Pool", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "available" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "paused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "draining" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 30, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(nativelink_worker_pool_count{worker_pool_instance=~\"$instance\"}) by (worker_pool_state)", + "legendFormat": "{{worker_pool_state}}", + "range": true, + "refId": "A" + } + ], + "title": "Workers by State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 31, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(nativelink_worker_pool_actions_running{worker_pool_instance=~\"$instance\"}) by (worker_pool_instance)", + "legendFormat": "{{worker_pool_instance}}", + "range": true, + "refId": "A" + } + ], + "title": "Running Actions on Workers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "added" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "removed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "timeout" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "connection_failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "evicted" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 32, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(nativelink_worker_pool_events_total{worker_pool_instance=~\"$instance\"}[$__rate_interval])) by (worker_pool_event_type)", + "legendFormat": "{{worker_pool_event_type}}", + "range": true, + "refId": "A" + } + ], + "title": "Worker Pool Events Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "dispatched" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "completed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "failures" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 33, + "options": { + "legend": { + "calcs": ["last", "mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(nativelink_worker_pool_actions_dispatched_total{worker_pool_instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "dispatched", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(nativelink_worker_pool_actions_completed_total{worker_pool_instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "completed", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(nativelink_worker_pool_dispatch_failures_total{worker_pool_instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "failures", + "range": true, + "refId": "C" + } + ], + "title": "Worker Actions Rate", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["nativelink", "remote-execution", "bazel"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(nativelink_execution_active_count, execution_instance)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(nativelink_execution_active_count, execution_instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h"], + "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] + }, + "timezone": "browser", + "title": "NativeLink Execution Metrics", + "uid": "nativelink-execution", + "version": 1, + "weekStart": "" +} + diff --git a/deployment-examples/metrics/grafana/dashboards/nativelink-overview.json b/deployment-examples/metrics/grafana/dashboards/nativelink-overview.json new file mode 100644 index 000000000..0ec71ab0c --- /dev/null +++ b/deployment-examples/metrics/grafana/dashboards/nativelink-overview.json @@ -0,0 +1,811 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "type": "text", + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "# NativeLink Metrics Dashboard\nMonitor remote execution performance", + "mode": "markdown" + }, + "pluginVersion": "10.3.0" + }, + { + "type": "stat", + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 2 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(increase(nativelink_execution_completed_count_total[5m]))", + "legendFormat": "Completed", + "refId": "A" + } + ], + "title": "Executions (5m)" + }, + { + "type": "stat", + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 90 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 2 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(increase(nativelink_execution_completed_count_total{execution_result=\"success\"}[5m])) / sum(increase(nativelink_execution_completed_count_total[5m])) * 100", + "legendFormat": "Success Rate", + "refId": "A" + } + ], + "title": "Success Rate (%)" + }, + { + "type": "stat", + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 2 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(nativelink_execution_active_count{execution_stage=\"queued\"})", + "legendFormat": "Queued", + "refId": "A" + } + ], + "title": "Queued Actions" + }, + { + "type": "stat", + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 2 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(nativelink_execution_active_count{execution_stage=\"executing\"})", + "legendFormat": "Executing", + "refId": "A" + } + ], + "title": "Executing Actions" + }, + { + "type": "timeseries", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(rate(nativelink_execution_completed_count_total[1m])) by (execution_result)", + "legendFormat": "{{execution_result}}", + "refId": "A" + } + ], + "title": "Execution Completion Rate" + }, + { + "type": "timeseries", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "lineInterpolation": "stepAfter", + "stacking": { + "mode": "normal" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(nativelink_execution_active_count) by (execution_stage)", + "legendFormat": "{{execution_stage}}", + "refId": "A" + } + ], + "title": "Actions by Stage" + }, + { + "type": "stat", + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(nativelink_execution_completed_count_total{execution_result=\"success\"})", + "legendFormat": "Successes", + "refId": "A" + } + ], + "title": "Total Successful Executions" + }, + { + "type": "stat", + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 10 + } + ] + } + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(nativelink_execution_completed_count_total{execution_result=\"failure\"})", + "legendFormat": "Failures", + "refId": "A" + } + ], + "title": "Total Failed Executions" + }, + { + "type": "stat", + "fieldConfig": { + "defaults": { + "unit": "ops" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(rate(nativelink_execution_stage_transitions_total[1m]))", + "legendFormat": "Transitions/sec", + "refId": "A" + } + ], + "title": "Stage Transitions/sec" + }, + { + "type": "timeseries", + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(rate(nativelink_execution_stage_transitions_total[1m])) by (from_stage, to_stage)", + "legendFormat": "{{from_stage}} -> {{to_stage}}", + "refId": "A" + } + ], + "title": "Stage Transitions Over Time" + }, + { + "type": "table", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "displayMode": "auto" + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failures" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 10 + } + ] + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 12, + "options": { + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Failures" + } + ] + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum by (instance_name, service_instance_id) (increase(nativelink_execution_completed_count_total{execution_result=\"failure\"}[1h]))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Failures by Instance (1h)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "renameByName": { + "Value": "Failures", + "instance_name": "Instance", + "service_instance_id": "Worker ID" + } + } + } + ] + }, + { + "type": "timeseries", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 80, + "lineWidth": 0, + "stacking": { + "mode": "normal" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "sum" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(increase(nativelink_execution_completed_count_total{execution_result=\"failure\"}[5m])) or vector(0)", + "legendFormat": "Failures (exit_code != 0)", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(increase(nativelink_execution_completed_count_total{execution_result=\"success\"}[5m])) or vector(0)", + "legendFormat": "Successes", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(increase(nativelink_execution_completed_count_total{execution_result=\"cancelled\"}[5m])) or vector(0)", + "legendFormat": "Cancelled", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(increase(nativelink_execution_completed_count_total{execution_result=\"timeout\"}[5m])) or vector(0)", + "legendFormat": "Timeout", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(increase(nativelink_execution_completed_count_total{execution_result=\"cache_hit\"}[5m])) or vector(0)", + "legendFormat": "Cache Hits", + "refId": "E" + } + ], + "title": "Execution Results Over Time" + }, + { + "type": "table", + "description": "Shows action digests that have failed with non-zero exit code, timed out, or were cancelled. Note: Bazel build failures (client-side) may not appear here if the action never reached completion in NativeLink.", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "displayMode": "auto", + "filterable": true + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Count" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "color-background" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Action Digest" + }, + "properties": [ + { + "id": "custom.width", + "value": 400 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Result" + }, + "properties": [ + { + "id": "custom.width", + "value": 100 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 14, + "options": { + "footer": { + "enablePagination": true + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Count" + } + ] + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum by (execution_action_digest, execution_result) (increase(nativelink_execution_completed_count_total{execution_result=~\"failure|cancelled|timeout\"}[1h])) > 0", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Failed/Cancelled/Timed Out Actions by Digest", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true + }, + "renameByName": { + "Value": "Count", + "execution_action_digest": "Action Digest", + "execution_result": "Result" + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": true, + "field": "Count" + } + ] + } + } + ] + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": [ + "nativelink", + "remote-execution" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "NativeLink Overview", + "uid": "nativelink-overview", + "version": 1, + "weekStart": "" +} diff --git a/deployment-examples/metrics/grafana/dashboards/nativelink-stores.json b/deployment-examples/metrics/grafana/dashboards/nativelink-stores.json new file mode 100644 index 000000000..cab3eb1d3 --- /dev/null +++ b/deployment-examples/metrics/grafana/dashboards/nativelink-stores.json @@ -0,0 +1,1522 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "# NativeLink Store Metrics\nMonitor cache hit/miss rates and read/write latency for all stores", + "mode": "markdown" + }, + "pluginVersion": "12.2.1", + "title": "", + "type": "text" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "green", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 2 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", cache_operation_result=\"hit\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) / sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", cache_operation_result=~\"hit|miss\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval]))", + "legendFormat": "Hit Rate", + "refId": "A" + } + ], + "title": "Cache Hit Rate", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 2 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", cache_operation_result=\"miss\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) / sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", cache_operation_result=~\"hit|miss\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval]))", + "legendFormat": "Miss Rate", + "refId": "A" + } + ], + "title": "Cache Miss Rate", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 90 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 2 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", cache_operation_result=\"success\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) / sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval]))", + "legendFormat": "Read Success Rate", + "refId": "A" + } + ], + "title": "Read Success Rate", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": 0 + }, + { + "color": "yellow", + "value": 90 + }, + { + "color": "green", + "value": 99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 2 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{cache_operation_name=\"write\", cache_operation_result=\"success\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) / sum(rate(nativelink_store_operations_total{cache_operation_name=\"write\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval]))", + "legendFormat": "Write Success Rate", + "refId": "A" + } + ], + "title": "Write Success Rate", + "type": "stat" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", cache_operation_result=\"hit\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval]))", + "legendFormat": "Cache Hits", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", cache_operation_result=\"miss\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval]))", + "legendFormat": "Cache Misses", + "refId": "B" + } + ], + "title": "Cache Hits vs Misses", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (cache_operation_result)", + "legendFormat": "Read ({{cache_operation_result}})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{cache_operation_name=\"write\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (cache_operation_result)", + "legendFormat": "Write ({{cache_operation_result}})", + "refId": "B" + } + ], + "title": "Read/Write Operations", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 8, + "panels": [], + "title": "Read Latency", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum(rate(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"read\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.90, sum(rate(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"read\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le))", + "legendFormat": "p90", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"read\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Read Latency Percentiles (All Stores)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"read\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le, store_type))", + "legendFormat": "p99 - {{store_type}}", + "refId": "A" + } + ], + "title": "Read Latency p99 by Store Type", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 11, + "panels": [], + "title": "Write Latency", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum(rate(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"write\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.90, sum(rate(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"write\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le))", + "legendFormat": "p90", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"write\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Write Latency Percentiles (All Stores)", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "showValues": false, + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": 0 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "12.2.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"write\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le, store_type))", + "legendFormat": "p99 - {{store_type}}", + "refId": "A" + } + ], + "title": "Write Latency p99 by Store Type", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 14, + "panels": [], + "title": "Operations by Store", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "lineInterpolation": "smooth", + "spanNulls": true, + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "mean", + "sum" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (store_type)", + "legendFormat": "{{store_type}}", + "refId": "A" + } + ], + "title": "Operations by Store Type", + "type": "timeseries" + }, + { + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 20, + "lineInterpolation": "smooth", + "spanNulls": true, + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "mean", + "sum" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (store_name)", + "legendFormat": "{{store_name}}", + "refId": "A" + } + ], + "title": "Operations by Store Name", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 43 + }, + "id": 17, + "panels": [], + "title": "Store Details", + "type": "row" + }, + { + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "displayMode": "auto", + "filterable": true + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Hit Rate" + }, + "properties": [ + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "custom.displayMode", + "value": "color-background-solid" + }, + { + "id": "thresholds", + "value": { + "mode": "percentage", + "steps": [ + { + "color": "red" + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "green", + "value": 80 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "p99 Latency" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + }, + { + "id": "custom.displayMode", + "value": "color-background-solid" + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 100 + }, + { + "color": "red", + "value": 500 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Operations/sec" + }, + "properties": [ + { + "id": "unit", + "value": "ops" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 18, + "options": { + "footer": { + "enablePagination": true + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Operations/sec" + } + ] + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", cache_operation_result=\"hit\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (store_type, store_name) / sum(rate(nativelink_store_operations_total{cache_operation_name=\"read\", cache_operation_result=~\"hit|miss\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (store_type, store_name)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(nativelink_store_operation_duration_milliseconds_bucket{store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le, store_type, store_name))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(nativelink_store_operations_total{store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (store_type, store_name)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "C" + } + ], + "title": "Store Performance Summary", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "store_name" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 3": true, + "store_type 2": true, + "store_type 3": true + }, + "renameByName": { + "Value #A": "Hit Rate", + "Value #B": "p99 Latency", + "Value #C": "Operations/sec", + "store_name": "Store Name", + "store_type 1": "Store Type" + } + } + } + ], + "type": "table" + }, + { + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "log": 2, + "type": "log" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 54 + }, + "id": 19, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"read\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Read Latency Distribution", + "type": "heatmap" + }, + { + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "log": 2, + "type": "log" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 54 + }, + "id": 20, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-blue", + "mode": "scheme", + "scale": "exponential", + "scheme": "Blues", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(increase(nativelink_store_operation_duration_milliseconds_bucket{cache_operation_name=\"write\", store_type=~\"$store_type\", service_namespace=~\"$instance\", store_name=~\"$store_name\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Write Latency Distribution", + "type": "heatmap" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 42, + "tags": [ + "nativelink", + "stores", + "cache" + ], + "templating": { + "list": [ + { + "current": { + "text": "vmstorage-dev", + "value": "6PUSXUw4k" + }, + "includeAll": false, + "label": "Datasource", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "vmstorage-dev", + "type": "datasource" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(nativelink_store_operations_total, service_namespace)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { "qryType": 1, "query": "label_values(nativelink_store_operations_total, service_namespace)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, + "refresh": 2, + "regex": "/^nativelink.*/", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(nativelink_store_operations_total, store_type)", + "includeAll": true, + "label": "Store Type", + "multi": true, + "name": "store_type", + "options": [], + "query": { + "query": "label_values(nativelink_store_operations_total, store_type)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(nativelink_store_operations_total{store_type=~\"$store_type\", service_namespace=~\"$instance\"}, store_name)", + "includeAll": true, + "label": "Store Name", + "multi": true, + "name": "store_name", + "options": [], + "query": { + "query": "label_values(nativelink_store_operations_total{store_type=~\"$store_type\", service_namespace=~\"$instance\"}, store_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "NativeLink Stores", + "uid": "nativelink-stores", + "version": 4 +} diff --git a/deployment-examples/metrics/grafana/dashboards/nativelink-worker.json b/deployment-examples/metrics/grafana/dashboards/nativelink-worker.json new file mode 100644 index 000000000..6bd6f1ece --- /dev/null +++ b/deployment-examples/metrics/grafana/dashboards/nativelink-worker.json @@ -0,0 +1,1298 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "NativeLink Worker Metrics Dashboard - Local Worker and Running Actions metrics", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "panels": [], + "title": "📊 Worker Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_worker_start_actions_received_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "Actions/s", + "range": true, + "refId": "A" + } + ], + "title": "Actions Received Rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 10 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_worker_disconnects_received_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) * 60", + "legendFormat": "Disconnects/min", + "range": true, + "refId": "A" + } + ], + "title": "Disconnects (per min)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_task_timeouts_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) * 60", + "legendFormat": "Timeouts/min", + "range": true, + "refId": "A" + } + ], + "title": "Task Timeouts (per min)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_child_process_failure_exit_code_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) * 60", + "legendFormat": "Failures/min", + "range": true, + "refId": "A" + } + ], + "title": "Child Process Failures (per min)", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "panels": [], + "title": "👷 Local Worker Metrics", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 10, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_worker_start_actions_received_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (pod_name)", + "legendFormat": "{{pod_name}}", + "range": true, + "refId": "A" + } + ], + "title": "Actions Received Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "disconnects" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "keep_alives" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, + "id": 13, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_worker_disconnects_received_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "disconnects", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_worker_keep_alives_received_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "keep_alives", + "range": true, + "refId": "B" + } + ], + "title": "Scheduler Communication Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "id": 102, + "panels": [], + "title": "⚡ Running Actions - Operation Rates", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 23 }, + "id": 20, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_create_and_add_action_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "create_and_add_action", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_create_action_info_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "create_action_info", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_make_action_directory_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "make_action_directory", + "range": true, + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_prepare_action_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "prepare_action", + "range": true, + "refId": "D" + } + ], + "title": "Action Setup Operations Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 23 }, + "id": 21, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_execute_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "execute", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_upload_results_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "upload_results", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_cleanup_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "cleanup", + "range": true, + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_get_finished_result_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "get_finished_result", + "range": true, + "refId": "D" + } + ], + "title": "Execution & Finalization Operations Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "success_exit_code" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "failure_exit_code" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }, + "id": 22, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_child_process_success_exit_code_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "success_exit_code", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_child_process_failure_exit_code_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "failure_exit_code", + "range": true, + "refId": "B" + } + ], + "title": "Child Process Exit Codes Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 31 }, + "id": 23, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_download_to_directory_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "download_to_directory", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_upload_stdout_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "upload_stdout", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_upload_stderr_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "upload_stderr", + "range": true, + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_cache_action_result_calls_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "cache_action_result", + "range": true, + "refId": "D" + } + ], + "title": "I/O Operations Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }, + "id": 103, + "panels": [], + "title": "⏱️ Running Actions - Operation Durations", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }, + "id": 30, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_prepare_action_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "prepare_action p95", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_execute_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "execute p95", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_upload_results_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "upload_results p95", + "range": true, + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_cleanup_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "cleanup p95", + "range": true, + "refId": "D" + } + ], + "title": "Core Operations Duration (p95)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }, + "id": 31, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_download_to_directory_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "download_to_directory p95", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_upload_stdout_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "upload_stdout p95", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_upload_stderr_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "upload_stderr p95", + "range": true, + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_cache_action_result_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "cache_action_result p95", + "range": true, + "refId": "D" + } + ], + "title": "I/O Operations Duration (p95)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ms" + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 48 }, + "id": 32, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(nativelink_running_actions_execute_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_execute_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "p95", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(nativelink_running_actions_execute_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Execute Duration (Percentiles)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ms" + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "p99" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p95" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "p50" }, "properties": [{ "id": "color", "value": { "fixedColor": "green", "mode": "fixed" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 48 }, + "id": 33, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(nativelink_running_actions_child_process_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum(rate(nativelink_running_actions_child_process_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "p95", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(nativelink_running_actions_child_process_duration_milliseconds_bucket{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval])) by (le))", + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "Child Process Duration (Percentiles)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 56 }, + "id": 104, + "panels": [], + "title": "⚠️ Errors & Issues", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 57 }, + "id": 40, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_execute_failures_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "execute_failures", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_prepare_action_failures_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "prepare_action_failures", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_upload_results_failures_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "upload_results_failures", + "range": true, + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_cleanup_failures_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "cleanup_failures", + "range": true, + "refId": "D" + } + ], + "title": "Operation Failures Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "task_timeouts" }, "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "cleanup_wait_timeouts" }, "properties": [{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }] }, + { "matcher": { "id": "byName", "options": "stale_removals" }, "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] } + ] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 57 }, + "id": 41, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_task_timeouts_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "task_timeouts", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_cleanup_wait_timeouts_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "cleanup_wait_timeouts", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_cleanup_waits_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "cleanup_waits", + "range": true, + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_stale_removals_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "stale_removals", + "range": true, + "refId": "D" + } + ], + "title": "Timeouts & Cleanup Issues Rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 65 }, + "id": 42, + "options": { + "legend": { "calcs": ["last", "mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.3.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_download_to_directory_failures_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "download_to_directory_failures", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_upload_stdout_failures_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "upload_stdout_failures", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_upload_stderr_failures_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "upload_stderr_failures", + "range": true, + "refId": "C" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_cache_action_result_failures_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "cache_action_result_failures", + "range": true, + "refId": "D" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(nativelink_running_actions_get_proto_command_from_store_failures_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\", pod_name=~\"$worker\"}[$__rate_interval]))", + "legendFormat": "get_proto_command_failures", + "range": true, + "refId": "E" + } + ], + "title": "I/O Failures Rate", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": ["nativelink", "worker", "remote-execution"], + "templating": { + "list": [ + { + "current": { "selected": false, "text": "Prometheus", "value": "Prometheus" }, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(nativelink_worker_start_actions_received_total, service_namespace)", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": true, + "name": "instance", + "options": [], + "query": { "qryType": 1, "query": "label_values(nativelink_worker_start_actions_received_total, service_namespace)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, + "refresh": 2, + "regex": "/^nativelink.*/", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(nativelink_worker_start_actions_received_total{service_namespace=~\"$instance\"}, deployment_name)", + "hide": 0, + "includeAll": true, + "label": "Worker Type", + "multi": true, + "name": "worker_type", + "options": [], + "query": { "qryType": 1, "query": "label_values(nativelink_worker_start_actions_received_total{service_namespace=~\"$instance\"}, deployment_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, + "refresh": 2, + "regex": "/.*worker.*/", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(nativelink_worker_start_actions_received_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\"}, pod_name)", + "hide": 0, + "includeAll": true, + "label": "Worker", + "multi": true, + "name": "worker", + "options": [], + "query": { "qryType": 1, "query": "label_values(nativelink_worker_start_actions_received_total{service_namespace=~\"$instance\", deployment_name=~\"$worker_type\"}, pod_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-1h", "to": "now" }, + "timepicker": { + "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h"], + "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] + }, + "timezone": "browser", + "title": "NativeLink Worker Metrics", + "uid": "nativelink-worker", + "version": 1, + "weekStart": "" +} + diff --git a/deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml b/deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml new file mode 100644 index 000000000..20e6f666f --- /dev/null +++ b/deployment-examples/metrics/grafana/provisioning/dashboards/dashboard.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: 'NativeLink Dashboards' + orgId: 1 + folder: 'NativeLink' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /var/lib/grafana/dashboards diff --git a/deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml b/deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 000000000..9a64e7725 --- /dev/null +++ b/deployment-examples/metrics/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,29 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + jsonData: + timeInterval: "15s" + queryTimeout: "60s" + httpMethod: POST + + - name: OTEL-Collector-Prometheus + type: prometheus + access: proxy + url: http://otel-collector:9090 + editable: true + jsonData: + timeInterval: "15s" + queryTimeout: "60s" + httpMethod: POST + + - name: Jaeger + type: jaeger + access: proxy + url: http://jaeger:16686 + editable: false diff --git a/deployment-examples/metrics/kubernetes/otel-collector.yaml b/deployment-examples/metrics/kubernetes/otel-collector.yaml new file mode 100644 index 000000000..9610865f0 --- /dev/null +++ b/deployment-examples/metrics/kubernetes/otel-collector.yaml @@ -0,0 +1,274 @@ +# OpenTelemetry Collector Deployment for NativeLink Metrics +apiVersion: v1 +kind: ConfigMap +metadata: + name: otel-collector-config + namespace: nativelink +data: + collector.yaml: | + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + resource: + attributes: + - key: service.namespace + value: nativelink + action: upsert + - key: k8s.cluster.name + from_attribute: K8S_CLUSTER_NAME + action: insert + - key: deployment.environment + from_attribute: DEPLOYMENT_ENV + action: insert + + transform/nativelink: + metric_statements: + - context: datapoint + statements: + - set(attributes["instance_name"], resource.attributes["nativelink.instance_name"]) + where resource.attributes["nativelink.instance_name"] != nil + + batch: + timeout: 10s + send_batch_size: 1024 + + memory_limiter: + check_interval: 1s + limit_mib: 1024 + spike_limit_mib: 256 + + exporters: + prometheus: + endpoint: 0.0.0.0:9090 + namespace: nativelink + resource_to_telemetry_conversion: + enabled: true + enable_open_metrics: true + + otlphttp/prometheus: + endpoint: http://prometheus:9090/api/v1/otlp + compression: gzip + + extensions: + health_check: + endpoint: 0.0.0.0:13133 + pprof: + endpoint: 0.0.0.0:1777 + zpages: + endpoint: 0.0.0.0:55679 + + service: + extensions: [health_check, pprof, zpages] + pipelines: + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, transform/nativelink, batch] + exporters: [prometheus] + metrics/prometheus_otlp: + receivers: [otlp] + processors: [memory_limiter, resource, transform/nativelink, batch] + exporters: [otlphttp/prometheus] + + telemetry: + logs: + level: info + metrics: + level: detailed + address: 0.0.0.0:8888 + +--- +apiVersion: v1 +kind: Service +metadata: + name: otel-collector + namespace: nativelink + labels: + app: otel-collector +spec: + type: ClusterIP + selector: + app: otel-collector + ports: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP + - name: otlp-http + port: 4318 + targetPort: 4318 + protocol: TCP + - name: prometheus + port: 9090 + targetPort: 9090 + protocol: TCP + - name: metrics + port: 8888 + targetPort: 8888 + protocol: TCP + - name: health + port: 13133 + targetPort: 13133 + protocol: TCP + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector + namespace: nativelink + labels: + app: otel-collector +spec: + replicas: 2 + selector: + matchLabels: + app: otel-collector + template: + metadata: + labels: + app: otel-collector + spec: + serviceAccountName: otel-collector + containers: + - name: otel-collector + image: otel/opentelemetry-collector-contrib:0.98.0 + args: + - "--config=/conf/collector.yaml" + ports: + - containerPort: 4317 + name: otlp-grpc + - containerPort: 4318 + name: otlp-http + - containerPort: 9090 + name: prometheus + - containerPort: 8888 + name: metrics + - containerPort: 13133 + name: health + env: + - name: K8S_CLUSTER_NAME + value: "nativelink-cluster" + - name: DEPLOYMENT_ENV + value: "production" + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: K8S_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: config + mountPath: /conf + resources: + requests: + memory: "512Mi" + cpu: "200m" + limits: + memory: "1Gi" + cpu: "1000m" + livenessProbe: + httpGet: + path: / + port: 13133 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + httpGet: + path: / + port: 13133 + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: otel-collector-config + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + namespace: nativelink + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: otel-collector +rules: + - apiGroups: [""] + resources: ["pods", "namespaces", "nodes"] + verbs: ["get", "watch", "list"] + - apiGroups: ["apps"] + resources: ["deployments", "daemonsets", "statefulsets", "replicasets"] + verbs: ["get", "watch", "list"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "watch", "list"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: otel-collector +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: otel-collector +subjects: + - kind: ServiceAccount + name: otel-collector + namespace: nativelink + +--- +# HorizontalPodAutoscaler for OTEL Collector +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: otel-collector + namespace: nativelink +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: otel-collector + minReplicas: 2 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 80 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + +--- +# PodDisruptionBudget for high availability +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: otel-collector + namespace: nativelink +spec: + minAvailable: 1 + selector: + matchLabels: + app: otel-collector diff --git a/deployment-examples/metrics/kubernetes/prometheus.yaml b/deployment-examples/metrics/kubernetes/prometheus.yaml new file mode 100644 index 000000000..b3de80b40 --- /dev/null +++ b/deployment-examples/metrics/kubernetes/prometheus.yaml @@ -0,0 +1,344 @@ +# Prometheus Deployment for NativeLink Metrics +# +# NOTE: This configuration uses `translation_strategy: NoUTF8EscapingWithSuffixes` which +# adds the `_total` suffix to counter metrics when using OTLP ingestion (Prometheus v3+). +# Recording rules and alerts using counter metrics should use the `_total` suffix +# (e.g., `nativelink_execution_completed_count_total`). +# +apiVersion: v1 +kind: Namespace +metadata: + name: nativelink +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: nativelink +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'nativelink-k8s' + environment: 'production' + + # OTLP configuration (requires --web.enable-otlp-receiver flag) + otlp: + promote_resource_attributes: + - service.instance.id + - service.name + - service.namespace + - service.version + - cloud.availability_zone + - cloud.region + - container.name + - deployment.environment + - k8s.cluster.name + - k8s.container.name + - k8s.deployment.name + - k8s.namespace.name + - k8s.pod.name + - k8s.statefulset.name + - nativelink.instance_name + - nativelink.worker_id + - nativelink.scheduler_name + + keep_identifying_resource_attributes: true + translation_strategy: NoUTF8EscapingWithSuffixes + + storage: + tsdb: + out_of_order_time_window: 30m + retention.time: 30d + + scrape_configs: + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:9090'] + metric_relabel_configs: + - source_labels: [__name__] + regex: '(nativelink_.*)' + target_label: __name__ + replacement: '${1}' + + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Kubernetes service discovery for NativeLink pods + - job_name: 'nativelink-pods' + kubernetes_sd_configs: + - role: pod + namespaces: + names: ['nativelink'] + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + + rule_files: + - /etc/prometheus/rules/*.yml + + alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-rules + namespace: nativelink +data: + nativelink-rules.yml: | + groups: + - name: nativelink_alerts + interval: 30s + rules: + - alert: NativeLinkHighErrorRate + expr: | + (1 - ( + sum(rate(nativelink_execution_completed_count_total{execution_result="success"}[5m])) / + sum(rate(nativelink_execution_completed_count_total[5m])) + )) > 0.05 + for: 5m + labels: + severity: warning + component: nativelink + annotations: + summary: "High execution error rate ({{ $value | humanizePercentage }})" + description: "NativeLink execution error rate is above 5% for the last 5 minutes" + + - alert: NativeLinkCacheMissRateHigh + expr: | + (1 - ( + sum(rate(nativelink_cache_operations_total{cache_operation_result="hit"}[5m])) by (cache_type) / + sum(rate(nativelink_cache_operations_total{cache_operation_name="read"}[5m])) by (cache_type) + )) > 0.5 + for: 10m + labels: + severity: info + component: nativelink + annotations: + summary: "Cache miss rate above 50% for {{ $labels.cache_type }}" + description: "Cache {{ $labels.cache_type }} has a miss rate above 50% for 10 minutes" + + - alert: NativeLinkQueueBacklog + expr: | + sum(nativelink_execution_active_count{execution_stage="queued"}) > 100 + for: 15m + labels: + severity: warning + component: nativelink + annotations: + summary: "Execution queue backlog above 100 actions" + description: "{{ $value }} actions are queued for execution" + + - alert: NativeLinkWorkerUtilizationLow + expr: | + count(nativelink_execution_active_count{execution_stage="executing"} > 0) / + count(count by (execution_worker_id) (nativelink_execution_active_count)) < 0.3 + for: 30m + labels: + severity: info + component: nativelink + annotations: + summary: "Worker utilization below 30%" + description: "Only {{ $value | humanizePercentage }} of workers are active" + + - alert: NativeLinkCacheEvictionRateHigh + expr: | + sum(rate(nativelink_cache_operations_total{cache_operation_name="evict"}[5m])) by (cache_type) > 10 + for: 10m + labels: + severity: warning + component: nativelink + annotations: + summary: "High cache eviction rate for {{ $labels.cache_type }}" + description: "Cache {{ $labels.cache_type }} is evicting {{ $value }} items per second" + +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: nativelink + labels: + app: prometheus +spec: + type: ClusterIP + selector: + app: prometheus + ports: + - name: web + port: 9090 + targetPort: 9090 + protocol: TCP + +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus + namespace: nativelink + labels: + app: prometheus +spec: + serviceName: prometheus + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + serviceAccountName: prometheus + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: prometheus + image: prom/prometheus:v2.50.0 + args: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/etc/prometheus/console_libraries' + - '--web.console.templates=/etc/prometheus/consoles' + - '--web.enable-lifecycle' + - '--web.enable-otlp-receiver' + - '--storage.tsdb.retention.time=30d' + - '--storage.tsdb.out-of-order-time-window=30m' + ports: + - containerPort: 9090 + name: web + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: rules + mountPath: /etc/prometheus/rules + - name: storage + mountPath: /prometheus + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "4Gi" + cpu: "2000m" + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: prometheus-config + - name: rules + configMap: + name: prometheus-rules + volumeClaimTemplates: + - metadata: + name: storage + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: nativelink + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: + - ingresses + verbs: ["get", "list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: + - ingresses + verbs: ["get", "list", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: nativelink + +--- +# Ingress for external access (optional) +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus + namespace: nativelink + annotations: + nginx.ingress.kubernetes.io/rewrite-target: / +spec: + ingressClassName: nginx + rules: + - host: prometheus.nativelink.local + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: prometheus + port: + number: 9090 diff --git a/deployment-examples/metrics/otel-collector-config.yaml b/deployment-examples/metrics/otel-collector-config.yaml new file mode 100644 index 000000000..6fdbad7e9 --- /dev/null +++ b/deployment-examples/metrics/otel-collector-config.yaml @@ -0,0 +1,159 @@ +# OpenTelemetry Collector Configuration for NativeLink Metrics +# This configuration receives metrics from NativeLink via OTLP and exports them to various backends + +receivers: + # Receive metrics from NativeLink via OTLP gRPC + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + # Add resource attributes for better metric identification + resource: + attributes: + - key: service.namespace + value: nativelink + action: upsert + - key: deployment.environment + from_attribute: deployment_environment + action: insert + - key: deployment.region + from_attribute: deployment_region + action: insert + + # Transform metrics to add NativeLink-specific attributes + transform/nativelink: + metric_statements: + - context: datapoint + statements: + # Add instance name from resource attributes if available + - set(attributes["instance_name"], resource.attributes["nativelink.instance_name"]) + where resource.attributes["nativelink.instance_name"] != nil + + # Batch metrics for efficiency + batch: + timeout: 10s + send_batch_size: 1024 + send_batch_max_size: 2048 + + # Add memory limiter to prevent OOM + memory_limiter: + check_interval: 1s + limit_mib: 512 + spike_limit_mib: 128 + +exporters: + # Export metrics to Prometheus format + prometheus: + endpoint: 0.0.0.0:9090 + namespace: nativelink + const_labels: + service: nativelink + resource_to_telemetry_conversion: + enabled: true + enable_open_metrics: true + # Add metric descriptions for NativeLink metrics + metric_expiration: 10m + + # Direct OTLP export to Prometheus (when Prometheus has OTLP receiver enabled) + otlphttp/prometheus: + endpoint: http://prometheus:9090/api/v1/otlp + compression: gzip + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + # Export traces to Jaeger + otlp/jaeger: + endpoint: jaeger:4317 + tls: + insecure: true + + # Export to other OTLP backends (e.g., Grafana Cloud, ClickHouse) + otlp/backend: + endpoint: "${OTLP_BACKEND_ENDPOINT}" + compression: gzip + headers: + Authorization: "Bearer ${OTLP_BACKEND_TOKEN}" + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + # Debug exporter for troubleshooting + debug: + verbosity: detailed + sampling_initial: 5 + sampling_thereafter: 200 + +extensions: + health_check: + endpoint: 0.0.0.0:13133 + path: /health + check_collector_pipeline: + enabled: true + interval: 15s + exporter_failure_threshold: 5 + + pprof: + endpoint: 0.0.0.0:1777 + + zpages: + endpoint: 0.0.0.0:55679 + +service: + extensions: [health_check, pprof, zpages] + pipelines: + #traces: + # receivers: [otlp] + # exporters: [debug] + + #logs: + # receivers: [otlp] + # exporters: [debug] + + # Main metrics pipeline - exports to Prometheus scrape endpoint + metrics: + receivers: [otlp] + processors: [memory_limiter, resource, transform/nativelink, batch] + exporters: [prometheus] + + # Direct to Prometheus OTLP endpoint (if enabled) + metrics/prometheus_otlp: + receivers: [otlp] + processors: [memory_limiter, resource, transform/nativelink, batch] + exporters: [otlphttp/prometheus] + + # Traces pipeline - exports to Jaeger + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/jaeger] + + # Optional: Send to additional backend + # Uncomment and configure OTLP_BACKEND_ENDPOINT environment variable + # metrics/backend: + # receivers: [otlp] + # processors: [memory_limiter, resource, transform/nativelink, batch] + # exporters: [otlp/backend] + + # Debug pipeline for development + #metrics/debug: + # receivers: [otlp] + # processors: [memory_limiter] + # exporters: [debug] + + telemetry: + logs: + level: info + initial_fields: + service: otel-collector + metrics: + level: detailed + address: 0.0.0.0:8888 diff --git a/deployment-examples/metrics/prometheus-config.yaml b/deployment-examples/metrics/prometheus-config.yaml new file mode 100644 index 000000000..53b8435c6 --- /dev/null +++ b/deployment-examples/metrics/prometheus-config.yaml @@ -0,0 +1,170 @@ +# Prometheus Configuration for NativeLink Metrics +# This configuration sets up Prometheus to receive metrics via OTLP and scrape format +# +# NOTE: This configuration uses `translation_strategy: NoUTF8EscapingWithSuffixes` which +# adds the `_total` suffix to counter metrics when using OTLP ingestion (Prometheus v3+). +# The included Grafana dashboards use `_total` suffix for counter metrics to match this. +# See README.md for more information on metric naming. + +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'nativelink-cluster' + environment: 'production' + +# Enable OTLP receiver (requires --web.enable-otlp-receiver flag) +otlp: + # Promote NativeLink-specific resource attributes to labels + promote_resource_attributes: + - service.instance.id + - service.name + - service.namespace + - service.version + # Cloud/Infrastructure attributes + - cloud.availability_zone + - cloud.region + - container.name + - deployment.environment + - deployment.environment.name + # Kubernetes attributes + - k8s.cluster.name + - k8s.container.name + - k8s.cronjob.name + - k8s.daemonset.name + - k8s.deployment.name + - k8s.job.name + - k8s.namespace.name + - k8s.pod.name + - k8s.replicaset.name + - k8s.statefulset.name + # NativeLink-specific attributes + - nativelink.instance_name + - nativelink.worker_id + - nativelink.scheduler_name + + # Keep identifying resource attributes in target_info + keep_identifying_resource_attributes: true + + # Use NoTranslation to preserve metric names with UTF-8 support + # This keeps OpenTelemetry semantic convention names intact + translation_strategy: NoUTF8EscapingWithSuffixes + +# Storage configuration for handling out-of-order samples +storage: + tsdb: + # Allow 30 minutes of out-of-order samples (for batched OTLP data) + out_of_order_time_window: 30m + +# Scrape configurations +scrape_configs: + # Scrape the OTEL Collector's Prometheus endpoint + - job_name: 'otel-collector' + static_configs: + - targets: ['otel-collector:9090'] + metric_relabel_configs: + # Add nativelink prefix to all metrics from collector + - source_labels: [__name__] + regex: '(nativelink_.*)' + target_label: __name__ + replacement: '${1}' + + # Scrape Prometheus's own metrics + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Optional: Direct scrape of NativeLink instances (if metrics endpoint is exposed) + # - job_name: 'nativelink-direct' + # static_configs: + # - targets: ['nativelink-cas:8080', 'nativelink-scheduler:8080'] + # metrics_path: '/metrics' + +# Recording rules for common NativeLink queries +rule_files: + - /etc/prometheus/rules/*.yml + +# Alerting configuration +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] + +# Example recording rules for NativeLink metrics +# Save this as a separate file: rules/nativelink-recording-rules.yml +# rule_files content example: +--- +# Recording Rules for NativeLink Metrics +groups: + - name: nativelink_execution + interval: 30s + rules: + # Execution success rate + - record: nativelink:execution_success_rate + expr: | + sum(rate(nativelink_execution_completed_count_total{execution_result="success"}[5m])) / + sum(rate(nativelink_execution_completed_count_total[5m])) + + # Average queue time + - record: nativelink:execution_queue_time_avg + expr: | + histogram_quantile(0.5, + sum(rate(nativelink_execution_queue_time_bucket[5m])) by (le, instance_name) + ) + + # Actions per stage + - record: nativelink:execution_active_by_stage + expr: | + sum(nativelink_execution_active_count) by (execution_stage, instance_name) + + # Stage transition rate + - record: nativelink:stage_transition_rate + expr: | + sum(rate(nativelink_execution_stage_transitions_total[5m])) by (instance_name) + + - name: nativelink_cache + interval: 30s + rules: + # Cache hit rate + - record: nativelink:cache_hit_rate + expr: | + sum(rate(nativelink_cache_operations_total{cache_operation_result="hit"}[5m])) by (cache_type) / + sum(rate(nativelink_cache_operations_total{cache_operation_name="read"}[5m])) by (cache_type) + + # Cache operation latency p95 + - record: nativelink:cache_operation_latency_p95 + expr: | + histogram_quantile(0.95, + sum(rate(nativelink_cache_operation_duration_bucket[5m])) by (le, cache_type, cache_operation_name) + ) + + # Cache size utilization + - record: nativelink:cache_size_bytes + expr: | + sum(nativelink_cache_size) by (cache_type, instance_name) + + # Cache eviction rate + - record: nativelink:cache_eviction_rate + expr: | + sum(rate(nativelink_cache_operations_total{cache_operation_name="evict"}[5m])) by (cache_type) + + - name: nativelink_performance + interval: 60s + rules: + # Overall system throughput + - record: nativelink:system_throughput + expr: | + sum(rate(nativelink_execution_completed_count_total[5m])) + + # Worker utilization + - record: nativelink:worker_utilization + expr: | + sum(nativelink_execution_active_count{execution_stage="executing"}) by (execution_worker_id) / + count(count by (execution_worker_id) (nativelink_execution_active_count)) + + # Action completion time (from queued to completed) + - record: nativelink:action_total_duration_p99 + expr: | + histogram_quantile(0.99, + sum(rate(nativelink_execution_total_duration_bucket[5m])) by (le, instance_name) + ) diff --git a/deployment-examples/metrics/prometheus-recording-rules.yml b/deployment-examples/metrics/prometheus-recording-rules.yml new file mode 100644 index 000000000..f1fe126b0 --- /dev/null +++ b/deployment-examples/metrics/prometheus-recording-rules.yml @@ -0,0 +1,370 @@ +# Recording Rules for NativeLink Metrics +# These rules pre-calculate common queries for better dashboard performance +# +# NOTE: When exporting OpenTelemetry counters to Prometheus/OpenMetrics, counters are exposed with +# a `_total` suffix (for example, `nativelink_execution_completed_count_total`). These rules are +# written for that naming scheme (used by the Docker Compose quickstart). + +groups: + - name: nativelink_execution + interval: 30s + rules: + # Execution success rate by instance + - record: nativelink:execution_success_rate + expr: | + sum by (instance_name, execution_instance) ( + rate(nativelink_execution_completed_count_total{execution_result="success"}[5m]) + ) / + sum by (instance_name, execution_instance) ( + rate(nativelink_execution_completed_count_total[5m]) + ) + + # Cache hit rate from executions + - record: nativelink:execution_cache_hit_rate + expr: | + sum by (instance_name) ( + rate(nativelink_execution_completed_count_total{execution_result="cache_hit"}[5m]) + ) / + sum by (instance_name) ( + rate(nativelink_execution_completed_count_total[5m]) + ) + + # Average queue time (median) + - record: nativelink:execution_queue_time_p50 + expr: | + histogram_quantile(0.5, + sum by (le, instance_name, execution_instance) ( + rate(nativelink_execution_queue_time_bucket[5m]) + ) + ) + + # Queue time 95th percentile + - record: nativelink:execution_queue_time_p95 + expr: | + histogram_quantile(0.95, + sum by (le, instance_name, execution_instance) ( + rate(nativelink_execution_queue_time_bucket[5m]) + ) + ) + + # Actions currently in each stage + - record: nativelink:execution_active_by_stage + expr: | + sum by (execution_stage, instance_name, execution_instance) ( + nativelink_execution_active_count + ) + + # Stage transition rate + - record: nativelink:stage_transition_rate + expr: | + sum by (instance_name, execution_instance, execution_priority) ( + rate(nativelink_execution_stage_transitions_total[5m]) + ) + + # Execution duration by stage (p50, p95, p99) + - record: nativelink:execution_stage_duration_p50 + expr: | + histogram_quantile(0.5, + sum by (le, execution_stage, instance_name) ( + rate(nativelink_execution_stage_duration_bucket[5m]) + ) + ) + + - record: nativelink:execution_stage_duration_p95 + expr: | + histogram_quantile(0.95, + sum by (le, execution_stage, instance_name) ( + rate(nativelink_execution_stage_duration_bucket[5m]) + ) + ) + + - record: nativelink:execution_stage_duration_p99 + expr: | + histogram_quantile(0.99, + sum by (le, execution_stage, instance_name) ( + rate(nativelink_execution_stage_duration_bucket[5m]) + ) + ) + + # Total execution time from submission to completion + - record: nativelink:execution_total_duration_p50 + expr: | + histogram_quantile(0.5, + sum by (le, instance_name, execution_instance) ( + rate(nativelink_execution_total_duration_bucket[5m]) + ) + ) + + - record: nativelink:execution_total_duration_p95 + expr: | + histogram_quantile(0.95, + sum by (le, instance_name, execution_instance) ( + rate(nativelink_execution_total_duration_bucket[5m]) + ) + ) + + # Execution output size distribution + - record: nativelink:execution_output_size_p50 + expr: | + histogram_quantile(0.5, + sum by (le, instance_name) ( + rate(nativelink_execution_output_size_bucket[5m]) + ) + ) + + - record: nativelink:execution_output_size_p95 + expr: | + histogram_quantile(0.95, + sum by (le, instance_name) ( + rate(nativelink_execution_output_size_bucket[5m]) + ) + ) + + - name: nativelink_cache + interval: 30s + rules: + # Cache hit rate by operation and cache type + - record: nativelink:cache_hit_rate + expr: | + sum by (cache_type, instance_name) ( + rate(nativelink_cache_operations_total{cache_operation_result="hit"}[5m]) + ) / + sum by (cache_type, instance_name) ( + rate(nativelink_cache_operations_total{cache_operation_name="read"}[5m]) + ) + + # Cache operation latency percentiles + - record: nativelink:cache_operation_latency_p50 + expr: | + histogram_quantile(0.5, + sum by (le, cache_type, cache_operation_name, instance_name) ( + rate(nativelink_cache_operation_duration_bucket[5m]) + ) + ) + + - record: nativelink:cache_operation_latency_p95 + expr: | + histogram_quantile(0.95, + sum by (le, cache_type, cache_operation_name, instance_name) ( + rate(nativelink_cache_operation_duration_bucket[5m]) + ) + ) + + - record: nativelink:cache_operation_latency_p99 + expr: | + histogram_quantile(0.99, + sum by (le, cache_type, cache_operation_name, instance_name) ( + rate(nativelink_cache_operation_duration_bucket[5m]) + ) + ) + + # Cache size and entry count + - record: nativelink:cache_size_bytes + expr: | + sum by (cache_type, instance_name) (nativelink_cache_size) + + - record: nativelink:cache_entry_count + expr: | + sum by (cache_type, instance_name) (nativelink_cache_entries) + + # Cache eviction rate + - record: nativelink:cache_eviction_rate + expr: | + sum by (cache_type, instance_name) ( + rate(nativelink_cache_operations_total{cache_operation_name="evict"}[5m]) + ) + + # Cache throughput (bytes/sec) + - record: nativelink:cache_read_throughput_bytes + expr: | + sum by (cache_type, instance_name) ( + rate(nativelink_cache_io_total{cache_operation_name="read"}[5m]) + ) + + - record: nativelink:cache_write_throughput_bytes + expr: | + sum by (cache_type, instance_name) ( + rate(nativelink_cache_io_total{cache_operation_name="write"}[5m]) + ) + + # Cache error rate + - record: nativelink:cache_error_rate + expr: | + sum by (cache_type, cache_operation_name, instance_name) ( + rate(nativelink_cache_operations_total{cache_operation_result="error"}[5m]) + ) + + - name: nativelink_performance + interval: 60s + rules: + # Overall system throughput (actions/sec) + - record: nativelink:system_throughput + expr: | + sum(rate(nativelink_execution_completed_count_total[5m])) + + # System success rate + - record: nativelink:system_success_rate + expr: | + sum(rate(nativelink_execution_completed_count_total{execution_result="success"}[5m])) / + sum(rate(nativelink_execution_completed_count_total[5m])) + + # Worker utilization (percentage of workers executing) + - record: nativelink:worker_utilization + expr: | + count by (instance_name) ( + nativelink_execution_active_count{execution_stage="executing"} > 0 + ) / + count by (instance_name) ( + nativelink_execution_active_count + ) + + # Queue depth (actions waiting) + - record: nativelink:queue_depth + expr: | + sum by (instance_name, execution_priority) ( + nativelink_execution_active_count{execution_stage="queued"} + ) + + # Average actions per worker + - record: nativelink:actions_per_worker + expr: | + sum by (execution_worker_id) ( + nativelink_execution_active_count{execution_stage="executing"} + ) + + # Memory usage estimation from output sizes + - record: nativelink:estimated_memory_usage_bytes + expr: | + sum by (instance_name) ( + nativelink_execution_output_size_sum + ) + + # Retry rate + - record: nativelink:execution_retry_rate + expr: | + sum by (instance_name) ( + rate(nativelink_execution_retry_count_total[5m]) + ) + + - name: nativelink_slo + interval: 60s + rules: + # SLO: 99% of executions should complete successfully + - record: nativelink:slo_execution_success_rate + expr: | + sum(rate(nativelink_execution_completed_count_total{execution_result="success"}[1h])) / + sum(rate(nativelink_execution_completed_count_total[1h])) + + # SLO: 95% of cache reads should be under 100ms + - record: nativelink:slo_cache_read_latency + expr: | + histogram_quantile(0.95, + sum(rate(nativelink_cache_operation_duration_bucket{cache_operation_name="read"}[1h])) by (le) + ) < 0.1 + + # SLO: Queue time should be under 30s for 90% of actions + - record: nativelink:slo_queue_time + expr: | + histogram_quantile(0.9, + sum(rate(nativelink_execution_queue_time_bucket[1h])) by (le) + ) < 30 + + # Error budget remaining (based on 99% success SLO) + - record: nativelink:error_budget_remaining + expr: | + 1 - ( + (1 - 0.99) - + (1 - ( + sum(rate(nativelink_execution_completed_count_total{execution_result="success"}[30d])) / + sum(rate(nativelink_execution_completed_count_total[30d])) + )) + ) / (1 - 0.99) + + - name: nativelink_stores + interval: 30s + rules: + # Store cache hit rate by store type and name + - record: nativelink:store_cache_hit_rate + expr: | + sum by (store_type, store_name) ( + rate(nativelink_store_operations{cache_operation_name="read", cache_operation_result="hit"}[5m]) + ) / + sum by (store_type, store_name) ( + rate(nativelink_store_operations{cache_operation_name="read", cache_operation_result=~"hit|miss"}[5m]) + ) + + # Store read latency percentiles + - record: nativelink:store_read_latency_p50 + expr: | + histogram_quantile(0.5, + sum by (le, store_type, store_name) ( + rate(nativelink_store_operation_duration_bucket{cache_operation_name="read"}[5m]) + ) + ) + + - record: nativelink:store_read_latency_p90 + expr: | + histogram_quantile(0.9, + sum by (le, store_type, store_name) ( + rate(nativelink_store_operation_duration_bucket{cache_operation_name="read"}[5m]) + ) + ) + + - record: nativelink:store_read_latency_p99 + expr: | + histogram_quantile(0.99, + sum by (le, store_type, store_name) ( + rate(nativelink_store_operation_duration_bucket{cache_operation_name="read"}[5m]) + ) + ) + + # Store write latency percentiles + - record: nativelink:store_write_latency_p50 + expr: | + histogram_quantile(0.5, + sum by (le, store_type, store_name) ( + rate(nativelink_store_operation_duration_bucket{cache_operation_name="write"}[5m]) + ) + ) + + - record: nativelink:store_write_latency_p90 + expr: | + histogram_quantile(0.9, + sum by (le, store_type, store_name) ( + rate(nativelink_store_operation_duration_bucket{cache_operation_name="write"}[5m]) + ) + ) + + - record: nativelink:store_write_latency_p99 + expr: | + histogram_quantile(0.99, + sum by (le, store_type, store_name) ( + rate(nativelink_store_operation_duration_bucket{cache_operation_name="write"}[5m]) + ) + ) + + # Store operation rates + - record: nativelink:store_read_rate + expr: | + sum by (store_type, store_name) ( + rate(nativelink_store_operations{cache_operation_name="read"}[5m]) + ) + + - record: nativelink:store_write_rate + expr: | + sum by (store_type, store_name) ( + rate(nativelink_store_operations{cache_operation_name="write"}[5m]) + ) + + # Store error rate + - record: nativelink:store_error_rate + expr: | + sum by (store_type, store_name, cache_operation_name) ( + rate(nativelink_store_operations{cache_operation_result="error"}[5m]) + ) + + # Overall store hit rate (aggregated across all stores) + - record: nativelink:store_overall_hit_rate + expr: | + sum(rate(nativelink_store_operations{cache_operation_name="read", cache_operation_result="hit"}[5m])) / + sum(rate(nativelink_store_operations{cache_operation_name="read", cache_operation_result=~"hit|miss"}[5m])) diff --git a/deployment-examples/rhel/Dockerfile.rhel8 b/deployment-examples/rhel/Dockerfile.rhel8 index 8b9cdc260..0dde3fc5e 100644 --- a/deployment-examples/rhel/Dockerfile.rhel8 +++ b/deployment-examples/rhel/Dockerfile.rhel8 @@ -22,16 +22,16 @@ ARG ADDITIONAL_SETUP_WORKER_CMD= # RHEL8-equivalent image # see https://www.redhat.com/en/blog/introducing-red-hat-universal-base-image -FROM redhat/ubi8:8.10-1756195303 AS dependencies +FROM redhat/ubi8:8.10-1756195303@sha256:534c2c0efa4150ede18e3f9d7480d3b9ec2a52e62bc91cd54e08ee7336819619 AS dependencies ARG OS_VERSION -RUN yum update && \ +RUN yum update --assumeyes && \ yum install --assumeyes \ - npm \ - git \ - gcc \ - gcc-c++ \ - python3.12 \ - ca-certificates \ + npm-1:6.14.11 \ + git-2.43.7 \ + gcc-8.5.0 \ + gcc-c++-8.5.0 \ + python3.12-3.12.11 \ + ca-certificates-2024.2.69_v8.0.303 \ && npm install -g @bazel/bazelisk@1.25.0 # Build the binary. diff --git a/docs/property-router-scheduler-plan.md b/docs/property-router-scheduler-plan.md new file mode 100644 index 000000000..c99dae7bc --- /dev/null +++ b/docs/property-router-scheduler-plan.md @@ -0,0 +1,408 @@ +# Plan: PropertyRouterScheduler + +Routes incoming actions to different backend schedulers based on a +platform property value (e.g. `container-image`), so the client always +talks to one endpoint and knows nothing about the internal topology. + +## Architecture + +``` +Bazel Client + │ + │ ExecuteRequest + ▼ +Front NativeLink Process + ├── ExecutionServer + │ │ + │ │ add_action(action_info) + │ ▼ + │ PropertyRouterScheduler + │ │ + │ │ reads action_info.platform_properties["container-image"] + │ │ + │ ├── "compile" / "test-env" / "test-fat-env" + │ │ └── GrpcScheduler ──► Scheduler Process 1 + │ │ │ + │ │ Workers (compile, test) + │ │ + │ └── anything else (default) + │ └── GrpcScheduler ──► Scheduler Process 2 + │ │ + │ Workers (default) + │ + └── worker_api (not exposed on front process — managed by backend processes) +``` + +## Files Changed: 8 total (3 new, 5 modified) + +### New files + +| File | Description | +|------|-------------| +| `nativelink-scheduler/src/property_router_scheduler.rs` | Core implementation | +| `nativelink-scheduler/tests/property_router_scheduler_test.rs` | Unit tests | +| `docs/property-router-scheduler-plan.md` | This file | + +### Modified files + +| File | Change | +|------|--------| +| `nativelink-config/src/schedulers.rs` | Add `PropertyRouterSpec` struct and `SchedulerSpec::PropertyRouter` variant | +| `nativelink-scheduler/src/lib.rs` | Register `property_router_scheduler` module | +| `nativelink-scheduler/src/default_scheduler_factory.rs` | Add match arm for `PropertyRouter` | + +--- + +## Step 1 — Config + +**File:** `nativelink-config/src/schedulers.rs` + +Add after `PropertyModifierSpec`: + +```rust +/// Routes actions to different schedulers based on a platform property value. +/// Actions whose property value matches a key in `routes` go to that scheduler. +/// All other actions (missing property or unmatched value) go to `default_scheduler`. +#[derive(Deserialize, Serialize, Debug)] +#[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] +pub struct PropertyRouterSpec { + /// The platform property key to match on (e.g. "container-image"). + #[serde(deserialize_with = "convert_string_with_shellexpand")] + pub property_name: String, + + /// Map of property value -> nested scheduler spec. + pub routes: HashMap, + + /// Scheduler to use when the property is absent or its value does not match any route. + pub default_scheduler: Box, +} +``` + +Add variant to `SchedulerSpec`: + +```rust +pub enum SchedulerSpec { + Simple(SimpleSpec), + Grpc(GrpcSpec), + CacheLookup(CacheLookupSpec), + PropertyModifier(PropertyModifierSpec), + PropertyRouter(PropertyRouterSpec), // <-- new +} +``` + +--- + +## Step 2 — Core Implementation + +**File:** `nativelink-scheduler/src/property_router_scheduler.rs` + +Follows the exact same pattern as `property_modifier_scheduler.rs`. + +### Struct + +```rust +#[derive(MetricsComponent)] +pub struct PropertyRouterScheduler { + property_name: String, + #[metric(group = "routes")] + routes: HashMap>, + #[metric(group = "default_scheduler")] + default_scheduler: Arc, +} + +impl PropertyRouterScheduler { + pub fn new( + property_name: &str, + routes: HashMap>, + default_scheduler: Arc, + ) -> Self { + Self { + property_name: property_name.to_string(), + routes, + default_scheduler, + } + } +} +``` + +### `add_action` — the core routing logic + +Reads the property value from `action_info.platform_properties` +(`HashMap`), looks it up in `routes`, falls back to +`default_scheduler`: + +```rust +async fn inner_add_action( + &self, + client_operation_id: OperationId, + action_info: Arc, +) -> Result, Error> { + let scheduler = action_info + .platform_properties + .get(&self.property_name) + .and_then(|value| self.routes.get(value)) + .unwrap_or(&self.default_scheduler); + + scheduler.add_action(client_operation_id, action_info).await +} +``` + +### `filter_operations` — fan-out to all schedulers + +The caller (e.g. `WaitExecution`) does not know which backend scheduler +holds the operation, so the router must query all of them and merge: + +```rust +async fn inner_filter_operations( + &self, + filter: OperationFilter, +) -> Result, Error> { + let mut streams = Vec::with_capacity(self.routes.len() + 1); + for scheduler in self.routes.values() { + streams.push(scheduler.filter_operations(filter.clone()).await?); + } + streams.push(self.default_scheduler.filter_operations(filter).await?); + Ok(Box::pin(futures::stream::select_all(streams))) +} +``` + +`OperationFilter` is already `Clone` (derives it at line 67 of +`nativelink-util/src/operation_state_manager.rs`). + +### `KnownPlatformPropertyProvider` — union of all nested schedulers + +```rust +async fn inner_get_known_properties( + &self, + instance_name: &str, +) -> Result, Error> { + let mut all_props = HashSet::new(); + for scheduler in self.routes.values() { + if let Some(p) = scheduler.as_known_platform_property_provider() { + for prop in p.get_known_properties(instance_name).await? { + all_props.insert(prop); + } + } + } + if let Some(p) = self.default_scheduler.as_known_platform_property_provider() { + for prop in p.get_known_properties(instance_name).await? { + all_props.insert(prop); + } + } + Ok(all_props.into_iter().collect()) +} +``` + +### Trait impls + +Implements `ClientStateManager`, `KnownPlatformPropertyProvider`, +`RootMetricsComponent`. Does **not** implement `WorkerScheduler` — the +router never manages workers directly. + +--- + +## Step 3 — Register Module + +**File:** `nativelink-scheduler/src/lib.rs` + +Add: + +```rust +pub mod property_router_scheduler; +``` + +--- + +## Step 4 — Factory + +**File:** `nativelink-scheduler/src/default_scheduler_factory.rs` + +Add import at the top: + +```rust +use crate::property_router_scheduler::PropertyRouterScheduler; +``` + +Add match arm in `inner_scheduler_factory` after `PropertyModifier`: + +```rust +SchedulerSpec::PropertyRouter(spec) => { + let mut routes = HashMap::with_capacity(spec.routes.len()); + for (value, nested_spec) in &spec.routes { + let (action_scheduler, _) = Box::pin(inner_scheduler_factory( + nested_spec, + store_manager, + maybe_origin_event_tx, + )) + .await + .err_tip(|| format!("In nested PropertyRouterScheduler route '{value}'"))?; + routes.insert( + value.clone(), + action_scheduler.err_tip(|| { + format!("Nested route '{value}' is not an action scheduler") + })?, + ); + } + let (default_action_scheduler, _) = Box::pin(inner_scheduler_factory( + &spec.default_scheduler, + store_manager, + maybe_origin_event_tx, + )) + .await + .err_tip(|| "In PropertyRouterScheduler default_scheduler")?; + let router = Arc::new(PropertyRouterScheduler::new( + &spec.property_name, + routes, + default_action_scheduler + .err_tip(|| "Default scheduler is not an action scheduler")?, + )); + (Some(router), None) +} +``` + +--- + +## Step 5 — Tests + +**File:** `nativelink-scheduler/tests/property_router_scheduler_test.rs` + +Uses `MockActionScheduler` — same pattern as `property_modifier_scheduler_test.rs`. + +### Test fixture + +```rust +struct TestContext { + compile_scheduler: Arc, + default_scheduler: Arc, + router: PropertyRouterScheduler, +} + +fn make_router() -> TestContext { + let compile_scheduler = Arc::new(MockActionScheduler::new()); + let default_scheduler = Arc::new(MockActionScheduler::new()); + let mut routes = HashMap::new(); + routes.insert( + "compile".to_string(), + compile_scheduler.clone() as Arc, + ); + let router = PropertyRouterScheduler::new( + "container-image", + routes, + default_scheduler.clone() as Arc, + ); + TestContext { compile_scheduler, default_scheduler, router } +} +``` + +### Tests + +| # | Name | Scenario | Expected | +|---|------|----------|----------| +| 1 | `routes_to_matching_scheduler` | `container-image=compile` | `compile_scheduler.expect_add_action` fires, `default_scheduler` idle | +| 2 | `routes_to_default_when_no_match` | `container-image=other` | `default_scheduler.expect_add_action` fires, `compile_scheduler` idle | +| 3 | `routes_to_default_when_property_missing` | No `container-image` key | `default_scheduler.expect_add_action` fires | +| 4 | `routes_multiple_values` | Two actions: `compile` then `other` | Each routed to correct scheduler | +| 5 | `filter_operations_fans_out_to_all` | `filter_operations` called | Both `compile_scheduler` and `default_scheduler` receive the same filter | +| 6 | `known_properties_unions_all_schedulers` | `get_known_properties` called | Returns union of props from both schedulers | +| 7 | `error_from_nested_scheduler_propagates` | `compile_scheduler` returns `Err` | Router propagates the error | + +### Example test (test #1) + +```rust +#[nativelink_test] +async fn routes_to_matching_scheduler() -> Result<(), Error> { + let ctx = make_router(); + let mut action_info = make_base_action_info(UNIX_EPOCH, DigestInfo::zero_digest()) + .as_ref() + .clone(); + action_info + .platform_properties + .insert("container-image".to_string(), "compile".to_string()); + let action_info = Arc::new(action_info); + + let (_tx, rx) = watch::channel(Arc::new(ActionState { + client_operation_id: OperationId::default(), + stage: ActionStage::Queued, + action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), + })); + let client_operation_id = OperationId::default(); + + let (_, (received_op_id, received_action)) = join!( + ctx.router.add_action(client_operation_id.clone(), action_info.clone()), + ctx.compile_scheduler.expect_add_action(Ok(Box::new( + TokioWatchActionStateResult::new(client_operation_id.clone(), action_info, rx) + ))), + ); + assert_eq!(client_operation_id, received_op_id); + assert_eq!( + Some(&"compile".to_string()), + received_action.platform_properties.get("container-image") + ); + Ok(()) +} +``` + +--- + +## Example Production Config + +```json5 +// scheduler.json5 (front process — one endpoint for all clients) +{ + stores: [ + { + name: "CAS_STORE", + grpc: { + instance_name: "main", + endpoints: [{ address: "grpc://cas-node:50051" }], + store_type: "cas", + }, + }, + ], + schedulers: [ + { + name: "MAIN_SCHEDULER", + property_router: { + property_name: "container-image", + routes: { + "compile": { grpc: { endpoint: { address: "grpc://sched-compile:50052" } } }, + "test-env": { grpc: { endpoint: { address: "grpc://sched-compile:50052" } } }, + "test-fat-env": { grpc: { endpoint: { address: "grpc://sched-compile:50052" } } }, + }, + default_scheduler: { grpc: { endpoint: { address: "grpc://sched-default:50052" } } }, + }, + }, + ], + servers: [ + { + listener: { http: { socket_address: "0.0.0.0:50052" } }, + services: { + execution: [ + { instance_name: "", cas_store: "CAS_STORE", scheduler: "MAIN_SCHEDULER" }, + { instance_name: "main", cas_store: "CAS_STORE", scheduler: "MAIN_SCHEDULER" }, + ], + capabilities: [ + { instance_name: "", remote_execution: { scheduler: "MAIN_SCHEDULER" } }, + { instance_name: "main", remote_execution: { scheduler: "MAIN_SCHEDULER" } }, + ], + health: {}, + }, + }, + ], +} +``` + +--- + +## Notes + +- `WorkerScheduler` is **not** implemented by the router — worker management + stays entirely in the backend scheduler processes. +- The router does not cache the routing decision. This is intentional: + `add_action` reads a `HashMap` lookup — O(1), zero cost. +- `filter_operations` fan-out is necessary because `WaitExecution` uses it + and does not know which backend scheduler owns the operation. + With N backend schedulers this is N parallel gRPC calls — acceptable since + it's used for status polling, not hot-path action dispatch. diff --git a/flake-module.nix b/flake-module.nix index 0859225c8..97f75cee7 100644 --- a/flake-module.nix +++ b/flake-module.nix @@ -25,27 +25,6 @@ A bash snippet that creates a nixos.bazelrc file in the repository. ''; }; - api-key = lib.mkOption { - type = lib.types.str; - description = lib.mdDoc '' - The API key to connect to the NativeLink Cloud. - - You should only use read-only keys here to prevent cache-poisoning and - malicious artifact extractions. - - Defaults to NativeLink's shared read-only api key. - ''; - default = "065f02f53f26a12331d5cfd00a778fb243bfb4e857b8fcd4c99273edfb15deae"; - }; - endpoint = lib.mkOption { - type = lib.types.str; - description = lib.mdDoc '' - The NativeLink Cloud endpoint. - - Defaults to NativeLink's shared cache. - ''; - default = "grpcs://cas-tracemachina-shared.build-faster.nativelink.net"; - }; prefix = lib.mkOption { type = lib.types.str; description = lib.mdDoc '' @@ -75,9 +54,6 @@ # }; # ``` defaultConfig = [ - "--remote_cache=${cfg.endpoint}" - "--remote_header=x-nativelink-api-key=${cfg.api-key}" - "--remote_header=x-nativelink-project=nativelink-ci" "--nogenerate_json_trace_profile" "--remote_upload_local_results=false" "--remote_cache_async" diff --git a/flake.lock b/flake.lock index be53deb68..af5354dd1 100644 --- a/flake.lock +++ b/flake.lock @@ -134,7 +134,7 @@ }, "nixpkgs": { "locked": { - "lastModified": 1747744144, + "lastModified": 1747852984, "narHash": "sha256-q2PmaOxyR3zqOF54a3E1Cj1gh0sDu8APX9b+OkX4J5s=", "owner": "NixOS", "repo": "nixpkgs", diff --git a/flake.nix b/flake.nix index 0f6745bea..6bd851790 100644 --- a/flake.nix +++ b/flake.nix @@ -64,8 +64,7 @@ # Getting started Enter the Nix environment with `nix develop`. - Get your credentials for the NativeLink cloud on - https://app.nativelink.com/ and paste them into `user.bazelrc`. + Get your credentials for NativeLink and paste them into `user.bazelrc`. Run `bazel build hello-world` to build the example with local remote execution. @@ -137,7 +136,8 @@ p.libiconv ]; nativeBuildInputs = - ( + [p.bashNonInteractive] # needed for some command tests + ++ ( if isLinuxBuild then [pkgs.mold] else [pkgs.llvmPackages_20.lld] @@ -150,19 +150,18 @@ } // (pkgs.lib.optionalAttrs isLinuxTarget { CARGO_BUILD_RUSTFLAGS = "-C target-feature=+crt-static"; + TARGET_CC = "${pkgs.lre.clang}/bin/customClang"; ${linkerEnvVar} = linkerPath; }); # Additional target for external dependencies to simplify caching. cargoArtifactsFor = p: (craneLibFor p).buildDepsOnly (commonArgsFor p); - nightlyCargoArtifactsFor = p: (craneLibFor p).buildDepsOnly (commonArgsFor p); + nightlyCargoArtifactsFor = p: (nightlyCraneLibFor p).buildDepsOnly (commonArgsFor p); nativelinkFor = p: (craneLibFor p).buildPackage ((commonArgsFor p) // { cargoArtifacts = cargoArtifactsFor p; - # Enable this for debugging worker scheduler issues - # cargoExtraArgs = "--features worker_find_logging"; }); nativeTargetPkgs = @@ -293,16 +292,7 @@ nativelinkCoverageFor = p: let coverageArgs = - (commonArgsFor p) - // { - # TODO(palfrey): For some reason we're triggering an edgecase where - # mimalloc builds against glibc headers in coverage - # builds. This leads to nonexistend __memcpy_chk and - # __memset_chk symbols if fortification is enabled. - # Our regular builds also have this issue, but we - # should investigate further. - hardeningDisable = ["fortify"]; - }; + commonArgsFor p; in (nightlyCraneLibFor p).cargoLlvmCov (coverageArgs // { @@ -369,16 +359,28 @@ nativelink-worker-toolchain-buck2 = createWorker toolchain-buck2; nativelink-worker-buck2-toolchain = buck2-toolchain; image = nativelink-image; - generate-bazel-rc = pkgs.callPackage tools/generate-bazel-rc/build.nix {craneLib = craneLibFor pkgs;}; - generate-stores-config = pkgs.callPackage nativelink-config/generate-stores-config/build.nix {craneLib = craneLibFor pkgs;}; - inherit (pkgs) buildstream buildbox mongodb wait4x bazelisk; + inherit (pkgs) buildstream buildbox buck2 mongodb wait4x bazelisk; buildstream-with-nativelink-test = pkgs.callPackage integration_tests/buildstream/buildstream-with-nativelink-test.nix { inherit nativelink buildstream buildbox; }; mongo-with-nativelink-test = pkgs.callPackage integration_tests/mongo/mongo-with-nativelink-test.nix { inherit nativelink mongodb wait4x bazelisk; }; + rbe-toolchain-with-nativelink-test = pkgs.callPackage toolchain-examples/rbe-toolchain-test.nix { + inherit nativelink bazelisk; + }; + buck2-with-nativelink-test = pkgs.callPackage integration_tests/buck2/buck2-with-nativelink-test.nix { + inherit nativelink buck2; + }; + update-module-hashes = pkgs.callPackage tools/updaters/rewrite-module.nix { + python-with-requests = pkgs.python3.withPackages (ps: + with ps; [ + ps.requests + ]); + }; + generate-bazel-rc = pkgs.callPackage tools/generate-bazel-rc/build.nix {craneLib = craneLibFor pkgs;}; + generate-stores-config = pkgs.callPackage nativelink-config/generate-stores-config/build.nix {craneLib = craneLibFor pkgs;}; } // ( # It's not possible to crosscompile to darwin, not even between @@ -460,6 +462,8 @@ pkgs.git pkgs.pre-commit pkgs.git-cliff + pkgs.buck2 + packages.update-module-hashes # Rust bazel diff --git a/integration_tests/buck2/.buckconfig b/integration_tests/buck2/.buckconfig new file mode 100644 index 000000000..36aa16d20 --- /dev/null +++ b/integration_tests/buck2/.buckconfig @@ -0,0 +1,15 @@ +[cells] +root = . + +[buck2_re_client] +engine_address = localhost:50051 +action_cache_address = localhost:50051 +cas_address = localhost:50051 +tls = false +instance_name = main + +[build] +execution_platforms = root//platforms:platforms + +[project] + ignore = buck2.log diff --git a/integration_tests/buck2/.buckroot b/integration_tests/buck2/.buckroot new file mode 100644 index 000000000..e69de29bb diff --git a/integration_tests/buck2/README.md b/integration_tests/buck2/README.md new file mode 100644 index 000000000..7c2c5da68 --- /dev/null +++ b/integration_tests/buck2/README.md @@ -0,0 +1 @@ +Based off of https://github.com/facebook/buck2/tree/main/examples/remote_execution/nativelink diff --git a/integration_tests/buck2/buck2-with-nativelink-test.nix b/integration_tests/buck2/buck2-with-nativelink-test.nix new file mode 100644 index 000000000..f293603be --- /dev/null +++ b/integration_tests/buck2/buck2-with-nativelink-test.nix @@ -0,0 +1,55 @@ +{ + nativelink, + buck2, + writeShellScriptBin, + coreutils, + diffutils, +}: +writeShellScriptBin "buck2-with-nativelink-test" '' + set -uo pipefail + + cleanup() { + local pids=$(jobs -pr) + [ -n "$pids" ] && kill $pids + } + trap "cleanup" INT QUIT TERM EXIT + + ${nativelink}/bin/nativelink -- integration_tests/buck2/buck2_cas.json5 | tee -i integration_tests/buck2/nativelink.log & + + cp integration_tests/buck2/tests/defs.bzl integration_tests/buck2/tests/defs.bzl.original + sed -i -e 's#cat #${coreutils}/bin/cat #' integration_tests/buck2/tests/defs.bzl + sed -i -e 's#diff #${diffutils}/bin/diff #' integration_tests/buck2/tests/defs.bzl + + buck2_output=$(cd integration_tests/buck2 && BUCK_NO_INTERACTIVE_CONSOLE=false BUCK_CONSOLE=simplenotty ${buck2}/bin/buck2 build //... 2>&1 | tee -i buck2.log) + + ${buck2}/bin/buck2 killall + + mv integration_tests/buck2/tests/defs.bzl.original integration_tests/buck2/tests/defs.bzl + + echo "Buck2 log" + echo "---" + cat integration_tests/buck2/buck2.log + echo "---" + + case $buck2_output in + *"BUILD SUCCEEDED"* ) + echo "Saw a successful buck2 build" + ;; + *) + echo 'Failed buck2 build' + exit 1 + ;; + esac + + nativelink_output=$(cat integration_tests/buck2/nativelink.log) + + case $nativelink_output in + *"ERROR"* ) + echo "Error in nativelink build" + exit 1 + ;; + *) + echo 'Successful nativelink build' + ;; + esac +'' diff --git a/integration_tests/buck2/buck2_cas.json5 b/integration_tests/buck2/buck2_cas.json5 new file mode 100644 index 000000000..963c6107e --- /dev/null +++ b/integration_tests/buck2/buck2_cas.json5 @@ -0,0 +1,188 @@ +{ + stores: [ + { + name: "AC_MAIN_STORE", + filesystem: { + content_path: "/tmp/nativelink/data-worker-test/content_path-ac", + temp_path: "/tmp/nativelink/data-worker-test/tmp_path-ac", + eviction_policy: { + max_bytes: "1gb", + }, + }, + }, + { + name: "WORKER_FAST_SLOW_STORE", + fast_slow: { + // "fast" must be a "filesystem" store because the worker uses it to make + // hardlinks on disk to a directory where the jobs are running. + fast: { + filesystem: { + content_path: "/tmp/nativelink/data-worker-test/content_path-cas", + temp_path: "/tmp/nativelink/data-worker-test/tmp_path-cas", + eviction_policy: { + max_bytes: "10gb", + }, + }, + }, + slow: { + /// Discard data. + /// This example usage has the CAS and the Worker live in the same place, + /// so they share the same underlying CAS. Since workers require a fast_slow + /// store, we use the fast store as our primary data store, and the slow store + /// is just a noop, since there's no shared storage in this config. + noop: {}, + }, + }, + }, + ], + schedulers: [ + { + name: "MAIN_SCHEDULER", + simple: { + supported_platform_properties: { + cpu_count: "minimum", + memory_kb: "minimum", + network_kbps: "minimum", + disk_read_iops: "minimum", + disk_read_bps: "minimum", + disk_write_iops: "minimum", + disk_write_bps: "minimum", + shm_size: "minimum", + gpu_count: "minimum", + gpu_model: "exact", + cpu_vendor: "exact", + cpu_arch: "exact", + cpu_model: "exact", + kernel_version: "exact", + OSFamily: "priority", + "container-image": "priority", + "lre-rs": "priority", + ISA: "exact", + }, + }, + }, + ], + workers: [ + { + local: { + worker_api_endpoint: { + uri: "grpc://127.0.0.1:50061", + }, + cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + upload_action_result: { + ac_store: "AC_MAIN_STORE", + }, + work_directory: "/tmp/nativelink/work", + platform_properties: { + cpu_count: { + values: [ + "16", + ], + }, + memory_kb: { + values: [ + "500000", + ], + }, + network_kbps: { + values: [ + "100000", + ], + }, + cpu_arch: { + values: [ + "x86_64", + ], + }, + OSFamily: { + values: [ + "", + ], + }, + "container-image": { + values: [ + "", + ], + }, + "lre-rs": { + values: [ + "", + ], + }, + ISA: { + values: [ + "x86-64", + ], + }, + }, + }, + }, + ], + servers: [ + { + name: "public", + listener: { + http: { + socket_address: "0.0.0.0:50051", + }, + }, + services: { + cas: [ + { + instance_name: "main", + cas_store: "WORKER_FAST_SLOW_STORE", + }, + ], + ac: [ + { + instance_name: "main", + ac_store: "AC_MAIN_STORE", + }, + ], + execution: [ + { + instance_name: "main", + cas_store: "WORKER_FAST_SLOW_STORE", + scheduler: "MAIN_SCHEDULER", + }, + ], + capabilities: [ + { + instance_name: "main", + remote_execution: { + scheduler: "MAIN_SCHEDULER", + }, + }, + ], + bytestream: [ + { + instance_name: "main", + cas_store: "WORKER_FAST_SLOW_STORE", + }, + ], + }, + }, + { + name: "private_workers_servers", + listener: { + http: { + socket_address: "0.0.0.0:50061", + }, + }, + services: { + // Note: This should be served on a different port, because it has + // a different permission set than the other services. + // In other words, this service is a backend api. The ones above + // are a frontend api. + worker_api: { + scheduler: "MAIN_SCHEDULER", + }, + admin: {}, + health: {}, + }, + }, + ], + global: { + max_open_files: 24576, + }, +} diff --git a/integration_tests/buck2/platforms/BUCK b/integration_tests/buck2/platforms/BUCK new file mode 100644 index 000000000..63f852afe --- /dev/null +++ b/integration_tests/buck2/platforms/BUCK @@ -0,0 +1,3 @@ +load(":defs.bzl", "platforms") + +platforms(name = "platforms") diff --git a/integration_tests/buck2/platforms/defs.bzl b/integration_tests/buck2/platforms/defs.bzl new file mode 100644 index 000000000..c67c7647c --- /dev/null +++ b/integration_tests/buck2/platforms/defs.bzl @@ -0,0 +1,34 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under both the MIT license found in the +# LICENSE-MIT file in the root directory of this source tree and the Apache +# License, Version 2.0 found in the LICENSE-APACHE file in the root directory +# of this source tree. + +""" +Buck2 platform config +""" + +def _platforms(ctx): + configuration = ConfigurationInfo( + constraints = {}, + values = {}, + ) + + platform = ExecutionPlatformInfo( + label = ctx.label.raw_target(), + configuration = configuration, + executor_config = CommandExecutorConfig( + local_enabled = True, + remote_enabled = True, + use_limited_hybrid = True, + remote_execution_properties = { + }, + remote_execution_use_case = "buck2-default", + remote_output_paths = "output_paths", + ), + ) + + return [DefaultInfo(), ExecutionPlatformRegistrationInfo(platforms = [platform])] + +platforms = rule(attrs = {}, impl = _platforms) diff --git a/integration_tests/buck2/tests/BUCK b/integration_tests/buck2/tests/BUCK new file mode 100644 index 000000000..8200f35cc --- /dev/null +++ b/integration_tests/buck2/tests/BUCK @@ -0,0 +1,3 @@ +load(":defs.bzl", "tests") + +tests(name = "tests") diff --git a/integration_tests/buck2/tests/defs.bzl b/integration_tests/buck2/tests/defs.bzl new file mode 100644 index 000000000..e7c43e209 --- /dev/null +++ b/integration_tests/buck2/tests/defs.bzl @@ -0,0 +1,54 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under both the MIT license found in the +# LICENSE-MIT file in the root directory of this source tree and the Apache +# License, Version 2.0 found in the LICENSE-APACHE file in the root directory +# of this source tree. + +""" +Buck2 remote execution tests +""" + +def _tests(ctx): + # Create locally + stage0 = ctx.actions.declare_output("stage0") + ctx.actions.run( + ["sh", "-c", 'head -c 10 /dev/urandom > "$1"', "--", stage0.as_output()], + category = "stage0", + local_only = True, + ) + + # Use on RE + stage1 = ctx.actions.declare_output("stage1") + ctx.actions.run(["sh", "-c", 'cat "$1" "$1" > "$2"', "--", stage0, stage1.as_output()], category = "stage1") + + # Reuse on RE + stage2 = ctx.actions.declare_output("stage2") + ctx.actions.run(["sh", "-c", 'cat "$1" "$1" > "$2"', "--", stage1, stage2.as_output()], category = "stage2") + + # Reuse locally + stage3 = ctx.actions.declare_output("stage3") + ctx.actions.run( + ["sh", "-c", 'cat "$1" "$1" > "$2"', "--", stage2, stage3.as_output()], + category = "stage3", + local_only = True, + ) + + # Verify + stage4 = ctx.actions.declare_output("stage4") + ctx.actions.run( + [ + "sh", + "-c", + 'cat "$1" "$1" "$1" "$1" "$1" "$1" "$1" "$1" > "$3" && diff "$2" "$3"', + "--", + stage0, + stage3, + stage4.as_output(), + ], + category = "stage4", + ) + + return [DefaultInfo(stage4)] + +tests = rule(attrs = {}, impl = _tests) diff --git a/kubernetes/components/worker/worker.json5 b/kubernetes/components/worker/worker.json5 index bd8a3fafc..d68c57d55 100644 --- a/kubernetes/components/worker/worker.json5 +++ b/kubernetes/components/worker/worker.json5 @@ -40,6 +40,7 @@ }, }, }, + fast_direction: "get", slow: { ref_store: { name: "GRPC_LOCAL_STORE", diff --git a/local-remote-execution/MODULE.bazel b/local-remote-execution/MODULE.bazel index 6f7a176ac..849d7593e 100644 --- a/local-remote-execution/MODULE.bazel +++ b/local-remote-execution/MODULE.bazel @@ -7,10 +7,10 @@ module( compatibility_level = 0, ) -bazel_dep(name = "platforms", version = "0.0.11") +bazel_dep(name = "platforms", version = "1.0.0") # Use the starlark implementation of C++ rules instead of the builtin ones. -bazel_dep(name = "rules_cc", version = "0.1.1") +bazel_dep(name = "rules_cc", version = "0.1.5") # Use the starlark implementation of Java rules instead of the builtin ones. bazel_dep(name = "rules_java", version = "8.11.0") diff --git a/native-cli/default.nix b/native-cli/default.nix index 0c1e51dc8..bff1151fe 100644 --- a/native-cli/default.nix +++ b/native-cli/default.nix @@ -9,7 +9,7 @@ buildGoModule { pname = "native-cli"; version = "0.6.0"; src = ./.; - vendorHash = "sha256-4e7fPoBjbOd3pSMmkdTMIo1DC+XMLjgh2xZ98iHeH58="; + vendorHash = "sha256-dlJrpblQAx0/+DCLJ4xT6whRQo3SmSgRq/dLd0yH440="; buildInputs = [makeWrapper]; ldflags = ["-s -w"]; installPhase = '' diff --git a/native-cli/go.mod b/native-cli/go.mod index 76159fa7e..3c73d4747 100644 --- a/native-cli/go.mod +++ b/native-cli/go.mod @@ -1,10 +1,12 @@ module github.com/TraceMachina/nativelink/native-cli -go 1.23.5 +go 1.24.0 + +toolchain go1.24.3 require ( github.com/docker/docker v28.0.4+incompatible - github.com/go-git/go-git/v5 v5.14.0 + github.com/go-git/go-git/v5 v5.16.5 github.com/pulumi/pulumi-docker/sdk/v4 v4.6.2 github.com/pulumi/pulumi-kubernetes/sdk/v4 v4.22.1 github.com/pulumi/pulumi/sdk/v3 v3.160.0 @@ -34,7 +36,7 @@ require ( github.com/charmbracelet/x/cellbuf v0.0.13 // indirect github.com/charmbracelet/x/term v0.2.1 // indirect github.com/cheggaaa/pb v1.0.29 // indirect - github.com/cloudflare/circl v1.6.1 // indirect + github.com/cloudflare/circl v1.6.3 // indirect github.com/containerd/log v0.1.0 // indirect github.com/cyphar/filepath-securejoin v0.4.1 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect @@ -125,17 +127,17 @@ require ( go.opentelemetry.io/otel/metric v1.35.0 // indirect go.opentelemetry.io/otel/trace v1.35.0 // indirect go.uber.org/atomic v1.11.0 // indirect - golang.org/x/crypto v0.37.0 // indirect + golang.org/x/crypto v0.45.0 // indirect golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0 // indirect - golang.org/x/mod v0.24.0 // indirect - golang.org/x/net v0.39.0 // indirect + golang.org/x/mod v0.29.0 // indirect + golang.org/x/net v0.47.0 // indirect golang.org/x/oauth2 v0.29.0 // indirect - golang.org/x/sync v0.13.0 // indirect - golang.org/x/sys v0.32.0 // indirect - golang.org/x/term v0.31.0 // indirect - golang.org/x/text v0.24.0 // indirect + golang.org/x/sync v0.18.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect golang.org/x/time v0.11.0 // indirect - golang.org/x/tools v0.32.0 // indirect + golang.org/x/tools v0.38.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250407143221-ac9807e6c755 // indirect google.golang.org/grpc v1.71.1 // indirect google.golang.org/protobuf v1.36.6 // indirect diff --git a/native-cli/go.sum b/native-cli/go.sum index 7812eb148..8577eab32 100644 --- a/native-cli/go.sum +++ b/native-cli/go.sum @@ -45,8 +45,8 @@ github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQ github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg= github.com/cheggaaa/pb v1.0.29 h1:FckUN5ngEk2LpvuG0fw1GEFx6LtyY2pWI/Z2QgCnEYo= github.com/cheggaaa/pb v1.0.29/go.mod h1:W40334L7FMC5JKWldsTWbdGjLo0RxUKK73K+TuPxX30= -github.com/cloudflare/circl v1.6.1 h1:zqIqSPIndyBh1bjLVVDHMPpVKqp8Su/V+6MeDzzQBQ0= -github.com/cloudflare/circl v1.6.1/go.mod h1:uddAzsPgqdMAYatqJ0lsjX1oECcQLIlRpzZh3pJrofs= +github.com/cloudflare/circl v1.6.3 h1:9GPOhQGF9MCYUeXyMYlqTR6a5gTrgR/fBLXvUgtVcg8= +github.com/cloudflare/circl v1.6.3/go.mod h1:2eXP6Qfat4O/Yhh8BznvKnJ+uzEoTQ6jVKJRn81BiS4= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= @@ -94,8 +94,8 @@ github.com/go-git/go-billy/v5 v5.6.2 h1:6Q86EsPXMa7c3YZ3aLAQsMA0VlWmy43r6FHqa/UN github.com/go-git/go-billy/v5 v5.6.2/go.mod h1:rcFC2rAsp/erv7CMz9GczHcuD0D32fWzH+MJAU+jaUU= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII= -github.com/go-git/go-git/v5 v5.14.0 h1:/MD3lCrGjCen5WfEAzKg00MJJffKhC8gzS80ycmCi60= -github.com/go-git/go-git/v5 v5.14.0/go.mod h1:Z5Xhoia5PcWA3NF8vRLURn9E5FRhSl7dGj9ItW3Wk5k= +github.com/go-git/go-git/v5 v5.16.5 h1:mdkuqblwr57kVfXri5TTH+nMFLNUxIj9Z7F5ykFbw5s= +github.com/go-git/go-git/v5 v5.16.5/go.mod h1:QOMLpNf1qxuSY4StA/ArOdfFR2TrKEjJiye2kel2m+M= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -322,31 +322,31 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= -golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= +golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0 h1:R84qjqJb5nVJMxqWYb3np9L5ZsaDtB+a39EqjV0JSUM= golang.org/x/exp v0.0.0-20250408133849-7e4ce0ab07d0/go.mod h1:S9Xr4PYopiDyqSyp5NjCrhFrqg6A5zA2E/iPHPhqnS8= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU= -golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200421231249-e086a090c8fd/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= -golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= -golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= golang.org/x/oauth2 v0.29.0 h1:WdYw2tdTK1S8olAzWHdgeqfy+Mtm9XNhv/xJsY65d98= golang.org/x/oauth2 v0.29.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= -golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -363,16 +363,16 @@ golang.org/x/sys v0.0.0-20220615213510-4f61da869c0c/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= -golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= -golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= -golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -381,8 +381,8 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.32.0 h1:Q7N1vhpkQv7ybVzLFtTjvQya2ewbwNDZzUgfXGqtMWU= -golang.org/x/tools v0.32.0/go.mod h1:ZxrU41P/wAbZD8EDa6dDCa6XfpkhJ7HFMjHJXfBDu8s= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/nativelink-config/Cargo.toml b/nativelink-config/Cargo.toml index 0cf425a3f..f74177bb2 100644 --- a/nativelink-config/Cargo.toml +++ b/nativelink-config/Cargo.toml @@ -1,30 +1,45 @@ +#:schema ../tools/cargo-with-detailed-deps.json lints.workspace = true [package] edition = "2024" name = "nativelink-config" -version = "0.7.3" +version = "1.0.0-rc4" [dependencies] nativelink-error = { path = "../nativelink-error" } byte-unit = { version = "5.1.6", default-features = false, features = ["byte"] } -humantime = "2.2.0" +humantime = { version = "2.3.0", default-features = false } rand = { version = "0.9.0", default-features = false, features = [ "thread_rng", ] } +schemars = { version = "1.2.1", default-features = false, features = [ + "derive", + "std", +], optional = true } serde = { version = "1.0.219", default-features = false, features = ["derive"] } serde_json = { version = "1.0.140", default-features = false, features = [ "std", ] } -serde_json5 = "0.2.1" +serde_json5 = { version = "0.2.1", default-features = false } shellexpand = { version = "3.1.0", default-features = false, features = [ "base-0", ] } tracing = { version = "0.1.41", default-features = false } [dev-dependencies] -pretty_assertions = { version = "1.4.1", features = ["std"] } +pretty_assertions = { version = "1.4.1", features = [ + "std", +], default-features = false } tracing-test = { version = "0.2.5", default-features = false, features = [ "no-env-filter", ] } + +[features] +dev-schema = ["schemars"] + +[[bin]] +name = "build-schema" +path = "src/bin/build_schema.rs" +required-features = ["dev-schema"] diff --git a/nativelink-config/examples/basic_cas.json5 b/nativelink-config/examples/basic_cas.json5 index 1ed02ec0b..4d7278204 100644 --- a/nativelink-config/examples/basic_cas.json5 +++ b/nativelink-config/examples/basic_cas.json5 @@ -60,6 +60,7 @@ "container-image": "priority", "lre-rs": "priority", ISA: "exact", + InputRootAbsolutePath: "ignore", // used by chromium builds, but we can drop it }, }, }, @@ -75,6 +76,15 @@ ac_store: "AC_MAIN_STORE", }, work_directory: "/tmp/nativelink/work", + additional_environment: { + foo: "from_environment", + bar: { + value: "something", + }, + baz: "timeout_millis", + channel: "side_channel_file", + action: "action_directory", + }, platform_properties: { cpu_count: { values: [ diff --git a/nativelink-config/examples/scheduler_match_logging_disable.json5 b/nativelink-config/examples/scheduler_match_logging_disable.json5 new file mode 100644 index 000000000..4f333abb0 --- /dev/null +++ b/nativelink-config/examples/scheduler_match_logging_disable.json5 @@ -0,0 +1,12 @@ +{ + stores: [], + schedulers: [ + { + name: "MAIN_SCHEDULER", + simple: { + worker_match_logging_interval_s: -1, + }, + }, + ], + servers: [], +} diff --git a/nativelink-config/examples/stores-config.json5 b/nativelink-config/examples/stores-config.json5 index dba79289a..4fe27c981 100644 --- a/nativelink-config/examples/stores-config.json5 +++ b/nativelink-config/examples/stores-config.json5 @@ -253,6 +253,8 @@ "endpoints": [ {"address": "grpc://${CAS_ENDPOINT:-127.0.0.1}:50051"} ], + "connections_per_endpoint": "5", + "rpc_timeout_s": "5m", "store_type": "ac" } }, @@ -261,7 +263,8 @@ "redis_store": { "addresses": [ "redis://127.0.0.1:6379/", - ] + ], + "max_client_permits": 1000, } }, { diff --git a/nativelink-config/examples/worker_with_redis_scheduler.json5 b/nativelink-config/examples/worker_with_redis_scheduler.json5 new file mode 100644 index 000000000..85d845850 --- /dev/null +++ b/nativelink-config/examples/worker_with_redis_scheduler.json5 @@ -0,0 +1,198 @@ +{ + stores: [ + { + name: "AC_MAIN_STORE", + filesystem: { + content_path: "/tmp/nativelink/data-worker-test/content_path-ac", + temp_path: "/tmp/nativelink/data-worker-test/tmp_path-ac", + eviction_policy: { + max_bytes: 1000000000, + }, + }, + }, + { + name: "WORKER_FAST_SLOW_STORE", + fast_slow: { + fast: { + filesystem: { + content_path: "/tmp/nativelink/data-worker-test/content_path-cas", + temp_path: "/tmp/nativelink/data-worker-test/tmp_path-cas", + eviction_policy: { + max_bytes: 10000000000, + }, + }, + }, + slow: { + noop: {}, + }, + }, + }, + { + name: "SCHEDULER_REDIS_STORE", + redis_store: { + addresses: [ + "redis://127.0.0.1:6379", + ], + connection_pool_size: 10, + experimental_pub_sub_channel: "scheduler_key_change", + }, + }, + ], + schedulers: [ + { + name: "MAIN_SCHEDULER", + simple: { + worker_timeout_s: 30, + worker_match_logging_interval_s: -1, + supported_platform_properties: { + cpu_count: "minimum", + memory_kb: "minimum", + network_kbps: "minimum", + disk_read_iops: "minimum", + disk_read_bps: "minimum", + disk_write_iops: "minimum", + disk_write_bps: "minimum", + shm_size: "minimum", + gpu_count: "minimum", + gpu_model: "exact", + cpu_vendor: "exact", + cpu_arch: "exact", + cpu_model: "exact", + kernel_version: "exact", + OSFamily: "priority", + "container-image": "priority", + "lre-rs": "priority", + ISA: "exact", + }, + experimental_backend: { + redis: { + redis_store: "SCHEDULER_REDIS_STORE", + }, + }, + }, + }, + ], + workers: [ + { + local: { + worker_api_endpoint: { + uri: "grpc://127.0.0.1:50061", + }, + max_inflight_tasks: 5, + cas_fast_slow_store: "WORKER_FAST_SLOW_STORE", + upload_action_result: { + ac_store: "AC_MAIN_STORE", + }, + work_directory: "/tmp/nativelink/work", + platform_properties: { + cpu_count: { + values: [ + "14", + ], + }, + memory_kb: { + values: [ + "32000000", + ], + }, + network_kbps: { + values: [ + "100000", + ], + }, + cpu_arch: { + values: [ + "aarch64", + ], + }, + OSFamily: { + values: [ + "Darwin", + "", + ], + }, + "container-image": { + values: [ + "", + ], + }, + "lre-rs": { + values: [ + "", + ], + }, + ISA: { + values: [ + "aarch64", + ], + }, + }, + }, + }, + ], + servers: [ + { + name: "public", + listener: { + http: { + socket_address: "0.0.0.0:50051", + }, + }, + services: { + cas: [ + { + instance_name: "main", + cas_store: "WORKER_FAST_SLOW_STORE", + }, + ], + ac: [ + { + instance_name: "main", + ac_store: "AC_MAIN_STORE", + }, + ], + execution: [ + { + instance_name: "main", + cas_store: "WORKER_FAST_SLOW_STORE", + scheduler: "MAIN_SCHEDULER", + }, + ], + capabilities: [ + { + instance_name: "main", + remote_execution: { + scheduler: "MAIN_SCHEDULER", + }, + }, + ], + bytestream: [ + { + instance_name: "main", + cas_store: "WORKER_FAST_SLOW_STORE", + }, + ], + health: {}, + admin: {}, + }, + }, + { + name: "private_workers_servers", + listener: { + http: { + socket_address: "0.0.0.0:50061", + }, + }, + services: { + worker_api: { + scheduler: "MAIN_SCHEDULER", + }, + admin: {}, + health: {}, + }, + }, + ], + global: { + max_open_files: 24576, + }, +} diff --git a/nativelink-config/generate-stores-config/Cargo.lock b/nativelink-config/generate-stores-config/Cargo.lock index e81f1bef9..ce035e647 100644 --- a/nativelink-config/generate-stores-config/Cargo.lock +++ b/nativelink-config/generate-stores-config/Cargo.lock @@ -2,15 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "aho-corasick" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" -dependencies = [ - "memchr", -] - [[package]] name = "generate-stores-config" version = "0.1.0" @@ -18,20 +9,12 @@ dependencies = [ "regex", ] -[[package]] -name = "memchr" -version = "2.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" - [[package]] name = "regex" version = "1.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b5288124840bee7b386bc413c487869b360b2b4ec421ea56425128692f2a82c" dependencies = [ - "aho-corasick", - "memchr", "regex-automata", "regex-syntax", ] @@ -42,8 +25,6 @@ version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "833eb9ce86d40ef33cb1306d8accf7bc8ec2bfea4355cbdebb3df68b40925cad" dependencies = [ - "aho-corasick", - "memchr", "regex-syntax", ] diff --git a/nativelink-config/generate-stores-config/Cargo.toml b/nativelink-config/generate-stores-config/Cargo.toml index c102aba7b..cb74afd7a 100644 --- a/nativelink-config/generate-stores-config/Cargo.toml +++ b/nativelink-config/generate-stores-config/Cargo.toml @@ -1,9 +1,12 @@ +#:schema ../../tools/cargo-with-detailed-deps.json [package] edition = "2024" name = "generate-stores-config" version = "0.1.0" [dependencies] -regex = "1.11.3" +regex = { version = "1.11.3", default-features = false, features = [ + "unicode-perl", +] } [workspace] diff --git a/nativelink-config/generate-stores-config/src/main.rs b/nativelink-config/generate-stores-config/src/main.rs index cda44d4f7..d0284624d 100644 --- a/nativelink-config/generate-stores-config/src/main.rs +++ b/nativelink-config/generate-stores-config/src/main.rs @@ -22,23 +22,31 @@ fn main() { for block in json_start.captures_iter(&stores_rs) { let start_marker = block.get(0).unwrap().end(); let end_match = block_end.find(&stores_rs[start_marker..]).unwrap(); - let end_marker =end_match.start(); - let contents = &stores_rs[start_marker..(start_marker+end_marker)].split("\n").map(|line| line.replacen("///", "", 1)).collect::>().join("\n"); + let end_marker = end_match.start(); + let contents = &stores_rs[start_marker..(start_marker + end_marker)] + .split("\n") + .map(|line| line.replacen("///", "", 1)) + .collect::>() + .join("\n"); blocks.push(contents.trim().to_string()); } - let mut output = String::from("// Generated by generate-stores-config from stores.rs comments for testing + let mut output = String::from( + "// Generated by generate-stores-config from stores.rs comments for testing { servers: [], stores: [ -"); +", + ); for (index, contents) in blocks.iter().enumerate() { - let more_output = format!(r#" {{ + let more_output = format!( + r#" {{ name: "{index}", {contents} }}, -"#); +"# + ); output.push_str(&more_output); } output.push_str("]}\n"); diff --git a/nativelink-config/src/bin/build_schema.rs b/nativelink-config/src/bin/build_schema.rs new file mode 100644 index 000000000..3d4936264 --- /dev/null +++ b/nativelink-config/src/bin/build_schema.rs @@ -0,0 +1,24 @@ +//! ```sh +//! cargo run --bin build-schema --features dev-schema --package nativelink-config +//! ``` + +#[cfg(feature = "dev-schema")] +fn main() { + use std::fs::File; + + use nativelink_config::cas_server::CasConfig; + use schemars::schema_for; + use serde_json::to_writer_pretty; + const FILE: &str = "nativelink_config.schema.json"; + + let schema = schema_for!(CasConfig); + to_writer_pretty(File::create(FILE).expect("to create file"), &schema) + .expect("to export schema"); + + println!("Wrote schema to {FILE}"); +} + +#[cfg(not(feature = "dev-schema"))] +fn main() { + eprintln!("Enable with --features dev-schema"); +} diff --git a/nativelink-config/src/cas_server.rs b/nativelink-config/src/cas_server.rs index 207194714..c7f9f4882 100644 --- a/nativelink-config/src/cas_server.rs +++ b/nativelink-config/src/cas_server.rs @@ -15,6 +15,8 @@ use std::collections::HashMap; use nativelink_error::{Error, ResultExt}; +#[cfg(feature = "dev-schema")] +use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use crate::schedulers::SchedulerSpec; @@ -22,7 +24,7 @@ use crate::serde_utils::{ convert_data_size_with_shellexpand, convert_duration_with_shellexpand, convert_numeric_with_shellexpand, convert_optional_numeric_with_shellexpand, convert_optional_string_with_shellexpand, convert_string_with_shellexpand, - convert_vec_string_with_shellexpand, + convert_vec_string_with_shellexpand, convert_enum_with_shellexpand, }; use crate::stores::{ClientTlsConfig, ConfigDigestHashFunction, StoreRefName, StoreSpec}; @@ -34,6 +36,7 @@ pub type SchedulerRefName = String; pub type InstanceName = String; #[derive(Debug, Default, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct WithInstanceName { #[serde(default)] pub instance_name: InstanceName, @@ -50,6 +53,7 @@ impl core::ops::Deref for WithInstanceName { } #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct NamedConfig { pub name: String, #[serde(flatten)] @@ -58,6 +62,7 @@ pub struct NamedConfig { #[derive(Deserialize, Serialize, Debug, Default, Clone, Copy)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum HttpCompressionAlgorithm { /// No compression. #[default] @@ -78,6 +83,7 @@ pub enum HttpCompressionAlgorithm { /// and cloud-clients to use another. #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct HttpCompressionConfig { /// The compression algorithm that the server will use when sending /// responses to clients. Enabling this will likely save a lot of @@ -101,6 +107,7 @@ pub struct HttpCompressionConfig { #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct AcStoreConfig { /// The store name referenced in the `stores` map in the main config. /// This store name referenced here may be reused multiple times. @@ -115,6 +122,7 @@ pub struct AcStoreConfig { #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct CasStoreConfig { /// The store name referenced in the `stores` map in the main config. /// This store name referenced here may be reused multiple times. @@ -124,6 +132,7 @@ pub struct CasStoreConfig { #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct CapabilitiesRemoteExecutionConfig { /// Scheduler used to configure the capabilities of remote execution. #[serde(deserialize_with = "convert_string_with_shellexpand")] @@ -132,6 +141,7 @@ pub struct CapabilitiesRemoteExecutionConfig { #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct CapabilitiesConfig { /// Configuration for remote execution capabilities. /// If not set the capabilities service will inform the client that remote @@ -141,6 +151,7 @@ pub struct CapabilitiesConfig { #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ExecutionConfig { /// The store name referenced in the `stores` map in the main config. /// This store name referenced here may be reused multiple times. @@ -155,6 +166,7 @@ pub struct ExecutionConfig { #[derive(Deserialize, Serialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct FetchConfig { /// The store name referenced in the `stores` map in the main config. /// This store name referenced here may be reused multiple times. @@ -164,6 +176,7 @@ pub struct FetchConfig { #[derive(Deserialize, Serialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct PushConfig { /// The store name referenced in the `stores` map in the main config. /// This store name referenced here may be reused multiple times. @@ -177,12 +190,13 @@ pub struct PushConfig { } // From https://github.com/serde-rs/serde/issues/818#issuecomment-287438544 -fn default(t: &T) -> bool { +fn is_default(t: &T) -> bool { *t == Default::default() } #[derive(Deserialize, Serialize, Debug, Default, PartialEq, Eq)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ByteStreamConfig { /// Name of the store in the "stores" configuration. pub cas_store: StoreRefName, @@ -196,7 +210,7 @@ pub struct ByteStreamConfig { #[serde( default, deserialize_with = "convert_data_size_with_shellexpand", - skip_serializing_if = "default" + skip_serializing_if = "is_default" )] pub max_bytes_per_stream: usize, @@ -209,7 +223,7 @@ pub struct ByteStreamConfig { #[serde( default, deserialize_with = "convert_duration_with_shellexpand", - skip_serializing_if = "default" + skip_serializing_if = "is_default" )] pub persist_stream_on_disconnect_timeout: usize, } @@ -224,25 +238,26 @@ pub struct OldByteStreamConfig { #[serde( default, deserialize_with = "convert_data_size_with_shellexpand", - skip_serializing_if = "default" + skip_serializing_if = "is_default" )] pub max_bytes_per_stream: usize, #[serde( default, deserialize_with = "convert_data_size_with_shellexpand", - skip_serializing_if = "default" + skip_serializing_if = "is_default" )] pub max_decoding_message_size: usize, #[serde( default, deserialize_with = "convert_duration_with_shellexpand", - skip_serializing_if = "default" + skip_serializing_if = "is_default" )] pub persist_stream_on_disconnect_timeout: usize, } #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct WorkerApiConfig { /// The scheduler name referenced in the `schedulers` map in the main config. #[serde(deserialize_with = "convert_string_with_shellexpand")] @@ -251,6 +266,7 @@ pub struct WorkerApiConfig { #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct AdminConfig { /// Path to register the admin API. If path is "/admin", and your /// domain is "example.com", you can reach the endpoint with: @@ -263,6 +279,7 @@ pub struct AdminConfig { #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct HealthConfig { /// Path to register the health status check. If path is "/status", and your /// domain is "example.com", you can reach the endpoint with: @@ -278,6 +295,7 @@ pub struct HealthConfig { } #[derive(Deserialize, Serialize, Debug)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct BepConfig { /// The store to publish build events to. /// The store name referenced in the `stores` map in the main config. @@ -286,6 +304,7 @@ pub struct BepConfig { } #[derive(Deserialize, Serialize, Clone, Debug, Default)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct IdentityHeaderSpec { /// The name of the header to look for the identity in. /// Default: "x-identity" @@ -298,6 +317,7 @@ pub struct IdentityHeaderSpec { } #[derive(Deserialize, Serialize, Clone, Debug)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct OriginEventsPublisherSpec { /// The store to publish nativelink events to. /// The store name referenced in the `stores` map in the main config. @@ -306,6 +326,7 @@ pub struct OriginEventsPublisherSpec { } #[derive(Deserialize, Serialize, Clone, Debug)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct OriginEventsSpec { /// The publisher configuration for origin events. pub publisher: OriginEventsPublisherSpec, @@ -321,6 +342,7 @@ pub struct OriginEventsSpec { #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ServicesConfig { /// The Content Addressable Storage (CAS) backend config. /// The key is the `instance_name` used in the protocol and the @@ -402,6 +424,7 @@ pub struct ServicesConfig { #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct TlsConfig { /// Path to the certificate file. #[serde(deserialize_with = "convert_string_with_shellexpand")] @@ -430,6 +453,7 @@ pub struct TlsConfig { /// specified. #[derive(Deserialize, Serialize, Debug, Default, Clone, Copy)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct HttpServerConfig { /// Interval to send keep-alive pings via HTTP2. /// Note: This is in seconds. @@ -497,6 +521,7 @@ pub struct HttpServerConfig { #[derive(Deserialize, Serialize, Debug)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum ListenerConfig { /// Listener for HTTP/HTTPS/HTTP2 sockets. Http(HttpListener), @@ -504,6 +529,7 @@ pub enum ListenerConfig { #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct HttpListener { /// Address to listen on. Example: `127.0.0.1:8080` or `:8080` to listen /// to all IPs. @@ -533,6 +559,7 @@ pub struct HttpListener { #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ServerConfig { /// Name of the server. This is used to help identify the service /// for telemetry and logs. @@ -555,6 +582,7 @@ pub struct ServerConfig { #[derive(Deserialize, Serialize, Debug)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum WorkerProperty { /// List of static values. /// Note: Generally there should only ever be 1 value, but if the platform @@ -570,6 +598,7 @@ pub enum WorkerProperty { /// Generic config for an endpoint and associated configs. #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct EndpointConfig { /// URI of the endpoint. #[serde(deserialize_with = "convert_string_with_shellexpand")] @@ -585,6 +614,7 @@ pub struct EndpointConfig { #[derive(Copy, Clone, Deserialize, Serialize, Debug, Default)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum UploadCacheResultsStrategy { /// Only upload action results with an exit code of 0. #[default] @@ -602,6 +632,7 @@ pub enum UploadCacheResultsStrategy { #[derive(Clone, Deserialize, Serialize, Debug)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum EnvironmentSource { /// The name of the platform property in the action to get the value from. Property(String), @@ -609,6 +640,9 @@ pub enum EnvironmentSource { /// The raw value to set. Value(#[serde(deserialize_with = "convert_string_with_shellexpand")] String), + /// Take the value from the local environment corresponding to the name key + FromEnvironment, + /// The max amount of time in milliseconds the command is allowed to run /// (requested by the client). TimeoutMillis, @@ -649,6 +683,7 @@ pub enum EnvironmentSource { #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct UploadActionResultConfig { /// Underlying AC store that the worker will use to publish execution results /// into. Objects placed in this store should be reachable from the @@ -709,6 +744,7 @@ pub struct UploadActionResultConfig { #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct LocalWorkerConfig { /// Name of the worker. This is give a more friendly name to a worker for logging /// and metric publishing. This is also the prefix of the worker id @@ -727,6 +763,20 @@ pub struct LocalWorkerConfig { #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] pub max_action_timeout: usize, + /// Maximum time allowed for uploading action results to CAS after execution + /// completes. If upload takes longer than this, the action fails with + /// `DeadlineExceeded` and may be retried by the scheduler. Value in seconds. + /// + /// Default: 600 (seconds / 10 mins) + #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] + pub max_upload_timeout: usize, + + /// Maximum number of inflight tasks this worker can cope with. + /// + /// Default: 0 (infinite tasks) + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] + pub max_inflight_tasks: u64, + /// If timeout is handled in `entrypoint` or another wrapper script. /// If set to true `NativeLink` will not honor the timeout the action requested /// and instead will always force kill the action after `max_action_timeout` @@ -797,10 +847,60 @@ pub struct LocalWorkerConfig { /// of the environment variable being the value of the property of the /// action being executed of that name or the fixed value. pub additional_environment: Option>, + + /// Optional directory cache configuration for improving performance by caching + /// reconstructed input directories and using hardlinks instead of rebuilding + /// them from CAS for every action. + /// Default: None (directory cache disabled) + pub directory_cache: Option, + + #[serde(deserialize_with = "convert_enum_with_shellexpand")] + pub execution_completion_behaviour: ExecutionCompletionBehaviour, +} + +#[derive(Deserialize, Serialize, Debug, Default, Copy, Clone)] +#[serde(rename_all = "snake_case")] +pub enum ExecutionCompletionBehaviour { + #[default] + Default, + OneShotAlways, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] +pub struct DirectoryCacheConfig { + /// Maximum number of cached directories. + /// Default: 1000 + #[serde(default = "default_directory_cache_max_entries")] + pub max_entries: usize, + + /// Maximum total size in bytes for all cached directories (0 = unlimited). + /// Default: 10737418240 (10 GB) + #[serde( + default = "default_directory_cache_max_size_bytes", + deserialize_with = "convert_data_size_with_shellexpand" + )] + pub max_size_bytes: u64, + + /// Base directory for cache storage. This directory will be managed by + /// the worker and should be on the same filesystem as `work_directory`. + /// Default: `{work_directory}/../directory_cache` + #[serde(default, deserialize_with = "convert_string_with_shellexpand")] + pub cache_root: String, +} + +const fn default_directory_cache_max_entries() -> usize { + 1000 +} + +const fn default_directory_cache_max_size_bytes() -> u64 { + 10 * 1024 * 1024 * 1024 // 10 GB } #[derive(Deserialize, Serialize, Debug)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum WorkerConfig { /// A worker type that executes jobs locally on this machine. Local(LocalWorkerConfig), @@ -808,6 +908,7 @@ pub enum WorkerConfig { #[derive(Deserialize, Serialize, Debug, Clone, Copy)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct GlobalConfig { /// Maximum number of open files that can be opened at one time. /// This value is not strictly enforced, it is a best effort. Some internal libraries @@ -843,6 +944,7 @@ pub type SchedulerConfig = NamedConfig; #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct CasConfig { /// List of stores available to use in this config. /// The keys can be used in other configs when needing to reference a store. diff --git a/nativelink-config/src/schedulers.rs b/nativelink-config/src/schedulers.rs index 4067ecb8a..c2001817e 100644 --- a/nativelink-config/src/schedulers.rs +++ b/nativelink-config/src/schedulers.rs @@ -14,24 +14,32 @@ use std::collections::HashMap; +#[cfg(feature = "dev-schema")] +use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use crate::serde_utils::{convert_duration_with_shellexpand, convert_numeric_with_shellexpand}; +use crate::serde_utils::{ + convert_duration_with_shellexpand, convert_duration_with_shellexpand_and_negative, + convert_numeric_with_shellexpand, +}; use crate::stores::{GrpcEndpoint, Retry, StoreRefName}; #[derive(Deserialize, Serialize, Debug)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum SchedulerSpec { Simple(SimpleSpec), Grpc(GrpcSpec), CacheLookup(CacheLookupSpec), PropertyModifier(PropertyModifierSpec), + PropertyRouter(PropertyRouterSpec), } /// When the scheduler matches tasks to workers that are capable of running /// the task, this value will be used to determine how the property is treated. #[derive(Deserialize, Serialize, Debug, Clone, Copy, Hash, Eq, PartialEq)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum PropertyType { /// Requires the platform property to be a u64 and when the scheduler looks /// for appropriate worker nodes that are capable of executing the task, @@ -50,6 +58,10 @@ pub enum PropertyType { /// to cause the scheduler to prefer certain workers over others, but not /// restrict them based on these values. Priority, + + //// Allows jobs to be requested with said key, but without requiring workers + //// to have that key + Ignore, } /// When a worker is being searched for to run a job, this will be used @@ -57,6 +69,7 @@ pub enum PropertyType { /// workers are able to run the task. #[derive(Copy, Clone, Deserialize, Serialize, Debug, Default)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum WorkerAllocationStrategy { /// Prefer workers that have been least recently used to run a job. #[default] @@ -65,8 +78,26 @@ pub enum WorkerAllocationStrategy { MostRecentlyUsed, } +// defaults to every 10s +const fn default_worker_match_logging_interval_s() -> i64 { + 10 +} + +/// Default batch interval in milliseconds (100ms). +/// This is the maximum time between batch matching cycles. +const fn default_batch_interval_ms() -> u64 { + 100 +} + +/// Default debounce window in milliseconds (20ms). +/// After a trigger, wait this long to collect more changes before running. +const fn default_batch_debounce_ms() -> u64 { + 20 +} + #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct SimpleSpec { /// A list of supported platform properties mapped to how these properties /// are used when the scheduler looks for worker nodes capable of running @@ -112,6 +143,16 @@ pub struct SimpleSpec { #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] pub worker_timeout_s: u64, + /// Maximum time (seconds) an action can stay in Executing state without + /// any worker update before being timed out and re-queued. + /// This applies regardless of worker keepalive status, catching cases + /// where a worker is alive (sending keepalives) but stuck on a specific + /// action. Set to 0 to disable (relies only on `worker_timeout_s`). + /// + /// Default: 0 (disabled) + #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] + pub max_action_executing_timeout_s: u64, + /// If a job returns an internal error or times out this many times when /// attempting to run on a worker the scheduler will return the last error /// to the client. Jobs will be retried and this configuration is to help @@ -129,10 +170,52 @@ pub struct SimpleSpec { /// The storage backend to use for the scheduler. /// Default: memory pub experimental_backend: Option, + + /// Every N seconds, do logging of worker matching + /// e.g. "worker busy", "can't find any worker" + /// Defaults to 10s. Can be set to -1 to disable + #[serde( + default = "default_worker_match_logging_interval_s", + deserialize_with = "convert_duration_with_shellexpand_and_negative" + )] + pub worker_match_logging_interval_s: i64, + + /// Enable batch worker matching optimization. + /// When enabled, the scheduler will collect queued actions and match them + /// to workers in a single batch operation, reducing lock contention. + /// This can significantly improve throughput when there are many queued + /// actions and workers. + /// Default: false + #[serde(default)] + pub enable_batch_worker_matching: bool, + + /// Maximum interval between batch matching cycles (milliseconds). + /// Even without triggers, matching runs at least this often. + /// Only used when `enable_batch_worker_matching` is true. + /// Default: 100ms + #[serde( + default = "default_batch_interval_ms", + deserialize_with = "convert_numeric_with_shellexpand" + )] + pub batch_interval_ms: u64, + + /// Debounce window after first trigger (milliseconds). + /// When a task or worker change notification is received, wait this long + /// to collect additional changes before running batch match. + /// This improves batching efficiency under bursty load. + /// 0 = immediate (no debounce). + /// Only used when `enable_batch_worker_matching` is true. + /// Default: 20ms + #[serde( + default = "default_batch_debounce_ms", + deserialize_with = "convert_numeric_with_shellexpand" + )] + pub batch_debounce_ms: u64, } #[derive(Deserialize, Serialize, Debug)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum ExperimentalSimpleSchedulerBackend { /// Use an in-memory store for the scheduler. Memory, @@ -142,6 +225,7 @@ pub enum ExperimentalSimpleSchedulerBackend { #[derive(Deserialize, Serialize, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ExperimentalRedisSchedulerBackend { /// A reference to the redis store to use for the scheduler. /// Note: This MUST resolve to a `RedisSpec`. @@ -154,6 +238,7 @@ pub struct ExperimentalRedisSchedulerBackend { /// build at the main scheduler directly though. #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct GrpcSpec { /// The upstream scheduler to forward requests to. pub endpoint: GrpcEndpoint, @@ -165,17 +250,18 @@ pub struct GrpcSpec { /// Limit the number of simultaneous upstream requests to this many. A /// value of zero is treated as unlimited. If the limit is reached the /// request is queued. - #[serde(default)] + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] pub max_concurrent_requests: usize, /// The number of connections to make to each specified endpoint to balance /// the load over multiple TCP connections. Default 1. - #[serde(default)] + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] pub connections_per_endpoint: usize, } #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct CacheLookupSpec { /// The reference to the action cache store used to return cached /// actions from rather than running them again. @@ -188,6 +274,7 @@ pub struct CacheLookupSpec { #[derive(Deserialize, Serialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct PlatformPropertyAddition { /// The name of the property to add. pub name: String, @@ -197,6 +284,7 @@ pub struct PlatformPropertyAddition { #[derive(Deserialize, Serialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct PlatformPropertyReplacement { /// The name of the property to replace. pub name: String, @@ -212,6 +300,7 @@ pub struct PlatformPropertyReplacement { #[derive(Deserialize, Serialize, Debug, Clone)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum PropertyModification { /// Add a property to the action properties. Add(PlatformPropertyAddition), @@ -223,6 +312,7 @@ pub enum PropertyModification { #[derive(Deserialize, Serialize, Debug)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct PropertyModifierSpec { /// A list of modifications to perform to incoming actions for the nested /// scheduler. These are performed in order and blindly, so removing a @@ -234,3 +324,20 @@ pub struct PropertyModifierSpec { /// The nested scheduler to use after modifying the properties. pub scheduler: Box, } + +/// Routes actions to different schedulers based on a platform property value. +/// Actions whose property value matches a key in `routes` go to that scheduler. +/// All other actions (missing property or unmatched value) go to `default_scheduler`. +#[derive(Deserialize, Serialize, Debug)] +#[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] +pub struct PropertyRouterSpec { + /// The platform property key to match on (e.g. "container-image"). + pub property_name: String, + + /// Map of property value -> nested scheduler spec. + pub routes: HashMap, + + /// Scheduler to use when the property is absent or its value does not match any route. + pub default_scheduler: Box, +} diff --git a/nativelink-config/src/serde_utils.rs b/nativelink-config/src/serde_utils.rs index d66b4b9d1..5ebdf0e11 100644 --- a/nativelink-config/src/serde_utils.rs +++ b/nativelink-config/src/serde_utils.rs @@ -18,7 +18,7 @@ use std::fmt; use byte_unit::Byte; use humantime::parse_duration; -use serde::de::Visitor; +use serde::de::{DeserializeOwned, Visitor}; use serde::{Deserialize, Deserializer, de}; /// Helper for serde macro so you can use shellexpand variables in the json configuration @@ -152,6 +152,43 @@ pub fn convert_string_with_shellexpand<'de, D: Deserializer<'de>>( Ok((*(shellexpand::env(&value).map_err(de::Error::custom)?)).to_string()) } +pub fn convert_boolean_with_shellexpand<'de, D, T>(deserializer: D) -> Result +where + D: Deserializer<'de>, + T: TryFrom, + >::Error: fmt::Display, +{ + struct BooleanExpandVisitor>(PhantomData); + + impl Visitor<'_> for BooleanExpandVisitor + where + T: TryFrom, + >::Error: fmt::Display, + { + type Value = T; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("a boolean or a shell-expandable string that is a boolean") + } + + fn visit_bool(self, v: bool) -> Result { + T::try_from(v).map_err(de::Error::custom) + } + + fn visit_str(self, v: &str) -> Result { + if v.is_empty() { + return Err(de::Error::custom("empty string is not a valid number")); + } + let expanded = shellexpand::env(v).map_err(de::Error::custom)?; + let s = expanded.as_ref().trim().to_lowercase(); + let parsed = s.parse::().map_err(de::Error::custom)?; + T::try_from(parsed).map_err(de::Error::custom) + } + } + + deserializer.deserialize_any(BooleanExpandVisitor::(PhantomData)) +} + /// Same as `convert_string_with_shellexpand`, but supports `Vec`. /// /// # Errors @@ -249,6 +286,86 @@ where deserializer.deserialize_any(DataSizeVisitor::(PhantomData)) } +/// # Errors +/// +/// Will return `Err` if deserialization fails. +pub fn convert_optional_data_size_with_shellexpand<'de, D, T>( + deserializer: D, +) -> Result, D::Error> +where + D: Deserializer<'de>, + T: TryFrom, + >::Error: fmt::Display, +{ + struct DataSizeVisitor>(PhantomData); + + impl<'de, T> Visitor<'de> for DataSizeVisitor + where + T: TryFrom, + >::Error: fmt::Display, + { + type Value = Option; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("an optional number of bytes as an integer, or a string with a data size format (e.g., \"1GB\", \"500MB\", \"1.5TB\")") + } + + fn visit_none(self) -> Result { + Ok(None) + } + + fn visit_unit(self) -> Result { + Ok(None) + } + + fn visit_some>( + self, + deserializer: D2, + ) -> Result { + deserializer.deserialize_any(self) + } + + fn visit_u64(self, v: u64) -> Result { + T::try_from(u128::from(v)) + .map(Some) + .map_err(de::Error::custom) + } + + fn visit_i64(self, v: i64) -> Result { + if v < 0 { + return Err(de::Error::custom("Negative data size is not allowed")); + } + let v_u128 = u128::try_from(v).map_err(de::Error::custom)?; + T::try_from(v_u128).map(Some).map_err(de::Error::custom) + } + + fn visit_u128(self, v: u128) -> Result { + T::try_from(v).map(Some).map_err(de::Error::custom) + } + + fn visit_i128(self, v: i128) -> Result { + if v < 0 { + return Err(de::Error::custom("Negative data size is not allowed")); + } + let v_u128 = u128::try_from(v).map_err(de::Error::custom)?; + T::try_from(v_u128).map(Some).map_err(de::Error::custom) + } + + fn visit_str(self, v: &str) -> Result { + let expanded = shellexpand::env(v).map_err(de::Error::custom)?; + let s = expanded.as_ref().trim(); + if v.is_empty() { + return Err(de::Error::custom("Missing value in a size field")); + } + let byte_size = Byte::parse_str(s, true).map_err(de::Error::custom)?; + let bytes = byte_size.as_u128(); + T::try_from(bytes).map(Some).map_err(de::Error::custom) + } + } + + deserializer.deserialize_option(DataSizeVisitor::(PhantomData)) +} + /// # Errors /// /// Will return `Err` if deserialization fails. @@ -280,12 +397,12 @@ where return Err(de::Error::custom("Negative duration is not allowed")); } let v_u64 = u64::try_from(v).map_err(de::Error::custom)?; - T::try_from(v_u64).map_err(de::Error::custom) + self.visit_u64(v_u64) } fn visit_u128(self, v: u128) -> Result { let v_u64 = u64::try_from(v).map_err(de::Error::custom)?; - T::try_from(v_u64).map_err(de::Error::custom) + self.visit_u64(v_u64) } fn visit_i128(self, v: i128) -> Result { @@ -293,7 +410,7 @@ where return Err(de::Error::custom("Negative duration is not allowed")); } let v_u64 = u64::try_from(v).map_err(de::Error::custom)?; - T::try_from(v_u64).map_err(de::Error::custom) + self.visit_u64(v_u64) } fn visit_str(self, v: &str) -> Result { @@ -301,9 +418,78 @@ where let expanded = expanded.as_ref().trim(); let duration = parse_duration(expanded).map_err(de::Error::custom)?; let secs = duration.as_secs(); - T::try_from(secs).map_err(de::Error::custom) + self.visit_u64(secs) } } deserializer.deserialize_any(DurationVisitor::(PhantomData)) } + +/// # Errors +/// +/// Will return `Err` if deserialization fails. +pub fn convert_duration_with_shellexpand_and_negative<'de, D, T>( + deserializer: D, +) -> Result +where + D: Deserializer<'de>, + T: TryFrom, + >::Error: fmt::Display, +{ + struct DurationVisitor>(PhantomData); + + impl Visitor<'_> for DurationVisitor + where + T: TryFrom, + >::Error: fmt::Display, + { + type Value = T; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + formatter.write_str("either a number of seconds as an integer, or a string with a duration format (e.g., \"1h2m3s\", \"30m\", \"1d\")") + } + + fn visit_u64(self, v: u64) -> Result { + let v_i64 = i64::try_from(v).map_err(de::Error::custom)?; + self.visit_i64(v_i64) + } + + fn visit_i64(self, v: i64) -> Result { + T::try_from(v).map_err(de::Error::custom) + } + + fn visit_u128(self, v: u128) -> Result { + let v_i64 = i64::try_from(v).map_err(de::Error::custom)?; + self.visit_i64(v_i64) + } + + fn visit_i128(self, v: i128) -> Result { + let v_i64 = i64::try_from(v).map_err(de::Error::custom)?; + self.visit_i64(v_i64) + } + + fn visit_str(self, v: &str) -> Result { + let expanded = shellexpand::env(v).map_err(de::Error::custom)?; + let expanded = expanded.as_ref().trim(); + let duration = parse_duration(expanded).map_err(de::Error::custom)?; + let secs = duration.as_secs(); + self.visit_u64(secs) + } + } + + deserializer.deserialize_any(DurationVisitor::(PhantomData)) +} + +pub fn convert_enum_with_shellexpand<'de, D, T>(deserializer: D) -> Result +where + D: Deserializer<'de>, + T: DeserializeOwned, +{ + let s = String::deserialize(deserializer)?; + let expanded = shellexpand::env(&s) + .map_err(de::Error::custom)?; + + let quoted = format!("\"{}\"", expanded); + serde_json5::from_str("ed) + .map_err(de::Error::custom) +} diff --git a/nativelink-config/src/stores.rs b/nativelink-config/src/stores.rs index 8a3f8e209..3a48e403e 100644 --- a/nativelink-config/src/stores.rs +++ b/nativelink-config/src/stores.rs @@ -16,11 +16,14 @@ use core::time::Duration; use std::sync::Arc; use rand::Rng; +#[cfg(feature = "dev-schema")] +use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use crate::serde_utils::{ - convert_data_size_with_shellexpand, convert_duration_with_shellexpand, - convert_numeric_with_shellexpand, convert_optional_numeric_with_shellexpand, + convert_boolean_with_shellexpand, convert_data_size_with_shellexpand, + convert_duration_with_shellexpand, convert_numeric_with_shellexpand, + convert_optional_data_size_with_shellexpand, convert_optional_numeric_with_shellexpand, convert_optional_string_with_shellexpand, convert_string_with_shellexpand, convert_vec_string_with_shellexpand, }; @@ -31,6 +34,7 @@ pub type StoreRefName = String; #[derive(Serialize, Deserialize, Debug, Clone, Copy)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum ConfigDigestHashFunction { /// Use the sha256 hash function. /// @@ -43,6 +47,7 @@ pub enum ConfigDigestHashFunction { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum StoreSpec { /// Memory store will store all data in a hashmap in memory. /// @@ -472,6 +477,8 @@ pub enum StoreSpec { /// "endpoints": [ /// {"address": "grpc://${CAS_ENDPOINT:-127.0.0.1}:50051"} /// ], + /// "connections_per_endpoint": "5", + /// "rpc_timeout_s": "5m", /// "store_type": "ac" /// } /// ``` @@ -489,7 +496,8 @@ pub enum StoreSpec { /// "redis_store": { /// "addresses": [ /// "redis://127.0.0.1:6379/", - /// ] + /// ], + /// "max_client_permits": 1000, /// } /// ``` /// @@ -532,6 +540,7 @@ pub enum StoreSpec { /// Configuration for an individual shard of the store. #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ShardConfig { /// Store to shard the data to. pub store: StoreSpec, @@ -541,11 +550,13 @@ pub struct ShardConfig { /// all the store's weights divided by the individual store's weight. /// /// Default: 1 + #[serde(deserialize_with = "convert_optional_numeric_with_shellexpand")] pub weight: Option, } #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ShardSpec { /// Stores to shard the data to. pub stores: Vec, @@ -553,6 +564,7 @@ pub struct ShardSpec { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct SizePartitioningSpec { /// Size to partition the data on. #[serde(deserialize_with = "convert_data_size_with_shellexpand")] @@ -567,6 +579,7 @@ pub struct SizePartitioningSpec { #[derive(Serialize, Deserialize, Debug, Default, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct RefSpec { /// Name of the store under the root "stores" config object. #[serde(deserialize_with = "convert_string_with_shellexpand")] @@ -575,6 +588,7 @@ pub struct RefSpec { #[derive(Serialize, Deserialize, Debug, Default, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct FilesystemSpec { /// Path on the system where to store the actual content. This is where /// the bulk of the data will be placed. @@ -609,11 +623,22 @@ pub struct FilesystemSpec { /// Default: 4096 #[serde(default, deserialize_with = "convert_data_size_with_shellexpand")] pub block_size: u64, + + /// Maximum number of concurrent write operations allowed. + /// Each write involves streaming data to a temp file and calling `sync_all()`, + /// which can saturate disk I/O when many writes happen simultaneously. + /// Limiting concurrency prevents disk saturation from blocking the async + /// runtime. + /// A value of 0 means unlimited (no concurrency limit). + /// Default: 0 + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] + pub max_concurrent_writes: usize, } // NetApp ONTAP S3 Spec #[derive(Serialize, Deserialize, Debug, Default, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ExperimentalOntapS3Spec { #[serde(deserialize_with = "convert_string_with_shellexpand")] pub endpoint: String, @@ -621,7 +646,7 @@ pub struct ExperimentalOntapS3Spec { pub vserver_name: String, #[serde(deserialize_with = "convert_string_with_shellexpand")] pub bucket: String, - #[serde(default)] + #[serde(default, deserialize_with = "convert_optional_string_with_shellexpand")] pub root_certificates: Option, /// Common retry and upload configuration @@ -631,6 +656,7 @@ pub struct ExperimentalOntapS3Spec { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct OntapS3ExistenceCacheSpec { #[serde(deserialize_with = "convert_string_with_shellexpand")] pub index_path: String, @@ -639,20 +665,53 @@ pub struct OntapS3ExistenceCacheSpec { pub backend: Box, } +#[derive(Serialize, Deserialize, Default, Debug, Clone, Copy, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] +pub enum StoreDirection { + /// The store operates normally and all get and put operations are + /// handled by it. + #[default] + Both, + /// Update operations will cause persistence to this store, but Get + /// operations will be ignored. + /// This only makes sense on the fast store as the slow store will + /// never get written to on Get anyway. + Update, + /// Get operations will cause persistence to this store, but Update + /// operations will be ignored. + Get, + /// Operate as a read only store, only really makes sense if there's + /// another way to write to it. + ReadOnly, +} + #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct FastSlowSpec { /// Fast store that will be attempted to be contacted before reaching /// out to the `slow` store. pub fast: StoreSpec, + /// How to handle the fast store. This can be useful to set to Get for + /// worker nodes such that results are persisted to the slow store only. + #[serde(default)] + pub fast_direction: StoreDirection, + /// If the object does not exist in the `fast` store it will try to /// get it from this store. pub slow: StoreSpec, + + /// How to handle the slow store. This can be useful if creating a diode + /// and you wish to have an upstream read only store. + #[serde(default)] + pub slow_direction: StoreDirection, } #[derive(Serialize, Deserialize, Debug, Default, Clone, Copy)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct MemorySpec { /// Policy used to evict items out of the store. Failure to set this /// value will cause items to never be removed from the store causing @@ -662,6 +721,7 @@ pub struct MemorySpec { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct DedupSpec { /// Store used to store the index of each dedup slice. This store /// should generally be fast and small. @@ -717,6 +777,7 @@ pub struct DedupSpec { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ExistenceCacheSpec { /// The underlying store wrap around. All content will first flow /// through self before forwarding to backend. In the event there @@ -733,6 +794,7 @@ pub struct ExistenceCacheSpec { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct VerifySpec { /// The underlying store wrap around. All content will first flow /// through self before forwarding to backend. In the event there @@ -745,7 +807,7 @@ pub struct VerifySpec { /// an upload of data. /// /// This should be set to false for AC, but true for CAS stores. - #[serde(default)] + #[serde(default, deserialize_with = "convert_boolean_with_shellexpand")] pub verify_size: bool, /// If the data should be hashed and verify that the key matches the @@ -753,12 +815,13 @@ pub struct VerifySpec { /// request and if not set will use the global default. /// /// This should be set to false for AC, but true for CAS stores. - #[serde(default)] + #[serde(default, deserialize_with = "convert_boolean_with_shellexpand")] pub verify_hash: bool, } #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct CompletenessCheckingSpec { /// The underlying store that will have it's results validated before sending to client. pub backend: StoreSpec, @@ -770,6 +833,7 @@ pub struct CompletenessCheckingSpec { #[derive(Serialize, Deserialize, Debug, Default, PartialEq, Eq, Clone, Copy)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct Lz4Config { /// Size of the blocks to compress. /// Higher values require more ram, but might yield slightly better @@ -793,6 +857,7 @@ pub struct Lz4Config { #[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone, Copy)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum CompressionAlgorithm { /// LZ4 compression algorithm is extremely fast for compression and /// decompression, however does not perform very well in compression @@ -806,6 +871,7 @@ pub enum CompressionAlgorithm { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct CompressionSpec { /// The underlying store wrap around. All content will first flow /// through self before forwarding to backend. In the event there @@ -824,6 +890,7 @@ pub struct CompressionSpec { /// until the store size becomes smaller than `max_bytes`. #[derive(Serialize, Deserialize, Debug, Default, Clone, Copy)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct EvictionPolicy { /// Maximum number of bytes before eviction takes place. /// Default: 0. Zero means never evict based on size. @@ -851,6 +918,7 @@ pub struct EvictionPolicy { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(tag = "provider", rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum ExperimentalCloudObjectSpec { Aws(ExperimentalAwsSpec), Gcs(ExperimentalGcsSpec), @@ -865,6 +933,7 @@ impl Default for ExperimentalCloudObjectSpec { #[derive(Serialize, Deserialize, Debug, Default, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ExperimentalAwsSpec { /// S3 region. Usually us-east-1, us-west-2, af-south-1, exc... #[serde(default, deserialize_with = "convert_string_with_shellexpand")] @@ -881,6 +950,7 @@ pub struct ExperimentalAwsSpec { #[derive(Serialize, Deserialize, Debug, Default, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ExperimentalGcsSpec { /// Bucket name to use as the backend. #[serde(default, deserialize_with = "convert_string_with_shellexpand")] @@ -889,6 +959,10 @@ pub struct ExperimentalGcsSpec { /// Chunk size for resumable uploads. /// /// Default: 2MB + #[serde( + default, + deserialize_with = "convert_optional_data_size_with_shellexpand" + )] pub resumable_chunk_size: Option, /// Common retry and upload configuration @@ -896,11 +970,22 @@ pub struct ExperimentalGcsSpec { pub common: CommonObjectSpec, /// Error if authentication was not found. - #[serde(default)] + #[serde(default, deserialize_with = "convert_boolean_with_shellexpand")] pub authentication_required: bool, + + /// Connection timeout in milliseconds. + /// Default: 3000 + #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] + pub connection_timeout_s: u64, + + /// Read timeout in milliseconds. + /// Default: 3000 + #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] + pub read_timeout_s: u64, } #[derive(Serialize, Deserialize, Debug, Default, Clone)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct CommonObjectSpec { /// If you wish to prefix the location in the bucket. If None, no prefix will be used. #[serde(default)] @@ -930,17 +1015,26 @@ pub struct CommonObjectSpec { /// upload will be aborted and the client will likely receive an error. /// /// Default: 5MB. + #[serde( + default, + deserialize_with = "convert_optional_data_size_with_shellexpand" + )] pub max_retry_buffer_per_request: Option, /// Maximum number of concurrent `UploadPart` requests per `MultipartUpload`. /// /// Default: 10. + /// + #[serde( + default, + deserialize_with = "convert_optional_numeric_with_shellexpand" + )] pub multipart_max_concurrent_uploads: Option, /// Allow unencrypted HTTP connections. Only use this for local testing. /// /// Default: false - #[serde(default)] + #[serde(default, deserialize_with = "convert_boolean_with_shellexpand")] pub insecure_allow_http: bool, /// Disable http/2 connections and only use http/1.1. Default client @@ -950,12 +1044,13 @@ pub struct CommonObjectSpec { /// underlying network environment, S3, or GCS API servers specify otherwise. /// /// Default: false - #[serde(default)] + #[serde(default, deserialize_with = "convert_boolean_with_shellexpand")] pub disable_http2: bool, } #[derive(Serialize, Deserialize, Debug, Clone, Copy)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum StoreType { /// The store is content addressable storage. Cas, @@ -964,6 +1059,7 @@ pub enum StoreType { } #[derive(Serialize, Deserialize, Debug, Clone)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ClientTlsConfig { /// Path to the certificate authority to use to validate the remote. /// @@ -992,6 +1088,7 @@ pub struct ClientTlsConfig { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct GrpcEndpoint { /// The endpoint address (i.e. grpc(s)://example.com:443). #[serde(deserialize_with = "convert_string_with_shellexpand")] @@ -999,11 +1096,39 @@ pub struct GrpcEndpoint { /// The TLS configuration to use to connect to the endpoint (if grpcs). pub tls_config: Option, /// The maximum concurrency to allow on this endpoint. + #[serde( + default, + deserialize_with = "convert_optional_numeric_with_shellexpand" + )] pub concurrency_limit: Option, + + /// Timeout for establishing a TCP connection to the endpoint (seconds). + /// If not set or 0, defaults to 30 seconds. + #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] + pub connect_timeout_s: u64, + + /// TCP keepalive interval (seconds). Sends TCP keepalive probes at this + /// interval to detect dead connections at the OS level. + /// If not set or 0, defaults to 30 seconds. + #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] + pub tcp_keepalive_s: u64, + + /// HTTP/2 keepalive interval (seconds). Sends HTTP/2 PING frames at this + /// interval to detect dead connections at the application level. + /// If not set or 0, defaults to 30 seconds. + #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] + pub http2_keepalive_interval_s: u64, + + /// HTTP/2 keepalive timeout (seconds). If a PING response is not received + /// within this duration, the connection is considered dead. + /// If not set or 0, defaults to 20 seconds. + #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] + pub http2_keepalive_timeout_s: u64, } #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct GrpcSpec { /// Instance name for GRPC calls. Proxy calls will have the `instance_name` changed to this. #[serde(default, deserialize_with = "convert_string_with_shellexpand")] @@ -1022,17 +1147,32 @@ pub struct GrpcSpec { /// Limit the number of simultaneous upstream requests to this many. A /// value of zero is treated as unlimited. If the limit is reached the /// request is queued. - #[serde(default)] + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] pub max_concurrent_requests: usize, /// The number of connections to make to each specified endpoint to balance /// the load over multiple TCP connections. Default 1. - #[serde(default)] + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] pub connections_per_endpoint: usize, + + /// Maximum time (seconds) allowed for a single RPC request (e.g. a + /// ByteStream.Write call) before it is cancelled. + /// + /// A value of 0 (the default) disables the per-RPC timeout. Dead + /// connections are still detected by the HTTP/2 and TCP keepalive + /// mechanisms configured on each endpoint. + /// + /// For large uploads (multi-GB), either leave this at 0 or set it + /// large enough to accommodate the full transfer time. + /// + /// Default: 0 (disabled) + #[serde(default, deserialize_with = "convert_duration_with_shellexpand")] + pub rpc_timeout_s: u64, } /// The possible error codes that might occur on an upstream request. #[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq, Eq)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum ErrorCode { Cancelled = 1, Unknown = 2, @@ -1054,6 +1194,7 @@ pub enum ErrorCode { } #[derive(Serialize, Deserialize, Debug, Clone, Default)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct RedisSpec { /// The hostname or IP address of the Redis server. /// Ex: `["redis://username:password@redis-server-url:6380/99"]` @@ -1061,12 +1202,15 @@ pub struct RedisSpec { #[serde(deserialize_with = "convert_vec_string_with_shellexpand")] pub addresses: Vec, + /// DEPRECATED: use `command_timeout_ms` /// The response timeout for the Redis connection in seconds. /// /// Default: 10 #[serde(default)] pub response_timeout_s: u64, + /// DEPRECATED: use `connection_timeout_ms` + /// /// The connection timeout for the Redis connection in seconds. /// /// Default: 10 @@ -1090,7 +1234,7 @@ pub struct RedisSpec { /// organize your data according to the shared prefix. /// /// Default: (Empty String / No Prefix) - #[serde(default)] + #[serde(default, deserialize_with = "convert_string_with_shellexpand")] pub key_prefix: String, /// Set the mode Redis is operating in. @@ -1104,10 +1248,7 @@ pub struct RedisSpec { #[serde(default)] pub mode: RedisMode, - /// When using pubsub interface, this is the maximum number of items to keep - /// queued up before dropping old items. - /// - /// Default: 4096 + /// Deprecated as redis-rs doesn't use it #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] pub broadcast_channel_capacity: usize, @@ -1162,7 +1303,7 @@ pub struct RedisSpec { /// /// Default: 10000 #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] - pub scan_count: u32, + pub scan_count: usize, /// Retry configuration to use when a network request fails. /// See the `Retry` struct for more information. @@ -1177,10 +1318,23 @@ pub struct RedisSpec { /// ``` #[serde(default)] pub retry: Retry, + + /// Maximum number of permitted actions to the Redis store at any one time + /// This stops problems with timeouts due to many, many inflight actions + /// Default: 500 + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] + pub max_client_permits: usize, + + /// Maximum number of items returned per cursor for the search indexes + /// May reduce thundering herd issues with worker provisioner at higher node counts, + /// Default: 1500 + #[serde(default, deserialize_with = "convert_numeric_with_shellexpand")] + pub max_count_per_cursor: u64, } #[derive(Debug, Default, Deserialize, Serialize, Clone, Copy, PartialEq, Eq)] #[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub enum RedisMode { Cluster, Sentinel, @@ -1189,6 +1343,7 @@ pub enum RedisMode { } #[derive(Clone, Copy, Debug, Default, Deserialize, Serialize)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct NoopSpec {} /// Retry configuration. This configuration is exponential and each iteration @@ -1214,6 +1369,7 @@ pub struct NoopSpec {} /// would mean a single request would have a total delay of 9.525s - 15.875s. #[derive(Serialize, Deserialize, Clone, Debug, Default)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct Retry { /// Maximum number of retries until retrying stops. /// Setting this to zero will always attempt 1 time, but not retry. @@ -1253,6 +1409,7 @@ pub struct Retry { /// Configuration for `ExperimentalMongoDB` store. #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(deny_unknown_fields)] +#[cfg_attr(feature = "dev-schema", derive(JsonSchema))] pub struct ExperimentalMongoSpec { /// `ExperimentalMongoDB` connection string. /// Example: or @@ -1302,7 +1459,7 @@ pub struct ExperimentalMongoSpec { /// Enable `MongoDB` change streams for real-time updates. /// Required for scheduler subscriptions. /// Default: false - #[serde(default)] + #[serde(default, deserialize_with = "convert_boolean_with_shellexpand")] pub enable_change_streams: bool, /// Write concern 'w' parameter. diff --git a/nativelink-config/tests/deserialization_test.rs b/nativelink-config/tests/deserialization_test.rs index e19e8f1b6..6ee384d33 100644 --- a/nativelink-config/tests/deserialization_test.rs +++ b/nativelink-config/tests/deserialization_test.rs @@ -13,7 +13,8 @@ // limitations under the License. use nativelink_config::serde_utils::{ - convert_data_size_with_shellexpand, convert_duration_with_shellexpand, + convert_boolean_with_shellexpand, convert_data_size_with_shellexpand, + convert_duration_with_shellexpand, convert_optional_data_size_with_shellexpand, convert_optional_numeric_with_shellexpand, convert_optional_string_with_shellexpand, }; use serde::Deserialize; @@ -30,6 +31,15 @@ struct DataSizeEntity { data_size: usize, } +#[derive(Deserialize, Debug)] +struct OptionalDataSizeEntity { + #[serde( + default, + deserialize_with = "convert_optional_data_size_with_shellexpand" + )] + data_size: Option, +} + #[derive(Deserialize, Debug)] struct OptionalNumericEntity { #[serde( @@ -45,6 +55,12 @@ struct OptionalStringEntity { value: Option, } +#[derive(Deserialize, Debug)] +struct BoolEntity { + #[serde(default, deserialize_with = "convert_boolean_with_shellexpand")] + value: bool, +} + mod duration_tests { use pretty_assertions::assert_eq; @@ -289,6 +305,21 @@ mod optional_values_tests { } } + #[test] + fn test_optional_datasize_values() { + let examples = [ + (r#"{"data_size": null}"#, None), + (r#"{"data_size": 42}"#, Some(42)), + (r"{}", None), // Missing field + (r#"{"data_size": "20K"}"#, Some(20000)), + ]; + + for (input, expected) in examples { + let deserialized: OptionalDataSizeEntity = serde_json5::from_str(input).unwrap(); + assert_eq!(deserialized.data_size, expected); + } + } + #[test] fn test_mixed_optional_values() { #[derive(Deserialize)] @@ -331,8 +362,34 @@ mod optional_values_tests { } } +mod boolean_tests { + use crate::BoolEntity; + + #[test] + fn test_bool_parsing() { + let examples = [ + // Standard value + (r#"{"value": true}"#, true), + (r#"{"value": false}"#, false), + // Stringy values + (r#"{"value": "true"}"#, true), + (r#"{"value": "false"}"#, false), + // Stringy values with odd cases + (r#"{"value": "TRue"}"#, true), + (r#"{"value": "faLSE"}"#, false), + ]; + + for (input, expected) in examples { + let deserialized: BoolEntity = + serde_json5::from_str(input).unwrap_or_else(|_| panic!("Failed on '{input}'")); + assert_eq!(deserialized.value, expected, "{input}"); + } + } +} + mod shellexpand_tests { use pretty_assertions::assert_eq; + use serde_json5::Location; use super::*; @@ -347,6 +404,8 @@ mod shellexpand_tests { std::env::set_var("TEST_NUMBER", "42"); std::env::set_var("TEST_VAR", "test_value"); std::env::set_var("EMPTY_VAR", ""); + std::env::set_var("TEST_GOOD_BOOL", "true"); + std::env::set_var("TEST_BAD_BOOL", "wibble"); }; // Test duration with environment variable @@ -359,6 +418,11 @@ mod shellexpand_tests { serde_json5::from_str::(r#"{"data_size": "${TEST_SIZE}"}"#).unwrap(); assert_eq!(size_result.data_size, 1_000_000_000); + let size_result = + serde_json5::from_str::(r#"{"data_size": "${TEST_SIZE}"}"#) + .unwrap(); + assert_eq!(size_result.data_size, Some(1_000_000_000)); + // Test optional numeric with environment variable let numeric_result = serde_json5::from_str::(r#"{"value": "${TEST_NUMBER}"}"#) @@ -384,5 +448,22 @@ mod shellexpand_tests { .to_string() .contains("environment variable not found") ); + + let good_bool_results = + serde_json5::from_str::(r#"{"value": "${TEST_GOOD_BOOL}"}"#).unwrap(); + assert!(good_bool_results.value); + + let bad_bool_results = + serde_json5::from_str::(r#"{"value": "${TEST_BAD_BOOL}"}"#).unwrap_err(); + assert_eq!( + bad_bool_results, + serde_json5::Error::Message { + msg: "provided string was not `true` or `false`".into(), + location: Some(Location { + line: 1, + column: 11 + }) + } + ); } } diff --git a/nativelink-error/BUILD.bazel b/nativelink-error/BUILD.bazel index 5b3cb0c4d..1a0af2534 100644 --- a/nativelink-error/BUILD.bazel +++ b/nativelink-error/BUILD.bazel @@ -15,13 +15,17 @@ rust_library( deps = [ "//nativelink-metric", "//nativelink-proto", - "@crates//:fred", "@crates//:prost", "@crates//:prost-types", + "@crates//:redis", + "@crates//:rustls-pki-types", "@crates//:serde", "@crates//:serde_json5", "@crates//:tokio", "@crates//:tonic", + "@crates//:url", + "@crates//:uuid", + "@crates//:walkdir", ], ) @@ -33,10 +37,10 @@ rust_test( "//nativelink-metric", "//nativelink-proto", "@crates//:async-lock", - "@crates//:fred", "@crates//:hex", "@crates//:prost", "@crates//:prost-types", + "@crates//:redis", "@crates//:serde", "@crates//:tokio", "@crates//:tonic", diff --git a/nativelink-error/Cargo.toml b/nativelink-error/Cargo.toml index ee3869c94..935374990 100644 --- a/nativelink-error/Cargo.toml +++ b/nativelink-error/Cargo.toml @@ -1,3 +1,4 @@ +#:schema ../tools/cargo-with-detailed-deps.json lints.workspace = true [package] @@ -7,17 +8,16 @@ autoexamples = false autotests = false edition = "2024" name = "nativelink-error" -version = "0.7.3" +version = "1.0.0-rc4" [dependencies] nativelink-metric = { path = "../nativelink-metric" } nativelink-proto = { path = "../nativelink-proto" } -fred = { version = "10.1.0", default-features = false, features = [ - "enable-rustls-ring", -] } prost = { version = "0.13.5", default-features = false } prost-types = { version = "0.13.5", default-features = false } +redis = { version = "1.0.0", default-features = false } +rustls-pki-types = { version = "1.13.1", default-features = false } serde = { version = "1.0.219", default-features = false } serde_json5 = { version = "0.2.1", default-features = false } tokio = { version = "1.44.1", features = [ @@ -30,3 +30,6 @@ tonic = { version = "0.13.0", features = [ "tls-ring", "transport", ], default-features = false } +url = { version = "2.5.7", default-features = false } +uuid = { version = "1.16.0", default-features = false } +walkdir = { version = "2.5.0", default-features = false } diff --git a/nativelink-error/src/lib.rs b/nativelink-error/src/lib.rs index d5bacf268..04df9e64a 100644 --- a/nativelink-error/src/lib.rs +++ b/nativelink-error/src/lib.rs @@ -13,6 +13,7 @@ // limitations under the License. use core::convert::Into; +use core::str::Utf8Error; use std::sync::{MutexGuard, PoisonError}; use nativelink_metric::{ @@ -20,6 +21,7 @@ use nativelink_metric::{ }; use prost_types::TimestampError; use serde::{Deserialize, Serialize}; +use tokio::sync::AcquireError; // Reexport of tonic's error codes which we use as "nativelink_error::Code". pub use tonic::Code; @@ -233,6 +235,18 @@ impl From for Error { } } +impl From for Error { + fn from(err: AcquireError) -> Self { + make_err!(Code::Internal, "{}", err) + } +} + +impl From for Error { + fn from(err: Utf8Error) -> Self { + make_err!(Code::Internal, "{}", err) + } +} + impl From for Error { fn from(err: std::io::Error) -> Self { Self { @@ -242,26 +256,29 @@ impl From for Error { } } -impl From for Error { - fn from(error: fred::error::Error) -> Self { - use fred::error::ErrorKind::{ - Auth, Backpressure, Canceled, Cluster, Config, IO, InvalidArgument, InvalidCommand, - NotFound, Parse, Protocol, Routing, Sentinel, Timeout, Tls, Unknown, Url, +impl From for Error { + fn from(error: redis::RedisError) -> Self { + use redis::ErrorKind::{ + AuthenticationFailed, InvalidClientConfig, Io as IoError, Parse as ParseError, + UnexpectedReturnType, }; // Conversions here are based on https://grpc.github.io/grpc/core/md_doc_statuscodes.html. let code = match error.kind() { - Config | InvalidCommand | InvalidArgument | Url => Code::InvalidArgument, - IO | Protocol | Tls | Cluster | Parse | Sentinel | Routing => Code::Internal, - Auth => Code::PermissionDenied, - Canceled => Code::Aborted, - Unknown => Code::Unknown, - Timeout => Code::DeadlineExceeded, - NotFound => Code::NotFound, - Backpressure => Code::Unavailable, + AuthenticationFailed => Code::PermissionDenied, + ParseError | UnexpectedReturnType | InvalidClientConfig => Code::InvalidArgument, + IoError => { + if error.is_timeout() { + Code::DeadlineExceeded + } else { + Code::Internal + } + } + _ => Code::Unknown, }; - make_err!(code, "{error}") + let kind = error.kind(); + make_err!(code, "{kind:?}: {error}") } } @@ -277,6 +294,36 @@ impl From for tonic::Status { } } +impl From for Error { + fn from(value: walkdir::Error) -> Self { + Self::new(Code::Internal, value.to_string()) + } +} + +impl From for Error { + fn from(value: uuid::Error) -> Self { + Self::new(Code::Internal, value.to_string()) + } +} + +impl From for Error { + fn from(value: rustls_pki_types::pem::Error) -> Self { + Self::new(Code::Internal, value.to_string()) + } +} + +impl From for Error { + fn from(value: tokio::time::error::Elapsed) -> Self { + Self::new(Code::DeadlineExceeded, value.to_string()) + } +} + +impl From for Error { + fn from(value: url::ParseError) -> Self { + Self::new(Code::Internal, value.to_string()) + } +} + pub trait ResultExt { /// # Errors /// diff --git a/nativelink-macro/Cargo.toml b/nativelink-macro/Cargo.toml index ff810dbb8..e7daad698 100644 --- a/nativelink-macro/Cargo.toml +++ b/nativelink-macro/Cargo.toml @@ -1,9 +1,10 @@ +#:schema ../tools/cargo-with-detailed-deps.json lints.workspace = true [package] edition = "2024" name = "nativelink-macro" -version = "0.7.3" +version = "1.0.0-rc4" [lib] proc-macro = true diff --git a/nativelink-macro/src/lib.rs b/nativelink-macro/src/lib.rs index 3e90c77b5..a42f4893f 100644 --- a/nativelink-macro/src/lib.rs +++ b/nativelink-macro/src/lib.rs @@ -13,19 +13,58 @@ // limitations under the License. use proc_macro::TokenStream; -use quote::quote; +use proc_macro2::TokenTree; +use quote::{format_ident, quote}; use syn::{ItemFn, parse_macro_input}; +// Helper function for debugging. Add prettyplease as dependency +// +// fn unparse(input: proc_macro2::TokenStream) -> String { +// let item = syn::parse2(input).unwrap(); +// let file = syn::File { +// attrs: vec![], +// items: vec![item], +// shebang: None, +// }; + +// prettyplease::unparse(&file) +// } + +// Either use this as-is or as `#[nativelink_test("foo")]` where foo is the path for nativelink-util +// Mostly used inside nativelink-util as `#[nativelink_test("crate")]` +// If you start it with an ident instead, e.g. `#[nativelink_test(flavor = "multi_thread")]` we feed it into tokio::test #[proc_macro_attribute] pub fn nativelink_test(attr: TokenStream, item: TokenStream) -> TokenStream { let attr = proc_macro2::TokenStream::from(attr); let input_fn = parse_macro_input!(item as ItemFn); + let mut maybe_crate_ident: Option = None; + let mut maybe_tokio_attrs: Option = None; + + for a in attr.clone() { + assert!(maybe_crate_ident.is_none()); + + match a { + TokenTree::Literal(l) => { + let s = format_ident!("{}", l.to_string().replace('"', "")); + maybe_crate_ident = Some(quote! {#s}); + } + TokenTree::Ident(_) => { + maybe_tokio_attrs = Some(attr); + break; + } + _ => { + panic!("unsupported tokentree: {a:?}"); + } + } + } let fn_name = &input_fn.sig.ident; let fn_block = &input_fn.block; let fn_inputs = &input_fn.sig.inputs; let fn_output = &input_fn.sig.output; let fn_attr = &input_fn.attrs; + let crate_ident = maybe_crate_ident.unwrap_or_else(|| quote!(::nativelink_util)); + let tokio_attrs = maybe_tokio_attrs.unwrap_or_else(|| quote!()); let expanded = quote! { #(#fn_attr)* @@ -33,12 +72,12 @@ pub fn nativelink_test(attr: TokenStream, item: TokenStream) -> TokenStream { clippy::disallowed_methods, reason = "`tokio::test` uses `tokio::runtime::Runtime::block_on`" )] - #[tokio::test(#attr)] + #[tokio::test(#tokio_attrs)] #[::tracing_test::traced_test] async fn #fn_name(#fn_inputs) #fn_output { - ::nativelink_util::__tracing::error_span!(stringify!(#fn_name)) + #crate_ident::__tracing::error_span!(stringify!(#fn_name)) .in_scope(|| async move { - ::nativelink_util::common::reseed_rng_for_test().unwrap(); + #crate_ident::common::reseed_rng_for_test().unwrap(); let res = #fn_block; logs_assert(|lines: &[&str]| { for line in lines { diff --git a/nativelink-metric/Cargo.toml b/nativelink-metric/Cargo.toml index bfdffc861..a807af3ef 100644 --- a/nativelink-metric/Cargo.toml +++ b/nativelink-metric/Cargo.toml @@ -1,15 +1,16 @@ +#:schema ../tools/cargo-with-detailed-deps.json lints.workspace = true [package] edition = "2024" name = "nativelink-metric" -version = "0.7.3" +version = "1.0.0-rc4" [dependencies] nativelink-metric-macro-derive = { path = "nativelink-metric-macro-derive" } async-lock = { version = "3.4.0", features = ["std"], default-features = false } -parking_lot = "0.12.3" +parking_lot = { version = "0.12.3", default-features = false } tokio = { version = "1.44.1", features = [ "fs", "io-util", diff --git a/nativelink-metric/nativelink-metric-macro-derive/Cargo.toml b/nativelink-metric/nativelink-metric-macro-derive/Cargo.toml index ff8e0583e..271d4167a 100644 --- a/nativelink-metric/nativelink-metric-macro-derive/Cargo.toml +++ b/nativelink-metric/nativelink-metric-macro-derive/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2024" name = "nativelink-metric-macro-derive" -version = "0.6.0" +version = "0.8.0" [lib] proc-macro = true diff --git a/nativelink-metric/src/lib.rs b/nativelink-metric/src/lib.rs index 5661f14b0..b885262dd 100644 --- a/nativelink-metric/src/lib.rs +++ b/nativelink-metric/src/lib.rs @@ -458,6 +458,18 @@ impl MetricsComponent for async_lock::Mutex { } } +impl MetricsComponent for async_lock::RwLock { + fn publish( + &self, + kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + // It is safe to block in the publishing thread. + let lock = self.read_blocking(); + lock.publish(kind, field_metadata) + } +} + impl MetricsComponent for parking_lot::Mutex { fn publish( &self, diff --git a/nativelink-proto/BUILD.bazel b/nativelink-proto/BUILD.bazel index 5221cb83e..e6395afe3 100644 --- a/nativelink-proto/BUILD.bazel +++ b/nativelink-proto/BUILD.bazel @@ -148,12 +148,10 @@ genrule( rust_library( name = "nativelink-proto", srcs = glob(["genproto/*.rs"]), - proc_macro_deps = [ - "@crates//:derivative", - ], tags = ["no-rustfmt"], visibility = ["//visibility:public"], deps = [ + "@crates//:derive_more", "@crates//:prost", "@crates//:prost-types", "@crates//:tonic", diff --git a/nativelink-proto/Cargo.toml b/nativelink-proto/Cargo.toml index 39e7c6046..9779d6034 100644 --- a/nativelink-proto/Cargo.toml +++ b/nativelink-proto/Cargo.toml @@ -1,21 +1,31 @@ +#:schema ../tools/cargo-with-detailed-deps.json [package] -name = "nativelink-proto" -version = "0.7.3" edition = "2024" +name = "nativelink-proto" +version = "1.0.0-rc4" [lib] name = "nativelink_proto" path = "genproto/lib.rs" [dependencies] -derivative = { version="2.2.0", default-features = false } +derive_more = { version = "2.0.1", default-features = false, features = [ + "debug", +] } prost = { version = "0.13.5", default-features = false } prost-types = { version = "0.13.5", default-features = false } -tonic = { version = "0.13.0", features = ["codegen", "prost", "transport", "tls-ring"], default-features = false } +tonic = { version = "0.13.0", features = [ + "codegen", + "prost", + "tls-ring", + "transport", +], default-features = false } [dev-dependencies] prost-build = { version = "0.13.5", default-features = false } -tonic-build = { version = "0.13.0", features = ["prost"], default-features = false } +tonic-build = { version = "0.13.0", features = [ + "prost", +], default-features = false } [package.metadata.cargo-machete] # Used by gen_protos_tool.rs diff --git a/nativelink-proto/com/github/trace_machina/nativelink/remote_execution/worker_api.proto b/nativelink-proto/com/github/trace_machina/nativelink/remote_execution/worker_api.proto index 15d82b668..d736d1624 100644 --- a/nativelink-proto/com/github/trace_machina/nativelink/remote_execution/worker_api.proto +++ b/nativelink-proto/com/github/trace_machina/nativelink/remote_execution/worker_api.proto @@ -25,52 +25,30 @@ import "google/rpc/status.proto"; /// /// When a worker node comes online it must be pre-configured with the /// endpoint of the scheduler it will register with. Once the worker -/// connects to the scheduler it must send a `RegisterSupportedProperties` +/// connects to the scheduler it must send a `ConnectWorkerRequest` /// command to the scheduler. The scheduler will then use this information /// to determine which jobs the worker can process. service WorkerApi { /// Registers this worker and informs the scheduler what properties /// this worker supports. The response must be listened on the client - /// side for updates from the server. The first item sent will always be - /// a ConnectionResult, after that it is undefined. - rpc ConnectWorker(ConnectWorkerRequest) returns (stream UpdateForWorker); - - /// Message used to let the scheduler know that it is still alive as - /// well as check to see if the scheduler is still alive. The scheduler - /// may close the connection if the worker has not sent any messages - /// after some amount of time (configured in the scheduler's - /// configuration). - rpc KeepAlive(KeepAliveRequest) returns (google.protobuf.Empty); - - /// Informs the scheduler that the service is going offline and - /// should stop issuing any new actions on this worker. - /// - /// The worker may stay connected even after sending this command - /// and may even send an `ExecuteResult` after sending this command. - /// It is up to the scheduler implementation to decide how to handle - /// this case. - /// - /// Any job that was running on this instance likely needs to be - /// executed again, but up to the scheduler on how or when to handle - /// this case. - rpc GoingAway(GoingAwayRequest) returns (google.protobuf.Empty); - - /// Informs the scheduler about the result of an execution request. - rpc ExecutionResponse(ExecuteResult) returns (google.protobuf.Empty); + /// side for updates from the server. This is performed as a single + /// bi-directional call to ensure that the worker is always talking to the + /// same scheduler instance even if there's a load balancer in front. + /// The first message on the UpdateForScheduler stream will be a + /// ConnectWorkerRequest which will notify the scheduler of the available + /// properties and the first response will be a ConnectionResult to tell + /// the worker what worker ID to place in action results. + rpc ConnectWorker(stream UpdateForScheduler) returns (stream UpdateForWorker); } /// Request object for keep alive requests. message KeepAliveRequest { - /// ID of the worker making the request. - string worker_id = 1; - reserved 2; // NextId. + reserved 1; // NextId. } /// Request object for going away requests. message GoingAwayRequest { - /// ID of the worker making the request. - string worker_id = 1; - reserved 2; // NextId. + reserved 1; // NextId. } /// Represents the initial request sent to the scheduler informing the @@ -93,40 +71,48 @@ message ConnectWorkerRequest { /// append this prefix to the assigned worker_id followed by a UUIDv6. string worker_id_prefix = 2; - reserved 3; // NextId. + /// Maximum number of inflight tasks this worker can cope with at one time + /// The default (0) means unlimited. + uint64 max_inflight_tasks = 3; + + reserved 4; // NextId. } /// The result of an ExecutionRequest. message ExecuteResult { - /// ID of the worker making the request. - string worker_id = 1; - /// The `instance_name` this task was initially assigned to. This is set by the client /// that initially sent the job as part of the BRE protocol. - string instance_name = 6; + string instance_name = 1; /// The operation ID that was executed. - string operation_id = 8; + string operation_id = 2; /// The actual response data. oneof result { /// Result of the execution. See `build.bazel.remote.execution.v2.ExecuteResponse` /// for details. - build.bazel.remote.execution.v2.ExecuteResponse execute_response = 4; + build.bazel.remote.execution.v2.ExecuteResponse execute_response = 3; /// An internal error. This is only present when an internal error happened that /// was not recoverable. If the execution job failed but at no fault of the worker /// it should not use this field and should send the error via execute_response. - google.rpc.Status internal_error = 5; + google.rpc.Status internal_error = 4; } - reserved 9; // NextId. + reserved 5; // NextId. +} + +/// The result of an ExecutionComplete. +message ExecuteComplete { + /// The operation ID that was executed. + string operation_id = 1; } /// Result sent back from the server when a node connects. message ConnectionResult { - /// The internal ID given to the newly connected node. + /// The worker ID to place in the action results generated by this worker. string worker_id = 1; + reserved 2; // NextId. } @@ -164,6 +150,42 @@ message UpdateForWorker { reserved 6; // NextId. } +/// Communication from the worker to the scheduler. +message UpdateForScheduler { + oneof update { + /// The initial request sent to the scheduler informing it of the + /// supported properties of this worker. + ConnectWorkerRequest connect_worker_request = 1; + + /// Message used to let the scheduler know that it is still alive as + /// well as check to see if the scheduler is still alive. The scheduler + /// may close the connection if the worker has not sent any messages + /// after some amount of time (configured in the scheduler's + /// configuration). + KeepAliveRequest keep_alive_request = 2; + + /// Informs the scheduler that the service is going offline and + /// should stop issuing any new actions on this worker. + /// + /// The worker may stay connected even after sending this command + /// and may even send an `ExecuteResult` after sending this command. + /// It is up to the scheduler implementation to decide how to handle + /// this case. + /// + /// Any job that was running on this instance likely needs to be + /// executed again, but up to the scheduler on how or when to handle + /// this case. + GoingAwayRequest going_away_request = 3; + + /// Informs the scheduler about the result of an execution request. + ExecuteResult execute_result = 4; + + /// Notify that the execution has completed, but result is uploading. + ExecuteComplete execute_complete = 5; + } + reserved 6; // NextId. +} + message StartExecute { /// The action information used to execute job. build.bazel.remote.execution.v2.ExecuteRequest execute_request = 1; diff --git a/nativelink-proto/gen_lib_rs_tool.py b/nativelink-proto/gen_lib_rs_tool.py index 64a488f08..73558ed9d 100644 --- a/nativelink-proto/gen_lib_rs_tool.py +++ b/nativelink-proto/gen_lib_rs_tool.py @@ -50,6 +50,8 @@ clippy::missing_const_for_fn, clippy::similar_names, clippy::std_instead_of_core, + clippy::use_self, + rustdoc::broken_intra_doc_links, rustdoc::invalid_html_tags )] """ diff --git a/nativelink-proto/gen_protos_tool.rs b/nativelink-proto/gen_protos_tool.rs index 6691b6629..87fa502db 100644 --- a/nativelink-proto/gen_protos_tool.rs +++ b/nativelink-proto/gen_protos_tool.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::path::PathBuf; use clap::{Arg, ArgAction, Command}; @@ -29,23 +30,21 @@ fn main() -> std::io::Result<()> { let mut config = Config::new(); config.bytes(["."]); - let structs_with_data_to_ignore = [ - "BatchReadBlobsResponse.Response", - "BatchUpdateBlobsRequest.Request", - "ReadResponse", - "WriteRequest", - ]; + let mut structs_with_data_to_ignore = HashMap::new(); + structs_with_data_to_ignore.insert("BatchReadBlobsResponse.Response", vec!["data"]); + structs_with_data_to_ignore.insert("BatchUpdateBlobsRequest.Request", vec!["data"]); + structs_with_data_to_ignore.insert("ReadResponse", vec!["data"]); + structs_with_data_to_ignore.insert("WriteRequest", vec!["data"]); + structs_with_data_to_ignore.insert("ActionResult", vec!["output_files"]); - for struct_name in structs_with_data_to_ignore { - config.type_attribute(struct_name, "#[derive(::derivative::Derivative)]"); - config.type_attribute(struct_name, "#[derivative(Debug)]"); - config.field_attribute( - format!("{struct_name}.data"), - "#[derivative(Debug=\"ignore\")]", - ); + for (struct_name, fields) in &structs_with_data_to_ignore { + config.type_attribute(struct_name, "#[derive(::derive_more::Debug)]"); + for field in fields { + config.field_attribute(format!("{struct_name}.{field}"), "#[debug(ignore)]"); + } } - config.skip_debug(structs_with_data_to_ignore); + config.skip_debug(structs_with_data_to_ignore.keys()); tonic_build::configure() .out_dir(output_dir) diff --git a/nativelink-proto/genproto/build.bazel.remote.execution.v2.pb.rs b/nativelink-proto/genproto/build.bazel.remote.execution.v2.pb.rs index 19c014dd9..f6e831311 100644 --- a/nativelink-proto/genproto/build.bazel.remote.execution.v2.pb.rs +++ b/nativelink-proto/genproto/build.bazel.remote.execution.v2.pb.rs @@ -600,7 +600,9 @@ pub struct ExecutedActionMetadata { /// `ActionResult.execution_metadata.Worker`) have a non-default value, to /// ensure that the serialized value is non-empty, which can then be used /// as a basic data sanity check. +#[derive(::derive_more::Debug)] #[derive(Clone, PartialEq, ::prost::Message)] +#[prost(skip_debug)] pub struct ActionResult { /// The output files of the action. For each output file requested in the /// `output_files` or `output_paths` field of the Action, if the corresponding @@ -614,6 +616,7 @@ pub struct ActionResult { /// will be omitted from the list. The server is free to arrange the output /// list as desired; clients MUST NOT assume that the output list is sorted. #[prost(message, repeated, tag = "2")] + #[debug(ignore)] pub output_files: ::prost::alloc::vec::Vec, /// The output files of the action that are symbolic links to other files. Those /// may be links to other output files, or input files, or even absolute paths @@ -1269,8 +1272,7 @@ pub struct BatchUpdateBlobsRequest { /// Nested message and enum types in `BatchUpdateBlobsRequest`. pub mod batch_update_blobs_request { /// A request corresponding to a single blob that the client wants to upload. - #[derive(::derivative::Derivative)] - #[derivative(Debug)] + #[derive(::derive_more::Debug)] #[derive(Clone, PartialEq, ::prost::Message)] #[prost(skip_debug)] pub struct Request { @@ -1280,7 +1282,7 @@ pub mod batch_update_blobs_request { pub digest: ::core::option::Option, /// The raw binary data. #[prost(bytes = "bytes", tag = "2")] - #[derivative(Debug = "ignore")] + #[debug(ignore)] pub data: ::prost::bytes::Bytes, /// The format of `data`. Must be `IDENTITY`/unspecified, or one of the /// compressors advertised by the @@ -1353,8 +1355,7 @@ pub struct BatchReadBlobsResponse { /// Nested message and enum types in `BatchReadBlobsResponse`. pub mod batch_read_blobs_response { /// A response corresponding to a single blob that the client tried to download. - #[derive(::derivative::Derivative)] - #[derivative(Debug)] + #[derive(::derive_more::Debug)] #[derive(Clone, PartialEq, ::prost::Message)] #[prost(skip_debug)] pub struct Response { @@ -1363,7 +1364,7 @@ pub mod batch_read_blobs_response { pub digest: ::core::option::Option, /// The raw binary data. #[prost(bytes = "bytes", tag = "2")] - #[derivative(Debug = "ignore")] + #[debug(ignore)] pub data: ::prost::bytes::Bytes, /// The format the data is encoded in. MUST be `IDENTITY`/unspecified, /// or one of the acceptable compressors specified in the `BatchReadBlobsRequest`. diff --git a/nativelink-proto/genproto/com.github.trace_machina.nativelink.remote_execution.pb.rs b/nativelink-proto/genproto/com.github.trace_machina.nativelink.remote_execution.pb.rs index 559e66109..c4a53f73f 100644 --- a/nativelink-proto/genproto/com.github.trace_machina.nativelink.remote_execution.pb.rs +++ b/nativelink-proto/genproto/com.github.trace_machina.nativelink.remote_execution.pb.rs @@ -14,19 +14,11 @@ // This file is @generated by prost-build. /// / Request object for keep alive requests. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct KeepAliveRequest { - /// / ID of the worker making the request. - #[prost(string, tag = "1")] - pub worker_id: ::prost::alloc::string::String, -} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct KeepAliveRequest {} /// / Request object for going away requests. -#[derive(Clone, PartialEq, ::prost::Message)] -pub struct GoingAwayRequest { - /// / ID of the worker making the request. - #[prost(string, tag = "1")] - pub worker_id: ::prost::alloc::string::String, -} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct GoingAwayRequest {} /// / Represents the initial request sent to the scheduler informing the /// / scheduler about this worker's capabilities and metadata. #[derive(Clone, PartialEq, ::prost::Message)] @@ -50,22 +42,23 @@ pub struct ConnectWorkerRequest { /// / append this prefix to the assigned worker_id followed by a UUIDv6. #[prost(string, tag = "2")] pub worker_id_prefix: ::prost::alloc::string::String, + /// / Maximum number of inflight tasks this worker can cope with at one time + /// / The default (0) means unlimited. + #[prost(uint64, tag = "3")] + pub max_inflight_tasks: u64, } /// / The result of an ExecutionRequest. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ExecuteResult { - /// / ID of the worker making the request. - #[prost(string, tag = "1")] - pub worker_id: ::prost::alloc::string::String, /// / The `instance_name` this task was initially assigned to. This is set by the client /// / that initially sent the job as part of the BRE protocol. - #[prost(string, tag = "6")] + #[prost(string, tag = "1")] pub instance_name: ::prost::alloc::string::String, /// / The operation ID that was executed. - #[prost(string, tag = "8")] + #[prost(string, tag = "2")] pub operation_id: ::prost::alloc::string::String, /// / The actual response data. - #[prost(oneof = "execute_result::Result", tags = "4, 5")] + #[prost(oneof = "execute_result::Result", tags = "3, 4")] pub result: ::core::option::Option, } /// Nested message and enum types in `ExecuteResult`. @@ -75,21 +68,28 @@ pub mod execute_result { pub enum Result { /// / Result of the execution. See `build.bazel.remote.execution.v2.ExecuteResponse` /// / for details. - #[prost(message, tag = "4")] + #[prost(message, tag = "3")] ExecuteResponse( super::super::super::super::super::super::build::bazel::remote::execution::v2::ExecuteResponse, ), /// / An internal error. This is only present when an internal error happened that /// / was not recoverable. If the execution job failed but at no fault of the worker /// / it should not use this field and should send the error via execute_response. - #[prost(message, tag = "5")] + #[prost(message, tag = "4")] InternalError(super::super::super::super::super::super::google::rpc::Status), } } +/// / The result of an ExecutionComplete. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ExecuteComplete { + /// / The operation ID that was executed. + #[prost(string, tag = "1")] + pub operation_id: ::prost::alloc::string::String, +} /// / Result sent back from the server when a node connects. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ConnectionResult { - /// / The internal ID given to the newly connected node. + /// / The worker ID to place in the action results generated by this worker. #[prost(string, tag = "1")] pub worker_id: ::prost::alloc::string::String, } @@ -134,6 +134,48 @@ pub mod update_for_worker { KillOperationRequest(super::KillOperationRequest), } } +/// / Communication from the worker to the scheduler. +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct UpdateForScheduler { + #[prost(oneof = "update_for_scheduler::Update", tags = "1, 2, 3, 4, 5")] + pub update: ::core::option::Option, +} +/// Nested message and enum types in `UpdateForScheduler`. +pub mod update_for_scheduler { + #[derive(Clone, PartialEq, ::prost::Oneof)] + pub enum Update { + /// / The initial request sent to the scheduler informing it of the + /// / supported properties of this worker. + #[prost(message, tag = "1")] + ConnectWorkerRequest(super::ConnectWorkerRequest), + /// / Message used to let the scheduler know that it is still alive as + /// / well as check to see if the scheduler is still alive. The scheduler + /// / may close the connection if the worker has not sent any messages + /// / after some amount of time (configured in the scheduler's + /// / configuration). + #[prost(message, tag = "2")] + KeepAliveRequest(super::KeepAliveRequest), + /// / Informs the scheduler that the service is going offline and + /// / should stop issuing any new actions on this worker. + /// / + /// / The worker may stay connected even after sending this command + /// / and may even send an `ExecuteResult` after sending this command. + /// / It is up to the scheduler implementation to decide how to handle + /// / this case. + /// / + /// / Any job that was running on this instance likely needs to be + /// / executed again, but up to the scheduler on how or when to handle + /// / this case. + #[prost(message, tag = "3")] + GoingAwayRequest(super::GoingAwayRequest), + /// / Informs the scheduler about the result of an execution request. + #[prost(message, tag = "4")] + ExecuteResult(super::ExecuteResult), + /// / Notify that the execution has completed, but result is uploading. + #[prost(message, tag = "5")] + ExecuteComplete(super::ExecuteComplete), + } +} #[derive(Clone, PartialEq, ::prost::Message)] pub struct StartExecute { /// / The action information used to execute job. @@ -188,7 +230,7 @@ pub mod worker_api_client { /// / /// / When a worker node comes online it must be pre-configured with the /// / endpoint of the scheduler it will register with. Once the worker - /// / connects to the scheduler it must send a `RegisterSupportedProperties` + /// / connects to the scheduler it must send a `ConnectWorkerRequest` /// / command to the scheduler. The scheduler will then use this information /// / to determine which jobs the worker can process. #[derive(Debug, Clone)] @@ -262,11 +304,18 @@ pub mod worker_api_client { } /// / Registers this worker and informs the scheduler what properties /// / this worker supports. The response must be listened on the client - /// / side for updates from the server. The first item sent will always be - /// / a ConnectionResult, after that it is undefined. + /// / side for updates from the server. This is performed as a single + /// / bi-directional call to ensure that the worker is always talking to the + /// / same scheduler instance even if there's a load balancer in front. + /// / The first message on the UpdateForScheduler stream will be a + /// / ConnectWorkerRequest which will notify the scheduler of the available + /// / properties and the first response will be a ConnectionResult to tell + /// / the worker what worker ID to place in action results. pub async fn connect_worker( &mut self, - request: impl tonic::IntoRequest, + request: impl tonic::IntoStreamingRequest< + Message = super::UpdateForScheduler, + >, ) -> std::result::Result< tonic::Response>, tonic::Status, @@ -283,7 +332,7 @@ pub mod worker_api_client { let path = http::uri::PathAndQuery::from_static( "/com.github.trace_machina.nativelink.remote_execution.WorkerApi/ConnectWorker", ); - let mut req = request.into_request(); + let mut req = request.into_streaming_request(); req.extensions_mut() .insert( GrpcMethod::new( @@ -291,102 +340,7 @@ pub mod worker_api_client { "ConnectWorker", ), ); - self.inner.server_streaming(req, path, codec).await - } - /// / Message used to let the scheduler know that it is still alive as - /// / well as check to see if the scheduler is still alive. The scheduler - /// / may close the connection if the worker has not sent any messages - /// / after some amount of time (configured in the scheduler's - /// / configuration). - pub async fn keep_alive( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner - .ready() - .await - .map_err(|e| { - tonic::Status::unknown( - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/com.github.trace_machina.nativelink.remote_execution.WorkerApi/KeepAlive", - ); - let mut req = request.into_request(); - req.extensions_mut() - .insert( - GrpcMethod::new( - "com.github.trace_machina.nativelink.remote_execution.WorkerApi", - "KeepAlive", - ), - ); - self.inner.unary(req, path, codec).await - } - /// / Informs the scheduler that the service is going offline and - /// / should stop issuing any new actions on this worker. - /// / - /// / The worker may stay connected even after sending this command - /// / and may even send an `ExecuteResult` after sending this command. - /// / It is up to the scheduler implementation to decide how to handle - /// / this case. - /// / - /// / Any job that was running on this instance likely needs to be - /// / executed again, but up to the scheduler on how or when to handle - /// / this case. - pub async fn going_away( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner - .ready() - .await - .map_err(|e| { - tonic::Status::unknown( - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/com.github.trace_machina.nativelink.remote_execution.WorkerApi/GoingAway", - ); - let mut req = request.into_request(); - req.extensions_mut() - .insert( - GrpcMethod::new( - "com.github.trace_machina.nativelink.remote_execution.WorkerApi", - "GoingAway", - ), - ); - self.inner.unary(req, path, codec).await - } - /// / Informs the scheduler about the result of an execution request. - pub async fn execution_response( - &mut self, - request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner - .ready() - .await - .map_err(|e| { - tonic::Status::unknown( - format!("Service was not ready: {}", e.into()), - ) - })?; - let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static( - "/com.github.trace_machina.nativelink.remote_execution.WorkerApi/ExecutionResponse", - ); - let mut req = request.into_request(); - req.extensions_mut() - .insert( - GrpcMethod::new( - "com.github.trace_machina.nativelink.remote_execution.WorkerApi", - "ExecutionResponse", - ), - ); - self.inner.unary(req, path, codec).await + self.inner.streaming(req, path, codec).await } } } @@ -411,50 +365,26 @@ pub mod worker_api_server { + 'static; /// / Registers this worker and informs the scheduler what properties /// / this worker supports. The response must be listened on the client - /// / side for updates from the server. The first item sent will always be - /// / a ConnectionResult, after that it is undefined. + /// / side for updates from the server. This is performed as a single + /// / bi-directional call to ensure that the worker is always talking to the + /// / same scheduler instance even if there's a load balancer in front. + /// / The first message on the UpdateForScheduler stream will be a + /// / ConnectWorkerRequest which will notify the scheduler of the available + /// / properties and the first response will be a ConnectionResult to tell + /// / the worker what worker ID to place in action results. async fn connect_worker( &self, - request: tonic::Request, + request: tonic::Request>, ) -> std::result::Result< tonic::Response, tonic::Status, >; - /// / Message used to let the scheduler know that it is still alive as - /// / well as check to see if the scheduler is still alive. The scheduler - /// / may close the connection if the worker has not sent any messages - /// / after some amount of time (configured in the scheduler's - /// / configuration). - async fn keep_alive( - &self, - request: tonic::Request, - ) -> std::result::Result, tonic::Status>; - /// / Informs the scheduler that the service is going offline and - /// / should stop issuing any new actions on this worker. - /// / - /// / The worker may stay connected even after sending this command - /// / and may even send an `ExecuteResult` after sending this command. - /// / It is up to the scheduler implementation to decide how to handle - /// / this case. - /// / - /// / Any job that was running on this instance likely needs to be - /// / executed again, but up to the scheduler on how or when to handle - /// / this case. - async fn going_away( - &self, - request: tonic::Request, - ) -> std::result::Result, tonic::Status>; - /// / Informs the scheduler about the result of an execution request. - async fn execution_response( - &self, - request: tonic::Request, - ) -> std::result::Result, tonic::Status>; } /// / This API describes how schedulers communicate with Worker nodes. /// / /// / When a worker node comes online it must be pre-configured with the /// / endpoint of the scheduler it will register with. Once the worker - /// / connects to the scheduler it must send a `RegisterSupportedProperties` + /// / connects to the scheduler it must send a `ConnectWorkerRequest` /// / command to the scheduler. The scheduler will then use this information /// / to determine which jobs the worker can process. #[derive(Debug)] @@ -538,7 +468,7 @@ pub mod worker_api_server { struct ConnectWorkerSvc(pub Arc); impl< T: WorkerApi, - > tonic::server::ServerStreamingService + > tonic::server::StreamingService for ConnectWorkerSvc { type Response = super::UpdateForWorker; type ResponseStream = T::ConnectWorkerStream; @@ -548,7 +478,9 @@ pub mod worker_api_server { >; fn call( &mut self, - request: tonic::Request, + request: tonic::Request< + tonic::Streaming, + >, ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { @@ -574,140 +506,7 @@ pub mod worker_api_server { max_decoding_message_size, max_encoding_message_size, ); - let res = grpc.server_streaming(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/com.github.trace_machina.nativelink.remote_execution.WorkerApi/KeepAlive" => { - #[allow(non_camel_case_types)] - struct KeepAliveSvc(pub Arc); - impl< - T: WorkerApi, - > tonic::server::UnaryService - for KeepAliveSvc { - type Response = (); - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = Arc::clone(&self.0); - let fut = async move { - ::keep_alive(&inner, request).await - }; - Box::pin(fut) - } - } - let accept_compression_encodings = self.accept_compression_encodings; - let send_compression_encodings = self.send_compression_encodings; - let max_decoding_message_size = self.max_decoding_message_size; - let max_encoding_message_size = self.max_encoding_message_size; - let inner = self.inner.clone(); - let fut = async move { - let method = KeepAliveSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = tonic::server::Grpc::new(codec) - .apply_compression_config( - accept_compression_encodings, - send_compression_encodings, - ) - .apply_max_message_size_config( - max_decoding_message_size, - max_encoding_message_size, - ); - let res = grpc.unary(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/com.github.trace_machina.nativelink.remote_execution.WorkerApi/GoingAway" => { - #[allow(non_camel_case_types)] - struct GoingAwaySvc(pub Arc); - impl< - T: WorkerApi, - > tonic::server::UnaryService - for GoingAwaySvc { - type Response = (); - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = Arc::clone(&self.0); - let fut = async move { - ::going_away(&inner, request).await - }; - Box::pin(fut) - } - } - let accept_compression_encodings = self.accept_compression_encodings; - let send_compression_encodings = self.send_compression_encodings; - let max_decoding_message_size = self.max_decoding_message_size; - let max_encoding_message_size = self.max_encoding_message_size; - let inner = self.inner.clone(); - let fut = async move { - let method = GoingAwaySvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = tonic::server::Grpc::new(codec) - .apply_compression_config( - accept_compression_encodings, - send_compression_encodings, - ) - .apply_max_message_size_config( - max_decoding_message_size, - max_encoding_message_size, - ); - let res = grpc.unary(method, req).await; - Ok(res) - }; - Box::pin(fut) - } - "/com.github.trace_machina.nativelink.remote_execution.WorkerApi/ExecutionResponse" => { - #[allow(non_camel_case_types)] - struct ExecutionResponseSvc(pub Arc); - impl tonic::server::UnaryService - for ExecutionResponseSvc { - type Response = (); - type Future = BoxFuture< - tonic::Response, - tonic::Status, - >; - fn call( - &mut self, - request: tonic::Request, - ) -> Self::Future { - let inner = Arc::clone(&self.0); - let fut = async move { - ::execution_response(&inner, request).await - }; - Box::pin(fut) - } - } - let accept_compression_encodings = self.accept_compression_encodings; - let send_compression_encodings = self.send_compression_encodings; - let max_decoding_message_size = self.max_decoding_message_size; - let max_encoding_message_size = self.max_encoding_message_size; - let inner = self.inner.clone(); - let fut = async move { - let method = ExecutionResponseSvc(inner); - let codec = tonic::codec::ProstCodec::default(); - let mut grpc = tonic::server::Grpc::new(codec) - .apply_compression_config( - accept_compression_encodings, - send_compression_encodings, - ) - .apply_max_message_size_config( - max_decoding_message_size, - max_encoding_message_size, - ); - let res = grpc.unary(method, req).await; + let res = grpc.streaming(method, req).await; Ok(res) }; Box::pin(fut) diff --git a/nativelink-proto/genproto/google.bytestream.pb.rs b/nativelink-proto/genproto/google.bytestream.pb.rs index c24aad0d6..d0229a041 100644 --- a/nativelink-proto/genproto/google.bytestream.pb.rs +++ b/nativelink-proto/genproto/google.bytestream.pb.rs @@ -37,8 +37,7 @@ pub struct ReadRequest { pub read_limit: i64, } /// Response object for ByteStream.Read. -#[derive(::derivative::Derivative)] -#[derivative(Debug)] +#[derive(::derive_more::Debug)] #[derive(Clone, PartialEq, ::prost::Message)] #[prost(skip_debug)] pub struct ReadResponse { @@ -47,12 +46,11 @@ pub struct ReadResponse { /// client that the request is still live while it is running an operation to /// generate more data. #[prost(bytes = "bytes", tag = "10")] - #[derivative(Debug = "ignore")] + #[debug(ignore)] pub data: ::prost::bytes::Bytes, } /// Request object for ByteStream.Write. -#[derive(::derivative::Derivative)] -#[derivative(Debug)] +#[derive(::derive_more::Debug)] #[derive(Clone, PartialEq, ::prost::Message)] #[prost(skip_debug)] pub struct WriteRequest { @@ -85,7 +83,7 @@ pub struct WriteRequest { /// service that the request is still live while it is running an operation to /// generate more data. #[prost(bytes = "bytes", tag = "10")] - #[derivative(Debug = "ignore")] + #[debug(ignore)] pub data: ::prost::bytes::Bytes, } /// Response object for ByteStream.Write. diff --git a/nativelink-proto/genproto/lib.rs b/nativelink-proto/genproto/lib.rs index 68e45db2a..bc2568a85 100644 --- a/nativelink-proto/genproto/lib.rs +++ b/nativelink-proto/genproto/lib.rs @@ -30,6 +30,8 @@ clippy::missing_const_for_fn, clippy::similar_names, clippy::std_instead_of_core, + clippy::use_self, + rustdoc::broken_intra_doc_links, rustdoc::invalid_html_tags )] diff --git a/nativelink-redis-tester/BUILD.bazel b/nativelink-redis-tester/BUILD.bazel new file mode 100644 index 000000000..8f4caff32 --- /dev/null +++ b/nativelink-redis-tester/BUILD.bazel @@ -0,0 +1,59 @@ +load( + "@rules_rust//rust:defs.bzl", + "rust_doc", + "rust_doc_test", + "rust_library", + "rust_test", + "rust_test_suite", +) + +rust_library( + name = "nativelink-redis-tester", + srcs = [ + "src/dynamic_fake_redis.rs", + "src/fake_redis.rs", + "src/lib.rs", + "src/pubsub.rs", + "src/read_only_redis.rs", + ], + visibility = ["//visibility:public"], + deps = [ + "//nativelink-util", + "@crates//:either", + "@crates//:redis", + "@crates//:redis-protocol", + "@crates//:redis-test", + "@crates//:tokio", + "@crates//:tracing", + ], +) + +rust_test_suite( + name = "integration", + timeout = "short", + srcs = [ + ], + deps = [ + ":nativelink-redis-tester", + ], +) + +rust_test( + name = "unit_test", + timeout = "short", + crate = ":nativelink-redis-tester", + deps = [ + ], +) + +rust_doc( + name = "docs", + crate = ":nativelink-redis-tester", + visibility = ["//visibility:public"], +) + +rust_doc_test( + name = "doc_test", + timeout = "short", + crate = ":nativelink-redis-tester", +) diff --git a/nativelink-redis-tester/Cargo.toml b/nativelink-redis-tester/Cargo.toml new file mode 100644 index 000000000..778b5b62d --- /dev/null +++ b/nativelink-redis-tester/Cargo.toml @@ -0,0 +1,22 @@ +#:schema ../tools/cargo-with-detailed-deps.json +lints.workspace = true + +[package] +edition = "2024" +name = "nativelink-redis-tester" +version = "1.0.0-rc4" + +[dependencies] +nativelink-util = { path = "../nativelink-util" } + +either = { version = "1.15.0", default-features = false } +redis = { version = "1.0.0", default-features = false } +redis-protocol = { version = "6.0.0", default-features = false, features = [ + "bytes", + "resp2", + "resp3", + "std", +] } +redis-test = { version = "1.0.0", default-features = false, features = ["aio"] } +tokio = { version = "1.44.1", features = [], default-features = false } +tracing = { version = "0.1.41", default-features = false } diff --git a/nativelink-redis-tester/src/dynamic_fake_redis.rs b/nativelink-redis-tester/src/dynamic_fake_redis.rs new file mode 100644 index 000000000..ee9baf176 --- /dev/null +++ b/nativelink-redis-tester/src/dynamic_fake_redis.rs @@ -0,0 +1,371 @@ +// Copyright 2026 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::fmt; +use std::collections::HashMap; +use std::collections::hash_map::Entry; +use std::sync::{Arc, Mutex}; + +use nativelink_util::background_spawn; +use redis::Value; +use redis_protocol::resp2::decode::decode; +use redis_protocol::resp2::types::{OwnedFrame, Resp2Frame}; +use tokio::net::TcpListener; +use tracing::{debug, info, trace}; + +use crate::fake_redis::{arg_as_string, fake_redis_internal}; + +pub trait SubscriptionManagerNotify { + fn notify_for_test(&self, value: String); +} + +#[derive(Clone)] +pub struct FakeRedisBackend { + /// Contains a list of all of the Redis keys -> fields. + pub table: Arc>>>, + subscription_manager: Arc>>>, +} + +impl Default for FakeRedisBackend { + fn default() -> Self { + Self::new() + } +} + +impl fmt::Debug for FakeRedisBackend { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("FakeRedisBackend").finish() + } +} + +const FAKE_SCRIPT_SHA: &str = "b22b9926cbce9dd9ba97fa7ba3626f89feea1ed5"; + +impl FakeRedisBackend { + pub fn new() -> Self { + Self { + table: Arc::new(Mutex::new(HashMap::new())), + subscription_manager: Arc::new(Mutex::new(None)), + } + } + + pub fn set_subscription_manager(&self, subscription_manager: Arc) { + self.subscription_manager + .lock() + .unwrap() + .replace(subscription_manager); + } + + async fn dynamic_fake_redis(self, listener: TcpListener) { + let inner = move |buf: &[u8]| -> String { + let mut output = String::new(); + let mut buf_index = 0; + loop { + let frame = match decode(&buf[buf_index..]).unwrap() { + Some((frame, amt)) => { + buf_index += amt; + frame + } + None => { + panic!("No frame!"); + } + }; + let (cmd, args) = { + if let OwnedFrame::Array(a) = frame { + if let OwnedFrame::BulkString(s) = a.first().unwrap() { + let args: Vec<_> = a[1..].to_vec(); + (str::from_utf8(s).unwrap().to_string(), args) + } else { + panic!("Array not starting with cmd: {a:?}"); + } + } else { + panic!("Non array cmd: {frame:?}"); + } + }; + + let ret: Value = match cmd.as_str() { + "HELLO" => Value::Map(vec![( + Value::SimpleString("server".into()), + Value::SimpleString("redis".into()), + )]), + "CLIENT" => { + // We can safely ignore these, as it's just setting the library name/version + Value::Int(0) + } + "SCRIPT" => { + assert_eq!(args[0], OwnedFrame::BulkString(b"LOAD".to_vec())); + + let OwnedFrame::BulkString(ref _script) = args[1] else { + panic!("Script should be a bulkstring: {args:?}"); + }; + Value::SimpleString(FAKE_SCRIPT_SHA.to_string()) + } + + "PSUBSCRIBE" => { + // This does nothing at the moment, maybe we need to implement it later. + Value::Int(0) + } + + "PUBLISH" => { + if let Some(subscription_manager) = + self.subscription_manager.lock().unwrap().as_ref() + { + subscription_manager.notify_for_test( + str::from_utf8(args[1].as_bytes().expect("Notification not bytes")) + .expect("Notification not UTF-8") + .into(), + ); + Value::Int(1) + } else { + Value::Int(0) + } + } + + "FT.AGGREGATE" => { + // The query is either "*" (match all) or @field:{ value }. + let OwnedFrame::BulkString(ref raw_query) = args[1] else { + panic!("Aggregate query should be a string: {args:?}"); + }; + let query = str::from_utf8(raw_query).unwrap(); + // Lazy implementation making assumptions. + assert_eq!( + args[2..6], + vec![ + OwnedFrame::BulkString(b"LOAD".to_vec()), + OwnedFrame::BulkString(b"2".to_vec()), + OwnedFrame::BulkString(b"data".to_vec()), + OwnedFrame::BulkString(b"version".to_vec()) + ] + ); + let mut results = vec![Value::Int(0)]; + + if query == "*" { + // Wildcard query - return all records that have both data and version fields. + // Some entries (e.g., from HSET) may not have version field. + for fields in self.table.lock().unwrap().values() { + if let (Some(data), Some(version)) = + (fields.get("data"), fields.get("version")) + { + results.push(Value::Array(vec![ + Value::BulkString(b"data".to_vec()), + data.clone(), + Value::BulkString(b"version".to_vec()), + version.clone(), + ])); + } + } + } else { + // Field-specific query: @field:{ value } + assert_eq!(&query[..1], "@"); + let mut parts = query[1..].split(':'); + let field = parts.next().expect("No field name"); + let value = parts.next().expect("No value"); + let value = value + .strip_prefix("{ ") + .and_then(|s| s.strip_suffix(" }")) + .unwrap_or(value); + for fields in self.table.lock().unwrap().values() { + if let Some(key_value) = fields.get(field) { + if *key_value == Value::BulkString(value.as_bytes().to_vec()) { + results.push(Value::Array(vec![ + Value::BulkString(b"data".to_vec()), + fields.get("data").expect("No data field").clone(), + Value::BulkString(b"version".to_vec()), + fields + .get("version") + .expect("No version field") + .clone(), + ])); + } + } + } + } + + results[0] = + Value::Int(i64::try_from(results.len() - 1).unwrap_or(i64::MAX)); + Value::Array(vec![ + Value::Array(results), + Value::Int(0), // Means no more items in cursor. + ]) + } + + "EVALSHA" => { + assert_eq!( + args[0], + OwnedFrame::BulkString(FAKE_SCRIPT_SHA.as_bytes().to_vec()) + ); + assert_eq!(args[1], OwnedFrame::BulkString(b"1".to_vec())); + let mut value: HashMap<_, Value> = HashMap::new(); + value.insert( + "data".into(), + Value::BulkString(args[4].as_bytes().unwrap().to_vec()), + ); + for pair in args[5..].chunks(2) { + value.insert( + str::from_utf8(pair[0].as_bytes().expect("Field name not bytes")) + .expect("Unable to parse field name as string") + .into(), + Value::BulkString(pair[1].as_bytes().unwrap().to_vec()), + ); + } + let mut ret: Option = None; + let key: String = + str::from_utf8(args[2].as_bytes().expect("Key not bytes")) + .expect("Key cannot be parsed as string") + .into(); + let expected_existing_version: i64 = + str::from_utf8(args[3].as_bytes().unwrap()) + .unwrap() + .parse() + .expect("Unable to parse existing version field"); + trace!(%key, %expected_existing_version, ?value, "Want to insert with EVALSHA"); + let version = match self.table.lock().unwrap().entry(key.clone()) { + Entry::Occupied(mut occupied_entry) => { + let version = occupied_entry + .get() + .get("version") + .expect("No version field"); + let Value::BulkString(version_bytes) = version else { + panic!("Non-bulkstring version: {version:?}"); + }; + let version_int: i64 = str::from_utf8(version_bytes) + .expect("Version field not valid string") + .parse() + .expect("Unable to parse version field"); + if version_int == expected_existing_version { + let new_version = version_int + 1; + debug!(%key, %new_version, "Version update"); + value.insert( + "version".into(), + Value::BulkString( + format!("{new_version}").as_bytes().to_vec(), + ), + ); + occupied_entry.insert(value); + new_version + } else { + // Version mismatch. + debug!(%key, %version_int, %expected_existing_version, "Version mismatch"); + ret = Some(Value::Array(vec![ + Value::Int(0), + Value::Int(version_int), + ])); + -1 + } + } + Entry::Vacant(vacant_entry) => { + if expected_existing_version != 0 { + // Version mismatch. + debug!(%key, %expected_existing_version, "Version mismatch, expected zero"); + ret = Some(Value::Array(vec![Value::Int(0), Value::Int(0)])); + -1 + } else { + debug!(%key, "Version insert"); + value + .insert("version".into(), Value::BulkString(b"1".to_vec())); + vacant_entry.insert_entry(value); + 1 + } + } + }; + if let Some(r) = ret { + r + } else { + Value::Array(vec![Value::Int(1), Value::Int(version)]) + } + } + + "HMSET" => { + let mut values = HashMap::new(); + assert_eq!( + (args.len() - 1).rem_euclid(2), + 0, + "Non-even args for hmset: {args:?}" + ); + let chunks = args[1..].chunks_exact(2); + for chunk in chunks { + let [key, value] = chunk else { + panic!("Uneven hmset args"); + }; + let key_name: String = + str::from_utf8(key.as_bytes().expect("Key argument is not bytes")) + .expect("Unable to parse key as string") + .into(); + values.insert( + key_name, + Value::BulkString(value.as_bytes().unwrap().to_vec()), + ); + } + let key = + str::from_utf8(args[0].as_bytes().expect("Key argument is not bytes")) + .expect("Unable to parse key as string") + .into(); + debug!(%key, ?values, "Inserting with HMSET"); + self.table.lock().unwrap().insert(key, values); + Value::Okay + } + + "HMGET" => { + let key_name = + str::from_utf8(args[0].as_bytes().expect("Key argument is not bytes")) + .expect("Unable to parse key name"); + + if let Some(fields) = self.table.lock().unwrap().get(key_name) { + trace!(%key_name, keys = ?fields.keys(), "Getting keys with HMGET, some keys"); + let mut result = vec![]; + for key in &args[1..] { + let field_name = str::from_utf8( + key.as_bytes().expect("Field argument is not bytes"), + ) + .expect("Unable to parse requested field"); + if let Some(value) = fields.get(field_name) { + result.push(value.clone()); + } else { + debug!(%key_name, %field_name, "Missing field"); + result.push(Value::Nil); + } + } + Value::Array(result) + } else { + trace!(%key_name, "Getting keys with HMGET, empty"); + let null_count = i64::try_from(args.len() - 1).unwrap(); + Value::Array(vec![Value::Nil, Value::Int(null_count)]) + } + } + actual => { + panic!("Mock command not implemented! {actual:?}"); + } + }; + + arg_as_string(&mut output, ret); + if buf_index == buf.len() { + break; + } + } + output + }; + fake_redis_internal(listener, vec![inner]).await; + } + + pub async fn run(self) -> u16 { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + info!("Using port {port}"); + + background_spawn!("listener", async move { + self.dynamic_fake_redis(listener).await; + }); + + port + } +} diff --git a/nativelink-redis-tester/src/fake_redis.rs b/nativelink-redis-tester/src/fake_redis.rs new file mode 100644 index 000000000..179c10949 --- /dev/null +++ b/nativelink-redis-tester/src/fake_redis.rs @@ -0,0 +1,272 @@ +// Copyright 2026 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::fmt::Write; +use core::hash::BuildHasher; +use std::collections::HashMap; + +use nativelink_util::background_spawn; +use redis::Value; +use redis_test::IntoRedisValue; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::TcpListener; +use tracing::{error, info, warn}; + +fn cmd_as_string(cmd: &redis::Cmd) -> String { + let raw = cmd.get_packed_command(); + String::from_utf8(raw).unwrap() +} + +pub(crate) fn arg_as_string(output: &mut String, arg: Value) { + match arg { + Value::SimpleString(s) => { + write!(output, "+{s}\r\n").unwrap(); + } + Value::Okay => { + write!(output, "+OK\r\n").unwrap(); + } + Value::BulkString(s) => { + write!( + output, + "${}\r\n{}\r\n", + s.len(), + str::from_utf8(&s).unwrap() + ) + .unwrap(); + } + Value::Int(v) => { + write!(output, ":{v}\r\n").unwrap(); + } + Value::Array(values) => { + write!(output, "*{}\r\n", values.len()).unwrap(); + for value in values { + arg_as_string(output, value); + } + } + Value::Map(values) => { + write!(output, "%{}\r\n", values.len()).unwrap(); + for (key, value) in values { + arg_as_string(output, key); + arg_as_string(output, value); + } + } + Value::Nil => { + write!(output, "_\r\n").unwrap(); + } + _ => { + panic!("No support for {arg:?}") + } + } +} + +fn args_as_string(args: Vec) -> String { + let mut output = String::new(); + for arg in args { + arg_as_string(&mut output, arg); + } + output +} + +pub fn add_to_response( + response: &mut HashMap, + cmd: &redis::Cmd, + args: Vec, +) { + add_to_response_raw(response, cmd, args_as_string(args)); +} + +pub fn add_to_response_raw( + response: &mut HashMap, + cmd: &redis::Cmd, + args: String, +) { + response.insert(cmd_as_string(cmd), args); +} + +fn setinfo(responses: &mut HashMap) { + // We do raw inserts of command here, because the library sends 3/4 commands in one go + // They always start with HELLO, then optionally SELECT, so we use this to differentiate + let hello = cmd_as_string(redis::cmd("HELLO").arg("3")); + let setinfo = cmd_as_string( + redis::cmd("CLIENT") + .arg("SETINFO") + .arg("LIB-NAME") + .arg("redis-rs"), + ); + responses.insert( + [hello.clone(), setinfo.clone()].join(""), + args_as_string(vec![ + Value::Map(vec![( + Value::SimpleString("server".into()), + Value::SimpleString("redis".into()), + )]), + Value::Okay, + Value::Okay, + ]), + ); + responses.insert( + [hello, cmd_as_string(redis::cmd("SELECT").arg(3)), setinfo].join(""), + args_as_string(vec![ + Value::Map(vec![( + Value::SimpleString("server".into()), + Value::SimpleString("redis".into()), + )]), + Value::Okay, + Value::Okay, + Value::Okay, + ]), + ); +} + +pub fn add_lua_script( + responses: &mut HashMap, + lua_script: &str, + hash: &str, +) { + add_to_response( + responses, + redis::cmd("SCRIPT").arg("LOAD").arg(lua_script), + vec![hash.into_redis_value()], + ); +} + +pub fn fake_redis_stream() -> HashMap { + let mut responses = HashMap::new(); + setinfo(&mut responses); + // Does setinfo as well, so need to respond to all 3 + add_to_response( + &mut responses, + redis::cmd("SELECT").arg("3"), + vec![Value::Okay, Value::Okay, Value::Okay], + ); + responses +} + +pub fn fake_redis_sentinel_master_stream() -> HashMap { + let mut response = fake_redis_stream(); + add_to_response( + &mut response, + &redis::cmd("ROLE"), + vec![Value::Array(vec![ + "master".into_redis_value(), + 0.into_redis_value(), + Value::Array(vec![]), + ])], + ); + response +} + +pub fn fake_redis_sentinel_stream(master_name: &str, redis_port: u16) -> HashMap { + let mut response = HashMap::new(); + setinfo(&mut response); + + // Not a full "sentinel masters" response, but enough for redis-rs + let resp: Vec<(Value, Value)> = vec![ + ("name".into_redis_value(), master_name.into_redis_value()), + ("ip".into_redis_value(), "127.0.0.1".into_redis_value()), + ( + "port".into_redis_value(), + i64::from(redis_port).into_redis_value(), + ), + ("flags".into_redis_value(), "master".into_redis_value()), + ]; + + add_to_response( + &mut response, + redis::cmd("SENTINEL").arg("MASTERS"), + vec![Value::Array(vec![Value::Map(resp)])], + ); + response +} + +pub(crate) async fn fake_redis_internal(listener: TcpListener, handlers: Vec) +where + H: Fn(&[u8]) -> String + Send + Clone + 'static + Sync, +{ + let mut handler_iter = handlers.iter().cloned().cycle(); + loop { + info!( + "Waiting for connection on {}", + listener.local_addr().unwrap() + ); + let Ok((mut stream, _)) = listener.accept().await else { + error!("accept error"); + panic!("error"); + }; + info!("Accepted new connection"); + let local_handler = handler_iter.next().unwrap(); + background_spawn!("thread", async move { + loop { + let mut buf = vec![0; 8192]; + let res = stream.read(&mut buf).await.unwrap(); + if res != 0 { + let output = local_handler(&buf[..res]); + if !output.is_empty() { + stream.write_all(output.as_bytes()).await.unwrap(); + } + } + } + }); + } +} + +async fn fake_redis(listener: TcpListener, all_responses: Vec>) +where + B: BuildHasher + Clone + Send + 'static + Sync, +{ + let funcs = all_responses + .iter() + .map(|responses| { + info!("Responses are: {:?}", responses); + let values = responses.clone(); + move |buf: &[u8]| -> String { + let str_buf = String::from_utf8_lossy(buf).into_owned(); + for (key, value) in &values { + if str_buf.starts_with(key) { + info!("Responding to {}", str_buf.replace("\r\n", "\\r\\n")); + return value.clone(); + } + } + warn!( + "Unknown command: {}", + str_buf.chars().take(1000).collect::() + ); + String::new() + } + }) + .collect(); + fake_redis_internal(listener, funcs).await; +} + +pub async fn make_fake_redis_with_multiple_responses< + B: BuildHasher + Clone + Send + 'static + Sync, +>( + responses: Vec>, +) -> u16 { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + info!("Using port {port}"); + + background_spawn!("listener", async move { + fake_redis(listener, responses).await; + }); + + port +} + +pub async fn make_fake_redis_with_responses( + responses: HashMap, +) -> u16 { + make_fake_redis_with_multiple_responses(vec![responses]).await +} diff --git a/nativelink-redis-tester/src/lib.rs b/nativelink-redis-tester/src/lib.rs new file mode 100644 index 000000000..976441b25 --- /dev/null +++ b/nativelink-redis-tester/src/lib.rs @@ -0,0 +1,27 @@ +// Copyright 2026 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod dynamic_fake_redis; +mod fake_redis; +mod pubsub; +mod read_only_redis; + +pub use dynamic_fake_redis::{FakeRedisBackend, SubscriptionManagerNotify}; +pub use fake_redis::{ + add_lua_script, add_to_response, add_to_response_raw, fake_redis_sentinel_master_stream, + fake_redis_sentinel_stream, fake_redis_stream, make_fake_redis_with_multiple_responses, + make_fake_redis_with_responses, +}; +pub use pubsub::MockPubSub; +pub use read_only_redis::ReadOnlyRedis; diff --git a/nativelink-redis-tester/src/pubsub.rs b/nativelink-redis-tester/src/pubsub.rs new file mode 100644 index 000000000..6de74a9d6 --- /dev/null +++ b/nativelink-redis-tester/src/pubsub.rs @@ -0,0 +1,28 @@ +// Copyright 2026 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#[derive(Debug, Clone, Copy)] +pub struct MockPubSub {} + +impl MockPubSub { + pub const fn new() -> Self { + Self {} + } +} + +impl Default for MockPubSub { + fn default() -> Self { + Self::new() + } +} diff --git a/nativelink-redis-tester/src/read_only_redis.rs b/nativelink-redis-tester/src/read_only_redis.rs new file mode 100644 index 000000000..757075c9f --- /dev/null +++ b/nativelink-redis-tester/src/read_only_redis.rs @@ -0,0 +1,156 @@ +// Copyright 2026 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::fmt::Write; +use core::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use either::Either; +use nativelink_util::background_spawn; +use redis::Value; +use redis_protocol::resp2::decode::decode; +use redis_protocol::resp2::types::OwnedFrame; +use tokio::net::TcpListener; +use tracing::info; + +use crate::fake_redis::{arg_as_string, fake_redis_internal}; + +const FAKE_SCRIPT_SHA: &str = "b22b9926cbce9dd9ba97fa7ba3626f89feea1ed5"; + +#[derive(Clone, Debug)] +pub struct ReadOnlyRedis { + // The first time we hit SETRANGE/HMSET, we output a ReadOnly. Next time, we assume we're reconnected and do correct values + readonly_triggered: Arc, +} + +impl Default for ReadOnlyRedis { + fn default() -> Self { + Self::new() + } +} + +impl ReadOnlyRedis { + pub fn new() -> Self { + Self { + readonly_triggered: Arc::new(AtomicBool::new(false)), + } + } + + async fn dynamic_fake_redis(self, listener: TcpListener) { + let readonly_err_str = "READONLY You can't write against a read only replica."; + let readonly_err = format!("!{}\r\n{readonly_err_str}\r\n", readonly_err_str.len()); + + let inner = move |buf: &[u8]| -> String { + let mut output = String::new(); + let mut buf_index = 0; + loop { + let frame = match decode(&buf[buf_index..]).unwrap() { + Some((frame, amt)) => { + buf_index += amt; + frame + } + None => { + panic!("No frame!"); + } + }; + let (cmd, args) = { + if let OwnedFrame::Array(a) = frame { + if let OwnedFrame::BulkString(s) = a.first().unwrap() { + let args: Vec<_> = a[1..].to_vec(); + (str::from_utf8(s).unwrap().to_string(), args) + } else { + panic!("Array not starting with cmd: {a:?}"); + } + } else { + panic!("Non array cmd: {frame:?}"); + } + }; + + let ret: Either = match cmd.as_str() { + "HELLO" => Either::Left(Value::Map(vec![( + Value::SimpleString("server".into()), + Value::SimpleString("redis".into()), + )])), + "CLIENT" => { + // We can safely ignore these, as it's just setting the library name/version + Either::Left(Value::Int(0)) + } + "SCRIPT" => { + assert_eq!(args[0], OwnedFrame::BulkString(b"LOAD".to_vec())); + + let OwnedFrame::BulkString(ref _script) = args[1] else { + panic!("Script should be a bulkstring: {args:?}"); + }; + Either::Left(Value::SimpleString(FAKE_SCRIPT_SHA.to_string())) + } + "ROLE" => Either::Left(Value::Array(vec![ + Value::BulkString(b"master".to_vec()), + Value::Int(0), + Value::Array(vec![]), + ])), + "SETRANGE" => { + let value = self.readonly_triggered.load(Ordering::Relaxed); + if value { + Either::Left(Value::Int(5)) + } else { + self.readonly_triggered.store(true, Ordering::Relaxed); + Either::Right(readonly_err.clone()) + } + } + "STRLEN" => Either::Left(Value::Int(5)), + "RENAME" | "HMSET" => { + let value = self.readonly_triggered.load(Ordering::Relaxed); + if value { + Either::Left(Value::Okay) + } else { + self.readonly_triggered.store(true, Ordering::Relaxed); + Either::Right(readonly_err.clone()) + } + } + "EVALSHA" => Either::Left(Value::Array(vec![Value::Int(1), Value::Int(0)])), + actual => { + panic!("Mock command not implemented! {actual:?}"); + } + }; + + match ret { + Either::Left(v) => { + arg_as_string(&mut output, v); + } + Either::Right(s) => { + write!(&mut output, "{s}").unwrap(); + } + } + + if buf_index == buf.len() { + break; + } + } + output + }; + fake_redis_internal(listener, vec![inner]).await; + } + + pub async fn run(self) -> u16 { + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + info!("Using port {port}"); + + background_spawn!("listener", async move { + self.dynamic_fake_redis(listener).await; + }); + + port + } +} diff --git a/nativelink-scheduler/BUILD.bazel b/nativelink-scheduler/BUILD.bazel index b244c2e9e..a0653adb3 100644 --- a/nativelink-scheduler/BUILD.bazel +++ b/nativelink-scheduler/BUILD.bazel @@ -21,10 +21,13 @@ rust_library( "src/mock_scheduler.rs", "src/platform_property_manager.rs", "src/property_modifier_scheduler.rs", + "src/property_router_scheduler.rs", "src/simple_scheduler.rs", "src/simple_scheduler_state_manager.rs", "src/store_awaited_action_db.rs", "src/worker.rs", + "src/worker_capability_index.rs", + "src/worker_registry.rs", "src/worker_scheduler.rs", ], proc_macro_deps = [ @@ -45,6 +48,7 @@ rust_library( "@crates//:opentelemetry", "@crates//:opentelemetry-semantic-conventions", "@crates//:parking_lot", + "@crates//:redis", "@crates//:scopeguard", "@crates//:serde", "@crates//:serde_json", @@ -63,8 +67,11 @@ rust_test_suite( "tests/action_messages_test.rs", "tests/cache_lookup_scheduler_test.rs", "tests/property_modifier_scheduler_test.rs", + "tests/property_router_scheduler_test.rs", "tests/redis_store_awaited_action_db_test.rs", + "tests/simple_scheduler_state_manager_test.rs", "tests/simple_scheduler_test.rs", + "tests/worker_capability_index_test.rs", ], compile_data = [ "tests/utils/scheduler_utils.rs", @@ -79,16 +86,17 @@ rust_test_suite( "//nativelink-error", "//nativelink-metric", "//nativelink-proto", + "//nativelink-redis-tester", "//nativelink-store", "//nativelink-util", "@crates//:async-lock", "@crates//:bytes", - "@crates//:fred", "@crates//:futures", "@crates//:mock_instant", "@crates//:parking_lot", "@crates//:pretty_assertions", "@crates//:prost", + "@crates//:redis", "@crates//:serde_json", "@crates//:tokio", "@crates//:tokio-stream", @@ -107,8 +115,9 @@ rust_test( "//nativelink-macro", ], deps = [ - "@crates//:fred", "@crates//:pretty_assertions", + "@crates//:redis", + "@crates//:tracing-test", ], ) diff --git a/nativelink-scheduler/Cargo.toml b/nativelink-scheduler/Cargo.toml index 2a617bfb0..a459c17a3 100644 --- a/nativelink-scheduler/Cargo.toml +++ b/nativelink-scheduler/Cargo.toml @@ -1,12 +1,10 @@ +#:schema ../tools/cargo-with-detailed-deps.json lints.workspace = true [package] edition = "2024" name = "nativelink-scheduler" -version = "0.7.3" - -[features] -worker_find_logging = ["nativelink-util/worker_find_logging"] +version = "1.0.0-rc4" [dependencies] nativelink-config = { path = "../nativelink-config" } @@ -16,25 +14,24 @@ nativelink-proto = { path = "../nativelink-proto" } nativelink-store = { path = "../nativelink-store" } nativelink-util = { path = "../nativelink-util" } -# TODO(palfrey): This should not be a dependency. Move the corresponding -# files somewhere else. async-lock = { version = "3.4.0", features = ["std"], default-features = false } -async-trait = "0.1.88" +async-trait = { version = "0.1.88", default-features = false } bytes = { version = "1.10.1", default-features = false } futures = { version = "0.3.31", default-features = false } -lru = { version = "0.13.0", default-features = false } -mock_instant = "0.5.3" -opentelemetry = { version = "0.29.1", default-features = false } -opentelemetry-semantic-conventions = { version = "0.29.0", default-features = false, features = [ +lru = { version = "0.16.0", default-features = false } +mock_instant = { version = "0.5.3", default-features = false } +opentelemetry = { version = "0.30.0", default-features = false } +opentelemetry-semantic-conventions = { version = "0.30.0", default-features = false, features = [ "default", "semconv_experimental", ] } -parking_lot = "0.12.3" +parking_lot = { version = "0.12.3", default-features = false } prost = { version = "0.13.5", default-features = false } +redis = { version = "1.0.0", default-features = false } scopeguard = { version = "1.2.0", default-features = false } -serde = { version = "1.0.219", features = ["rc"] } -serde_json = "1.0.140" -static_assertions = "1.1.0" +serde = { version = "1.0.219", features = ["rc"], default-features = false } +serde_json = { version = "1.0.140", default-features = false } +static_assertions = { version = "1.1.0", default-features = false } tokio = { version = "1.44.1", features = [ "fs", "io-util", @@ -56,13 +53,11 @@ uuid = { version = "1.16.0", default-features = false, features = [ [dev-dependencies] nativelink-macro = { path = "../nativelink-macro" } +nativelink-redis-tester = { path = "../nativelink-redis-tester" } -fred = { version = "10.1.0", default-features = false, features = ["mocks"] } -pretty_assertions = { version = "1.4.1", features = ["std"] } +pretty_assertions = { version = "1.4.1", features = [ + "std", +], default-features = false } tracing-test = { version = "0.2.5", default-features = false, features = [ "no-env-filter", ] } - -[package.metadata.cargo-machete] -# Used by nativelink_test macro -ignored = ["tracing-test"] diff --git a/nativelink-scheduler/src/api_worker_scheduler.rs b/nativelink-scheduler/src/api_worker_scheduler.rs index 4de734db4..665467f4e 100644 --- a/nativelink-scheduler/src/api_worker_scheduler.rs +++ b/nativelink-scheduler/src/api_worker_scheduler.rs @@ -13,33 +13,132 @@ // limitations under the License. use core::ops::{Deref, DerefMut}; +use core::sync::atomic::{AtomicU64, Ordering}; +use core::time::Duration; +use std::collections::HashMap; use std::sync::Arc; +use std::time::{Instant, UNIX_EPOCH}; -use async_lock::Mutex; +use async_lock::RwLock; use lru::LruCache; use nativelink_config::schedulers::WorkerAllocationStrategy; -use nativelink_error::{Code, Error, ResultExt, error_if, make_err, make_input_err}; +use nativelink_error::{error_if, make_err, make_input_err, Code, Error, ResultExt}; use nativelink_metric::{ - MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, - RootMetricsComponent, group, + group, MetricFieldData, MetricKind, MetricPublishKnownKindData, + MetricsComponent, RootMetricsComponent, }; use nativelink_util::action_messages::{OperationId, WorkerId}; +use nativelink_util::metrics::{WORKER_POOL_METRICS, WORKER_POOL_INSTANCE, WorkerPoolMetricAttrs}; use nativelink_util::operation_state_manager::{UpdateOperationType, WorkerStateManager}; use nativelink_util::platform_properties::PlatformProperties; use nativelink_util::shutdown_guard::ShutdownGuard; -use nativelink_util::spawn; -use nativelink_util::task::JoinHandleDropGuard; +use opentelemetry::KeyValue; use tokio::sync::Notify; -use tokio::sync::mpsc::{self, UnboundedSender}; use tonic::async_trait; -#[cfg(feature = "worker_find_logging")] -use tracing::info; -use tracing::{error, warn}; +use tracing::{error, info, trace, warn}; + +/// Metrics for tracking scheduler performance. +#[derive(Debug, Default)] +pub struct SchedulerMetrics { + /// Total number of worker additions. + pub workers_added: AtomicU64, + /// Total number of worker removals. + pub workers_removed: AtomicU64, + /// Total number of `find_worker_for_action` calls. + pub find_worker_calls: AtomicU64, + /// Total number of successful worker matches. + pub find_worker_hits: AtomicU64, + /// Total number of failed worker matches (no worker found). + pub find_worker_misses: AtomicU64, + /// Total time spent in `find_worker_for_action` (nanoseconds). + pub find_worker_time_ns: AtomicU64, + /// Total number of workers iterated during find operations. + pub workers_iterated: AtomicU64, + /// Total number of action dispatches. + pub actions_dispatched: AtomicU64, + /// Total number of keep-alive updates. + pub keep_alive_updates: AtomicU64, + /// Total number of worker timeouts. + pub worker_timeouts: AtomicU64, +} use crate::platform_property_manager::PlatformPropertyManager; -use crate::worker::{ActionInfoWithProps, Worker, WorkerTimestamp, WorkerUpdate}; +use crate::worker::{reduce_platform_properties, Worker, ActionInfoWithProps, WorkerState, WorkerTimestamp, WorkerUpdate}; +use crate::worker_capability_index::WorkerCapabilityIndex; +use crate::worker_registry::SharedWorkerRegistry; use crate::worker_scheduler::WorkerScheduler; +#[derive(Debug)] +pub struct WorkerSchedulerMetrics { + attrs: WorkerPoolMetricAttrs, + instance_name: String, +} + +impl WorkerSchedulerMetrics { + #[must_use] + pub fn new(instance_name: impl Into) -> Self { + let instance_name = instance_name.into(); + let base_attrs = vec![KeyValue::new(WORKER_POOL_INSTANCE, instance_name.clone())]; + Self { + attrs: WorkerPoolMetricAttrs::new(&base_attrs), + instance_name, + } + } + + pub fn record_worker_count(&self, count: usize) { + WORKER_POOL_METRICS + .worker_count + .record(count as u64, self.attrs.added()); + } + + pub fn record_worker_added(&self) { + WORKER_POOL_METRICS.worker_events.add(1, self.attrs.added()); + } + + pub fn record_worker_removed(&self) { + WORKER_POOL_METRICS.worker_events.add(1, self.attrs.removed()); + } + + pub fn record_worker_timeout(&self) { + WORKER_POOL_METRICS.worker_events.add(1, self.attrs.timeout()); + } + + pub fn record_worker_connection_failed(&self) { + WORKER_POOL_METRICS + .worker_events + .add(1, self.attrs.connection_failed()); + } + + pub fn record_action_dispatched(&self) { + WORKER_POOL_METRICS + .worker_actions_dispatched + .add(1, self.attrs.added()); + } + + pub fn record_action_completed(&self) { + WORKER_POOL_METRICS + .worker_actions_completed + .add(1, self.attrs.removed()); + } + + pub fn record_running_actions_count(&self, count: usize) { + WORKER_POOL_METRICS + .worker_actions_running + .record(count as u64, self.attrs.added()); + } + + pub fn record_dispatch_failure(&self) { + WORKER_POOL_METRICS + .worker_dispatch_failures + .add(1, self.attrs.evicted()); + } + + #[must_use] + pub fn instance_name(&self) -> &str { + &self.instance_name + } +} + #[derive(Debug)] struct Workers(LruCache); @@ -88,8 +187,16 @@ struct ApiWorkerSchedulerImpl { allocation_strategy: WorkerAllocationStrategy, /// A channel to notify the matching engine that the worker pool has changed. worker_change_notify: Arc, - /// A channel to notify that an operation is still alive. - operation_keep_alive_tx: UnboundedSender<(OperationId, WorkerId)>, + /// Worker registry for tracking worker liveness. + worker_registry: SharedWorkerRegistry, + + /// Whether the worker scheduler is shutting down. + shutting_down: bool, + + /// Index for fast worker capability lookup. + /// Used to accelerate `find_worker_for_action` by filtering candidates + /// based on properties before doing linear scan. + capability_index: WorkerCapabilityIndex, } impl core::fmt::Debug for ApiWorkerSchedulerImpl { @@ -98,13 +205,24 @@ impl core::fmt::Debug for ApiWorkerSchedulerImpl { .field("workers", &self.workers) .field("allocation_strategy", &self.allocation_strategy) .field("worker_change_notify", &self.worker_change_notify) - .field("operation_keep_alive_tx", &self.operation_keep_alive_tx) + .field( + "capability_index_size", + &self.capability_index.worker_count(), + ) + .field("worker_registry", &self.worker_registry) .finish_non_exhaustive() } } impl ApiWorkerSchedulerImpl { /// Refreshes the lifetime of the worker with the given timestamp. + /// + /// Instead of sending N keepalive messages (one per operation), + /// we now send a single worker heartbeat. The worker registry tracks worker liveness, + /// and timeout detection checks the worker's `last_seen` instead of per-operation timestamps. + /// + /// Note: This only updates the local worker state. The worker registry is updated + /// separately after releasing the inner lock to reduce contention. fn refresh_lifetime( &mut self, worker_id: &WorkerId, @@ -123,19 +241,13 @@ impl ApiWorkerSchedulerImpl { timestamp ); worker.last_update_timestamp = timestamp; - for operation_id in worker.running_action_infos.keys() { - if self - .operation_keep_alive_tx - .send((operation_id.clone(), worker_id.clone())) - .is_err() - { - error!( - ?operation_id, - ?worker_id, - "OperationKeepAliveTx stream closed" - ); - } - } + + trace!( + ?worker_id, + running_operations = worker.running_action_infos.len(), + "Worker keepalive received" + ); + Ok(()) } @@ -143,8 +255,13 @@ impl ApiWorkerSchedulerImpl { /// Note: This function will not do any task matching. fn add_worker(&mut self, worker: Worker) -> Result<(), Error> { let worker_id = worker.id.clone(); + let platform_properties = worker.platform_properties.clone(); self.workers.put(worker_id.clone(), worker); + // Add to capability index for fast matching + self.capability_index + .add_worker(&worker_id, &platform_properties); + // Worker is not cloneable, and we do not want to send the initial connection results until // we have added it to the map, or we might get some strange race conditions due to the way // the multi-threaded runtime works. @@ -167,6 +284,9 @@ impl ApiWorkerSchedulerImpl { /// Note: The caller is responsible for any rescheduling of any tasks that might be /// running. fn remove_worker(&mut self, worker_id: &WorkerId) -> Option { + // Remove from capability index + self.capability_index.remove_worker(worker_id); + let result = self.workers.pop(worker_id); self.worker_change_notify.notify_one(); result @@ -187,47 +307,124 @@ impl ApiWorkerSchedulerImpl { Ok(()) } - #[cfg_attr(not(feature = "worker_find_logging"), allow(unused_variables))] - fn inner_worker_checker( - (worker_id, w): &(&WorkerId, &Worker), + fn inner_find_worker_for_action( + &self, platform_properties: &PlatformProperties, - ) -> bool { - #[cfg(feature = "worker_find_logging")] - { + full_worker_logging: bool, + ) -> Option { + // Do a fast check to see if any workers are available at all for work allocation + if !self.workers.iter().any(|(_, w)| w.can_accept_work()) { + if full_worker_logging { + info!("All workers are fully allocated"); + } + return None; + } + + // Use capability index to get candidate workers that match STATIC properties + // (Exact, Unknown) and have the required property keys (Priority, Minimum). + // This reduces complexity from O(W × P) to O(P × log(W)) for exact properties. + let candidates = self + .capability_index + .find_matching_workers(platform_properties, full_worker_logging); + + if candidates.is_empty() { + if full_worker_logging { + info!("No workers in capability index match required properties"); + } + return None; + } + + // Check function for availability AND dynamic Minimum property verification. + // The index only does presence checks for Minimum properties since their + // values change dynamically as jobs are assigned to workers. + let worker_matches = |(worker_id, w): &(&WorkerId, &Worker)| -> bool { if !w.can_accept_work() { - info!( - "Worker {worker_id} cannot accept work because is_paused: {}, is_draining: {}", - w.is_paused, w.is_draining - ); + if full_worker_logging { + info!( + "Worker {worker_id} cannot accept work: is_paused={}, is_draining={}, inflight={}/{}", + w.is_paused, + w.is_draining, + w.running_action_infos.len(), + w.max_inflight_tasks + ); + } return false; } - if !platform_properties.is_satisfied_by(&w.platform_properties) { - info!("Worker {worker_id} properties are insufficient"); + + // Verify Minimum properties at runtime (their values are dynamic) + if !platform_properties.is_satisfied_by(&w.platform_properties, full_worker_logging) { return false; } - return true; - } - #[cfg(not(feature = "worker_find_logging"))] - { - w.can_accept_work() && platform_properties.is_satisfied_by(&w.platform_properties) + + true + }; + + // Now check constraints on filtered candidates. + // Iterate in LRU order based on allocation strategy. + let workers_iter = self.workers.iter(); + + let worker_id = match self.allocation_strategy { + // Use rfind to get the least recently used that satisfies the properties. + WorkerAllocationStrategy::LeastRecentlyUsed => workers_iter + .rev() + .filter(|(worker_id, _)| candidates.contains(worker_id)) + .find(&worker_matches) + .map(|(_, w)| w.id.clone()), + + // Use find to get the most recently used that satisfies the properties. + WorkerAllocationStrategy::MostRecentlyUsed => workers_iter + .filter(|(worker_id, _)| candidates.contains(worker_id)) + .find(&worker_matches) + .map(|(_, w)| w.id.clone()), + }; + if full_worker_logging && worker_id.is_none() { + warn!("No workers matched!"); } + worker_id } - fn inner_find_worker_for_action( + /// Batch finds workers for multiple actions in a single pass. + /// This reduces lock contention by acquiring the lock once for all actions. + /// Returns a map of (action_index, worker_id) pairs for successful matches. + fn inner_batch_find_workers_for_actions( &self, - platform_properties: &PlatformProperties, - ) -> Option { - let mut workers_iter = self.workers.iter(); - let workers_iter = - match self.allocation_strategy { - // Use rfind to get the least recently used that satisfies the properties. - WorkerAllocationStrategy::LeastRecentlyUsed => workers_iter - .rfind(|worker| Self::inner_worker_checker(worker, platform_properties)), - // Use find to get the most recently used that satisfies the properties. - WorkerAllocationStrategy::MostRecentlyUsed => workers_iter - .find(|worker| Self::inner_worker_checker(worker, platform_properties)), - }; - workers_iter.map(|(_, w)| w.id.clone()) + actions: &[&PlatformProperties], + full_worker_logging: bool, + ) -> HashMap { + let mut results = HashMap::with_capacity(actions.len()); + let mut workers_platform_properties = HashMap::new(); + + for (idx, platform_properties) in actions.iter().enumerate() { + let candidates = self + .capability_index + .find_matching_workers(platform_properties, full_worker_logging); + if candidates.is_empty() { + continue; + } + + for worker_id in candidates { + if let Some(worker) = self.workers.peek(&worker_id) { + if !worker.can_accept_work() { + continue; + } + + if !workers_platform_properties.contains_key(&worker_id) { + workers_platform_properties.insert(worker_id.clone(), worker.platform_properties.clone()); + } + + if !platform_properties.is_satisfied_by(&workers_platform_properties[&worker_id], full_worker_logging) { + continue; + } + + reduce_platform_properties(workers_platform_properties.get_mut(&worker_id).unwrap(), platform_properties); + + results.insert(idx, worker_id.clone()); + break; + } + } + } + + results } async fn update_action( @@ -259,6 +456,12 @@ impl ApiWorkerSchedulerImpl { (true, err.code == Code::ResourceExhausted) } UpdateOperationType::UpdateWithDisconnect => (true, false), + UpdateOperationType::ExecutionComplete => { + // No update here, just restoring platform properties. + worker.execution_complete(operation_id); + self.worker_change_notify.notify_one(); + return Ok(()); + } }; // Update the operation in the worker state manager. @@ -270,7 +473,7 @@ impl ApiWorkerSchedulerImpl { .err_tip(|| "in update_operation on SimpleScheduler::update_action"); if let Err(err) = update_operation_res { error!( - ?operation_id, + %operation_id, ?worker_id, ?err, "Failed to update_operation on update_action" @@ -285,13 +488,10 @@ impl ApiWorkerSchedulerImpl { // Clear this action from the current worker if finished. let complete_action_res = { - let was_paused = !worker.can_accept_work(); - // Note: We need to run this before dealing with backpressure logic. let complete_action_res = worker.complete_action(operation_id).await; - // Only pause if there's an action still waiting that will unpause. - if (was_paused || due_to_backpressure) && worker.has_actions() { + if (due_to_backpressure || !worker.can_accept_work()) && worker.has_actions() { worker.is_paused = true; } complete_action_res @@ -344,7 +544,7 @@ impl ApiWorkerSchedulerImpl { } else { warn!( ?worker_id, - ?operation_id, + %operation_id, ?action_info, "Worker not found in worker map in worker_notify_run_action" ); @@ -359,6 +559,76 @@ impl ApiWorkerSchedulerImpl { } } + /// Batch notifies multiple workers to run actions in a single lock hold. + /// Returns a vector of results for each notification attempt. + async fn inner_batch_worker_notify_run_action( + &mut self, + assignments: Vec<(WorkerId, OperationId, ActionInfoWithProps)>, + ) -> Vec> { + let mut results = Vec::with_capacity(assignments.len()); + let mut workers_to_evict: Vec<(WorkerId, Error, bool)> = Vec::new(); + + for (worker_id, operation_id, action_info) in assignments { + if let Some(worker) = self.workers.get_mut(&worker_id) { + let notify_worker_result = worker + .notify_update(WorkerUpdate::RunAction(( + operation_id.clone(), + action_info.clone(), + ))) + .await; + + if let Err(notify_err) = notify_worker_result { + warn!( + ?worker_id, + ?action_info, + ?notify_err, + "Worker command failed in batch notify, will remove worker", + ); + + let is_disconnect = notify_err.code == Code::Internal + && notify_err.messages.len() == 1 + && notify_err.messages[0] == "Worker Disconnected"; + + let err = make_err!( + Code::Internal, + "Worker command failed, removing worker {worker_id} -- {notify_err:?}", + ); + + workers_to_evict.push((worker_id.clone(), err.clone(), is_disconnect)); + results.push(Err(err)); + } else { + results.push(Ok(())); + } + } else { + warn!( + ?worker_id, + %operation_id, + ?action_info, + "Worker not found in worker map in batch_worker_notify_run_action" + ); + // Queue the operation to be put back to queued state + let update_result = self + .worker_state_manager + .update_operation( + &operation_id, + &worker_id, + UpdateOperationType::UpdateWithDisconnect, + ) + .await; + results.push(update_result); + } + } + + // Evict failed workers after processing all notifications + for (worker_id, err, is_disconnect) in workers_to_evict { + let _ = self + .immediate_evict_worker(&worker_id, err, is_disconnect) + .await; + } + + results + } + /// Evicts the worker from the pool and puts items back into the queue if anything was being executed on it. async fn immediate_evict_worker( &mut self, @@ -388,12 +658,16 @@ impl ApiWorkerSchedulerImpl { self.worker_change_notify.notify_one(); result } + + fn count_running_actions(&self) -> usize { + self.workers.iter().map(|(_, w)| w.running_action_infos.len()).sum() + } } #[derive(Debug, MetricsComponent)] pub struct ApiWorkerScheduler { #[metric] - inner: Mutex, + inner: RwLock, #[metric(group = "platform_property_manager")] platform_property_manager: Arc, @@ -401,7 +675,14 @@ pub struct ApiWorkerScheduler { help = "Timeout of how long to evict workers if no response in this given amount of time in seconds." )] worker_timeout_s: u64, - _operation_keep_alive_spawn: JoinHandleDropGuard<()>, + /// Shared worker registry for checking worker liveness. + worker_registry: SharedWorkerRegistry, + + /// Performance metrics for observability. + metrics: Arc, + + /// OTEL metrics for tracking worker pool state. + worker_scheduler_metrics: WorkerSchedulerMetrics, } impl ApiWorkerScheduler { @@ -411,62 +692,100 @@ impl ApiWorkerScheduler { allocation_strategy: WorkerAllocationStrategy, worker_change_notify: Arc, worker_timeout_s: u64, + worker_registry: SharedWorkerRegistry, + instance_name: impl Into, ) -> Arc { - let (operation_keep_alive_tx, mut operation_keep_alive_rx) = mpsc::unbounded_channel(); Arc::new(Self { - inner: Mutex::new(ApiWorkerSchedulerImpl { + inner: RwLock::new(ApiWorkerSchedulerImpl { workers: Workers(LruCache::unbounded()), - worker_state_manager: worker_state_manager.clone(), + worker_state_manager, allocation_strategy, worker_change_notify, - operation_keep_alive_tx, + worker_registry: worker_registry.clone(), + shutting_down: false, + capability_index: WorkerCapabilityIndex::new(), }), platform_property_manager, worker_timeout_s, - _operation_keep_alive_spawn: spawn!( - "simple_scheduler_operation_keep_alive", - async move { - const RECV_MANY_LIMIT: usize = 256; - let mut messages = Vec::with_capacity(RECV_MANY_LIMIT); - loop { - messages.clear(); - operation_keep_alive_rx - .recv_many(&mut messages, RECV_MANY_LIMIT) - .await; - if messages.is_empty() { - return; // Looks like our sender has been dropped. - } - for (operation_id, worker_id) in messages.drain(..) { - let update_operation_res = worker_state_manager - .update_operation( - &operation_id, - &worker_id, - UpdateOperationType::KeepAlive, - ) - .await; - if let Err(err) = update_operation_res { - warn!( - ?err, - "Error while running worker_keep_alive_received, maybe job is done?" - ); - } - } - } - } - ), + worker_registry, + metrics: Arc::new(SchedulerMetrics::default()), + worker_scheduler_metrics: WorkerSchedulerMetrics::new(instance_name), }) } + /// Returns a reference to the worker registry. + pub const fn worker_registry(&self) -> &SharedWorkerRegistry { + &self.worker_registry + } + + /// Returns a reference to the worker scheduler metrics for recording OTEL metrics. + #[must_use] + pub fn workerMetrics(&self) -> &WorkerSchedulerMetrics { + &self.worker_scheduler_metrics + } + pub async fn worker_notify_run_action( &self, worker_id: WorkerId, operation_id: OperationId, action_info: ActionInfoWithProps, ) -> Result<(), Error> { - let mut inner = self.inner.lock().await; - inner + self.metrics + .actions_dispatched + .fetch_add(1, Ordering::Relaxed); + let mut inner = self.inner.write().await; + let result = inner .worker_notify_run_action(worker_id, operation_id, action_info) - .await + .await; + + // Record metrics + if result.is_ok() { + self.worker_scheduler_metrics.record_action_dispatched(); + } else { + self.worker_scheduler_metrics.record_dispatch_failure(); + } + self.worker_scheduler_metrics.record_running_actions_count(inner.count_running_actions()); + + result + } + + /// Batch notifies multiple workers to run actions in a single lock acquisition. + /// This reduces lock contention compared to calling `worker_notify_run_action` + /// for each action individually. + /// + /// Returns a vector of results corresponding to each assignment in the input. + pub async fn batch_worker_notify_run_action( + &self, + assignments: Vec<(WorkerId, OperationId, ActionInfoWithProps)>, + ) -> Vec> { + let count = assignments.len(); + self.metrics + .actions_dispatched + .fetch_add(count as u64, Ordering::Relaxed); + + let mut inner = self.inner.write().await; + let results = inner.inner_batch_worker_notify_run_action(assignments).await; + + // Record metrics + let successes = results.iter().filter(|r| r.is_ok()).count(); + let failures = count - successes; + + for _ in 0..successes { + self.worker_scheduler_metrics.record_action_dispatched(); + } + for _ in 0..failures { + self.worker_scheduler_metrics.record_dispatch_failure(); + } + self.worker_scheduler_metrics + .record_running_actions_count(inner.count_running_actions()); + + results + } + + /// Returns the scheduler metrics for observability. + #[must_use] + pub const fn get_metrics(&self) -> &Arc { + &self.metrics } /// Attempts to find a worker that is capable of running this action. @@ -476,15 +795,86 @@ impl ApiWorkerScheduler { pub async fn find_worker_for_action( &self, platform_properties: &PlatformProperties, + full_worker_logging: bool, ) -> Option { - let inner = self.inner.lock().await; - inner.inner_find_worker_for_action(platform_properties) + let start = Instant::now(); + self.metrics + .find_worker_calls + .fetch_add(1, Ordering::Relaxed); + + let inner = self.inner.read().await; + let worker_count = inner.workers.len() as u64; + let result = inner.inner_find_worker_for_action(platform_properties, full_worker_logging); + + // Track workers iterated (worst case is all workers) + self.metrics + .workers_iterated + .fetch_add(worker_count, Ordering::Relaxed); + + if result.is_some() { + self.metrics + .find_worker_hits + .fetch_add(1, Ordering::Relaxed); + } else { + self.metrics + .find_worker_misses + .fetch_add(1, Ordering::Relaxed); + } + + #[allow(clippy::cast_possible_truncation)] + self.metrics + .find_worker_time_ns + .fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed); + result + } + + /// Batch finds workers for multiple actions in a single lock acquisition. + /// This reduces lock contention compared to calling `find_worker_for_action` + /// for each action individually. + /// + /// Returns a vector of (action_index, worker_id) pairs for successful matches. + /// Actions that couldn't be matched to a worker are not included in the result. + pub async fn batch_find_workers_for_actions( + &self, + actions: &[&PlatformProperties], + full_worker_logging: bool, + ) -> HashMap { + let start = Instant::now(); + self.metrics + .find_worker_calls + .fetch_add(actions.len() as u64, Ordering::Relaxed); + + let inner = self.inner.read().await; + let worker_count = inner.workers.len() as u64; + let results = + inner.inner_batch_find_workers_for_actions(actions, full_worker_logging); + + // Track metrics + self.metrics + .workers_iterated + .fetch_add(worker_count * actions.len() as u64, Ordering::Relaxed); + + let hits = results.len() as u64; + let misses = actions.len() as u64 - hits; + self.metrics + .find_worker_hits + .fetch_add(hits, Ordering::Relaxed); + self.metrics + .find_worker_misses + .fetch_add(misses, Ordering::Relaxed); + + #[allow(clippy::cast_possible_truncation)] + self.metrics + .find_worker_time_ns + .fetch_add(start.elapsed().as_nanos() as u64, Ordering::Relaxed); + + results } /// Checks to see if the worker exists in the worker pool. Should only be used in unit tests. #[must_use] pub async fn contains_worker_for_test(&self, worker_id: &WorkerId) -> bool { - let inner = self.inner.lock().await; + let inner = self.inner.read().await; inner.workers.contains(worker_id) } @@ -493,12 +883,17 @@ impl ApiWorkerScheduler { &self, worker_id: &WorkerId, ) -> Result<(), Error> { - let mut inner = self.inner.lock().await; + let mut inner = self.inner.write().await; let worker = inner.workers.get_mut(worker_id).ok_or_else(|| { make_input_err!("WorkerId '{}' does not exist in workers map", worker_id) })?; worker.keep_alive() } + + pub async fn get_workers_state(&self) -> Vec { + let inner = self.inner.read().await; + inner.workers.iter().map(|(_, w)| w.to_state()).collect() + } } #[async_trait] @@ -508,15 +903,34 @@ impl WorkerScheduler for ApiWorkerScheduler { } async fn add_worker(&self, worker: Worker) -> Result<(), Error> { - let mut inner = self.inner.lock().await; let worker_id = worker.id.clone(); + let worker_timestamp = worker.last_update_timestamp; + let mut inner = self.inner.write().await; + if inner.shutting_down { + warn!("Rejected worker add during shutdown: {}", worker_id); + return Err(make_err!( + Code::Unavailable, + "Received request to add worker while shutting down" + )); + } let result = inner .add_worker(worker) .err_tip(|| "Error while adding worker, removing from pool"); - if let Err(err) = result { - return Result::<(), _>::Err(err.clone()) - .merge(inner.immediate_evict_worker(&worker_id, err, false).await); + if let Err(err) = &result { + self.worker_scheduler_metrics.record_worker_connection_failed(); + return Result::<(), _>::Err(err.clone()).merge( + inner + .immediate_evict_worker(&worker_id, err.clone(), false) + .await, + ); } + + let now = UNIX_EPOCH + Duration::from_secs(worker_timestamp); + self.worker_registry.register_worker(&worker_id, now).await; + + self.metrics.workers_added.fetch_add(1, Ordering::Relaxed); + self.worker_scheduler_metrics.record_worker_added(); + self.worker_scheduler_metrics.record_worker_count(inner.workers.len()); Ok(()) } @@ -526,8 +940,24 @@ impl WorkerScheduler for ApiWorkerScheduler { operation_id: &OperationId, update: UpdateOperationType, ) -> Result<(), Error> { - let mut inner = self.inner.lock().await; - inner.update_action(worker_id, operation_id, update).await + let is_completion = matches!( + update, + UpdateOperationType::UpdateWithActionStage(ref stage) if stage.is_finished() + ) || matches!( + update, + UpdateOperationType::UpdateWithError(_) | UpdateOperationType::UpdateWithDisconnect + ); + + let mut inner = self.inner.write().await; + let result = inner.update_action(worker_id, operation_id, update).await; + + // Record action completion metric + if result.is_ok() && is_completion { + self.worker_scheduler_metrics.record_action_completed(); + } + self.worker_scheduler_metrics.record_running_actions_count(inner.count_running_actions()); + + result } async fn worker_keep_alive_received( @@ -535,25 +965,40 @@ impl WorkerScheduler for ApiWorkerScheduler { worker_id: &WorkerId, timestamp: WorkerTimestamp, ) -> Result<(), Error> { - let mut inner = self.inner.lock().await; - inner - .refresh_lifetime(worker_id, timestamp) - .err_tip(|| "Error refreshing lifetime in worker_keep_alive_received()") + { + let mut inner = self.inner.write().await; + inner + .refresh_lifetime(worker_id, timestamp) + .err_tip(|| "Error refreshing lifetime in worker_keep_alive_received()")?; + } + let now = UNIX_EPOCH + Duration::from_secs(timestamp); + self.worker_registry + .update_worker_heartbeat(worker_id, now) + .await; + Ok(()) } async fn remove_worker(&self, worker_id: &WorkerId) -> Result<(), Error> { - let mut inner = self.inner.lock().await; - inner + self.worker_registry.remove_worker(worker_id).await; + + let mut inner = self.inner.write().await; + let result = inner .immediate_evict_worker( worker_id, make_err!(Code::Internal, "Received request to remove worker"), false, ) - .await + .await; + + // Record worker removal + self.worker_scheduler_metrics.record_worker_removed(); + self.worker_scheduler_metrics.record_worker_count(inner.workers.len()); + result } async fn shutdown(&self, shutdown_guard: ShutdownGuard) { - let mut inner = self.inner.lock().await; + let mut inner = self.inner.write().await; + inner.shutting_down = true; // should reject further worker registration while let Some(worker_id) = inner .workers .peek_lru() @@ -574,23 +1019,56 @@ impl WorkerScheduler for ApiWorkerScheduler { } async fn remove_timedout_workers(&self, now_timestamp: WorkerTimestamp) -> Result<(), Error> { - let mut inner = self.inner.lock().await; + // Check worker liveness using both the local timestamp (from LRU) + // and the worker registry. A worker is alive if either source says it's alive. + let timeout = Duration::from_secs(self.worker_timeout_s); + let now = UNIX_EPOCH + Duration::from_secs(now_timestamp); + let timeout_threshold = now_timestamp.saturating_sub(self.worker_timeout_s); + + // Phase 1: Read-only collection of workers to check + let workers_to_check: Vec<(WorkerId, bool)> = { + let inner = self.inner.read().await; + inner + .workers + .iter() + .map(|(worker_id, worker)| { + let local_alive = worker.last_update_timestamp > timeout_threshold; + (worker_id.clone(), local_alive) + }) + .collect() + }; + + let mut worker_ids_to_remove = Vec::new(); + for (worker_id, local_alive) in workers_to_check { + if local_alive { + continue; + } + + let registry_alive = self + .worker_registry + .is_worker_alive(&worker_id, timeout, now) + .await; + + if !registry_alive { + trace!( + ?worker_id, + local_alive, + registry_alive, + timeout_threshold, + "Worker timed out - neither local nor registry shows alive" + ); + worker_ids_to_remove.push(worker_id); + } + } + + if worker_ids_to_remove.is_empty() { + return Ok(()); + } + // Phase 2: Write lock to remove timed out workers + let mut inner = self.inner.write().await; let mut result = Ok(()); - // Items should be sorted based on last_update_timestamp, so we don't need to iterate the entire - // map most of the time. - let worker_ids_to_remove: Vec = inner - .workers - .iter() - .rev() - .map_while(|(worker_id, worker)| { - if worker.last_update_timestamp <= now_timestamp - self.worker_timeout_s { - Some(worker_id.clone()) - } else { - None - } - }) - .collect(); + for worker_id in &worker_ids_to_remove { warn!(?worker_id, "Worker timed out, removing from pool"); result = result.merge( @@ -605,14 +1083,20 @@ impl WorkerScheduler for ApiWorkerScheduler { ) .await, ); + self.worker_scheduler_metrics.record_worker_timeout(); } + self.worker_scheduler_metrics.record_running_actions_count(inner.count_running_actions()); + self.worker_scheduler_metrics.record_worker_count(inner.workers.len()); + result } async fn set_drain_worker(&self, worker_id: &WorkerId, is_draining: bool) -> Result<(), Error> { - let mut inner = self.inner.lock().await; - inner.set_drain_worker(worker_id, is_draining).await + let mut inner = self.inner.write().await; + inner.set_drain_worker(worker_id, is_draining).await?; + self.worker_scheduler_metrics.record_worker_count(inner.workers.len()); + Ok(()) } } diff --git a/nativelink-scheduler/src/awaited_action_db/awaited_action.rs b/nativelink-scheduler/src/awaited_action_db/awaited_action.rs index 232986d46..337c354e0 100644 --- a/nativelink-scheduler/src/awaited_action_db/awaited_action.rs +++ b/nativelink-scheduler/src/awaited_action_db/awaited_action.rs @@ -107,6 +107,7 @@ impl AwaitedAction { // client_operation_id to all clients. client_operation_id: operation_id.clone(), action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: now, }); let ctx = Context::current(); @@ -239,29 +240,36 @@ impl MetricsComponent for AwaitedActionSortKey { } impl AwaitedActionSortKey { - #[rustfmt::skip] const fn new(priority: i32, insert_timestamp: u32) -> Self { - // Shift `new_priority` so [`i32::MIN`] is represented by zero. - // This makes it so any negative values are positive, but - // maintains ordering. - const MIN_I32: i64 = (i32::MIN as i64).abs(); - let priority = ((priority as i64 + MIN_I32) as u32).to_be_bytes(); + // Shift the signed i32 range [i32::MIN, i32::MAX] to the unsigned u32 range + // [0, u32::MAX] to preserve ordering when we convert to bytes for sorting. + let priority_u32 = i32::MIN.unsigned_abs().wrapping_add_signed(priority); + let priority = priority_u32.to_be_bytes(); // Invert our timestamp so the larger the timestamp the lower the number. // This makes timestamp descending order instead of ascending. let timestamp = (insert_timestamp ^ u32::MAX).to_be_bytes(); Self(u64::from_be_bytes([ - priority[0], priority[1], priority[2], priority[3], - timestamp[0], timestamp[1], timestamp[2], timestamp[3], + priority[0], + priority[1], + priority[2], + priority[3], + timestamp[0], + timestamp[1], + timestamp[2], + timestamp[3], ])) } fn new_with_unique_key(priority: i32, insert_timestamp: &SystemTime) -> Self { - let timestamp = insert_timestamp - .duration_since(UNIX_EPOCH) - .unwrap() - .as_secs() as u32; + let timestamp = u32::try_from( + insert_timestamp + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(), + ) + .unwrap_or(u32::MAX); Self::new(priority, timestamp) } diff --git a/nativelink-scheduler/src/awaited_action_db/mod.rs b/nativelink-scheduler/src/awaited_action_db/mod.rs index 315f13e67..4433c07a1 100644 --- a/nativelink-scheduler/src/awaited_action_db/mod.rs +++ b/nativelink-scheduler/src/awaited_action_db/mod.rs @@ -15,6 +15,7 @@ use core::cmp; use core::ops::Bound; use core::time::Duration; +use std::collections::HashMap; use std::sync::Arc; pub use awaited_action::{AwaitedAction, AwaitedActionSortKey}; @@ -51,6 +52,17 @@ impl TryFrom<&ActionStage> for SortedAwaitedActionState { } } +impl TryFrom<&CountableActionStage> for SortedAwaitedActionState { + type Error = Error; + fn try_from(value: &CountableActionStage) -> Result { + match value { + CountableActionStage::Queued => Ok(Self::Queued), + CountableActionStage::Executing => Ok(Self::Executing), + CountableActionStage::Completed => Ok(Self::Completed), + } + } +} + impl TryFrom for SortedAwaitedActionState { type Error = Error; fn try_from(value: ActionStage) -> Result { @@ -140,6 +152,13 @@ pub trait AwaitedActionSubscriber: Send + Sync + Sized + 'static { fn borrow(&self) -> impl Future> + Send; } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum CountableActionStage { + Queued, + Executing, + Completed, +} + /// A trait that defines the interface for an `AwaitedActionDb`. pub trait AwaitedActionDb: Send + Sync + MetricsComponent + Unpin + 'static { type Subscriber: AwaitedActionSubscriber; @@ -174,6 +193,15 @@ pub trait AwaitedActionDb: Send + Sync + MetricsComponent + Unpin + 'static { Output = Result> + Send, Error>, > + Send; + fn get_queued_actions( + &self, + ) -> impl Future>, Error>> + Send; + + fn count_actions( + &self, + states: Vec, + ) -> impl Future, Error>> + Send; + /// Process a change changed `AwaitedAction` and notify any listeners. fn update_awaited_action( &self, diff --git a/nativelink-scheduler/src/cache_lookup_scheduler.rs b/nativelink-scheduler/src/cache_lookup_scheduler.rs index cb76b4658..c11321771 100644 --- a/nativelink-scheduler/src/cache_lookup_scheduler.rs +++ b/nativelink-scheduler/src/cache_lookup_scheduler.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; use std::sync::Arc; +use std::time::SystemTime; use async_trait::async_trait; use nativelink_error::{Code, Error, ResultExt, make_err}; @@ -64,7 +65,7 @@ pub struct CacheLookupScheduler { /// The "real" scheduler to use to perform actions if they were not found /// in the action cache. #[metric(group = "action_scheduler")] - action_scheduler: Arc, + pub action_scheduler: Arc, /// Actions that are currently performing a `CacheCheck`. inflight_cache_checks: Arc>, } @@ -267,6 +268,7 @@ impl CacheLookupScheduler { client_operation_id: OperationId::default(), stage: ActionStage::CompletedFromCache(action_result), action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), }; let ctx = Context::current(); @@ -379,6 +381,10 @@ impl ClientStateManager for CacheLookupScheduler { fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider> { self.action_scheduler.as_known_platform_property_provider() } + + fn as_any(&self) -> &dyn std::any::Any { + self + } } impl RootMetricsComponent for CacheLookupScheduler {} diff --git a/nativelink-scheduler/src/default_scheduler_factory.rs b/nativelink-scheduler/src/default_scheduler_factory.rs index a9a9072fd..2228bf9ee 100644 --- a/nativelink-scheduler/src/default_scheduler_factory.rs +++ b/nativelink-scheduler/src/default_scheduler_factory.rs @@ -21,16 +21,18 @@ use nativelink_config::schedulers::{ use nativelink_config::stores::EvictionPolicy; use nativelink_error::{Error, ResultExt, make_input_err}; use nativelink_proto::com::github::trace_machina::nativelink::events::OriginEvent; -use nativelink_store::redis_store::RedisStore; +use nativelink_store::redis_store::{RedisStore, StandardRedisManager}; use nativelink_store::store_manager::StoreManager; use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::operation_state_manager::ClientStateManager; +use redis::aio::ConnectionManager; use tokio::sync::{Notify, mpsc}; use crate::cache_lookup_scheduler::CacheLookupScheduler; use crate::grpc_scheduler::GrpcScheduler; use crate::memory_awaited_action_db::MemoryAwaitedActionDb; use crate::property_modifier_scheduler::PropertyModifierScheduler; +use crate::property_router_scheduler::PropertyRouterScheduler; use crate::simple_scheduler::SimpleScheduler; use crate::store_awaited_action_db::StoreAwaitedActionDb; use crate::worker_scheduler::WorkerScheduler; @@ -44,31 +46,36 @@ pub type SchedulerFactoryResults = ( Option>, ); -pub fn scheduler_factory( +pub async fn scheduler_factory( spec: &SchedulerSpec, store_manager: &StoreManager, maybe_origin_event_tx: Option<&mpsc::Sender>, ) -> Result { - inner_scheduler_factory(spec, store_manager, maybe_origin_event_tx) + inner_scheduler_factory(spec, store_manager, maybe_origin_event_tx).await } -fn inner_scheduler_factory( +async fn inner_scheduler_factory( spec: &SchedulerSpec, store_manager: &StoreManager, maybe_origin_event_tx: Option<&mpsc::Sender>, ) -> Result { let scheduler: SchedulerFactoryResults = match spec { SchedulerSpec::Simple(spec) => { - simple_scheduler_factory(spec, store_manager, SystemTime::now, maybe_origin_event_tx)? + simple_scheduler_factory(spec, store_manager, SystemTime::now, maybe_origin_event_tx) + .await? } SchedulerSpec::Grpc(spec) => (Some(Arc::new(GrpcScheduler::new(spec)?)), None), SchedulerSpec::CacheLookup(spec) => { let ac_store = store_manager .get_store(&spec.ac_store) .err_tip(|| format!("'ac_store': '{}' does not exist", spec.ac_store))?; - let (action_scheduler, worker_scheduler) = - inner_scheduler_factory(&spec.scheduler, store_manager, maybe_origin_event_tx) - .err_tip(|| "In nested CacheLookupScheduler construction")?; + let (action_scheduler, worker_scheduler) = Box::pin(inner_scheduler_factory( + &spec.scheduler, + store_manager, + maybe_origin_event_tx, + )) + .await + .err_tip(|| "In nested CacheLookupScheduler construction")?; let cache_lookup_scheduler = Arc::new(CacheLookupScheduler::new( ac_store, action_scheduler.err_tip(|| "Nested scheduler is not an action scheduler")?, @@ -76,21 +83,57 @@ fn inner_scheduler_factory( (Some(cache_lookup_scheduler), worker_scheduler) } SchedulerSpec::PropertyModifier(spec) => { - let (action_scheduler, worker_scheduler) = - inner_scheduler_factory(&spec.scheduler, store_manager, maybe_origin_event_tx) - .err_tip(|| "In nested PropertyModifierScheduler construction")?; + let (action_scheduler, worker_scheduler) = Box::pin(inner_scheduler_factory( + &spec.scheduler, + store_manager, + maybe_origin_event_tx, + )) + .await + .err_tip(|| "In nested PropertyModifierScheduler construction")?; let property_modifier_scheduler = Arc::new(PropertyModifierScheduler::new( spec, action_scheduler.err_tip(|| "Nested scheduler is not an action scheduler")?, )); (Some(property_modifier_scheduler), worker_scheduler) } + SchedulerSpec::PropertyRouter(spec) => { + use std::collections::HashMap; + let mut routes = HashMap::with_capacity(spec.routes.len()); + for (value, nested_spec) in &spec.routes { + let (action_scheduler, _) = Box::pin(inner_scheduler_factory( + nested_spec, + store_manager, + maybe_origin_event_tx, + )) + .await + .err_tip(|| format!("In nested PropertyRouterScheduler route '{value}'"))?; + routes.insert( + value.clone(), + action_scheduler + .err_tip(|| format!("Nested route '{value}' is not an action scheduler"))?, + ); + } + let (default_action_scheduler, _) = Box::pin(inner_scheduler_factory( + &spec.default_scheduler, + store_manager, + maybe_origin_event_tx, + )) + .await + .err_tip(|| "In PropertyRouterScheduler default_scheduler")?; + let router = Arc::new(PropertyRouterScheduler::new( + &spec.property_name, + routes, + default_action_scheduler + .err_tip(|| "Default scheduler is not an action scheduler")?, + )); + (Some(router), None) + } }; Ok(scheduler) } -fn simple_scheduler_factory( +async fn simple_scheduler_factory( spec: &SimpleSpec, store_manager: &StoreManager, now_fn: fn() -> SystemTime, @@ -129,7 +172,8 @@ fn simple_scheduler_factory( let store = store .into_inner() .as_any_arc() - .downcast::() + .downcast::>>( + ) .map_err(|_| { make_input_err!( "Could not downcast to redis store in RedisAwaitedActionDb::new" @@ -141,6 +185,7 @@ fn simple_scheduler_factory( now_fn, Default::default, ) + .await .err_tip(|| "In state_manager_factory::redis_state_manager")?; let (action_scheduler, worker_scheduler) = SimpleScheduler::new( spec, diff --git a/nativelink-scheduler/src/grpc_scheduler.rs b/nativelink-scheduler/src/grpc_scheduler.rs index 13b0d6b79..f4de0b0d3 100644 --- a/nativelink-scheduler/src/grpc_scheduler.rs +++ b/nativelink-scheduler/src/grpc_scheduler.rs @@ -216,7 +216,7 @@ impl GrpcScheduler { // Not in the cache, lookup the capabilities with the upstream. let channel = self .connection_manager - .connection() + .connection("get_known_properties".into()) .await .err_tip(|| "in get_platform_property_manager()")?; let capabilities_result = CapabilitiesClient::new(channel) @@ -274,7 +274,7 @@ impl GrpcScheduler { .perform_request(request, |request| async move { let channel = self .connection_manager - .connection() + .connection(format!("add_action: {:?}", request.action_digest)) .await .err_tip(|| "in add_action()")?; ExecutionClient::new(channel) @@ -309,7 +309,7 @@ impl GrpcScheduler { .perform_request(request, |request| async move { let channel = self .connection_manager - .connection() + .connection(format!("filter_operations: {}", request.name)) .await .err_tip(|| "in find_by_client_operation_id()")?; ExecutionClient::new(channel) @@ -354,6 +354,10 @@ impl ClientStateManager for GrpcScheduler { fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider> { Some(self) } + + fn as_any(&self) -> &dyn std::any::Any { + self + } } #[async_trait] diff --git a/nativelink-scheduler/src/lib.rs b/nativelink-scheduler/src/lib.rs index db7e7cdab..cc11ffe27 100644 --- a/nativelink-scheduler/src/lib.rs +++ b/nativelink-scheduler/src/lib.rs @@ -21,8 +21,11 @@ pub mod memory_awaited_action_db; pub mod mock_scheduler; pub mod platform_property_manager; pub mod property_modifier_scheduler; +pub mod property_router_scheduler; pub mod simple_scheduler; -mod simple_scheduler_state_manager; +pub mod simple_scheduler_state_manager; pub mod store_awaited_action_db; pub mod worker; +pub mod worker_capability_index; +pub mod worker_registry; pub mod worker_scheduler; diff --git a/nativelink-scheduler/src/memory_awaited_action_db.rs b/nativelink-scheduler/src/memory_awaited_action_db.rs index 7fbcef567..faae5f8e5 100644 --- a/nativelink-scheduler/src/memory_awaited_action_db.rs +++ b/nativelink-scheduler/src/memory_awaited_action_db.rs @@ -18,7 +18,7 @@ use std::collections::hash_map::Entry; use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; -use async_lock::Mutex; +use async_lock::RwLock; use futures::{FutureExt, Stream}; use nativelink_config::stores::EvictionPolicy; use nativelink_error::{Code, Error, ResultExt, error_if, make_err}; @@ -29,6 +29,9 @@ use nativelink_util::action_messages::{ use nativelink_util::chunked_stream::ChunkedStream; use nativelink_util::evicting_map::{EvictingMap, LenEntry}; use nativelink_util::instant_wrapper::InstantWrapper; +use nativelink_util::metrics::{ + EXECUTION_METRICS, ExecutionResult, ExecutionStage, make_execution_attributes, +}; use nativelink_util::spawn; use nativelink_util::task::JoinHandleDropGuard; use tokio::sync::{Notify, mpsc, watch}; @@ -36,7 +39,7 @@ use tracing::{debug, error}; use crate::awaited_action_db::{ AwaitedAction, AwaitedActionDb, AwaitedActionSubscriber, CLIENT_KEEPALIVE_DURATION, - SortedAwaitedAction, SortedAwaitedActionState, + CountableActionStage, SortedAwaitedAction, SortedAwaitedActionState, }; /// Number of events to process per cycle. @@ -246,6 +249,17 @@ impl SortedAwaitedActions { } } + const fn btree_for_countable_stage( + &mut self, + stage: &CountableActionStage, + ) -> &mut BTreeSet { + match stage { + CountableActionStage::Queued => &mut self.queued, + CountableActionStage::Executing => &mut self.executing, + CountableActionStage::Completed => &mut self.completed, + } + } + fn insert_sort_map_for_stage( &mut self, stage: &ActionStage, @@ -375,7 +389,7 @@ impl I + Clone + Send + Sync> AwaitedActionDbI // Cleanup operation_id_to_awaited_action. let Some(tx) = self.operation_id_to_awaited_action.remove(&operation_id) else { error!( - ?operation_id, + %operation_id, "operation_id_to_awaited_action does not have operation_id" ); continue; @@ -392,7 +406,7 @@ impl I + Clone + Send + Sync> AwaitedActionDbI } Entry::Vacant(_) => { error!( - ?operation_id, + %operation_id, "connected_clients_for_operation_id does not have operation_id" ); 0 @@ -411,7 +425,7 @@ impl I + Clone + Send + Sync> AwaitedActionDbI .insert(operation_id, connected_clients); continue; } - debug!(?operation_id, "Clearing operation from state manager"); + debug!(%operation_id, "Clearing operation from state manager"); let awaited_action = tx.borrow().clone(); // Cleanup action_info_hash_key_to_awaited_action if it was marked cached. match &awaited_action.action_info().unique_qualifier { @@ -423,7 +437,7 @@ impl I + Clone + Send + Sync> AwaitedActionDbI && maybe_awaited_action.is_none() { error!( - ?operation_id, + %operation_id, ?awaited_action, ?action_key, "action_info_hash_key_to_awaited_action and operation_id_to_awaited_action are out of sync", @@ -448,7 +462,7 @@ impl I + Clone + Send + Sync> AwaitedActionDbI }); if maybe_sorted_awaited_action.is_none() { error!( - ?operation_id, + %operation_id, ?sort_key, "Expected maybe_sorted_awaited_action to have {sort_key:?}", ); @@ -579,6 +593,12 @@ impl I + Clone + Send + Sync> AwaitedActionDbI } } + fn count_actions(&mut self, stage: CountableActionStage) -> usize { + self.sorted_action_info_hash_keys + .btree_for_countable_stage(&stage) + .len() + } + fn update_awaited_action( &mut self, mut new_awaited_action: AwaitedAction, @@ -631,6 +651,56 @@ impl I + Clone + Send + Sync> AwaitedActionDbI .is_same_stage(&new_awaited_action.state().stage); if !is_same_stage { + // Record metrics for stage transitions + let metrics = &*EXECUTION_METRICS; + let old_stage = &old_awaited_action.state().stage; + let new_stage = &new_awaited_action.state().stage; + + // Track stage transitions + let base_attrs = make_execution_attributes( + "unknown", + Some(old_awaited_action.action_info().priority), + ); + metrics.execution_stage_transitions.add(1, &base_attrs); + + // Update active count for old stage + let old_stage_attrs = vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::from(old_stage), + )]; + metrics.execution_active_count.add(-1, &old_stage_attrs); + + // Update active count for new stage + let new_stage_attrs = vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::from(new_stage), + )]; + metrics.execution_active_count.add(1, &new_stage_attrs); + + // Record completion metrics with action digest for failure tracking + let action_digest = old_awaited_action.action_info().digest().to_string(); + if let ActionStage::Completed(action_result) = new_stage { + let result_attrs = vec![ + opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_RESULT, + if action_result.exit_code == 0 { + ExecutionResult::Success + } else { + ExecutionResult::Failure + }, + ), + ]; + metrics.execution_completed_count.add(1, &result_attrs); + } else if let ActionStage::CompletedFromCache(_) = new_stage { + let result_attrs = vec![ + opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_RESULT, + ExecutionResult::CacheHit, + ), + ]; + metrics.execution_completed_count.add(1, &result_attrs); + } + self.sorted_action_info_hash_keys .process_state_changes(&old_awaited_action, &new_awaited_action)?; Self::process_state_changes_for_hash_key_map( @@ -696,8 +766,11 @@ impl I + Clone + Send + Sync> AwaitedActionDbI ActionUniqueQualifier::Uncacheable(_unique_key) => None, }; let operation_id = OperationId::default(); - let awaited_action = - AwaitedAction::new(operation_id.clone(), action_info, (self.now_fn)().now()); + let awaited_action = AwaitedAction::new( + operation_id.clone(), + action_info.clone(), + (self.now_fn)().now(), + ); debug_assert!( ActionStage::Queued == awaited_action.state().stage, "Expected action to be queued" @@ -708,8 +781,8 @@ impl I + Clone + Send + Sync> AwaitedActionDbI self.make_client_awaited_action(&operation_id.clone(), awaited_action); debug!( - ?client_operation_id, - ?operation_id, + %client_operation_id, + %operation_id, ?client_awaited_action, "Adding action" ); @@ -725,13 +798,22 @@ impl I + Clone + Send + Sync> AwaitedActionDbI .insert(unique_key, operation_id.clone()); if let Some(old_value) = old_value { error!( - ?operation_id, + %operation_id, ?old_value, "action_info_hash_key_to_awaited_action already has unique_key" ); } } + // Record metric for new action entering the queue + let metrics = &*EXECUTION_METRICS; + let _base_attrs = make_execution_attributes("unknown", Some(action_info.priority)); + let queued_attrs = vec![opentelemetry::KeyValue::new( + nativelink_util::metrics::EXECUTION_STAGE, + ExecutionStage::Queued, + )]; + metrics.execution_active_count.add(1, &queued_attrs); + self.sorted_action_info_hash_keys .insert_sort_map_for_stage( &ActionStage::Queued, @@ -825,7 +907,7 @@ impl I + Clone + Send + Sync> AwaitedActionDbI #[derive(Debug, MetricsComponent)] pub struct MemoryAwaitedActionDb I> { #[metric] - inner: Arc>>, + inner: Arc>>, tasks_change_notify: Arc, _handle_awaited_action_events: JoinHandleDropGuard<()>, } @@ -839,7 +921,7 @@ impl I + Clone + Send + Sync + 'static> now_fn: NowFn, ) -> Self { let (action_event_tx, mut action_event_rx) = mpsc::unbounded_channel(); - let inner = Arc::new(Mutex::new(AwaitedActionDbImpl { + let inner = Arc::new(RwLock::new(AwaitedActionDbImpl { client_operation_to_awaited_action: EvictingMap::new(eviction_config, (now_fn)()), operation_id_to_awaited_action: BTreeMap::new(), action_info_hash_key_to_awaited_action: HashMap::new(), @@ -862,7 +944,7 @@ impl I + Clone + Send + Sync + 'static> let Some(inner) = weak_inner.upgrade() else { return; // Nothing to cleanup, our struct is dropped. }; - let mut inner = inner.lock().await; + let mut inner = inner.write().await; inner .handle_action_events(dropped_operation_ids.drain(..)) .await; @@ -882,7 +964,7 @@ impl I + Clone + Send + Sync + 'static> Awaite client_operation_id: &OperationId, ) -> Result, Error> { self.inner - .lock() + .read() .await .get_awaited_action_by_id(client_operation_id) .await @@ -895,7 +977,7 @@ impl I + Clone + Send + Sync + 'static> Awaite Bound::Unbounded, Bound::Unbounded, move |start, end, mut output| async move { - let inner = self.inner.lock().await; + let inner = self.inner.read().await; let mut maybe_new_start = None; for (operation_id, item) in @@ -915,7 +997,23 @@ impl I + Clone + Send + Sync + 'static> Awaite &self, operation_id: &OperationId, ) -> Result, Error> { - Ok(self.inner.lock().await.get_by_operation_id(operation_id)) + Ok(self.inner.read().await.get_by_operation_id(operation_id)) + } + + async fn get_queued_actions(&self) -> Result>, Error> { + let inner = self.inner.read().await; + + Ok(inner + .sorted_action_info_hash_keys + .queued + .iter() + .filter_map(|(awaited_action)| { + inner + .operation_id_to_awaited_action + .get(&awaited_action.operation_id) + }) + .map(|awaited_action| Arc::new(awaited_action.borrow().clone())) + .collect()) } async fn get_range_of_actions( @@ -929,7 +1027,7 @@ impl I + Clone + Send + Sync + 'static> Awaite start, end, move |start, end, mut output| async move { - let inner = self.inner.lock().await; + let inner = self.inner.read().await; let mut done = true; let mut new_start = start.as_ref(); let mut new_end = end.as_ref(); @@ -965,9 +1063,21 @@ impl I + Clone + Send + Sync + 'static> Awaite )) } + async fn count_actions( + &self, + stages: Vec, + ) -> Result, Error> { + let mut results: HashMap = + HashMap::with_capacity(stages.len()); + for stage in stages { + results.insert(stage, self.inner.write().await.count_actions(stage)); + } + Ok(results) + } + async fn update_awaited_action(&self, new_awaited_action: AwaitedAction) -> Result<(), Error> { self.inner - .lock() + .write() .await .update_awaited_action(new_awaited_action)?; self.tasks_change_notify.notify_one(); @@ -982,7 +1092,7 @@ impl I + Clone + Send + Sync + 'static> Awaite ) -> Result { let subscriber = self .inner - .lock() + .write() .await .add_action(client_operation_id, action_info) .await?; diff --git a/nativelink-scheduler/src/mock_scheduler.rs b/nativelink-scheduler/src/mock_scheduler.rs index df17e844f..ff9ab9f6d 100644 --- a/nativelink-scheduler/src/mock_scheduler.rs +++ b/nativelink-scheduler/src/mock_scheduler.rs @@ -192,6 +192,10 @@ impl ClientStateManager for MockActionScheduler { fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider> { Some(self) } + + fn as_any(&self) -> &dyn std::any::Any { + self + } } impl RootMetricsComponent for MockActionScheduler {} diff --git a/nativelink-scheduler/src/platform_property_manager.rs b/nativelink-scheduler/src/platform_property_manager.rs index a090aa285..81201c0ff 100644 --- a/nativelink-scheduler/src/platform_property_manager.rs +++ b/nativelink-scheduler/src/platform_property_manager.rs @@ -88,6 +88,7 @@ impl PlatformPropertyManager { )), PropertyType::Exact => Ok(PlatformPropertyValue::Exact(value.to_string())), PropertyType::Priority => Ok(PlatformPropertyValue::Priority(value.to_string())), + PropertyType::Ignore => Ok(PlatformPropertyValue::Ignore(value.to_string())), }; } Err(make_input_err!("Unknown platform property '{}'", key)) diff --git a/nativelink-scheduler/src/property_modifier_scheduler.rs b/nativelink-scheduler/src/property_modifier_scheduler.rs index 38ebea695..5343ecb0a 100644 --- a/nativelink-scheduler/src/property_modifier_scheduler.rs +++ b/nativelink-scheduler/src/property_modifier_scheduler.rs @@ -32,7 +32,7 @@ use parking_lot::Mutex; pub struct PropertyModifierScheduler { modifications: Vec, #[metric(group = "scheduler")] - scheduler: Arc, + pub scheduler: Arc, #[metric(group = "property_manager")] known_properties: Mutex>>, } @@ -168,6 +168,10 @@ impl ClientStateManager for PropertyModifierScheduler { fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider> { Some(self) } + + fn as_any(&self) -> &dyn std::any::Any { + self + } } impl RootMetricsComponent for PropertyModifierScheduler {} diff --git a/nativelink-scheduler/src/property_router_scheduler.rs b/nativelink-scheduler/src/property_router_scheduler.rs new file mode 100644 index 000000000..a259d9c69 --- /dev/null +++ b/nativelink-scheduler/src/property_router_scheduler.rs @@ -0,0 +1,129 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use async_trait::async_trait; +use nativelink_error::{Error, ResultExt}; +use nativelink_metric::{MetricsComponent, RootMetricsComponent}; +use nativelink_util::action_messages::{ActionInfo, OperationId}; +use nativelink_util::known_platform_property_provider::KnownPlatformPropertyProvider; +use nativelink_util::operation_state_manager::{ + ActionStateResult, ActionStateResultStream, ClientStateManager, OperationFilter, +}; + +#[derive(MetricsComponent)] +pub struct PropertyRouterScheduler { + property_name: String, + #[metric(group = "routes")] + routes: HashMap>, + #[metric(group = "default_scheduler")] + default_scheduler: Arc, +} + +impl core::fmt::Debug for PropertyRouterScheduler { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("PropertyRouterScheduler") + .field("property_name", &self.property_name) + .finish_non_exhaustive() + } +} + +impl PropertyRouterScheduler { + pub fn new( + property_name: &str, + routes: HashMap>, + default_scheduler: Arc, + ) -> Self { + Self { + property_name: property_name.to_string(), + routes, + default_scheduler, + } + } + + async fn inner_add_action( + &self, + client_operation_id: OperationId, + action_info: Arc, + ) -> Result, Error> { + let scheduler = action_info + .platform_properties + .get(&self.property_name) + .and_then(|value| self.routes.get(value)) + .unwrap_or(&self.default_scheduler); + + scheduler.add_action(client_operation_id, action_info).await + } + + async fn inner_filter_operations( + &self, + filter: OperationFilter, + ) -> Result, Error> { + let mut streams = Vec::with_capacity(self.routes.len() + 1); + for scheduler in self.routes.values() { + streams.push(scheduler.filter_operations(filter.clone()).await?); + } + streams.push(self.default_scheduler.filter_operations(filter).await?); + Ok(Box::pin(futures::stream::select_all(streams))) + } + + async fn inner_get_known_properties(&self, instance_name: &str) -> Result, Error> { + let mut all_props = HashSet::new(); + for scheduler in self.routes.values() { + if let Some(p) = scheduler.as_known_platform_property_provider() { + for prop in p + .get_known_properties(instance_name) + .await + .err_tip(|| "In PropertyRouterScheduler::get_known_properties for route")? + { + all_props.insert(prop); + } + } + } + if let Some(p) = self.default_scheduler.as_known_platform_property_provider() { + for prop in p + .get_known_properties(instance_name) + .await + .err_tip(|| "In PropertyRouterScheduler::get_known_properties for default")? + { + all_props.insert(prop); + } + } + Ok(all_props.into_iter().collect()) + } +} + +#[async_trait] +impl KnownPlatformPropertyProvider for PropertyRouterScheduler { + async fn get_known_properties(&self, instance_name: &str) -> Result, Error> { + self.inner_get_known_properties(instance_name).await + } +} + +#[async_trait] +impl ClientStateManager for PropertyRouterScheduler { + async fn add_action( + &self, + client_operation_id: OperationId, + action_info: Arc, + ) -> Result, Error> { + self.inner_add_action(client_operation_id, action_info) + .await + } + + async fn filter_operations<'a>( + &'a self, + filter: OperationFilter, + ) -> Result, Error> { + self.inner_filter_operations(filter).await + } + + fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider> { + Some(self) + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } +} + +impl RootMetricsComponent for PropertyRouterScheduler {} diff --git a/nativelink-scheduler/src/simple_scheduler.rs b/nativelink-scheduler/src/simple_scheduler.rs index eee05bbcd..4d0e91b36 100644 --- a/nativelink-scheduler/src/simple_scheduler.rs +++ b/nativelink-scheduler/src/simple_scheduler.rs @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::{BTreeSet, HashMap}; use std::sync::Arc; -use std::time::SystemTime; +use std::time::{Instant, SystemTime}; use async_trait::async_trait; -use futures::Future; +use futures::{future, Future, StreamExt}; use nativelink_config::schedulers::SimpleSpec; use nativelink_error::{Code, Error, ResultExt}; use nativelink_metric::{MetricsComponent, RootMetricsComponent}; @@ -29,24 +30,27 @@ use nativelink_util::operation_state_manager::{ OperationFilter, OperationStageFlags, OrderDirection, UpdateOperationType, }; use nativelink_util::origin_event::OriginMetadata; +use nativelink_util::platform_properties::PlatformProperties; use nativelink_util::shutdown_guard::ShutdownGuard; use nativelink_util::spawn; use nativelink_util::task::JoinHandleDropGuard; -use opentelemetry::KeyValue; use opentelemetry::baggage::BaggageExt; use opentelemetry::context::{Context, FutureExt as OtelFutureExt}; +use opentelemetry::KeyValue; use opentelemetry_semantic_conventions::attribute::ENDUSER_ID; -use tokio::sync::{Notify, mpsc}; +use tokio::sync::{mpsc, Notify}; use tokio::time::Duration; -use tokio_stream::StreamExt; -use tracing::{error, info_span}; +use tracing::{debug, error, info, info_span, warn}; use crate::api_worker_scheduler::ApiWorkerScheduler; use crate::awaited_action_db::{AwaitedActionDb, CLIENT_KEEPALIVE_DURATION}; use crate::platform_property_manager::PlatformPropertyManager; -use crate::simple_scheduler_state_manager::SimpleSchedulerStateManager; -use crate::worker::{ActionInfoWithProps, Worker, WorkerTimestamp}; +use crate::simple_scheduler_state_manager::{SchedulerStateManager, SimpleSchedulerStateManager}; +use crate::worker::{ActionInfoWithProps, ActionsState, Worker, WorkerState, WorkerTimestamp}; +use crate::worker_registry::WorkerRegistry; use crate::worker_scheduler::WorkerScheduler; +use serde::Serialize; +use nativelink_util::metrics::EXECUTION_METRICS; /// Default timeout for workers in seconds. /// If this changes, remember to change the documentation in the config. @@ -61,6 +65,12 @@ const DEFAULT_CLIENT_ACTION_TIMEOUT_S: u64 = 60; /// If this changes, remember to change the documentation in the config. const DEFAULT_MAX_JOB_RETRIES: usize = 3; +#[derive(Serialize)] +pub struct SchedulerState { + pub actions: ActionsState, + pub workers: Vec, +} + struct SimpleSchedulerActionStateResult { client_operation_id: OperationId, action_state_result: Box, @@ -125,6 +135,10 @@ pub struct SimpleScheduler { #[metric(group = "client_state_manager")] client_state_manager: Arc, + /// Manager for scheduler state of this scheduler. + #[metric(group = "scheduler_state_manager")] + scheduler_state_manager: Arc, + /// Manager for platform of this scheduler. #[metric(group = "platform_properties")] platform_property_manager: Arc, @@ -140,6 +154,24 @@ pub struct SimpleScheduler { /// Background task that tries to match actions to workers. If this struct /// is dropped the spawn will be cancelled as well. task_worker_matching_spawn: JoinHandleDropGuard<()>, + + /// Every duration, do logging of worker matching + /// e.g. "worker busy", "can't find any worker" + /// Set to None to disable. This is quite noisy, so we limit it + worker_match_logging_interval: Option, + + /// Whether to use batch worker matching optimization. + /// When enabled, actions are collected and matched to workers in a single + /// batch operation, reducing lock contention. + enable_batch_worker_matching: bool, + + /// Maximum interval between batch matching cycles. + /// Even without triggers, matching runs at least this often. + batch_interval: Duration, + + /// Debounce window after first trigger. + /// After a notification, wait this long to collect more changes before running. + batch_debounce: Duration, } impl core::fmt::Debug for SimpleScheduler { @@ -203,18 +235,19 @@ impl SimpleScheduler { } pub async fn do_try_match_for_test(&self) -> Result<(), Error> { - self.do_try_match().await + self.do_try_match(true).await } // TODO(palfrey) This is an O(n*m) (aka n^2) algorithm. In theory we // can create a map of capabilities of each worker and then try and match // the actions to the worker using the map lookup (ie. map reduce). - async fn do_try_match(&self) -> Result<(), Error> { + async fn do_try_match(&self, full_worker_logging: bool) -> Result<(), Error> { async fn match_action_to_worker( action_state_result: &dyn ActionStateResult, workers: &ApiWorkerScheduler, matching_engine_state_manager: &dyn MatchingEngineStateManager, platform_property_manager: &PlatformPropertyManager, + full_worker_logging: bool, ) -> Result<(), Error> { let (action_info, maybe_origin_metadata) = action_state_result @@ -238,7 +271,7 @@ impl SimpleScheduler { // Try to find a worker for the action. let worker_id = { match workers - .find_worker_for_action(&action_info.platform_properties) + .find_worker_for_action(&action_info.platform_properties, full_worker_logging) .await { Some(worker_id) => worker_id, @@ -273,6 +306,7 @@ impl SimpleScheduler { return Err(err); } + debug!(%worker_id, %operation_id, ?action_info, "Notifying worker of operation"); workers .worker_notify_run_action(worker_id, operation_id, action_info) .await @@ -297,11 +331,21 @@ impl SimpleScheduler { let mut result = Ok(()); + let start = Instant::now(); + let mut stream = self .get_queued_operations() .await .err_tip(|| "Failed to get queued operations in do_try_match")?; + let query_elapsed = start.elapsed(); + if query_elapsed > Duration::from_secs(1) { + warn!( + elapsed_ms = query_elapsed.as_millis(), + "Slow get_queued_operations query" + ); + } + while let Some(action_state_result) = stream.next().await { result = result.merge( match_action_to_worker( @@ -309,12 +353,202 @@ impl SimpleScheduler { self.worker_scheduler.as_ref(), self.matching_engine_state_manager.as_ref(), self.platform_property_manager.as_ref(), + full_worker_logging, ) .await, ); } + + let total_elapsed = start.elapsed(); + EXECUTION_METRICS.do_try_match_duration.record(total_elapsed.as_secs_f64(), &[]); + if total_elapsed > Duration::from_secs(5) { + warn!( + total_ms = total_elapsed.as_millis(), + query_ms = query_elapsed.as_millis(), + "Slow do_try_match cycle" + ); + } + result } + + /// Batch version of `do_try_match` that collects all queued actions and matches + /// them to workers in a single batch operation. This reduces lock contention + /// compared to the sequential version. + async fn do_try_match_batch(&self, full_worker_logging: bool) -> Result<(), Error> { + let start = Instant::now(); + + // Collect all queued actions + let stream = self + .get_queued_operations() + .await + .err_tip(|| "Failed to get queued operations in do_try_match_batch")?; + + let query_elapsed = start.elapsed(); + if query_elapsed > Duration::from_secs(1) { + warn!( + elapsed_ms = query_elapsed.as_millis(), + "Slow get_queued_operations query in batch mode" + ); + } + + // Collect all action state results and compute their platform properties + let action_state_results: Vec<_> = stream.collect().await; + + if action_state_results.is_empty() { + return Ok(()); + } + + // Prepare actions with their platform properties for batch matching + struct PreparedAction { + action_state_result: Box, + action_info: ActionInfoWithProps, + origin_metadata: OriginMetadata, + } + + let mut prepared_actions: Vec = Vec::with_capacity(action_state_results.len()); + let mut platform_properties_refs: Vec<&PlatformProperties> = Vec::with_capacity(action_state_results.len()); + + for action_state_result in action_state_results { + let (action_info, maybe_origin_metadata) = match action_state_result + .as_action_info() + .await + { + Ok(result) => result, + Err(err) => { + warn!(?err, "Failed to get action_info in batch mode, skipping"); + continue; + } + }; + + // TODO(palfrey) We should not compute this every time and instead store + // it with the ActionInfo when we receive it. + let platform_properties = match self + .platform_property_manager + .make_platform_properties(action_info.platform_properties.clone()) + { + Ok(props) => props, + Err(err) => { + warn!(?err, "Failed to make platform properties in batch mode, skipping"); + continue; + } + }; + + let action_info_with_props = ActionInfoWithProps { + inner: action_info, + platform_properties, + }; + + prepared_actions.push(PreparedAction { + action_state_result, + action_info: action_info_with_props, + origin_metadata: maybe_origin_metadata.unwrap_or_default(), + }); + } + + // Collect platform properties references for batch matching + for prepared in &prepared_actions { + platform_properties_refs.push(&prepared.action_info.platform_properties); + } + + // Batch find workers for all actions (single lock acquisition) + let matches = self + .worker_scheduler + .batch_find_workers_for_actions(&platform_properties_refs, full_worker_logging) + .await; + + let matches_count = matches.len(); + let actions_count = prepared_actions.len(); + + if matches.is_empty() { + return Ok(()); + } + + // Phase 1: Extract operation_ids and assign operations to workers + // Collect successful assignments for batch worker notification + let mut successful_assignments: Vec<(WorkerId, OperationId, ActionInfoWithProps)> = + Vec::with_capacity(matches_count); + let mut result = Ok(()); + + for (action_idx, worker_id) in matches { + let prepared = &prepared_actions[action_idx]; + + // Extract the operation_id from the action_state + let operation_id = match prepared.action_state_result.as_state().await { + Ok((action_state, _origin_metadata)) => action_state.client_operation_id.clone(), + Err(err) => { + warn!(?err, "Failed to get action_state in batch mode, skipping"); + continue; + } + }; + + // Tell the matching engine that the operation is being assigned to a worker + let assign_result = self + .matching_engine_state_manager + .assign_operation(&operation_id, Ok(&worker_id)) + .await + .err_tip(|| "Failed to assign operation in do_try_match_batch"); + + match assign_result { + Ok(()) => { + // Assignment successful, queue for batch worker notification + successful_assignments.push(( + worker_id, + operation_id, + prepared.action_info.clone(), + )); + } + Err(err) => { + if err.code == Code::Aborted { + // Operation was cancelled, skip it + continue; + } + result = result.merge(Err(err)); + } + } + } + + // Phase 2: Batch notify workers (single lock acquisition) + if !successful_assignments.is_empty() { + let notify_results = self + .worker_scheduler + .batch_worker_notify_run_action(successful_assignments) + .await; + + // Merge notification results + for notify_result in notify_results { + result = result.merge( + notify_result + .err_tip(|| "Failed to run batch_worker_notify_run_action in do_try_match_batch"), + ); + } + } + + let total_elapsed = start.elapsed(); + EXECUTION_METRICS + .do_try_match_duration + .record(total_elapsed.as_secs_f64(), &[]); + if total_elapsed > Duration::from_secs(5) { + warn!( + total_ms = total_elapsed.as_millis(), + query_ms = query_elapsed.as_millis(), + actions_processed = actions_count, + matches_found = matches_count, + "Slow do_try_match_batch cycle" + ); + } + + result + } + + /// Internal method that dispatches to either batch or sequential matching. + async fn do_try_match_internal(&self, full_worker_logging: bool) -> Result<(), Error> { + if self.enable_batch_worker_matching { + self.do_try_match_batch(full_worker_logging).await + } else { + self.do_try_match(full_worker_logging).await + } + } } impl SimpleScheduler { @@ -390,12 +624,19 @@ impl SimpleScheduler { } let worker_change_notify = Arc::new(Notify::new()); + + // Create shared worker registry for single heartbeat per worker. + let worker_registry = Arc::new(WorkerRegistry::new()); + let state_manager = SimpleSchedulerStateManager::new( max_job_retries, Duration::from_secs(worker_timeout_s), Duration::from_secs(client_action_timeout_s), + Duration::from_secs(spec.max_action_executing_timeout_s), awaited_action_db, now_fn, + Some(worker_registry.clone()), + "simple_scheduler", ); let worker_scheduler = ApiWorkerScheduler::new( @@ -404,37 +645,146 @@ impl SimpleScheduler { spec.allocation_strategy, worker_change_notify.clone(), worker_timeout_s, + worker_registry, + "simple_scheduler", ); let worker_scheduler_clone = worker_scheduler.clone(); + // Capture batch timing parameters for the matching loop + let batch_interval = Duration::from_millis(spec.batch_interval_ms); + let batch_debounce = Duration::from_millis(spec.batch_debounce_ms); + let enable_batch_worker_matching = spec.enable_batch_worker_matching; + let action_scheduler = Arc::new_cyclic(move |weak_self| -> Self { let weak_inner = weak_self.clone(); let task_worker_matching_spawn = spawn!("simple_scheduler_task_worker_matching", async move { let mut last_match_successful = true; + let mut worker_match_logging_last: Option = None; // Break out of the loop only when the inner is dropped. loop { - let task_change_fut = task_change_notify.notified(); - let worker_change_fut = worker_change_notify.notified(); - tokio::pin!(task_change_fut); - tokio::pin!(worker_change_fut); - // Wait for either of these futures to be ready. - let state_changed = - futures::future::select(task_change_fut, worker_change_fut); - if last_match_successful { - let _ = state_changed.await; + // Use hybrid timer + debounce approach for batch mode, + // or the original notification-based approach for sequential mode. + if enable_batch_worker_matching { + // Phase 1: Wait for trigger OR batch_interval timeout + let deadline = tokio::time::Instant::now() + batch_interval; + + let triggered = tokio::select! { + _ = task_change_notify.notified() => true, + _ = worker_change_notify.notified() => true, + _ = tokio::time::sleep_until(deadline) => false, + }; + + // Phase 2: If triggered, apply debounce window to collect more changes + // But don't exceed the original batch_interval deadline + if triggered && batch_debounce > Duration::ZERO { + let debounce_until = tokio::time::Instant::now() + batch_debounce; + let effective_deadline = debounce_until.min(deadline); + tokio::time::sleep_until(effective_deadline).await; + } + + // If last match failed, add extra delay to avoid hard loop + if !last_match_successful { + tokio::time::sleep(Duration::from_millis(100)).await; + } } else { - // If the last match failed, then run again after a short sleep. - // This resolves issues where we tried to re-schedule a job to - // a disconnected worker. The sleep ensures we don't enter a - // hard loop if there's something wrong inside do_try_match. - let sleep_fut = tokio::time::sleep(Duration::from_millis(100)); - tokio::pin!(sleep_fut); - let _ = futures::future::select(state_changed, sleep_fut).await; + // Original notification-based approach for sequential mode + let task_change_fut = task_change_notify.notified(); + let worker_change_fut = worker_change_notify.notified(); + tokio::pin!(task_change_fut); + tokio::pin!(worker_change_fut); + let state_changed = future::select(task_change_fut, worker_change_fut); + if last_match_successful { + let _ = state_changed.await; + } else { + // If the last match failed, then run again after a short sleep. + let sleep_fut = tokio::time::sleep(Duration::from_millis(100)); + tokio::pin!(sleep_fut); + let _ = future::select(state_changed, sleep_fut).await; + } } + + // Phase 3: Run the matching let result = match weak_inner.upgrade() { - Some(scheduler) => scheduler.do_try_match().await, + Some(scheduler) => { + let now = Instant::now(); + let full_worker_logging = { + match scheduler.worker_match_logging_interval { + None => false, + Some(duration) => match worker_match_logging_last { + None => true, + Some(when) => now.duration_since(when) >= duration, + }, + } + }; + + let res = scheduler.do_try_match_internal(full_worker_logging).await; + if full_worker_logging { + let operations_stream = scheduler + .matching_engine_state_manager + .filter_operations(OperationFilter::default()) + .await + .err_tip(|| "In action_scheduler getting filter result"); + + let mut oldest_actions_in_state: HashMap< + String, + BTreeSet>, + > = HashMap::new(); + let max_items = 5; + + match operations_stream { + Ok(stream) => { + let actions = stream + .filter_map(|item| async move { + match item.as_ref().as_state().await { + Ok((action_state, _origin_metadata)) => { + Some(action_state) + } + Err(e) => { + error!( + ?e, + "Failed to get action state!" + ); + None + } + } + }) + .collect::>() + .await; + for action_state in &actions { + let name = action_state.stage.name(); + if let Some(values) = + oldest_actions_in_state.get_mut(&name) + { + values.insert(action_state.clone()); + if values.len() > max_items { + values.pop_first(); + } + } else { + let mut values = BTreeSet::new(); + values.insert(action_state.clone()); + oldest_actions_in_state.insert(name, values); + } + } + } + Err(e) => { + error!(?e, "Failed to get operations list!"); + } + } + + for value in oldest_actions_in_state.values() { + let mut items = vec![]; + for item in value { + items.push(item.to_string()); + } + info!(?items, "Oldest actions in state"); + } + + worker_match_logging_last.replace(now); + } + res + } // If the inner went away it means the scheduler is shutting // down, so we need to resolve our future. None => return, @@ -448,13 +798,34 @@ impl SimpleScheduler { } // Unreachable. }); + + let worker_match_logging_interval = match spec.worker_match_logging_interval_s { + // -1 or 0 means disabled (0 used to cause expensive logging on every call) + -1 | 0 => None, + signed_secs => { + if let Ok(secs) = TryInto::::try_into(signed_secs) { + Some(Duration::from_secs(secs)) + } else { + error!( + worker_match_logging_interval_s = spec.worker_match_logging_interval_s, + "Valid values for worker_match_logging_interval_s are -1, 0, or a positive integer, setting to disabled", + ); + None + } + } + }; Self { matching_engine_state_manager: state_manager.clone(), client_state_manager: state_manager.clone(), + scheduler_state_manager: state_manager, worker_scheduler, platform_property_manager, maybe_origin_event_tx, task_worker_matching_spawn, + worker_match_logging_interval, + enable_batch_worker_matching: spec.enable_batch_worker_matching, + batch_interval: Duration::from_millis(spec.batch_interval_ms), + batch_debounce: Duration::from_millis(spec.batch_debounce_ms), } }); (action_scheduler, worker_scheduler_clone) @@ -482,6 +853,10 @@ impl ClientStateManager for SimpleScheduler { fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider> { Some(self) } + + fn as_any(&self) -> &dyn std::any::Any { + self + } } #[async_trait] diff --git a/nativelink-scheduler/src/simple_scheduler_state_manager.rs b/nativelink-scheduler/src/simple_scheduler_state_manager.rs index 961fa810b..2502ed9f4 100644 --- a/nativelink-scheduler/src/simple_scheduler_state_manager.rs +++ b/nativelink-scheduler/src/simple_scheduler_state_manager.rs @@ -12,15 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use core::ops::Bound; -use core::time::Duration; -use std::string::ToString; -use std::sync::{Arc, Weak}; - +use super::awaited_action_db::{ + AwaitedAction, AwaitedActionDb, AwaitedActionSubscriber, CountableActionStage, + SortedAwaitedActionState, +}; use async_lock::Mutex; use async_trait::async_trait; -use futures::{StreamExt, TryStreamExt, stream}; -use nativelink_error::{Code, Error, ResultExt, make_err}; +use core::ops::Bound; +use core::time::Duration; +use futures::{stream, StreamExt, TryStreamExt}; +use nativelink_error::{make_err, Code, Error, ResultExt}; use nativelink_metric::MetricsComponent; use nativelink_util::action_messages::{ ActionInfo, ActionResult, ActionStage, ActionState, ActionUniqueQualifier, ExecutionMetadata, @@ -28,21 +29,156 @@ use nativelink_util::action_messages::{ }; use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::known_platform_property_provider::KnownPlatformPropertyProvider; +use nativelink_util::metrics::{ + register_queued_actions_callback, ExecutionMetricAttrs, ExecutionResult, EXECUTION_INSTANCE, + EXECUTION_METRICS, EXECUTION_RESULT, EXECUTION_STAGE, ExecutionStage, +}; use nativelink_util::operation_state_manager::{ ActionStateResult, ActionStateResultStream, ClientStateManager, MatchingEngineStateManager, OperationFilter, OperationStageFlags, OrderDirection, UpdateOperationType, WorkerStateManager, }; use nativelink_util::origin_event::OriginMetadata; -use tracing::{info, warn}; +use opentelemetry::KeyValue; +use tracing::{debug, info, trace, warn}; -use super::awaited_action_db::{ - AwaitedAction, AwaitedActionDb, AwaitedActionSubscriber, SortedAwaitedActionState, -}; +use crate::worker_registry::SharedWorkerRegistry; +use std::collections::{BTreeMap, HashMap}; +use std::string::ToString; +use std::sync::{Arc, Weak}; +use std::{env, vec}; /// Maximum number of times an update to the database /// can fail before giving up. const MAX_UPDATE_RETRIES: usize = 5; +/// Base delay for exponential backoff on version conflicts (in ms). +const BASE_RETRY_DELAY_MS: u64 = 10; + +/// Maximum jitter to add to retry delay (in ms). +const MAX_RETRY_JITTER_MS: u64 = 20; + +#[derive(Debug)] +pub struct SchedulerMetrics { + attrs: ExecutionMetricAttrs, + instance_name: String, +} + +impl SchedulerMetrics { + #[must_use] + pub fn new(instance_name: impl Into) -> Self { + let instance_name = instance_name.into(); + let base_attrs = vec![KeyValue::new(EXECUTION_INSTANCE, instance_name.clone())]; + Self { + attrs: ExecutionMetricAttrs::new(&base_attrs), + instance_name, + } + } + + pub fn record_stage_transition(&self, from_stage: Option, to_stage: ActionStage) { + if let Some(from) = from_stage { + let from_attrs = self.attrs_for_stage(from); + EXECUTION_METRICS.execution_active_count.add(-1, from_attrs); + } + + let to_attrs = self.attrs_for_stage(to_stage); + EXECUTION_METRICS.execution_active_count.add(1, to_attrs); + EXECUTION_METRICS + .execution_stage_transitions + .add(1, to_attrs); + } + + pub fn record_queue_time(&self, duration_secs: f64) { + EXECUTION_METRICS + .execution_queue_time + .record(duration_secs, self.attrs.queued()); + } + + pub fn record_completion(&self, result: ExecutionResult) { + let attrs = self.attrs_for_completion_result(result); + EXECUTION_METRICS.execution_completed_count.add(1, attrs); + + EXECUTION_METRICS.execution_active_count.add(-1, attrs); + } + + pub fn record_retry(&self) { + EXECUTION_METRICS + .execution_retry_count + .add(1, self.attrs.queued()); + } + + pub fn record_timeout(&self) { + let attrs = self.attrs.completed_timeout(); + EXECUTION_METRICS.execution_completed_count.add(1, attrs); + } + + fn attrs_for_stage(&self, stage: ActionStage) -> &[KeyValue] { + match stage { + ActionStage::Unknown => self.attrs.unknown(), + ActionStage::CacheCheck => self.attrs.cache_check(), + ActionStage::Queued => self.attrs.queued(), + ActionStage::Executing => self.attrs.executing(), + ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => { + self.attrs.completed_success() + } + } + } + + fn attrs_for_completion_result(&self, result: ExecutionResult) -> &[KeyValue] { + match result { + ExecutionResult::Success => self.attrs.completed_success(), + ExecutionResult::Failure => self.attrs.completed_failure(), + ExecutionResult::Cancelled => self.attrs.completed_cancelled(), + ExecutionResult::Timeout => self.attrs.completed_timeout(), + ExecutionResult::CacheHit => self.attrs.completed_cache_hit(), + } + } + + fn record_actions_count(&self, countByStage: HashMap) { + for (stage, count) in countByStage { + let attrs = self.attrs_for_stage(match stage { + CountableActionStage::Queued => ActionStage::Queued, + CountableActionStage::Executing => ActionStage::Executing, + CountableActionStage::Completed => { + let action_result = ActionResult::default(); + (ActionStage::Completed(action_result)) + } + }); + + EXECUTION_METRICS + .execution_actions_count + .record(count, attrs); + } + } + + #[must_use] + pub fn instance_name(&self) -> &str { + &self.instance_name + } + + #[must_use] + pub fn make_worker_attrs(&self) -> Vec { + vec![KeyValue::new( + EXECUTION_INSTANCE, + self.instance_name.clone(), + )] + } + + #[must_use] + pub fn result_from_stage(stage: &ActionStage) -> Option { + match stage { + ActionStage::Completed(result) => { + if result.error.is_some() { + Some(ExecutionResult::Failure) + } else { + Some(ExecutionResult::Success) + } + } + ActionStage::CompletedFromCache(_) => Some(ExecutionResult::CacheHit), + _ => None, + } + } +} + /// Simple struct that implements the `ActionStateResult` trait and always returns an error. struct ErrorActionStateResult(Error); @@ -201,6 +337,20 @@ where .upgrade() .err_tip(|| format!("Failed to upgrade weak reference to SimpleSchedulerStateManager in MatchingEngineActionStateResult::changed at attempt: {timeout_attempts}"))?; + // Check if worker is alive via registry before timing out. + let should_timeout = simple_scheduler_state_manager + .should_timeout_operation(&awaited_action) + .await; + + if !should_timeout { + // Worker is alive, continue waiting for updates + trace!( + operation_id = %awaited_action.operation_id(), + "Operation timeout check passed, worker is alive" + ); + continue; + } + warn!( ?awaited_action, "OperationId {} / {} timed out after {} seconds issuing a retry", @@ -244,8 +394,8 @@ where /// Scheduler state includes the actions that are queued, active, and recently completed. /// It also includes the workers that are available to execute actions based on allocation /// strategy. -#[derive(MetricsComponent)] -pub(crate) struct SimpleSchedulerStateManager +#[derive(MetricsComponent, Debug)] +pub struct SimpleSchedulerStateManager where T: AwaitedActionDb, I: InstantWrapper, @@ -273,6 +423,10 @@ where /// if it is not being processed by any worker. client_action_timeout: Duration, + /// Maximum time an action can stay in Executing state without any worker + /// update, regardless of worker keepalive status. `Duration::ZERO` disables. + max_executing_timeout: Duration, + // A lock to ensure only one timeout operation is running at a time // on this service. timeout_operation_mux: Mutex<()>, @@ -285,6 +439,94 @@ where /// Function to get the current time. now_fn: NowFn, + + /// Worker registry for checking worker liveness. + worker_registry: Option, + + /// OTEL metrics for tracking scheduler and action execution state. + /// Provides pre-computed attributes and methods for recording metrics + /// related to action execution lifecycle. + scheduler_metrics: SchedulerMetrics, + + queued_actions_tracker: Arc>, +} + +#[derive(Debug)] +struct QueuedActionsTracker +where + T: AwaitedActionDb, + I: InstantWrapper, + NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static, +{ + simple_scheduler_state_manager: Weak>, + queued_actions: Arc)>>>, +} + +impl QueuedActionsTracker +where + T: AwaitedActionDb, + I: InstantWrapper, + NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static, +{ + fn new(simple_scheduler_state_manager: Weak>) -> Self { + let queued_actions = Arc::new(tokio::sync::Mutex::new(Vec::new())); + + Self { + simple_scheduler_state_manager, + queued_actions, + } + } + + fn dump_queued_actions(&self, observer: impl Fn(u64, &[KeyValue])) { + if let Ok(queued_actions) = self.queued_actions.try_lock() { + for (count, attrs) in queued_actions.iter() { + observer(*count, attrs); + } + } + } + + async fn count_queued_actions(&self) { + if let Some(manager) = self.simple_scheduler_state_manager.upgrade() { + let action_infos = manager + .action_db + .get_queued_actions() + .await + .err_tip(|| "In SimpleSchedulerStateManager::record_actions_count") + .unwrap_or_default(); + + let count_by_properties = action_infos + .iter() + .map(|awaitedAction| { + awaitedAction + .action_info() + .platform_properties + .clone() + .into_iter() + .collect::>() + }) + .fold(HashMap::new(), |mut acc, platform_properties| { + *acc.entry(platform_properties).or_insert(0) += 1; + acc + }); + + let mut queued_actions = self.queued_actions.lock().await; + queued_actions.clear(); + + for (platform_properties, count) in count_by_properties { + let mut attrs = platform_properties + .iter() + .map(|(key, value)| KeyValue::new(key.clone(), value.clone())) + .collect::>(); + + attrs.push(KeyValue::new( + EXECUTION_INSTANCE, + manager.scheduler_metrics.instance_name.clone(), + )); + + queued_actions.push((count, attrs)); + } + } + } } impl SimpleSchedulerStateManager @@ -293,22 +535,173 @@ where I: InstantWrapper, NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static, { - pub(crate) fn new( + pub fn new( max_job_retries: usize, no_event_action_timeout: Duration, client_action_timeout: Duration, + max_executing_timeout: Duration, action_db: T, now_fn: NowFn, + worker_registry: Option, + instance_name: impl Into, ) -> Arc { - Arc::new_cyclic(|weak_self| Self { + let res = Arc::new_cyclic(|weak_self| Self { action_db, max_job_retries, no_event_action_timeout, client_action_timeout, + max_executing_timeout, timeout_operation_mux: Mutex::new(()), weak_self: weak_self.clone(), now_fn, - }) + worker_registry, + scheduler_metrics: SchedulerMetrics::new(instance_name), + queued_actions_tracker: Arc::new(QueuedActionsTracker::new(weak_self.clone())), + }); + + let queued_actions_tracker_clone = res.queued_actions_tracker.clone(); + + if env::var("NATIVELINK_COUNT_QUEUED_ACTIONS").unwrap_or_default() == "1" { + register_queued_actions_callback(Box::new(move |observe| { + queued_actions_tracker_clone.dump_queued_actions(observe); + })); + } + + res + } + + /// Returns a reference to the scheduler metrics for recording OTEL metrics. + #[must_use] + pub fn metrics(&self) -> &SchedulerMetrics { + &self.scheduler_metrics + } + + /// Records metrics for an action state update. + /// + /// This handles stage transitions, retries, completions, and timing metrics. + async fn record_action_update_metrics( + &self, + previous_stage: &ActionStage, + new_stage: &ActionStage, + is_retry: bool, + action_insert_timestamp: std::time::SystemTime, + ) { + // Only record if the stage actually changed + if std::mem::discriminant(previous_stage) != std::mem::discriminant(new_stage) { + self.record_actions_count().await; + // Record the stage transition + self.scheduler_metrics + .record_stage_transition(Some(previous_stage.clone()), new_stage.clone()); + + // Record queue time when transitioning from Queued to Executing + if matches!(previous_stage, ActionStage::Queued) + && matches!(new_stage, ActionStage::Executing) + { + if let Ok(queue_duration) = action_insert_timestamp.elapsed() { + self.scheduler_metrics + .record_queue_time(queue_duration.as_secs_f64()); + } + } + + // Record completion metrics + if new_stage.is_finished() { + if let Some(result) = SchedulerMetrics::result_from_stage(new_stage) { + self.scheduler_metrics.record_completion(result); + } + + if new_stage.has_action_result() && matches!(new_stage, ActionStage::Completed(_)) { + let result = match new_stage { + ActionStage::Completed(action_result) => Some(action_result), + _ => None, + }; + if let Some(action_result) = result { + let execution_metadata = &action_result.execution_metadata; + let total_execution_duration = execution_metadata + .worker_completed_timestamp + .duration_since(execution_metadata.worker_start_timestamp) + .unwrap_or(Duration::ZERO); + + let queue_duration = execution_metadata + .worker_start_timestamp // which is the start of execution + .duration_since(execution_metadata.queued_timestamp) + .unwrap_or(Duration::ZERO); + + let fetch_duration = execution_metadata + .input_fetch_completed_timestamp + .duration_since(execution_metadata.input_fetch_start_timestamp) + .unwrap_or(Duration::ZERO); + + let execution_duration = execution_metadata + .execution_completed_timestamp + .duration_since(execution_metadata.execution_start_timestamp) + .unwrap_or(Duration::ZERO); + + EXECUTION_METRICS.execution_stage_duration.record( + fetch_duration.as_secs_f64(), + self.scheduler_metrics + .attrs_for_stage(ActionStage::CacheCheck), + ); + + EXECUTION_METRICS.execution_stage_duration.record( + queue_duration.as_secs_f64(), + self.scheduler_metrics.attrs_for_stage(ActionStage::Queued), + ); + + EXECUTION_METRICS.execution_stage_duration.record( + execution_duration.as_secs_f64(), + self.scheduler_metrics + .attrs_for_stage(ActionStage::Executing), + ); + + EXECUTION_METRICS + .execution_total_duration + .record(total_execution_duration.as_secs_f64(), &[]) + } + } + } + } + + // Record retry metric + if is_retry { + self.scheduler_metrics.record_retry(); + } + } + + pub async fn should_timeout_operation(&self, awaited_action: &AwaitedAction) -> bool { + if !matches!(awaited_action.state().stage, ActionStage::Executing) { + return false; + } + + let now = (self.now_fn)().now(); + + let registry_alive = if let Some(ref worker_registry) = self.worker_registry { + if let Some(worker_id) = awaited_action.worker_id() { + worker_registry + .is_worker_alive(worker_id, self.no_event_action_timeout, now) + .await + } else { + false + } + } else { + false + }; + + if registry_alive { + if self.max_executing_timeout > Duration::ZERO { + let last_update = awaited_action.last_worker_updated_timestamp(); + if let Ok(elapsed) = now.duration_since(last_update) { + return elapsed > self.max_executing_timeout; + } + } + return false; + } + + let worker_should_update_before = awaited_action + .last_worker_updated_timestamp() + .checked_add(self.no_event_action_timeout) + .unwrap_or(now); + + worker_should_update_before < now } async fn apply_filter_predicate( @@ -320,9 +713,10 @@ where // Note: The caller must filter `client_operation_id`. let mut maybe_reloaded_awaited_action: Option = None; - if awaited_action.last_client_keepalive_timestamp() + self.client_action_timeout - < (self.now_fn)().now() - { + let now = (self.now_fn)().now(); + + // Check if client has timed out + if awaited_action.last_client_keepalive_timestamp() + self.client_action_timeout < now { // This may change if the version is out of date. let mut timed_out = true; if !awaited_action.state().stage.is_finished() { @@ -335,6 +729,7 @@ where )), ..ActionResult::default() }); + state.last_transition_timestamp = now; let state = Arc::new(state); // We may be competing with an client timestamp update, so try // this a few times. @@ -343,13 +738,22 @@ where None => awaited_action.clone(), Some(reloaded_awaited_action) => reloaded_awaited_action.clone(), }; + let previous_stage = new_awaited_action.state().stage.clone(); new_awaited_action.worker_set_state(state.clone(), (self.now_fn)().now()); let err = match self .action_db .update_awaited_action(new_awaited_action) .await { - Ok(()) => break, + Ok(()) => { + // Record client timeout metrics + self.scheduler_metrics.record_timeout(); + self.scheduler_metrics.record_stage_transition( + Some(previous_stage.clone()), + state.stage.clone(), + ); + break; + } Err(err) => err, }; // Reload from the database if the action was outdated. @@ -488,22 +892,51 @@ where return Ok(()); } - let worker_should_update_before = awaited_action - .last_worker_updated_timestamp() - .checked_add(self.no_event_action_timeout) - .ok_or_else(|| { - make_err!( - Code::Internal, - "Timestamp overflow for operation {operation_id} in SimpleSchedulerStateManager::timeout_operation_id" - ) - })?; - if worker_should_update_before >= (self.now_fn)().now() { - // The action was updated recently, we should not timeout the action. - // This is to prevent timing out actions that have recently been updated - // (like multiple clients timeout the same action at the same time). + let now = (self.now_fn)().now(); + + // Check worker liveness via registry if available. + let registry_alive = if let Some(ref worker_registry) = self.worker_registry { + if let Some(worker_id) = awaited_action.worker_id() { + worker_registry + .is_worker_alive(worker_id, self.no_event_action_timeout, now) + .await + } else { + false + } + } else { + false + }; + + let timestamp_alive = { + let worker_should_update_before = awaited_action + .last_worker_updated_timestamp() + .checked_add(self.no_event_action_timeout) + .unwrap_or(now); + worker_should_update_before >= now + }; + + if registry_alive || timestamp_alive { + trace!( + %operation_id, + worker_id = ?awaited_action.worker_id(), + registry_alive, + timestamp_alive, + "Worker is alive, operation not timed out" + ); return Ok(()); } + debug!( + %operation_id, + worker_id = ?awaited_action.worker_id(), + registry_alive, + timestamp_alive, + "Worker not alive via registry or timestamp, timing out operation" + ); + + // Record timeout metric + self.scheduler_metrics.record_timeout(); + self.assign_operation( operation_id, Err(make_err!( @@ -521,8 +954,51 @@ where maybe_worker_id: Option<&WorkerId>, update: UpdateOperationType, ) -> Result<(), Error> { + let update_type_str = match &update { + UpdateOperationType::KeepAlive => "KeepAlive", + UpdateOperationType::UpdateWithActionStage(stage) => match stage { + ActionStage::Queued => "Stage:Queued", + ActionStage::Executing => "Stage:Executing", + ActionStage::Completed(_) => "Stage:Completed", + ActionStage::CompletedFromCache(_) => "Stage:CompletedFromCache", + ActionStage::CacheCheck => "Stage:CacheCheck", + ActionStage::Unknown => "Stage:Unknown", + }, + UpdateOperationType::UpdateWithError(_) => "Error", + UpdateOperationType::UpdateWithDisconnect => "Disconnect", + UpdateOperationType::ExecutionComplete => "ExecutionComplete", + }; + + debug!( + %operation_id, + ?maybe_worker_id, + update_type = %update_type_str, + "inner_update_operation START" + ); + let mut last_err = None; + let mut retry_count = 0; for _ in 0..MAX_UPDATE_RETRIES { + retry_count += 1; + if retry_count > 1 { + let base_delay = BASE_RETRY_DELAY_MS * (1 << (retry_count - 2).min(4)); + let jitter = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| u64::try_from(d.as_nanos()).expect("u64 error") % MAX_RETRY_JITTER_MS) + .unwrap_or(0); + let delay = Duration::from_millis(base_delay + jitter); + + warn!( + %operation_id, + ?maybe_worker_id, + retry_count, + delay_ms = delay.as_millis(), + update_type = %update_type_str, + "Retrying operation update due to version conflict (with backoff)" + ); + + tokio::time::sleep(delay).await; + } let maybe_awaited_action_subscriber = self .action_db .get_by_operation_id(operation_id) @@ -532,6 +1008,10 @@ where // No action found. It is ok if the action was not found. It // probably means that the action was dropped, but worker was // still processing it. + warn!( + %operation_id, + "Unable to update action due to it being missing, probably dropped" + ); return Ok(()); }; @@ -577,7 +1057,7 @@ where _ => { return Err(make_err!( Code::Internal, - "Action {operation_id:?} is already completed with state {:?} - maybe_worker_id: {:?}", + "Action {operation_id} is already completed with state {:?} - maybe_worker_id: {:?}", awaited_action.state().stage, maybe_worker_id, )); @@ -585,7 +1065,11 @@ where } } - let stage = match &update { + // Capture the previous stage for metrics tracking + let previous_stage = awaited_action.state().stage.clone(); + let action_insert_timestamp = awaited_action.action_info().insert_timestamp; + + let (stage, is_retry) = match &update { UpdateOperationType::KeepAlive => { awaited_action.worker_keep_alive((self.now_fn)().now()); match self @@ -601,7 +1085,15 @@ where result => return result, } } - UpdateOperationType::UpdateWithActionStage(stage) => stage.clone(), + UpdateOperationType::UpdateWithActionStage(stage) => { + if stage == &ActionStage::Executing + && awaited_action.state().stage == ActionStage::Executing + { + warn!(state = ?awaited_action.state(), "Action already assigned"); + return Err(make_err!(Code::Aborted, "Action already assigned")); + } + (stage.clone(), false) + } UpdateOperationType::UpdateWithError(err) => { // Don't count a backpressure failure as an attempt for an action. let due_to_backpressure = err.code == Code::ResourceExhausted; @@ -610,25 +1102,35 @@ where } if awaited_action.attempts > self.max_job_retries { - ActionStage::Completed(ActionResult { - execution_metadata: ExecutionMetadata { - worker: maybe_worker_id.map_or_else(String::default, ToString::to_string), - ..ExecutionMetadata::default() - }, - error: Some(err.clone().merge(make_err!( - Code::Internal, - "Job cancelled because it attempted to execute too many times {} > {} times {}", - awaited_action.attempts, - self.max_job_retries, - format!("for operation_id: {operation_id}, maybe_worker_id: {maybe_worker_id:?}"), - ))), - ..ActionResult::default() - }) + ( + ActionStage::Completed(ActionResult { + execution_metadata: ExecutionMetadata { + worker: maybe_worker_id + .map_or_else(String::default, ToString::to_string), + ..ExecutionMetadata::default() + }, + error: Some(err.clone().merge(make_err!( + Code::Internal, + "Job cancelled because it attempted to execute too many times {} > {} times {}", + awaited_action.attempts, + self.max_job_retries, + format!("for operation_id: {operation_id}, maybe_worker_id: {maybe_worker_id:?}"), + ))), + ..ActionResult::default() + }), + false, + ) } else { - ActionStage::Queued + // This is a retry - action goes back to queued + (ActionStage::Queued, true) } } - UpdateOperationType::UpdateWithDisconnect => ActionStage::Queued, + UpdateOperationType::UpdateWithDisconnect => (ActionStage::Queued, true), + // We shouldn't get here, but we just ignore it if we do. + UpdateOperationType::ExecutionComplete => { + warn!("inner_update_operation got an ExecutionComplete, that's unexpected."); + return Ok(()); + } }; let now = (self.now_fn)().now(); if matches!(stage, ActionStage::Queued) { @@ -640,19 +1142,20 @@ where } awaited_action.worker_set_state( Arc::new(ActionState { - stage, + stage: stage.clone(), // Client id is not known here, it is the responsibility of // the the subscriber impl to replace this with the // correct client id. client_operation_id: operation_id.clone(), action_digest: awaited_action.action_info().digest(), + last_transition_timestamp: now, }), now, ); let update_action_result = self .action_db - .update_awaited_action(awaited_action) + .update_awaited_action(awaited_action.clone()) .await .err_tip(|| "In SimpleSchedulerStateManager::update_operation"); if let Err(err) = update_action_result { @@ -660,13 +1163,89 @@ where // updated due to the data being set was not the latest // but can be retried. if err.code == Code::Aborted { + debug!( + %operation_id, + retry_count, + update_type = %update_type_str, + "Version conflict (Aborted), will retry" + ); last_err = Some(err); continue; } + warn!( + %operation_id, + update_type = %update_type_str, + ?err, + "inner_update_operation FAILED (non-retryable)" + ); return Err(err); } + + // Record execution metrics after successful state update + let action_state = awaited_action.state(); + let instance_name = awaited_action + .action_info() + .unique_qualifier + .instance_name() + .as_str(); + let priority = Some(awaited_action.action_info().priority); + + // Build base attributes for metrics + let mut attrs = nativelink_util::metrics::make_execution_attributes( + instance_name, + priority, + ); + + // Add stage attribute + let execution_stage: ExecutionStage = (&action_state.stage).into(); + attrs.push(KeyValue::new(EXECUTION_STAGE, execution_stage)); + + // Record stage transition + EXECUTION_METRICS.execution_stage_transitions.add(1, &attrs); + + // For completed actions, record the completion count with result + match &action_state.stage { + ActionStage::Completed(action_result) => { + let result = if action_result.exit_code == 0 { + ExecutionResult::Success + } else { + ExecutionResult::Failure + }; + attrs.push(KeyValue::new(EXECUTION_RESULT, result)); + EXECUTION_METRICS.execution_completed_count.add(1, &attrs); + } + ActionStage::CompletedFromCache(_) => { + attrs.push(KeyValue::new(EXECUTION_RESULT, ExecutionResult::CacheHit)); + EXECUTION_METRICS.execution_completed_count.add(1, &attrs); + } + _ => {} + } + + debug!( + %operation_id, + retry_count, + update_type = %update_type_str, + "inner_update_operation SUCCESS" + ); + + // Record metrics for the stage transition + self.record_action_update_metrics( + &previous_stage, + &stage, + is_retry, + action_insert_timestamp, + ) + .await; + return Ok(()); } + + warn!( + %operation_id, + update_type = %update_type_str, + retry_count = MAX_UPDATE_RETRIES, + "inner_update_operation EXHAUSTED all retries" + ); Err(last_err.unwrap_or_else(|| { make_err!( Code::Internal, @@ -681,14 +1260,24 @@ where new_client_operation_id: OperationId, action_info: Arc, ) -> Result { - self.action_db + let result = self + .action_db .add_action( new_client_operation_id, action_info, self.no_event_action_timeout, ) .await - .err_tip(|| "In SimpleSchedulerStateManager::add_operation") + .err_tip(|| "In SimpleSchedulerStateManager::add_operation"); + + // Record metrics for new action entering the queue + if result.is_ok() { + self.scheduler_metrics + .record_stage_transition(None, ActionStage::Queued); + self.record_actions_count().await + } + + result } async fn inner_filter_operations<'a, F>( @@ -835,6 +1424,45 @@ where }); Ok(Box::pin(stream)) } + + const STAGES: [CountableActionStage; 3] = [ + CountableActionStage::Queued, + CountableActionStage::Executing, + CountableActionStage::Completed, + ]; + + async fn record_actions_count(&self) { + if env::var("NATIVELINK_COUNT_ACTIONS_DB").unwrap_or_default() == "1" { + let count = self + .action_db + .count_actions(Self::STAGES.to_vec()) + .await + .err_tip(|| "In SimpleSchedulerStateManager::record_actions_count") + .unwrap(); + self.scheduler_metrics.record_actions_count( + count + .iter() + .map(|(stage, count)| (stage.clone(), *count as u64)) + .collect(), + ); + } + + if env::var("NATIVELINK_COUNT_QUEUED_ACTIONS").unwrap_or_default() == "1" { + self.queued_actions_tracker.count_queued_actions().await; + } + } +} + +#[async_trait] +pub trait SchedulerStateManager: MatchingEngineStateManager + ClientStateManager {} + +#[async_trait] +impl SchedulerStateManager for SimpleSchedulerStateManager +where + T: AwaitedActionDb, + I: InstantWrapper, + NowFn: Fn() -> I + Clone + Send + Unpin + Sync + 'static, +{ } #[async_trait] @@ -879,6 +1507,10 @@ where fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider> { None } + + fn as_any(&self) -> &dyn std::any::Any { + self + } } #[async_trait] diff --git a/nativelink-scheduler/src/store_awaited_action_db.rs b/nativelink-scheduler/src/store_awaited_action_db.rs index 7a9cd22b0..49af3b9e8 100644 --- a/nativelink-scheduler/src/store_awaited_action_db.rs +++ b/nativelink-scheduler/src/store_awaited_action_db.rs @@ -17,7 +17,9 @@ use core::ops::Bound; use core::sync::atomic::{AtomicU64, Ordering}; use core::time::Duration; use std::borrow::Cow; +use std::collections::HashMap; use std::sync::{Arc, Weak}; +use std::time::UNIX_EPOCH; use bytes::Bytes; use futures::{Stream, TryStreamExt}; @@ -35,11 +37,11 @@ use nativelink_util::store_trait::{ }; use nativelink_util::task::JoinHandleDropGuard; use tokio::sync::Notify; -use tracing::error; +use tracing::{error, warn}; use crate::awaited_action_db::{ AwaitedAction, AwaitedActionDb, AwaitedActionSubscriber, CLIENT_KEEPALIVE_DURATION, - SortedAwaitedAction, SortedAwaitedActionState, + CountableActionStage, SortedAwaitedAction, SortedAwaitedActionState, }; type ClientOperationId = OperationId; @@ -47,6 +49,9 @@ type ClientOperationId = OperationId; /// Maximum number of retries to update client keep alive. const MAX_RETRIES_FOR_CLIENT_KEEPALIVE: u32 = 8; +/// Use separate non-versioned Redis key for client keepalives. +const USE_SEPARATE_CLIENT_KEEPALIVE_KEY: bool = true; + enum OperationSubscriberState { Unsubscribed, Subscribed(Sub), @@ -127,12 +132,35 @@ where if let Some(client_operation_id) = maybe_client_operation_id { awaited_action.set_client_operation_id(client_operation_id); } - last_known_keepalive_ts.store( - awaited_action - .last_client_keepalive_timestamp() - .unix_timestamp(), - Ordering::Release, - ); + + // Helper to convert SystemTime to unix timestamp + let to_unix_ts = |t: std::time::SystemTime| -> u64 { + t.duration_since(UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0) + }; + + // Check the separate keepalive key for the most recent timestamp. + let keepalive_ts = if USE_SEPARATE_CLIENT_KEEPALIVE_KEY { + let operation_id = key.0.as_ref(); + match store.get_and_decode(ClientKeepaliveKey(operation_id)).await { + Ok(Some(ts)) => { + let awaited_ts = to_unix_ts(awaited_action.last_client_keepalive_timestamp()); + if ts > awaited_ts { + let timestamp = UNIX_EPOCH + Duration::from_secs(ts); + awaited_action.update_client_keep_alive(timestamp); + ts + } else { + awaited_ts + } + } + Ok(None) | Err(_) => to_unix_ts(awaited_action.last_client_keepalive_timestamp()), + } + } else { + to_unix_ts(awaited_action.last_client_keepalive_timestamp()) + }; + + last_known_keepalive_ts.store(keepalive_ts, Ordering::Release); Ok(awaited_action) } @@ -168,6 +196,7 @@ where OperationSubscriberState::Unsubscribed => { let subscription = store .subscription_manager() + .await .err_tip(|| "In OperationSubscriber::changed::subscription_manager")? .subscribe(self.subscription_key.borrow()) .err_tip(|| "In OperationSubscriber::changed::subscribe")?; @@ -200,42 +229,93 @@ where loop { // This is set if the maybe_last_state doesn't match the state in the store. let mut maybe_changed_action = None; - for attempt in 1..=MAX_RETRIES_FOR_CLIENT_KEEPALIVE { - let last_known_keepalive_ts = self.last_known_keepalive_ts.load(Ordering::Acquire); - if I::from_secs(last_known_keepalive_ts).elapsed() <= CLIENT_KEEPALIVE_DURATION { - break; // We are still within the keep alive duration. - } - if attempt > 1 { - // Wait a tick before retrying. - (self.now_fn)().sleep(Duration::from_millis(100)).await; - } - let mut awaited_action = Self::inner_get_awaited_action( - store.as_ref(), - self.subscription_key.borrow(), - self.maybe_client_operation_id.clone(), - &self.last_known_keepalive_ts, - ) - .await - .err_tip(|| "In OperationSubscriber::changed")?; - awaited_action.update_client_keep_alive((self.now_fn)().now()); - // If this is set to Some then the action changed without being published. - maybe_changed_action = self - .maybe_last_stage - .as_ref() - .is_some_and(|last_stage| { - *last_stage != core::mem::discriminant(&awaited_action.state().stage) - }) - .then(|| awaited_action.clone()); - match inner_update_awaited_action(store.as_ref(), awaited_action).await { - Ok(()) => break, - err if attempt == MAX_RETRIES_FOR_CLIENT_KEEPALIVE => { - err.err_tip_with_code(|_| { - (Code::Aborted, "Could not update client keep alive") - })?; + + let last_known_keepalive_ts = self.last_known_keepalive_ts.load(Ordering::Acquire); + if I::from_secs(last_known_keepalive_ts).elapsed() > CLIENT_KEEPALIVE_DURATION { + let now = (self.now_fn)().now(); + let now_ts = now + .duration_since(UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + + if USE_SEPARATE_CLIENT_KEEPALIVE_KEY { + let operation_id = self.subscription_key.0.as_ref(); + let update_result = store + .update_data(UpdateClientKeepalive { + operation_id, + timestamp: now_ts, + }) + .await; + + if let Err(e) = update_result { + warn!( + ?self.subscription_key, + ?e, + "Failed to update client keepalive (non-versioned)" + ); + } + + // Update local timestamp + self.last_known_keepalive_ts + .store(now_ts, Ordering::Release); + + // Check if state changed (for unreliable subscription managers) + if self.maybe_last_stage.is_some() { + let awaited_action = Self::inner_get_awaited_action( + store.as_ref(), + self.subscription_key.borrow(), + self.maybe_client_operation_id.clone(), + &self.last_known_keepalive_ts, + ) + .await + .err_tip(|| "In OperationSubscriber::changed")?; + + if self.maybe_last_stage.as_ref().is_some_and(|last_stage| { + *last_stage != core::mem::discriminant(&awaited_action.state().stage) + }) { + maybe_changed_action = Some(awaited_action); + } + } + } else { + for attempt in 1..=MAX_RETRIES_FOR_CLIENT_KEEPALIVE { + if attempt > 1 { + (self.now_fn)().sleep(Duration::from_millis(100)).await; + warn!( + ?self.subscription_key, + attempt, + "Client keepalive retry due to version conflict" + ); + } + let mut awaited_action = Self::inner_get_awaited_action( + store.as_ref(), + self.subscription_key.borrow(), + self.maybe_client_operation_id.clone(), + &self.last_known_keepalive_ts, + ) + .await + .err_tip(|| "In OperationSubscriber::changed")?; + awaited_action.update_client_keep_alive(now); + maybe_changed_action = self + .maybe_last_stage + .as_ref() + .is_some_and(|last_stage| { + *last_stage + != core::mem::discriminant(&awaited_action.state().stage) + }) + .then(|| awaited_action.clone()); + match inner_update_awaited_action(store.as_ref(), awaited_action).await { + Ok(()) => break, + err if attempt == MAX_RETRIES_FOR_CLIENT_KEEPALIVE => { + err.err_tip_with_code(|_| { + (Code::Aborted, "Could not update client keep alive") + })?; + } + _ => (), + } } - _ => (), } } + // If the polling shows that it's changed state then publish now. if let Some(changed_action) = maybe_changed_action { self.maybe_last_stage = @@ -292,6 +372,8 @@ fn awaited_action_decode(version: i64, data: &Bytes) -> Result(Cow<'a, OperationId>); @@ -338,6 +420,42 @@ impl SchedulerStoreDecodeTo for ClientIdToOperationId<'_> { } } +struct ClientKeepaliveKey<'a>(&'a OperationId); +impl SchedulerStoreKeyProvider for ClientKeepaliveKey<'_> { + type Versioned = FalseValue; + fn get_key(&self) -> StoreKey<'static> { + StoreKey::Str(Cow::Owned(format!( + "{CLIENT_KEEPALIVE_KEY_PREFIX}{}", + self.0 + ))) + } +} +impl SchedulerStoreDecodeTo for ClientKeepaliveKey<'_> { + type DecodeOutput = u64; + fn decode(_version: i64, data: Bytes) -> Result { + let s = core::str::from_utf8(&data) + .map_err(|e| make_input_err!("In ClientKeepaliveKey::decode utf8 - {e:?}"))?; + s.parse::() + .map_err(|e| make_input_err!("In ClientKeepaliveKey::decode parse - {e:?}")) + } +} + +struct UpdateClientKeepalive<'a> { + operation_id: &'a OperationId, + timestamp: u64, +} +impl SchedulerStoreKeyProvider for UpdateClientKeepalive<'_> { + type Versioned = FalseValue; + fn get_key(&self) -> StoreKey<'static> { + ClientKeepaliveKey(self.operation_id).get_key() + } +} +impl SchedulerStoreDataProvider for UpdateClientKeepalive<'_> { + fn try_into_bytes(self) -> Result { + Ok(Bytes::from(self.timestamp.to_string())) + } +} + // TODO(palfrey) We only need operation_id here, it would be nice if we had a way // to tell the decoder we only care about specific fields. struct SearchUniqueQualifierToAwaitedAction<'a>(&'a ActionUniqueQualifier); @@ -452,23 +570,27 @@ async fn inner_update_awaited_action( ) -> Result<(), Error> { let operation_id = new_awaited_action.operation_id().clone(); if new_awaited_action.state().client_operation_id != operation_id { - // Just in case the client_operation_id was set to something else - // we put it back to the underlying operation_id. new_awaited_action.set_client_operation_id(operation_id.clone()); } + + let _is_finished = new_awaited_action.state().stage.is_finished(); + let maybe_version = store .update_data(UpdateOperationIdToAwaitedAction(new_awaited_action)) .await .err_tip(|| "In RedisAwaitedActionDb::update_awaited_action")?; + if maybe_version.is_none() { - tracing::warn!( - "Could not update AwaitedAction because the version did not match for {operation_id}" + warn!( + %operation_id, + "Could not update AwaitedAction because the version did not match" ); return Err(make_err!( Code::Aborted, "Could not update AwaitedAction because the version did not match for {operation_id}", )); } + Ok(()) } @@ -493,7 +615,7 @@ where I: InstantWrapper, NowFn: Fn() -> I + Send + Sync + Clone + 'static, { - pub fn new( + pub async fn new( store: Arc, task_change_publisher: Arc, now_fn: NowFn, @@ -501,6 +623,7 @@ where ) -> Result { let mut subscription = store .subscription_manager() + .await .err_tip(|| "In RedisAwaitedActionDb::new")? .subscribe(OperationIdToAwaitedAction(Cow::Owned(OperationId::String( String::new(), @@ -614,6 +737,46 @@ where let Some(operation_id) = maybe_operation_id else { return Ok(None); }; + + // Validate that the internal operation actually exists. + // If it doesn't, this is an orphaned client operation mapping that should be cleaned up. + // This can happen when an operation is deleted (completed/timed out) but the + // client_id -> operation_id mapping persists in the store. + let maybe_awaited_action = match self + .store + .get_and_decode(OperationIdToAwaitedAction(Cow::Borrowed(&operation_id))) + .await + { + Ok(maybe_action) => maybe_action, + Err(err) if err.code == Code::NotFound => { + tracing::warn!( + "Orphaned client operation mapping detected: client_id={} maps to operation_id={}, \ + but the operation does not exist in the store (NotFound). This typically happens when \ + an operation completes or times out but the client mapping persists.", + client_operation_id, + operation_id + ); + None + } + Err(err) => { + // Some other error occurred + return Err(err).err_tip( + || "In RedisAwaitedActionDb::get_awaited_action_by_id::validate_operation", + ); + } + }; + + if maybe_awaited_action.is_none() { + tracing::warn!( + "Found orphaned client operation mapping: client_id={} -> operation_id={}, \ + but operation no longer exists. Returning None to prevent client from polling \ + a non-existent operation.", + client_operation_id, + operation_id + ); + return Ok(None); + } + Ok(Some(OperationSubscriber::new( Some(client_operation_id.clone()), OperationIdToAwaitedAction(Cow::Owned(operation_id)), @@ -791,4 +954,45 @@ where ) })) } + + async fn get_queued_actions(&self) -> Result>, Error> { + let prefix = SearchStateToAwaitedAction(get_state_prefix(SortedAwaitedActionState::Queued)); + let awaited_actions: Vec> = self + .store + .search_by_index_prefix(prefix) + .await + .err_tip(|| "In RedisAwaitedActionDb::get_queued_actions")? + .map_ok(|awaited_action| Arc::new(AwaitedAction::from(awaited_action))) + .try_collect() + .await + .err_tip(|| "In RedisAwaitedActionDb::get_queued_actions")?; + + Ok(awaited_actions) + } + + async fn count_actions( + &self, + states: Vec, + ) -> Result, Error> { + let prefixes: Vec = states + .iter() + .map(|s| { + SearchStateToAwaitedAction(get_state_prefix( + SortedAwaitedActionState::try_from(s).unwrap(), + )) + }) + .collect(); + + let counts = self + .store + .count_by_index(prefixes) + .await + .err_tip(|| "In RedisAwaitedActionDb::count_actions")?; + + Ok(states + .iter() + .zip(counts) + .map(|(&s, count)| (s, count)) + .collect()) + } } diff --git a/nativelink-scheduler/src/worker.rs b/nativelink-scheduler/src/worker.rs index ed0ae6ed6..55f25a538 100644 --- a/nativelink-scheduler/src/worker.rs +++ b/nativelink-scheduler/src/worker.rs @@ -13,7 +13,7 @@ // limitations under the License. use core::hash::{Hash, Hasher}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; @@ -25,6 +25,7 @@ use nativelink_proto::com::github::trace_machina::nativelink::remote_execution:: use nativelink_util::action_messages::{ActionInfo, OperationId, WorkerId}; use nativelink_util::metrics_utils::{AsyncCounterWrapper, CounterWithTime, FuncCounterWrapper}; use nativelink_util::platform_properties::{PlatformProperties, PlatformPropertyValue}; +use serde::Serialize; use tokio::sync::mpsc::UnboundedSender; pub type WorkerTimestamp = u64; @@ -33,7 +34,7 @@ pub type WorkerTimestamp = u64; /// These platform properties have the type of the properties as well as /// the value of the properties, unlike `ActionInfo`, which only has the /// string value of the properties. -#[derive(Clone, Debug, MetricsComponent)] +#[derive(Clone, Debug, MetricsComponent, Serialize)] pub struct ActionInfoWithProps { /// The action info of the action. #[metric(group = "action_info")] @@ -53,12 +54,29 @@ pub enum WorkerUpdate { Disconnect, } -#[derive(Debug, MetricsComponent)] +#[derive(Debug, MetricsComponent, Serialize, Clone)] pub struct PendingActionInfoData { #[metric] pub action_info: ActionInfoWithProps, } +#[derive(Serialize, Debug)] +pub struct WorkerState { + pub id: WorkerId, + pub platform_properties: PlatformProperties, + pub running_action_infos: HashMap, + pub last_update_timestamp: WorkerTimestamp, + pub is_paused: bool, + pub is_draining: bool, +} + +#[derive(Serialize, Debug)] +pub struct ActionsState { + pub executing: usize, + pub queued: usize, + pub completed: usize, +} + /// Represents a connection to a worker and used as the medium to /// interact with the worker from the client/scheduler. #[derive(Debug, MetricsComponent)] @@ -78,6 +96,9 @@ pub struct Worker { #[metric(group = "running_action_infos")] pub running_action_infos: HashMap, + /// If the properties were restored already then it's added to this set. + pub restored_platform_properties: HashSet, + /// Timestamp of last time this worker had been communicated with. // Warning: Do not update this timestamp without updating the placement of the worker in // the LRUCache in the Workers struct. @@ -92,6 +113,10 @@ pub struct Worker { #[metric(help = "If the worker is draining.")] pub is_draining: bool, + /// Maximum inflight tasks for this worker (or 0 for unlimited) + #[metric(help = "Maximum inflight tasks for this worker (or 0 for unlimited)")] + pub max_inflight_tasks: u64, + /// Stats about the worker. #[metric] metrics: Arc, @@ -108,11 +133,11 @@ fn send_msg_to_worker( /// Reduces the platform properties available on the worker based on the platform properties provided. /// This is used because we allow more than 1 job to run on a worker at a time, and this is how the /// scheduler knows if more jobs can run on a given worker. -fn reduce_platform_properties( +pub fn reduce_platform_properties( parent_props: &mut PlatformProperties, reduction_props: &PlatformProperties, ) { - debug_assert!(reduction_props.is_satisfied_by(parent_props)); + debug_assert!(reduction_props.is_satisfied_by(parent_props, false)); for (property, prop_value) in &reduction_props.properties { if let PlatformPropertyValue::Minimum(value) = prop_value { let worker_props = &mut parent_props.properties; @@ -131,15 +156,18 @@ impl Worker { platform_properties: PlatformProperties, tx: UnboundedSender, timestamp: WorkerTimestamp, + max_inflight_tasks: u64, ) -> Self { Self { id, platform_properties, tx, running_action_infos: HashMap::new(), + restored_platform_properties: HashSet::new(), last_update_timestamp: timestamp, is_paused: false, is_draining: false, + max_inflight_tasks, metrics: Arc::new(Metrics { connected_timestamp: SystemTime::now() .duration_since(UNIX_EPOCH) @@ -219,6 +247,18 @@ impl Worker { .await } + pub(crate) fn execution_complete(&mut self, operation_id: &OperationId) { + if let Some((operation_id, pending_action_info)) = + self.running_action_infos.remove_entry(operation_id) + { + self.restored_platform_properties + .insert(operation_id.clone()); + self.restore_platform_properties(&pending_action_info.action_info.platform_properties); + self.running_action_infos + .insert(operation_id, pending_action_info); + } + } + pub(crate) async fn complete_action( &mut self, operation_id: &OperationId, @@ -229,7 +269,9 @@ impl Worker { self.id, operation_id ) })?; - self.restore_platform_properties(&pending_action_info.action_info.platform_properties); + if !self.restored_platform_properties.remove(operation_id) { + self.restore_platform_properties(&pending_action_info.action_info.platform_properties); + } self.is_paused = false; self.metrics.actions_completed.inc(); Ok(()) @@ -252,8 +294,23 @@ impl Worker { } } - pub const fn can_accept_work(&self) -> bool { - !self.is_paused && !self.is_draining + pub fn can_accept_work(&self) -> bool { + !self.is_paused + && !self.is_draining + && (self.max_inflight_tasks == 0 + || u64::try_from(self.running_action_infos.len()).unwrap_or(u64::MAX) + < self.max_inflight_tasks) + } + + pub fn to_state(&self) -> WorkerState { + WorkerState { + id: self.id.clone(), + platform_properties: self.platform_properties.clone(), + running_action_infos: self.running_action_infos.iter().map(|(k, v)| (k.to_string(), v.clone())).collect(), + last_update_timestamp: self.last_update_timestamp, + is_paused: self.is_paused, + is_draining: self.is_draining, + } } } diff --git a/nativelink-scheduler/src/worker_capability_index.rs b/nativelink-scheduler/src/worker_capability_index.rs new file mode 100644 index 000000000..b0e45b76b --- /dev/null +++ b/nativelink-scheduler/src/worker_capability_index.rs @@ -0,0 +1,229 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Worker capability index for fast worker matching. +//! +//! This module provides an index that accelerates worker matching by property. +//! Instead of iterating all workers for each action, we maintain an inverted index +//! that maps property values to sets of workers that have those values. +//! +//! ## Complexity Analysis +//! +//! Without index: O(W × P) where W = workers, P = properties per action +//! With index: O(P × log(W)) for exact properties + O(W' × P') for minimum properties +//! where W' = filtered workers, P' = minimum property count (typically small) +//! +//! For typical workloads (few minimum properties), this reduces matching from +//! O(n × m) to approximately O(log n). + +use std::collections::{HashMap, HashSet}; + +use nativelink_util::action_messages::WorkerId; +use nativelink_util::platform_properties::{PlatformProperties, PlatformPropertyValue}; +use tracing::info; + +/// A property key-value pair used for indexing. +#[derive(Clone, Hash, Eq, PartialEq, Debug)] +struct PropertyKey { + name: String, + value: PlatformPropertyValue, +} + +/// Index structure for fast worker capability lookup. +/// +/// Maintains an inverted index from property values to worker IDs. +/// Only indexes `Exact` and `Priority` properties since `Minimum` properties +/// are dynamic and require runtime comparison. +#[derive(Debug, Default)] +pub struct WorkerCapabilityIndex { + /// Maps `(property_name, property_value)` -> Set of worker IDs with that property. + /// Only contains `Exact` and `Priority` properties. + exact_index: HashMap>, + + /// Maps `property_name` -> Set of worker IDs that have this property (any value). + /// Used for fast "has property" checks for `Priority` and `Minimum` properties. + property_presence: HashMap>, + + /// Set of all indexed worker IDs. + all_workers: HashSet, +} + +impl WorkerCapabilityIndex { + /// Creates a new empty capability index. + pub fn new() -> Self { + Self::default() + } + + /// Adds a worker to the index with their platform properties. + pub fn add_worker(&mut self, worker_id: &WorkerId, properties: &PlatformProperties) { + self.all_workers.insert(worker_id.clone()); + + for (name, value) in &properties.properties { + // Track property presence + self.property_presence + .entry(name.clone()) + .or_default() + .insert(worker_id.clone()); + + match value { + PlatformPropertyValue::Exact(_) + | PlatformPropertyValue::Priority(_) + | PlatformPropertyValue::Unknown(_) => { + // Index exact-match properties + let key = PropertyKey { + name: name.clone(), + value: value.clone(), + }; + self.exact_index + .entry(key) + .or_default() + .insert(worker_id.clone()); + } + PlatformPropertyValue::Minimum(_) | PlatformPropertyValue::Ignore(_) => { + // Minimum properties are tracked via property_presence only. + // Their actual values are checked at runtime since they're dynamic. + + // Ignore properties we just drop + } + } + } + } + + /// Removes a worker from the index. + pub fn remove_worker(&mut self, worker_id: &WorkerId) { + self.all_workers.remove(worker_id); + + // Remove from exact index + self.exact_index.retain(|_, workers| { + workers.remove(worker_id); + !workers.is_empty() + }); + + // Remove from presence index + self.property_presence.retain(|_, workers| { + workers.remove(worker_id); + !workers.is_empty() + }); + } + + /// Finds workers that can satisfy the given action properties. + /// + /// Returns a set of worker IDs that match all required properties. + /// The caller should apply additional filtering (e.g., worker availability). + /// + /// IMPORTANT: This method returns candidates based on STATIC properties only. + /// - Exact and Unknown properties are fully matched + /// - Priority properties just require the key to exist + /// - Minimum properties return workers that HAVE the property (presence check only) + /// + /// The caller MUST still verify Minimum property values at runtime because + /// worker resources change dynamically as jobs are assigned/completed. + pub fn find_matching_workers( + &self, + action_properties: &PlatformProperties, + full_worker_logging: bool, + ) -> HashSet { + if self.all_workers.is_empty() { + if full_worker_logging { + info!("No workers available to match!"); + } + return HashSet::new(); + } + + if action_properties.properties.is_empty() { + // No properties required, all workers match + return self.all_workers.clone(); + } + + let mut candidates: Option> = None; + + for (name, value) in &action_properties.properties { + match value { + PlatformPropertyValue::Exact(_) | PlatformPropertyValue::Unknown(_) => { + // Look up workers with exact match + let key = PropertyKey { + name: name.clone(), + value: value.clone(), + }; + + let matching = self.exact_index.get(&key).cloned().unwrap_or_default(); + + let internal_candidates = match candidates { + Some(existing) => existing.intersection(&matching).cloned().collect(), + None => matching, + }; + + // Early exit if no candidates + if internal_candidates.is_empty() { + if full_worker_logging { + let values: Vec<_> = self + .exact_index + .iter() + .filter(|pk| &pk.0.name == name) + .map(|pk| pk.0.value.clone()) + .collect(); + info!( + "No candidate workers due to a lack of matching '{name}' = {value:?}. Workers have: {values:?}" + ); + } + return HashSet::new(); + } + candidates = Some(internal_candidates); + } + PlatformPropertyValue::Priority(_) | PlatformPropertyValue::Minimum(_) => { + // Priority: just requires the key to exist + // Minimum: worker must have the property (value checked at runtime by caller) + // We only check presence here because Minimum values are DYNAMIC - + // they change as jobs are assigned to workers. + let workers_with_property = self + .property_presence + .get(name) + .cloned() + .unwrap_or_default(); + + let internal_candidates = match candidates { + Some(existing) => existing + .intersection(&workers_with_property) + .cloned() + .collect(), + None => workers_with_property, + }; + + if internal_candidates.is_empty() { + if full_worker_logging { + info!( + "No candidate workers due to a lack of key '{name}'. Job asked for {value:?}" + ); + } + return HashSet::new(); + } + candidates = Some(internal_candidates); + } + PlatformPropertyValue::Ignore(_) => {} + } + } + + candidates.unwrap_or_else(|| self.all_workers.clone()) + } + + /// Returns the number of indexed workers. + pub fn worker_count(&self) -> usize { + self.all_workers.len() + } + + /// Returns true if the index is empty. + pub fn is_empty(&self) -> bool { + self.all_workers.is_empty() + } +} diff --git a/nativelink-scheduler/src/worker_registry.rs b/nativelink-scheduler/src/worker_registry.rs new file mode 100644 index 000000000..0f0b5c3af --- /dev/null +++ b/nativelink-scheduler/src/worker_registry.rs @@ -0,0 +1,161 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::time::Duration; +use std::collections::HashMap; +use std::sync::Arc; +use std::time::SystemTime; + +use async_lock::RwLock; +use nativelink_util::action_messages::WorkerId; +use tracing::{debug, trace}; + +/// In-memory worker registry that tracks worker liveness. +#[derive(Debug)] +pub struct WorkerRegistry { + workers: RwLock>, +} + +impl Default for WorkerRegistry { + fn default() -> Self { + Self::new() + } +} + +impl WorkerRegistry { + /// Creates a new worker registry. + pub fn new() -> Self { + Self { + workers: RwLock::new(HashMap::new()), + } + } + + /// Updates the heartbeat timestamp for a worker. + pub async fn update_worker_heartbeat(&self, worker_id: &WorkerId, now: SystemTime) { + let mut workers = self.workers.write().await; + workers.insert(worker_id.clone(), now); + trace!(?worker_id, "FLOW: Worker heartbeat updated in registry"); + } + + pub async fn register_worker(&self, worker_id: &WorkerId, now: SystemTime) { + let mut workers = self.workers.write().await; + workers.insert(worker_id.clone(), now); + debug!(?worker_id, "FLOW: Worker registered in registry"); + } + + pub async fn remove_worker(&self, worker_id: &WorkerId) { + let mut workers = self.workers.write().await; + workers.remove(worker_id); + debug!(?worker_id, "FLOW: Worker removed from registry"); + } + + pub async fn is_worker_alive( + &self, + worker_id: &WorkerId, + timeout: Duration, + now: SystemTime, + ) -> bool { + let workers = self.workers.read().await; + + if let Some(last_seen) = workers.get(worker_id) { + if let Some(deadline) = last_seen.checked_add(timeout) { + let is_alive = deadline > now; + trace!( + ?worker_id, + ?last_seen, + ?timeout, + is_alive, + "FLOW: Worker liveness check" + ); + return is_alive; + } + } + + trace!(?worker_id, "FLOW: Worker not found or timed out"); + false + } + + pub async fn get_worker_last_seen(&self, worker_id: &WorkerId) -> Option { + let workers = self.workers.read().await; + workers.get(worker_id).copied() + } +} + +pub type SharedWorkerRegistry = Arc; + +#[cfg(test)] +mod tests { + use nativelink_macro::nativelink_test; + + use super::*; + + #[nativelink_test] + async fn test_worker_heartbeat() { + let registry = WorkerRegistry::new(); + let worker_id = WorkerId::from(String::from("test")); + let now = SystemTime::now(); + + // Worker not registered yet + assert!( + !registry + .is_worker_alive(&worker_id, Duration::from_secs(5), now) + .await + ); + + // Register worker + registry.register_worker(&worker_id, now).await; + assert!( + registry + .is_worker_alive(&worker_id, Duration::from_secs(5), now) + .await + ); + + // Check with expired timeout + let future = now.checked_add(Duration::from_secs(10)).unwrap(); + assert!( + !registry + .is_worker_alive(&worker_id, Duration::from_secs(5), future) + .await + ); + + // Update heartbeat + registry.update_worker_heartbeat(&worker_id, future).await; + assert!( + registry + .is_worker_alive(&worker_id, Duration::from_secs(5), future) + .await + ); + } + + #[nativelink_test] + async fn test_remove_worker() { + let registry = WorkerRegistry::new(); + let worker_id = WorkerId::from(String::from("test-worker")); + let now = SystemTime::now(); + + registry.register_worker(&worker_id, now).await; + assert!( + registry + .is_worker_alive(&worker_id, Duration::from_secs(5), now) + .await + ); + + registry.remove_worker(&worker_id).await; + assert!( + !registry + .is_worker_alive(&worker_id, Duration::from_secs(5), now) + .await + ); + } +} diff --git a/nativelink-scheduler/src/worker_scheduler.rs b/nativelink-scheduler/src/worker_scheduler.rs index 47ea80687..fe9bcb0f4 100644 --- a/nativelink-scheduler/src/worker_scheduler.rs +++ b/nativelink-scheduler/src/worker_scheduler.rs @@ -22,7 +22,7 @@ use nativelink_util::shutdown_guard::ShutdownGuard; use crate::platform_property_manager::PlatformPropertyManager; use crate::worker::{Worker, WorkerTimestamp}; -/// WorkerScheduler interface is responsible for interactions between the scheduler +/// `WorkerScheduler` interface is responsible for interactions between the scheduler /// and worker related operations. #[async_trait] pub trait WorkerScheduler: Sync + Send + Unpin + RootMetricsComponent + 'static { diff --git a/nativelink-scheduler/tests/action_messages_test.rs b/nativelink-scheduler/tests/action_messages_test.rs index 7b58c4704..46e94e631 100644 --- a/nativelink-scheduler/tests/action_messages_test.rs +++ b/nativelink-scheduler/tests/action_messages_test.rs @@ -43,6 +43,7 @@ async fn action_state_any_url_test() -> Result<(), Error> { // Result is only populated if has_action_result. stage: ActionStage::Completed(ActionResult::default()), action_digest, + last_transition_timestamp: SystemTime::now(), }; let operation: Operation = action_state.as_operation(client_id); diff --git a/nativelink-scheduler/tests/cache_lookup_scheduler_test.rs b/nativelink-scheduler/tests/cache_lookup_scheduler_test.rs index cc24a78c1..27900be39 100644 --- a/nativelink-scheduler/tests/cache_lookup_scheduler_test.rs +++ b/nativelink-scheduler/tests/cache_lookup_scheduler_test.rs @@ -13,7 +13,7 @@ // limitations under the License. use std::sync::Arc; -use std::time::UNIX_EPOCH; +use std::time::{SystemTime, UNIX_EPOCH}; mod utils { pub(crate) mod scheduler_utils; @@ -71,6 +71,7 @@ async fn add_action_handles_skip_cache() -> Result<(), Error> { client_operation_id: OperationId::default(), stage: ActionStage::Queued, action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), })); let ActionUniqueQualifier::Cacheable(action_key) = action_info.unique_qualifier.clone() else { panic!("This test should be testing when item was cached first"); diff --git a/nativelink-scheduler/tests/property_modifier_scheduler_test.rs b/nativelink-scheduler/tests/property_modifier_scheduler_test.rs index e546327f9..16cf3ea98 100644 --- a/nativelink-scheduler/tests/property_modifier_scheduler_test.rs +++ b/nativelink-scheduler/tests/property_modifier_scheduler_test.rs @@ -14,7 +14,7 @@ use std::collections::HashMap; use std::sync::Arc; -use std::time::UNIX_EPOCH; +use std::time::{SystemTime, UNIX_EPOCH}; mod utils { pub(crate) mod scheduler_utils; @@ -70,6 +70,7 @@ async fn add_action_adds_property() -> Result<(), Error> { client_operation_id: OperationId::default(), stage: ActionStage::Queued, action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), })); let client_operation_id = OperationId::default(); let (_, (passed_client_operation_id, action_info)) = join!( @@ -114,6 +115,7 @@ async fn add_action_overwrites_property() -> Result<(), Error> { client_operation_id: OperationId::default(), stage: ActionStage::Queued, action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), })); let client_operation_id = OperationId::default(); let (_, (passed_client_operation_id, action_info)) = join!( @@ -153,6 +155,7 @@ async fn add_action_property_added_after_remove() -> Result<(), Error> { client_operation_id: OperationId::default(), stage: ActionStage::Queued, action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), })); let client_operation_id = OperationId::default(); let (_, (passed_client_operation_id, action_info)) = join!( @@ -192,6 +195,7 @@ async fn add_action_property_remove_after_add() -> Result<(), Error> { client_operation_id: OperationId::default(), stage: ActionStage::Queued, action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), })); let client_operation_id = OperationId::default(); let (_, (passed_client_operation_id, action_info)) = join!( @@ -233,6 +237,7 @@ async fn add_action_property_replace() -> Result<(), Error> { client_operation_id: OperationId::default(), stage: ActionStage::Queued, action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), })); let client_operation_id = OperationId::default(); let (_, (passed_client_operation_id, action_info)) = join!( @@ -277,6 +282,7 @@ async fn add_action_property_replace_match_value() -> Result<(), Error> { client_operation_id: OperationId::default(), stage: ActionStage::Queued, action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), })); let client_operation_id = OperationId::default(); let (_, (passed_client_operation_id, action_info)) = join!( @@ -322,6 +328,7 @@ async fn add_action_property_replace_value() -> Result<(), Error> { client_operation_id: OperationId::default(), stage: ActionStage::Queued, action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), })); let client_operation_id = OperationId::default(); let (_, (passed_client_operation_id, action_info)) = join!( @@ -359,6 +366,7 @@ async fn add_action_property_remove() -> Result<(), Error> { client_operation_id: OperationId::default(), stage: ActionStage::Queued, action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), })); // let platform_property_manager = Arc::new(PlatformPropertyManager::new(HashMap::new())); let client_operation_id = OperationId::default(); diff --git a/nativelink-scheduler/tests/property_router_scheduler_test.rs b/nativelink-scheduler/tests/property_router_scheduler_test.rs new file mode 100644 index 000000000..1bb1eb504 --- /dev/null +++ b/nativelink-scheduler/tests/property_router_scheduler_test.rs @@ -0,0 +1,288 @@ +use std::collections::HashMap; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +mod utils { + pub(crate) mod scheduler_utils; +} + +use futures::{StreamExt, join}; +use nativelink_error::{Error, make_input_err}; +use nativelink_macro::nativelink_test; +use nativelink_scheduler::mock_scheduler::MockActionScheduler; +use nativelink_scheduler::property_router_scheduler::PropertyRouterScheduler; +use nativelink_util::action_messages::{ActionStage, ActionState, OperationId}; +use nativelink_util::common::DigestInfo; +use nativelink_util::known_platform_property_provider::KnownPlatformPropertyProvider; +use nativelink_util::operation_state_manager::{ClientStateManager, OperationFilter}; +use pretty_assertions::assert_eq; +use tokio::sync::watch; +use utils::scheduler_utils::{TokioWatchActionStateResult, make_base_action_info}; + +struct TestContext { + compile_scheduler: Arc, + default_scheduler: Arc, + router: PropertyRouterScheduler, +} + +fn make_router() -> TestContext { + let compile_scheduler = Arc::new(MockActionScheduler::new()); + let default_scheduler = Arc::new(MockActionScheduler::new()); + let mut routes = HashMap::new(); + routes.insert( + "compile".to_string(), + compile_scheduler.clone() + as Arc, + ); + let router = PropertyRouterScheduler::new( + "container-image", + routes, + default_scheduler.clone() as Arc, + ); + TestContext { + compile_scheduler, + default_scheduler, + router, + } +} + +#[nativelink_test] +async fn routes_to_matching_scheduler() -> Result<(), Error> { + let ctx = make_router(); + let mut action_info = make_base_action_info(UNIX_EPOCH, DigestInfo::zero_digest()) + .as_ref() + .clone(); + action_info + .platform_properties + .insert("container-image".to_string(), "compile".to_string()); + let action_info = Arc::new(action_info); + + let (_tx, rx) = watch::channel(Arc::new(ActionState { + client_operation_id: OperationId::default(), + stage: ActionStage::Queued, + action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), + })); + let client_operation_id = OperationId::default(); + + let (_, (received_op_id, received_action)) = + join!( + ctx.router + .add_action(client_operation_id.clone(), action_info.clone()), + ctx.compile_scheduler.expect_add_action(Ok(Box::new( + TokioWatchActionStateResult::new(client_operation_id.clone(), action_info, rx) + ))), + ); + assert_eq!(client_operation_id, received_op_id); + assert_eq!( + Some(&"compile".to_string()), + received_action.platform_properties.get("container-image") + ); + Ok(()) +} + +#[nativelink_test] +async fn routes_to_default_when_no_match() -> Result<(), Error> { + let ctx = make_router(); + let mut action_info = make_base_action_info(UNIX_EPOCH, DigestInfo::zero_digest()) + .as_ref() + .clone(); + action_info.platform_properties.insert( + "container-image".to_string(), + "some-other-image".to_string(), + ); + let action_info = Arc::new(action_info); + + let (_tx, rx) = watch::channel(Arc::new(ActionState { + client_operation_id: OperationId::default(), + stage: ActionStage::Queued, + action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), + })); + let client_operation_id = OperationId::default(); + + let (_, (received_op_id, received_action)) = + join!( + ctx.router + .add_action(client_operation_id.clone(), action_info.clone()), + ctx.default_scheduler.expect_add_action(Ok(Box::new( + TokioWatchActionStateResult::new(client_operation_id.clone(), action_info, rx) + ))), + ); + assert_eq!(client_operation_id, received_op_id); + assert_eq!( + Some(&"some-other-image".to_string()), + received_action.platform_properties.get("container-image") + ); + Ok(()) +} + +#[nativelink_test] +async fn routes_to_default_when_property_missing() -> Result<(), Error> { + let ctx = make_router(); + let action_info = make_base_action_info(UNIX_EPOCH, DigestInfo::zero_digest()); + + let (_tx, rx) = watch::channel(Arc::new(ActionState { + client_operation_id: OperationId::default(), + stage: ActionStage::Queued, + action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), + })); + let client_operation_id = OperationId::default(); + + let (_, (received_op_id, received_action)) = + join!( + ctx.router + .add_action(client_operation_id.clone(), action_info.clone()), + ctx.default_scheduler.expect_add_action(Ok(Box::new( + TokioWatchActionStateResult::new(client_operation_id.clone(), action_info, rx) + ))), + ); + assert_eq!(client_operation_id, received_op_id); + assert!( + !received_action + .platform_properties + .contains_key("container-image"), + "Expected no container-image property" + ); + Ok(()) +} + +#[nativelink_test] +async fn routes_multiple_values() -> Result<(), Error> { + let ctx = make_router(); + + // First action: routes to compile_scheduler + { + let mut action_info = make_base_action_info(UNIX_EPOCH, DigestInfo::zero_digest()) + .as_ref() + .clone(); + action_info + .platform_properties + .insert("container-image".to_string(), "compile".to_string()); + let action_info = Arc::new(action_info); + + let (_tx, rx) = watch::channel(Arc::new(ActionState { + client_operation_id: OperationId::default(), + stage: ActionStage::Queued, + action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), + })); + let client_operation_id = OperationId::default(); + + let (_, (received_op_id, received_action)) = join!( + ctx.router + .add_action(client_operation_id.clone(), action_info.clone()), + ctx.compile_scheduler.expect_add_action(Ok(Box::new( + TokioWatchActionStateResult::new(client_operation_id.clone(), action_info, rx) + ))), + ); + assert_eq!(client_operation_id, received_op_id); + assert_eq!( + Some(&"compile".to_string()), + received_action.platform_properties.get("container-image") + ); + } + + // Second action: routes to default_scheduler + { + let mut action_info = make_base_action_info(UNIX_EPOCH, DigestInfo::zero_digest()) + .as_ref() + .clone(); + action_info + .platform_properties + .insert("container-image".to_string(), "default-image".to_string()); + let action_info = Arc::new(action_info); + + let (_tx, rx) = watch::channel(Arc::new(ActionState { + client_operation_id: OperationId::default(), + stage: ActionStage::Queued, + action_digest: action_info.unique_qualifier.digest(), + last_transition_timestamp: SystemTime::now(), + })); + let client_operation_id = OperationId::default(); + + let (_, (received_op_id, received_action)) = join!( + ctx.router + .add_action(client_operation_id.clone(), action_info.clone()), + ctx.default_scheduler.expect_add_action(Ok(Box::new( + TokioWatchActionStateResult::new(client_operation_id.clone(), action_info, rx) + ))), + ); + assert_eq!(client_operation_id, received_op_id); + assert_eq!( + Some(&"default-image".to_string()), + received_action.platform_properties.get("container-image") + ); + } + + Ok(()) +} + +#[nativelink_test] +async fn filter_operations_fans_out_to_all() -> Result<(), Error> { + let ctx = make_router(); + let filter = OperationFilter { + client_operation_id: Some(OperationId::default()), + ..Default::default() + }; + + // The router calls filter_operations sequentially on routes then default. + // Since HashMap order is arbitrary, we join both expects concurrently. + let (router_result, compile_filter, default_filter) = join!( + ctx.router.filter_operations(filter.clone()), + ctx.compile_scheduler + .expect_filter_operations(Ok(Box::pin(futures::stream::empty()))), + ctx.default_scheduler + .expect_filter_operations(Ok(Box::pin(futures::stream::empty()))), + ); + + assert!(router_result.unwrap().next().await.is_none()); + assert_eq!(filter, compile_filter); + assert_eq!(filter, default_filter); + Ok(()) +} + +#[nativelink_test] +async fn known_properties_unions_all_schedulers() -> Result<(), Error> { + let ctx = make_router(); + + let (known_props, _compile_instance, _default_instance) = join!( + ctx.router.get_known_properties("my-instance"), + ctx.compile_scheduler + .expect_get_known_properties(Ok(vec!["cpu_arch".to_string()])), + ctx.default_scheduler + .expect_get_known_properties(Ok(vec!["os".to_string(), "cpu_arch".to_string()])), + ); + + let mut props = known_props.unwrap(); + props.sort(); + assert_eq!(vec!["cpu_arch".to_string(), "os".to_string()], props); + Ok(()) +} + +#[nativelink_test] +async fn error_from_nested_scheduler_propagates() -> Result<(), Error> { + let ctx = make_router(); + let mut action_info = make_base_action_info(UNIX_EPOCH, DigestInfo::zero_digest()) + .as_ref() + .clone(); + action_info + .platform_properties + .insert("container-image".to_string(), "compile".to_string()); + let action_info = Arc::new(action_info); + + let client_operation_id = OperationId::default(); + let (result, _) = join!( + ctx.router + .add_action(client_operation_id.clone(), action_info.clone()), + ctx.compile_scheduler + .expect_add_action(Err(make_input_err!("Simulated scheduler error"))), + ); + + assert!( + result.is_err(), + "Expected error to propagate from nested scheduler" + ); + Ok(()) +} diff --git a/nativelink-scheduler/tests/redis_store_awaited_action_db_test.rs b/nativelink-scheduler/tests/redis_store_awaited_action_db_test.rs index e5084b698..2f786d42e 100644 --- a/nativelink-scheduler/tests/redis_store_awaited_action_db_test.rs +++ b/nativelink-scheduler/tests/redis_store_awaited_action_db_test.rs @@ -1,4 +1,4 @@ -// Copyright 2024 The NativeLink Authorsr All rights reserved. +// Copyright 2024 The NativeLink Authors. All rights reserved. // // Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); // you may not use this file except in compliance with the License. @@ -13,24 +13,14 @@ // limitations under the License. use core::time::Duration; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, VecDeque}; -use std::fmt; +use std::collections::HashMap; use std::sync::Arc; -use std::thread::panicking; use std::time::SystemTime; -use bytes::Bytes; -use fred::bytes_utils::string::Str; -use fred::clients::SubscriberClient; -use fred::error::{Error as RedisError, ErrorKind as RedisErrorKind}; -use fred::mocks::{MockCommand, Mocks}; -use fred::prelude::{Builder, Pool as RedisPool}; -use fred::types::Value as RedisValue; -use fred::types::config::{Config as RedisConfig, PerformanceConfig}; use futures::StreamExt; use mock_instant::global::SystemTime as MockSystemTime; use nativelink_config::schedulers::SimpleSpec; +use nativelink_config::stores::RedisSpec; use nativelink_error::{Error, ResultExt}; use nativelink_macro::nativelink_test; use nativelink_proto::build::bazel::remote::execution::v2::{ @@ -39,6 +29,7 @@ use nativelink_proto::build::bazel::remote::execution::v2::{ use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ ConnectionResult, StartExecute, UpdateForWorker, update_for_worker, }; +use nativelink_redis_tester::FakeRedisBackend; use nativelink_scheduler::awaited_action_db::{ AwaitedAction, AwaitedActionDb, AwaitedActionSubscriber, }; @@ -55,9 +46,11 @@ use nativelink_util::digest_hasher::DigestHasherFunc; use nativelink_util::instant_wrapper::MockInstantWrapped; use nativelink_util::operation_state_manager::{ClientStateManager, OperationFilter}; use nativelink_util::platform_properties::PlatformProperties; -use nativelink_util::store_trait::{SchedulerStore, SchedulerSubscriptionManager}; +use nativelink_util::store_trait::SchedulerStore; use parking_lot::Mutex; use pretty_assertions::assert_eq; +use redis::Value; +use tokio::sync::mpsc::unbounded_channel; use tokio::sync::{Notify, mpsc}; use tonic::Code; use utils::scheduler_utils::update_eq; @@ -67,360 +60,6 @@ mod utils { } const INSTANCE_NAME: &str = "instance_name"; -const TEMP_UUID: &str = "550e8400-e29b-41d4-a716-446655440000"; -const SCRIPT_VERSION: &str = "3e762c15"; -const VERSION_SCRIPT_HASH: &str = "b22b9926cbce9dd9ba97fa7ba3626f89feea1ed5"; -const MAX_CHUNK_UPLOADS_PER_UPDATE: usize = 10; -const SCAN_COUNT: u32 = 10_000; - -fn mock_uuid_generator() -> String { - uuid::Uuid::parse_str(TEMP_UUID).unwrap().to_string() -} - -type CommandandCallbackTuple = (MockCommand, Option>); -#[derive(Default)] -struct MockRedisBackend { - /// Commands we expect to encounter, and results we to return to the client. - // Commands are pushed from the back and popped from the front. - expected: Mutex)>>, -} - -impl fmt::Debug for MockRedisBackend { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("MockRedisBackend").finish() - } -} - -impl MockRedisBackend { - fn new() -> Self { - Self::default() - } - - fn expect( - &self, - command: MockCommand, - result: Result, - cb: Option>, - ) -> &Self { - self.expected.lock().push_back(((command, cb), result)); - self - } -} - -impl Mocks for MockRedisBackend { - fn process_command(&self, actual: MockCommand) -> Result { - let Some(((expected, maybe_cb), result)) = self.expected.lock().pop_front() else { - // panic here -- this isn't a redis error, it's a test failure - panic!("Didn't expect any more commands, but received {actual:?}"); - }; - - assert_eq!(expected, actual); - if let Some(cb) = maybe_cb { - (cb)(); - } - - result - } - - fn process_transaction(&self, commands: Vec) -> Result { - static MULTI: MockCommand = MockCommand { - cmd: Str::from_static("MULTI"), - subcommand: None, - args: Vec::new(), - }; - static EXEC: MockCommand = MockCommand { - cmd: Str::from_static("EXEC"), - subcommand: None, - args: Vec::new(), - }; - - let results = core::iter::once(MULTI.clone()) - .chain(commands) - .chain([EXEC.clone()]) - .map(|command| self.process_command(command)) - .collect::, RedisError>>()?; - - Ok(RedisValue::Array(results)) - } -} - -impl Drop for MockRedisBackend { - fn drop(&mut self) { - if panicking() { - // We're already panicking, let's make debugging easier and let future devs solve problems one at a time. - return; - } - - let expected = self.expected.get_mut(); - - if expected.is_empty() { - return; - } - - assert_eq!( - expected - .drain(..) - .map(|((cmd, _), res)| (cmd, res)) - .collect::>(), - VecDeque::new(), - "Didn't receive all expected commands." - ); - - // Panicking isn't enough inside a tokio task, we need to `exit(1)` - std::process::exit(1) - } -} - -struct FakeRedisBackend { - /// Contains a list of all of the Redis keys -> fields. - table: Mutex>>, - /// The subscription manager (maybe). - subscription_manager: Mutex>>, -} - -impl fmt::Debug for FakeRedisBackend { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("FakeRedisBackend").finish() - } -} - -impl FakeRedisBackend { - fn new() -> Self { - Self { - table: Mutex::new(HashMap::new()), - subscription_manager: Mutex::new(None), - } - } - - fn set_subscription_manager(&self, subscription_manager: Arc) { - *self.subscription_manager.lock() = Some(subscription_manager); - } -} - -impl Mocks for FakeRedisBackend { - fn process_command(&self, actual: MockCommand) -> Result { - if actual.cmd == Str::from_static("SUBSCRIBE") { - // This does nothing at the moment, maybe we need to implement it later. - return Ok(RedisValue::Integer(0)); - } - - if actual.cmd == Str::from_static("PUBLISH") { - if let Some(subscription_manager) = self.subscription_manager.lock().as_ref() { - subscription_manager.notify_for_test( - str::from_utf8(actual.args[1].as_bytes().expect("Notification not bytes")) - .expect("Notification not UTF-8") - .into(), - ); - } - return Ok(RedisValue::Integer(0)); - } - - if actual.cmd == Str::from_static("FT.AGGREGATE") { - // The query is @field:value where value might be wrapped in braces. - let query = actual.args[1] - .clone() - .into_string() - .expect("Aggregate query should be a string"); - assert_eq!(&query[..1], "@"); - let mut parts = query[1..].split(':'); - let field = parts.next().expect("No field name"); - let value = parts.next().expect("No value"); - let value = value - .strip_prefix("{ ") - .and_then(|s| s.strip_suffix(" }")) - .unwrap_or(value); - // Lazy implementation making assumptions. - assert_eq!( - actual.args[2..6], - vec!["LOAD".into(), 2.into(), "data".into(), "version".into()] - ); - let mut results = vec![RedisValue::Integer(0)]; - for fields in self.table.lock().values() { - if let Some(key_value) = fields.get(field) { - if *key_value == RedisValue::Bytes(Bytes::from(value.to_owned())) { - results.push(RedisValue::Array(vec![ - RedisValue::Bytes(Bytes::from("data")), - fields.get("data").expect("No data field").clone(), - RedisValue::Bytes(Bytes::from("version")), - fields.get("version").expect("No version field").clone(), - ])); - } - } - } - results[0] = ((results.len() - 1) as u32).into(); - return Ok(RedisValue::Array(vec![ - RedisValue::Array(results), - RedisValue::Integer(0), // Means no more items in cursor. - ])); - } - - if actual.cmd == Str::from_static("EVALSHA") { - assert_eq!(actual.args[0], VERSION_SCRIPT_HASH.into()); - let mut value = HashMap::new(); - value.insert("data".into(), actual.args[4].clone()); - for pair in actual.args[5..].chunks(2) { - value.insert( - str::from_utf8(pair[0].as_bytes().expect("Field name not bytes")) - .expect("Unable to parse field name as string") - .into(), - pair[1].clone(), - ); - } - let version = match self.table.lock().entry( - str::from_utf8(actual.args[2].as_bytes().expect("Key not bytes")) - .expect("Key cannot be parsed as string") - .into(), - ) { - Entry::Occupied(mut occupied_entry) => { - let version = occupied_entry - .get() - .get("version") - .expect("No version field"); - let version_int: i64 = - str::from_utf8(version.as_bytes().expect("Version field not bytes")) - .expect("Version field not valid string") - .parse() - .expect("Unable to parse version field"); - if *version != actual.args[3] { - // Version mismatch. - return Ok(RedisValue::Array(vec![ - RedisValue::Integer(0), - RedisValue::Integer(version_int), - ])); - } - value.insert( - "version".into(), - RedisValue::Bytes( - format!("{}", version_int + 1).as_bytes().to_owned().into(), - ), - ); - occupied_entry.insert(value); - version_int + 1 - } - Entry::Vacant(vacant_entry) => { - if actual.args[3] != RedisValue::Bytes(Bytes::from_static(b"0")) { - // Version mismatch. - return Ok(RedisValue::Array(vec![ - RedisValue::Integer(0), - RedisValue::Integer(0), - ])); - } - value.insert("version".into(), RedisValue::Bytes("1".into())); - vacant_entry.insert_entry(value); - 1 - } - }; - return Ok(RedisValue::Array(vec![ - RedisValue::Integer(1), - RedisValue::Integer(version), - ])); - } - - if actual.cmd == Str::from_static("HSET") { - assert_eq!( - RedisValue::Bytes(Bytes::from_static(b"data")), - actual.args[1] - ); - let mut values = HashMap::new(); - values.insert("data".into(), actual.args[2].clone()); - self.table.lock().insert( - str::from_utf8( - actual.args[0] - .as_bytes() - .expect("Key argument is not bytes"), - ) - .expect("Unable to parse key as string") - .into(), - values, - ); - return Ok(RedisValue::new_ok()); - } - - if actual.cmd == Str::from_static("HMGET") { - if let Some(fields) = self.table.lock().get( - str::from_utf8( - actual.args[0] - .as_bytes() - .expect("Key argument is not bytes"), - ) - .expect("Unable to parse key name"), - ) { - let mut result = vec![]; - for key in &actual.args[1..] { - if let Some(value) = fields.get( - str::from_utf8(key.as_bytes().expect("Field argument is not bytes")) - .expect("Unable to parse requested field"), - ) { - result.push(value.clone()); - } else { - result.push(RedisValue::Null); - } - } - return Ok(RedisValue::Array(result)); - } - return Err(RedisError::new(RedisErrorKind::NotFound, String::new())); - } - - panic!("Mock command not implemented! {actual:?}"); - } - - fn process_transaction(&self, commands: Vec) -> Result { - static MULTI: MockCommand = MockCommand { - cmd: Str::from_static("MULTI"), - subcommand: None, - args: Vec::new(), - }; - static EXEC: MockCommand = MockCommand { - cmd: Str::from_static("EXEC"), - subcommand: None, - args: Vec::new(), - }; - - let results = core::iter::once(MULTI.clone()) - .chain(commands) - .chain([EXEC.clone()]) - .map(|command| self.process_command(command)) - .collect::, RedisError>>()?; - - Ok(RedisValue::Array(results)) - } -} - -fn make_redis_store(sub_channel: &str, mocks: Arc) -> Arc { - let mut builder = Builder::default_centralized(); - builder.set_config(RedisConfig { - mocks: Some(mocks), - ..Default::default() - }); - let (client_pool, subscriber_client) = make_clients(builder); - Arc::new( - RedisStore::new_from_builder_and_parts( - client_pool, - subscriber_client, - Some(sub_channel.into()), - mock_uuid_generator, - String::new(), - 4064, - MAX_CHUNK_UPLOADS_PER_UPDATE, - SCAN_COUNT, - ) - .unwrap(), - ) -} - -fn make_clients(mut builder: Builder) -> (RedisPool, SubscriberClient) { - const CONNECTION_POOL_SIZE: usize = 1; - let client_pool = builder - .set_performance_config(PerformanceConfig { - broadcast_channel_capacity: 4096, - ..Default::default() - }) - .build_pool(CONNECTION_POOL_SIZE) - .unwrap(); - - let subscriber_client = builder.build_subscriber_client().unwrap(); - (client_pool, subscriber_client) -} async fn verify_initial_connection_message( worker_id: WorkerId, @@ -445,8 +84,8 @@ async fn setup_new_worker( worker_id: WorkerId, props: PlatformProperties, ) -> Result, Error> { - let (tx, mut rx) = mpsc::unbounded_channel(); - let worker = Worker::new(worker_id.clone(), props, tx, NOW_TIME); + let (tx, mut rx) = unbounded_channel(); + let worker = Worker::new(worker_id.clone(), props, tx, NOW_TIME, 0); scheduler .add_worker(worker) .await @@ -477,11 +116,12 @@ fn make_awaited_action(operation_id: &str) -> AwaitedAction { ) } +// TODO: This test needs to be rewritten to use workers (like test_multiple_clients_subscribe_to_same_action). #[nativelink_test] +#[ignore = "needs rewrite to use workers (like test_multiple_clients_subscribe_to_same_action)"] async fn add_action_smoke_test() -> Result<(), Error> { const CLIENT_OPERATION_ID: &str = "my_client_operation_id"; const WORKER_OPERATION_ID: &str = "my_worker_operation_id"; - static SUBSCRIPTION_MANAGER: Mutex>> = Mutex::new(None); const SUB_CHANNEL: &str = "sub_channel"; let worker_awaited_action = make_awaited_action(WORKER_OPERATION_ID); @@ -489,281 +129,22 @@ async fn add_action_smoke_test() -> Result<(), Error> { let mut new_awaited_action = worker_awaited_action.clone(); let mut new_state = new_awaited_action.state().as_ref().clone(); new_state.stage = ActionStage::Executing; + new_state.last_transition_timestamp = SystemTime::now(); new_awaited_action.worker_set_state(Arc::new(new_state), MockSystemTime::now().into()); new_awaited_action }; - let worker_operation_id = OperationId::from(WORKER_OPERATION_ID); - - let ft_aggregate_args = vec![ - format!("aa__unique_qualifier__{SCRIPT_VERSION}").into(), - format!("@unique_qualifier:{{ {INSTANCE_NAME}_SHA256_0000000000000000000000000000000000000000000000000000000000000000_0_c }}").into(), - "LOAD".into(), - 2.into(), - "data".into(), - "version".into(), - "SORTBY".into(), - 0.into(), - "WITHCURSOR".into(), - "COUNT".into(), - 256.into(), - "MAXIDLE".into(), - 2000.into(), - ]; - let mocks = Arc::new(MockRedisBackend::new()); - #[expect( - clippy::string_lit_as_bytes, - reason = r#"avoids `b"foo".as_slice()`, which is hardly better"# - )] - mocks - .expect( - MockCommand { - cmd: Str::from_static("FT.AGGREGATE"), - subcommand: None, - args: ft_aggregate_args.clone(), - }, - Err(RedisError::new( - RedisErrorKind::NotFound, - String::new(), - )), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("SUBSCRIBE"), - subcommand: None, - args: vec![SUB_CHANNEL.as_bytes().into()], - }, - Ok(RedisValue::Integer(0)), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("FT.CREATE"), - subcommand: None, - args: vec![ - format!("aa__unique_qualifier__{SCRIPT_VERSION}").into(), - "ON".into(), - "HASH".into(), - "PREFIX".into(), - 1.into(), - "aa_".into(), - "TEMPORARY".into(), - 86400.into(), - "NOOFFSETS".into(), - "NOHL".into(), - "NOFIELDS".into(), - "NOFREQS".into(), - "SCHEMA".into(), - "unique_qualifier".into(), - "TAG".into(), - ], - }, - Ok(RedisValue::Bytes(Bytes::from("data"))), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("FT.AGGREGATE"), - subcommand: None, - args: ft_aggregate_args.clone(), - }, - Ok(RedisValue::Array(vec![ - RedisValue::Array(vec![ - RedisValue::Integer(0), - ]), - RedisValue::Integer(0), // Means no more items in cursor. - ])), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("EVALSHA"), - subcommand: None, - args: vec![ - VERSION_SCRIPT_HASH.into(), - 1.into(), - format!("aa_{WORKER_OPERATION_ID}").as_bytes().into(), - "0".as_bytes().into(), - RedisValue::Bytes(Bytes::from(serde_json::to_string(&worker_awaited_action).unwrap())), - "unique_qualifier".as_bytes().into(), - format!("{INSTANCE_NAME}_SHA256_0000000000000000000000000000000000000000000000000000000000000000_0_c").as_bytes().into(), - "state".as_bytes().into(), - "queued".as_bytes().into(), - "sort_key".as_bytes().into(), - "80000000ffffffff".as_bytes().into(), - ], - }, - Ok(RedisValue::Array(vec![RedisValue::Integer(1), RedisValue::Integer(1)])), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("PUBLISH"), - subcommand: None, - args: vec![ - SUB_CHANNEL.into(), - format!("aa_{WORKER_OPERATION_ID}").into(), - ], - }, - Ok(0.into() /* unused */), - Some(Box::new(|| SUBSCRIPTION_MANAGER.lock().as_ref().unwrap().notify_for_test(format!("aa_{WORKER_OPERATION_ID}")))), - ) - .expect( - MockCommand { - cmd: Str::from_static("HSET"), - subcommand: None, - args: vec![ - format!("cid_{CLIENT_OPERATION_ID}").as_bytes().into(), - "data".as_bytes().into(), - format!("{{\"String\":\"{WORKER_OPERATION_ID}\"}}").as_bytes().into(), - ], - }, - Ok(RedisValue::new_ok()), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("PUBLISH"), - subcommand: None, - args: vec![ - SUB_CHANNEL.into(), - format!("cid_{CLIENT_OPERATION_ID}").into(), - ], - }, - Ok(0.into() /* unused */), - Some(Box::new(|| SUBSCRIPTION_MANAGER.lock().as_ref().unwrap().notify_for_test(format!("aa_{CLIENT_OPERATION_ID}")))), - ) - .expect( - MockCommand { - cmd: Str::from_static("HMGET"), - subcommand: None, - args: vec![ - format!("aa_{WORKER_OPERATION_ID}").as_bytes().into(), - "version".as_bytes().into(), - "data".as_bytes().into(), - ], - }, - Ok(RedisValue::Array(vec![ - // Version. - "1".into(), - // Data. - RedisValue::Bytes(Bytes::from(serde_json::to_string(&worker_awaited_action).unwrap())), - ])), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("HMGET"), - subcommand: None, - args: vec![ - format!("aa_{WORKER_OPERATION_ID}").as_bytes().into(), - "version".as_bytes().into(), - "data".as_bytes().into(), - ], - }, - Ok(RedisValue::Array(vec![ - // Version. - "1".into(), - // Data. - RedisValue::Bytes(Bytes::from(serde_json::to_string(&worker_awaited_action).unwrap())), - ])), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("HMGET"), - subcommand: None, - args: vec![ - format!("cid_{CLIENT_OPERATION_ID}").as_bytes().into(), - "version".as_bytes().into(), - "data".as_bytes().into(), - ], - }, - Ok(RedisValue::Array(vec![ - // Version. - RedisValue::Null, - // Data. - RedisValue::Bytes(Bytes::from(serde_json::to_string(&worker_operation_id).unwrap())), - ])), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("HMGET"), - subcommand: None, - args: vec![ - format!("aa_{WORKER_OPERATION_ID}").as_bytes().into(), - "version".as_bytes().into(), - "data".as_bytes().into(), - ], - }, - Ok(RedisValue::Array(vec![ - // Version. - "2".into(), - // Data. - RedisValue::Bytes(Bytes::from(serde_json::to_string(&new_awaited_action).unwrap())), - ])), - None, - ) - - .expect( - MockCommand { - cmd: Str::from_static("EVALSHA"), - subcommand: None, - args: vec![ - VERSION_SCRIPT_HASH.into(), - 1.into(), - format!("aa_{WORKER_OPERATION_ID}").as_bytes().into(), - "0".as_bytes().into(), - RedisValue::Bytes(Bytes::from(serde_json::to_string(&new_awaited_action).unwrap())), - "unique_qualifier".as_bytes().into(), - format!("{INSTANCE_NAME}_SHA256_0000000000000000000000000000000000000000000000000000000000000000_0_c").as_bytes().into(), - "state".as_bytes().into(), - "executing".as_bytes().into(), - "sort_key".as_bytes().into(), - "80000000ffffffff".as_bytes().into(), - ], - }, - Ok(RedisValue::Array(vec![RedisValue::Integer(1), RedisValue::Integer(2)])), - None, - ) - .expect( - MockCommand { - cmd: Str::from_static("PUBLISH"), - subcommand: None, - args: vec![ - SUB_CHANNEL.into(), - format!("aa_{WORKER_OPERATION_ID}").into(), - ], - }, - Ok(0.into() /* unused */), - Some(Box::new(|| SUBSCRIPTION_MANAGER.lock().as_ref().unwrap().notify_for_test(format!("aa_{WORKER_OPERATION_ID}")))), - ) - .expect( - MockCommand { - cmd: Str::from_static("HMGET"), - subcommand: None, - args: vec![ - format!("aa_{WORKER_OPERATION_ID}").as_bytes().into(), - "version".as_bytes().into(), - "data".as_bytes().into(), - ], - }, - Ok(RedisValue::Array(vec![ - // Version. - "2".into(), - // Data. - RedisValue::Bytes(Bytes::from(serde_json::to_string(&new_awaited_action).unwrap())), - ])), - None, - ) - ; - - let store = make_redis_store(SUB_CHANNEL, mocks); - SUBSCRIPTION_MANAGER - .lock() - .replace(store.subscription_manager().unwrap()); + // Use FakeRedisBackend which handles all Redis commands dynamically + // This is more maintainable than the standard fake redis which requires exact command sequences + let fake_redis_backend: FakeRedisBackend = FakeRedisBackend::new(); + let fake_redis_port = fake_redis_backend.clone().run().await; + let spec = RedisSpec { + addresses: vec![format!("redis://127.0.0.1:{fake_redis_port}")], + experimental_pub_sub_channel: Some(SUB_CHANNEL.to_string()), + ..Default::default() + }; + let store = RedisStore::new_standard(spec).await.expect("Working spec"); + fake_redis_backend.set_subscription_manager(store.subscription_manager().await.unwrap()); let notifier = Arc::new(Notify::new()); let awaited_action_db = StoreAwaitedActionDb::new( @@ -772,6 +153,7 @@ async fn add_action_smoke_test() -> Result<(), Error> { MockInstantWrapped::default, move || WORKER_OPERATION_ID.into(), ) + .await .unwrap(); let mut subscription = awaited_action_db @@ -802,7 +184,7 @@ async fn add_action_smoke_test() -> Result<(), Error> { let get_res = get_subscription.borrow().await; - assert_eq!(get_res.unwrap().state().stage, ActionStage::Executing); + assert_eq!(get_res.unwrap().state().stage, ActionStage::Queued); } { @@ -819,6 +201,18 @@ async fn add_action_smoke_test() -> Result<(), Error> { ); } + { + let get_subscription = awaited_action_db + .get_awaited_action_by_id(&OperationId::from(CLIENT_OPERATION_ID)) + .await + .unwrap() + .unwrap(); + + let get_res = get_subscription.borrow().await; + + assert_eq!(get_res.unwrap().state().stage, ActionStage::Executing); + } + Ok(()) } @@ -846,9 +240,17 @@ async fn test_multiple_clients_subscribe_to_same_action() -> Result<(), Error> { }), }); - let mocks = Arc::new(FakeRedisBackend::new()); - let store = make_redis_store(SUB_CHANNEL, mocks.clone()); - mocks.set_subscription_manager(store.subscription_manager().unwrap()); + // Use FakeRedisBackend which handles all Redis commands dynamically + // This is more maintainable than the standard fake redis which requires exact command sequences + let fake_redis_backend: FakeRedisBackend = FakeRedisBackend::new(); + let fake_redis_port = fake_redis_backend.clone().run().await; + let spec = RedisSpec { + addresses: vec![format!("redis://127.0.0.1:{fake_redis_port}")], + experimental_pub_sub_channel: Some(SUB_CHANNEL.to_string()), + ..Default::default() + }; + let store = RedisStore::new_standard(spec).await.expect("Working spec"); + fake_redis_backend.set_subscription_manager(store.subscription_manager().await.unwrap()); let notifier = Arc::new(Notify::new()); let worker_operation_id = Arc::new(Mutex::new(WORKER_OPERATION_ID_1)); @@ -859,6 +261,7 @@ async fn test_multiple_clients_subscribe_to_same_action() -> Result<(), Error> { MockInstantWrapped::default, move || worker_operation_id_clone.lock().clone().into(), ) + .await .unwrap(); let task_change_notify = Arc::new(Notify::new()); @@ -948,8 +351,8 @@ async fn test_multiple_clients_subscribe_to_same_action() -> Result<(), Error> { // The worker shouldn't be allocated the job again. tokio::select! { () = tokio::time::sleep(Duration::from_secs(1)) => {} - _ = rx_from_worker.recv() => { - panic!("Worker was allocated another job"); + v = rx_from_worker.recv() => { + panic!("Worker was allocated another job: {v:?}"); } } @@ -998,9 +401,14 @@ async fn test_outdated_version() -> Result<(), Error> { let worker_operation_id = Arc::new(Mutex::new(CLIENT_OPERATION_ID)); let worker_operation_id_clone = worker_operation_id.clone(); - let mocks = Arc::new(FakeRedisBackend::new()); - - let store = make_redis_store("sub_channel", mocks); + let fake_redis_backend: FakeRedisBackend = FakeRedisBackend::new(); + let fake_redis_port = fake_redis_backend.clone().run().await; + let spec = RedisSpec { + addresses: vec![format!("redis://127.0.0.1:{fake_redis_port}")], + experimental_pub_sub_channel: Some("sub_channel".into()), + ..Default::default() + }; + let store = RedisStore::new_standard(spec).await.expect("Working spec"); let notifier = Arc::new(Notify::new()); let awaited_action_db = StoreAwaitedActionDb::new( @@ -1009,6 +417,7 @@ async fn test_outdated_version() -> Result<(), Error> { MockInstantWrapped::default, move || worker_operation_id_clone.lock().clone().into(), ) + .await .unwrap(); let worker_awaited_action = make_awaited_action("WORKER_OPERATION_ID"); @@ -1029,3 +438,73 @@ async fn test_outdated_version() -> Result<(), Error> { Ok(()) } + +/// Test that orphaned client operation ID mappings return None. +/// +/// This tests the scenario where: +/// 1. A client operation ID mapping exists (cid_* → `operation_id`) +/// 2. The actual operation (aa_*) has been deleted (completed/timed out) +/// 3. `get_awaited_action_by_id` should return None instead of a subscriber to a non-existent operation +#[nativelink_test] +async fn test_orphaned_client_operation_id_returns_none() -> Result<(), Error> { + const CLIENT_OPERATION_ID: &str = "orphaned_client_id"; + const INTERNAL_OPERATION_ID: &str = "deleted_internal_operation_id"; + const SUB_CHANNEL: &str = "sub_channel"; + + let worker_operation_id = Arc::new(Mutex::new(INTERNAL_OPERATION_ID)); + let worker_operation_id_clone = worker_operation_id.clone(); + + let internal_operation_id = OperationId::from(INTERNAL_OPERATION_ID); + + // Use FakeRedisBackend which handles SUBSCRIBE automatically + let fake_redis_backend: FakeRedisBackend = FakeRedisBackend::new(); + let fake_redis_port = fake_redis_backend.clone().run().await; + let spec = RedisSpec { + addresses: vec![format!("redis://127.0.0.1:{fake_redis_port}")], + experimental_pub_sub_channel: Some(SUB_CHANNEL.into()), + ..Default::default() + }; + let store = RedisStore::new_standard(spec).await.expect("Working spec"); + fake_redis_backend.set_subscription_manager(store.subscription_manager().await.unwrap()); + + // Manually set up the orphaned state in the fake backend: + // 1. Add client_id → operation_id mapping (cid_* key) + { + let mut table = fake_redis_backend.table.lock().unwrap(); + let mut client_fields = HashMap::new(); + client_fields.insert( + "data".into(), + Value::BulkString( + serde_json::to_string(&internal_operation_id) + .unwrap() + .into_bytes(), + ), + ); + table.insert(format!("cid_{CLIENT_OPERATION_ID}"), client_fields); + } + // 2. Don't add the actual operation (aa_* key) - this simulates it being deleted/orphaned + + let notifier = Arc::new(Notify::new()); + let awaited_action_db = StoreAwaitedActionDb::new( + store.clone(), + notifier.clone(), + MockInstantWrapped::default, + move || worker_operation_id_clone.lock().clone().into(), + ) + .await + .unwrap(); + + // Try to get the awaited action by the client operation ID + // This should return None because the internal operation doesn't exist (orphaned mapping) + let result = awaited_action_db + .get_awaited_action_by_id(&OperationId::from(CLIENT_OPERATION_ID)) + .await + .expect("Should not error when checking orphaned client operation"); + + assert!( + result.is_none(), + "Expected None for orphaned client operation ID, but got a subscription" + ); + + Ok(()) +} diff --git a/nativelink-scheduler/tests/simple_scheduler_state_manager_test.rs b/nativelink-scheduler/tests/simple_scheduler_state_manager_test.rs new file mode 100644 index 000000000..28159838a --- /dev/null +++ b/nativelink-scheduler/tests/simple_scheduler_state_manager_test.rs @@ -0,0 +1,47 @@ +use core::time::Duration; +use std::sync::Arc; +use std::time::SystemTime; + +use nativelink_error::Error; +use nativelink_macro::nativelink_test; +use nativelink_scheduler::default_scheduler_factory::memory_awaited_action_db_factory; +use nativelink_scheduler::simple_scheduler_state_manager::SimpleSchedulerStateManager; +use nativelink_util::action_messages::{OperationId, WorkerId}; +use nativelink_util::instant_wrapper::MockInstantWrapped; +use nativelink_util::operation_state_manager::{UpdateOperationType, WorkerStateManager}; +use tokio::sync::Notify; + +#[nativelink_test] +async fn drops_missing_actions() -> Result<(), Error> { + let task_change_notify = Arc::new(Notify::new()); + let awaited_action_db = memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ); + let state_manager = SimpleSchedulerStateManager::new( + 0, + Duration::from_secs(10), + Duration::from_secs(10), + Duration::ZERO, + awaited_action_db, + SystemTime::now, + None, + "test_scheduler", + ); + state_manager + .update_operation( + &OperationId::Uuid(uuid::Uuid::parse_str( + "c458c1f4-136e-486d-b9cd-cea07460cde4", + )?), + &WorkerId::default(), + UpdateOperationType::ExecutionComplete, + ) + .await + .unwrap(); + + assert!(logs_contain( + "Unable to update action due to it being missing, probably dropped operation_id=c458c1f4-136e-486d-b9cd-cea07460cde4" + )); + Ok(()) +} diff --git a/nativelink-scheduler/tests/simple_scheduler_test.rs b/nativelink-scheduler/tests/simple_scheduler_test.rs index 059e061e1..980ad7be6 100644 --- a/nativelink-scheduler/tests/simple_scheduler_test.rs +++ b/nativelink-scheduler/tests/simple_scheduler_test.rs @@ -36,8 +36,8 @@ use nativelink_proto::com::github::trace_machina::nativelink::remote_execution:: ConnectionResult, StartExecute, UpdateForWorker, update_for_worker, }; use nativelink_scheduler::awaited_action_db::{ - AwaitedAction, AwaitedActionDb, AwaitedActionSubscriber, SortedAwaitedAction, - SortedAwaitedActionState, + AwaitedAction, AwaitedActionDb, AwaitedActionSubscriber, CountableActionStage, + SortedAwaitedAction, SortedAwaitedActionState, }; use nativelink_scheduler::default_scheduler_factory::memory_awaited_action_db_factory; use nativelink_scheduler::simple_scheduler::SimpleScheduler; @@ -92,7 +92,7 @@ async fn setup_new_worker( props: PlatformProperties, ) -> Result, Error> { let (tx, mut rx) = mpsc::unbounded_channel(); - let worker = Worker::new(worker_id.clone(), props, tx, NOW_TIME); + let worker = Worker::new(worker_id.clone(), props, tx, NOW_TIME, 0); scheduler .add_worker(worker) .await @@ -173,6 +173,7 @@ async fn basic_add_action_with_one_worker_test() -> Result<(), Error> { client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Executing, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -180,6 +181,28 @@ async fn basic_add_action_with_one_worker_test() -> Result<(), Error> { Ok(()) } +#[nativelink_test] +async fn bad_worker_match_logging_interval() -> Result<(), Error> { + let task_change_notify = Arc::new(Notify::new()); + let (_scheduler, _worker_scheduler) = SimpleScheduler::new( + &SimpleSpec { + worker_match_logging_interval_s: -2, + ..Default::default() + }, + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + task_change_notify, + None, + ); + assert!(logs_contain( + "nativelink_scheduler::simple_scheduler: Valid values for worker_match_logging_interval_s are -1, 0, or a positive integer, setting to disabled worker_match_logging_interval_s=-2" + )); + Ok(()) +} + #[nativelink_test] async fn client_does_not_receive_update_timeout() -> Result<(), Error> { async fn advance_time(duration: Duration, poll_fut: &mut Pin<&mut impl Future>) { @@ -191,12 +214,15 @@ async fn client_does_not_receive_update_timeout() -> Result<(), Error> { } } + MockClock::set_time(Duration::from_secs(NOW_TIME)); + let worker_id = WorkerId("worker_id".to_string()); let task_change_notify = Arc::new(Notify::new()); let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( &SimpleSpec { worker_timeout_s: WORKER_TIMEOUT_S, + worker_match_logging_interval_s: 1, ..Default::default() }, memory_awaited_action_db_factory( @@ -223,7 +249,7 @@ async fn client_does_not_receive_update_timeout() -> Result<(), Error> { .unwrap(); // Trigger a do_try_match to ensure we get a state change. - scheduler.do_try_match_for_test().await.unwrap(); + scheduler.do_try_match_for_test().await?; assert_eq!( action_listener.changed().await.unwrap().0.stage, ActionStage::Executing @@ -239,7 +265,7 @@ async fn client_does_not_receive_update_timeout() -> Result<(), Error> { // Advance our time by just under the timeout. advance_time(Duration::from_secs(WORKER_TIMEOUT_S - 1), &mut changed_fut).await; { - // Sill no update should have been received yet. + // Still no update should have been received yet. assert_eq!(poll!(&mut changed_fut).is_ready(), false); } // Advance it by just over the timeout. @@ -327,6 +353,7 @@ async fn find_executing_action() -> Result<(), Error> { client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Executing, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -589,6 +616,7 @@ async fn set_drain_worker_pauses_and_resumes_worker_test() -> Result<(), Error> client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Queued, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -605,6 +633,7 @@ async fn set_drain_worker_pauses_and_resumes_worker_test() -> Result<(), Error> client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Executing, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -664,6 +693,7 @@ async fn worker_should_not_queue_if_properties_dont_match_test() -> Result<(), E client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Queued, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -701,6 +731,7 @@ async fn worker_should_not_queue_if_properties_dont_match_test() -> Result<(), E client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Executing, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -738,6 +769,7 @@ async fn cacheable_items_join_same_action_queued_test() -> Result<(), Error> { client_operation_id, stage: ActionStage::Queued, action_digest, + last_transition_timestamp: SystemTime::now(), }; let insert_timestamp1 = make_system_time(1); @@ -794,6 +826,7 @@ async fn cacheable_items_join_same_action_queued_test() -> Result<(), Error> { // Action should now be executing. expected_action_state.stage = ActionStage::Executing; + expected_action_state.last_transition_timestamp = SystemTime::now(); { // Both client1 and client2 should be receiving the same updates. // Most importantly the `name` (which is random) will be the same. @@ -858,6 +891,7 @@ async fn worker_disconnects_does_not_schedule_for_execution_test() -> Result<(), client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Queued, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -950,6 +984,10 @@ impl AwaitedActionDb for RxMockAwaitedAction { .expect("Could not receive msg in mpsc") } + async fn get_queued_actions(&self) -> Result>, Error> { + Ok(vec![]) + } + async fn get_range_of_actions( &self, _state: SortedAwaitedActionState, @@ -979,6 +1017,13 @@ impl AwaitedActionDb for RxMockAwaitedAction { ) -> Result { unreachable!(); } + + async fn count_actions( + &self, + _states: Vec, + ) -> Result, Error> { + Ok(HashMap::default()) + } } #[nativelink_test] @@ -1082,6 +1127,8 @@ async fn matching_engine_fails_sends_abort() -> Result<(), Error> { #[nativelink_test] async fn worker_timesout_reschedules_running_job_test() -> Result<(), Error> { + MockClock::set_time(Duration::from_secs(NOW_TIME)); + let worker_id1 = WorkerId("worker1".to_string()); let worker_id2 = WorkerId("worker2".to_string()); let task_change_notify = Arc::new(Notify::new()); @@ -1164,6 +1211,7 @@ async fn worker_timesout_reschedules_running_job_test() -> Result<(), Error> { client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Executing, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), } ); } @@ -1197,6 +1245,7 @@ async fn worker_timesout_reschedules_running_job_test() -> Result<(), Error> { client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Executing, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), } ); } @@ -1310,6 +1359,7 @@ async fn update_action_sends_completed_result_to_client_test() -> Result<(), Err client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Completed(action_result), action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -1429,6 +1479,7 @@ async fn update_action_sends_completed_result_after_disconnect() -> Result<(), E client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Completed(action_result), action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -1571,6 +1622,7 @@ async fn does_not_crash_if_operation_joined_then_relaunched() -> Result<(), Erro client_operation_id, stage: ActionStage::Executing, action_digest, + last_transition_timestamp: SystemTime::now(), }; let insert_timestamp = make_system_time(1); @@ -1661,6 +1713,7 @@ async fn does_not_crash_if_operation_joined_then_relaunched() -> Result<(), Erro { // Action should now be executing. expected_action_state.stage = ActionStage::Completed(action_result.clone()); + expected_action_state.last_transition_timestamp = SystemTime::now(); assert_eq!( action_listener.changed().await.unwrap().0.as_ref(), &expected_action_state @@ -1678,6 +1731,7 @@ async fn does_not_crash_if_operation_joined_then_relaunched() -> Result<(), Erro .unwrap(); // We didn't disconnect our worker, so it will have scheduled it to the worker. expected_action_state.stage = ActionStage::Executing; + expected_action_state.last_transition_timestamp = SystemTime::now(); let (action_state, _maybe_origin_metadata) = action_listener.changed().await.unwrap(); // The name of the action changed (since it's a new action), so update it. expected_action_state.client_operation_id = action_state.client_operation_id.clone(); @@ -1803,6 +1857,7 @@ async fn run_two_jobs_on_same_worker_with_platform_properties_restrictions() -> client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Completed(action_result.clone()), action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -1847,6 +1902,7 @@ async fn run_two_jobs_on_same_worker_with_platform_properties_restrictions() -> client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Completed(action_result.clone()), action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -1985,6 +2041,7 @@ async fn worker_retries_on_internal_error_and_fails_test() -> Result<(), Error> client_operation_id: action_state.client_operation_id.clone(), stage: ActionStage::Queued, action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; assert_eq!(action_state.as_ref(), &expected_action_state); } @@ -2048,6 +2105,7 @@ async fn worker_retries_on_internal_error_and_fails_test() -> Result<(), Error> message: String::new(), }), action_digest: action_state.action_digest, + last_transition_timestamp: SystemTime::now(), }; let mut received_state = action_state.as_ref().clone(); if let ActionStage::Completed(stage) = &mut received_state.stage { @@ -2116,7 +2174,7 @@ async fn ensure_scheduler_drops_inner_spawn() -> Result<(), Error> { Ok(()) } -/// Regression test for: https://github.com/TraceMachina/nativelink/issues/257. +/// Regression test for: . #[nativelink_test] async fn ensure_task_or_worker_change_notification_received_test() -> Result<(), Error> { let worker_id1 = WorkerId("worker1".to_string()); @@ -2348,3 +2406,58 @@ async fn client_timesout_job_then_same_action_requested() -> Result<(), Error> { Ok(()) } + +#[nativelink_test] +async fn logs_when_no_workers_match() -> Result<(), Error> { + let worker_id = WorkerId("worker_id".to_string()); + + let mut prop_defs = HashMap::new(); + prop_defs.insert("prop".to_string(), PropertyType::Minimum); + + let task_change_notify = Arc::new(Notify::new()); + let (scheduler, _worker_scheduler) = SimpleScheduler::new_with_callback( + &SimpleSpec { + worker_match_logging_interval_s: 1, + supported_platform_properties: Some(prop_defs), + ..Default::default() + }, + memory_awaited_action_db_factory( + 0, + &task_change_notify.clone(), + MockInstantWrapped::default, + ), + || async move {}, + task_change_notify, + MockInstantWrapped::default, + None, + ); + let action_digest = DigestInfo::new([99u8; 32], 512); + + let mut required_platform_properties = HashMap::new(); + required_platform_properties.insert("prop".to_string(), "1".to_string()); + + let mut worker_properties = PlatformProperties::default(); + worker_properties + .properties + .insert("prop".to_string(), PlatformPropertyValue::Minimum(0)); + + setup_new_worker(&scheduler, worker_id.clone(), worker_properties).await?; + + setup_action( + &scheduler, + action_digest, + required_platform_properties, + make_system_time(1), + ) + .await + .unwrap(); + + scheduler.do_try_match_for_test().await?; + + assert!(logs_contain( + "Property mismatch on worker property prop. Minimum(0) < Minimum(1)" + )); + assert!(logs_contain("No workers matched")); + + Ok(()) +} diff --git a/nativelink-scheduler/tests/worker_capability_index_test.rs b/nativelink-scheduler/tests/worker_capability_index_test.rs new file mode 100644 index 000000000..dea773c5a --- /dev/null +++ b/nativelink-scheduler/tests/worker_capability_index_test.rs @@ -0,0 +1,276 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Tests for the worker capability index. + +use std::collections::HashMap; + +use nativelink_scheduler::worker_capability_index::WorkerCapabilityIndex; +use nativelink_util::action_messages::WorkerId; +use nativelink_util::platform_properties::{PlatformProperties, PlatformPropertyValue}; + +fn make_worker_id(name: &str) -> WorkerId { + WorkerId(name.to_string()) +} + +fn make_properties(props: &[(&str, PlatformPropertyValue)]) -> PlatformProperties { + let mut map = HashMap::new(); + for (name, value) in props { + map.insert((*name).to_string(), value.clone()); + } + PlatformProperties::new(map) +} + +#[test] +#[tracing_test::traced_test] +fn test_empty_index() { + let index = WorkerCapabilityIndex::new(); + let props = make_properties(&[]); + let result = index.find_matching_workers(&props, true); + assert!(result.is_empty()); + + assert!(logs_contain("No workers available to match!")); +} + +#[test] +fn test_exact_property_match() { + let mut index = WorkerCapabilityIndex::new(); + + let worker1 = make_worker_id("worker1"); + let worker2 = make_worker_id("worker2"); + + index.add_worker( + &worker1, + &make_properties(&[("os", PlatformPropertyValue::Exact("linux".to_string()))]), + ); + index.add_worker( + &worker2, + &make_properties(&[("os", PlatformPropertyValue::Exact("windows".to_string()))]), + ); + + // Match linux + let linux_props = make_properties(&[("os", PlatformPropertyValue::Exact("linux".to_string()))]); + let result = index.find_matching_workers(&linux_props, true); + assert_eq!(result.len(), 1); + assert!(result.contains(&worker1)); + + // Match windows + let windows_props = + make_properties(&[("os", PlatformPropertyValue::Exact("windows".to_string()))]); + let result = index.find_matching_workers(&windows_props, true); + assert_eq!(result.len(), 1); + assert!(result.contains(&worker2)); +} + +#[test] +fn test_minimum_property_presence_only() { + // The index only checks PRESENCE of Minimum properties, not their values. + // Actual value checking is done at runtime by the caller since Minimum + // values are dynamic (change as jobs are assigned to workers). + let mut index = WorkerCapabilityIndex::new(); + + let worker1 = make_worker_id("worker1"); + let worker2 = make_worker_id("worker2"); + let worker3 = make_worker_id("worker3"); + + index.add_worker( + &worker1, + &make_properties(&[("cpu_count", PlatformPropertyValue::Minimum(4))]), + ); + index.add_worker( + &worker2, + &make_properties(&[("cpu_count", PlatformPropertyValue::Minimum(8))]), + ); + // Worker3 has no cpu_count property + index.add_worker( + &worker3, + &make_properties(&[("os", PlatformPropertyValue::Exact("linux".to_string()))]), + ); + + // Any request for cpu_count returns workers that HAVE the property (regardless of value) + let props = make_properties(&[("cpu_count", PlatformPropertyValue::Minimum(2))]); + let result = index.find_matching_workers(&props, true); + assert_eq!(result.len(), 2); + assert!(result.contains(&worker1)); + assert!(result.contains(&worker2)); + assert!(!result.contains(&worker3)); // Doesn't have cpu_count + + // Even a high value returns the same workers - actual value check is done at runtime + let props = make_properties(&[("cpu_count", PlatformPropertyValue::Minimum(100))]); + let result = index.find_matching_workers(&props, true); + assert_eq!(result.len(), 2); +} + +#[test] +fn test_mixed_properties() { + let mut index = WorkerCapabilityIndex::new(); + + let worker1 = make_worker_id("worker1"); + let worker2 = make_worker_id("worker2"); + let worker3 = make_worker_id("worker3"); + + index.add_worker( + &worker1, + &make_properties(&[ + ("os", PlatformPropertyValue::Exact("linux".to_string())), + ("cpu_count", PlatformPropertyValue::Minimum(4)), + ]), + ); + index.add_worker( + &worker2, + &make_properties(&[ + ("os", PlatformPropertyValue::Exact("linux".to_string())), + ("cpu_count", PlatformPropertyValue::Minimum(8)), + ]), + ); + // Worker3 has different OS + index.add_worker( + &worker3, + &make_properties(&[ + ("os", PlatformPropertyValue::Exact("windows".to_string())), + ("cpu_count", PlatformPropertyValue::Minimum(16)), + ]), + ); + + // Match linux with cpu_count - both linux workers match (Minimum is presence-only) + let props = make_properties(&[ + ("os", PlatformPropertyValue::Exact("linux".to_string())), + ("cpu_count", PlatformPropertyValue::Minimum(6)), + ]); + let result = index.find_matching_workers(&props, true); + // Both worker1 and worker2 have linux OS and cpu_count property + assert_eq!(result.len(), 2); + assert!(result.contains(&worker1)); + assert!(result.contains(&worker2)); + assert!(!result.contains(&worker3)); // Different OS +} + +#[test] +fn test_remove_worker() { + let mut index = WorkerCapabilityIndex::new(); + + let worker1 = make_worker_id("worker1"); + index.add_worker( + &worker1, + &make_properties(&[("os", PlatformPropertyValue::Exact("linux".to_string()))]), + ); + + assert_eq!(index.worker_count(), 1); + + index.remove_worker(&worker1); + + assert_eq!(index.worker_count(), 0); + + let props = make_properties(&[("os", PlatformPropertyValue::Exact("linux".to_string()))]); + let result = index.find_matching_workers(&props, true); + assert!(result.is_empty()); +} + +#[test] +fn test_no_properties_matches_all() { + let mut index = WorkerCapabilityIndex::new(); + + let worker1 = make_worker_id("worker1"); + let worker2 = make_worker_id("worker2"); + + index.add_worker( + &worker1, + &make_properties(&[("os", PlatformPropertyValue::Exact("linux".to_string()))]), + ); + index.add_worker(&worker2, &make_properties(&[])); + + // No properties required - all workers match + let props = make_properties(&[]); + let result = index.find_matching_workers(&props, true); + assert_eq!(result.len(), 2); +} + +#[test] +fn test_priority_property() { + let mut index = WorkerCapabilityIndex::new(); + + let worker1 = make_worker_id("worker1"); + let worker2 = make_worker_id("worker2"); + + index.add_worker( + &worker1, + &make_properties(&[("pool", PlatformPropertyValue::Priority("high".to_string()))]), + ); + index.add_worker( + &worker2, + &make_properties(&[("pool", PlatformPropertyValue::Priority("low".to_string()))]), + ); + + // Priority just checks presence, so any pool value matches workers with pool + let props = make_properties(&[("pool", PlatformPropertyValue::Priority("any".to_string()))]); + let result = index.find_matching_workers(&props, true); + assert_eq!(result.len(), 2); +} + +#[test] +fn test_ignore_property() { + let mut index = WorkerCapabilityIndex::new(); + + let worker1 = make_worker_id("worker1"); + let worker2 = make_worker_id("worker2"); + + index.add_worker( + &worker1, + &make_properties(&[("foo", PlatformPropertyValue::Priority("high".to_string()))]), + ); + index.add_worker( + &worker2, + &make_properties(&[("bar", PlatformPropertyValue::Priority("low".to_string()))]), + ); + + // Ignore doesn't care if the worker has the property, so both workers with and without it should match + let props = make_properties(&[("foo", PlatformPropertyValue::Ignore("any".to_string()))]); + let result = index.find_matching_workers(&props, true); + assert_eq!(result.len(), 2); +} + +#[test] +#[tracing_test::traced_test] +fn test_no_exact_property_match() { + let mut index = WorkerCapabilityIndex::new(); + let worker1 = make_worker_id("worker1"); + index.add_worker( + &worker1, + &make_properties(&[("os", PlatformPropertyValue::Exact("windows".to_string()))]), + ); + + let props = make_properties(&[("os", PlatformPropertyValue::Exact("linux".to_string()))]); + let result = index.find_matching_workers(&props, true); + assert_eq!(result.len(), 0); + + assert!(logs_contain( + "No candidate workers due to a lack of matching 'os' = Exact(\"linux\"). Workers have: [Exact(\"windows\")]" + )); +} + +#[test] +#[tracing_test::traced_test] +fn test_no_priority_property_match() { + let mut index = WorkerCapabilityIndex::new(); + let worker1 = make_worker_id("worker1"); + index.add_worker(&worker1, &make_properties(&[])); + + let props = make_properties(&[("os", PlatformPropertyValue::Priority("linux".to_string()))]); + let result = index.find_matching_workers(&props, true); + assert_eq!(result.len(), 0); + + assert!(logs_contain( + "No candidate workers due to a lack of key 'os'. Job asked for Priority(\"linux\")" + )); +} diff --git a/nativelink-service/BUILD.bazel b/nativelink-service/BUILD.bazel index 1ed429dd3..5015732e0 100644 --- a/nativelink-service/BUILD.bazel +++ b/nativelink-service/BUILD.bazel @@ -27,6 +27,7 @@ rust_library( deps = [ "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", "//nativelink-scheduler", "//nativelink-store", diff --git a/nativelink-service/Cargo.toml b/nativelink-service/Cargo.toml index b37f74416..cc038993a 100644 --- a/nativelink-service/Cargo.toml +++ b/nativelink-service/Cargo.toml @@ -1,29 +1,31 @@ +#:schema ../tools/cargo-with-detailed-deps.json lints.workspace = true [package] edition = "2024" name = "nativelink-service" -version = "0.7.3" +version = "1.0.0-rc4" [dependencies] nativelink-config = { path = "../nativelink-config" } nativelink-error = { path = "../nativelink-error" } +nativelink-metric = { path = "../nativelink-metric" } nativelink-proto = { path = "../nativelink-proto" } nativelink-scheduler = { path = "../nativelink-scheduler" } nativelink-store = { path = "../nativelink-store" } nativelink-util = { path = "../nativelink-util" } -axum = { version = "0.8.3", default-features = false } +axum = { version = "0.8.3", default-features = false, features = ["tokio"]} bytes = { version = "1.10.1", default-features = false } futures = { version = "0.3.31", default-features = false } -http-body-util = "0.1.3" -hyper = { version = "1.6.0" } -opentelemetry = { version = "0.29.1", default-features = false } -opentelemetry-semantic-conventions = { version = "0.29.0", default-features = false, features = [ +http-body-util = { version = "0.1.3", default-features = false } +hyper = { version = "1.6.0", default-features = false } +opentelemetry = { version = "0.30.0", default-features = false } +opentelemetry-semantic-conventions = { version = "0.30.0", default-features = false, features = [ "default", "semconv_experimental", ] } -parking_lot = "0.12.3" +parking_lot = { version = "0.12.3", default-features = false } prost = { version = "0.13.5", default-features = false } prost-types = { version = "0.13.5", default-features = false, features = [ "std", @@ -31,7 +33,8 @@ prost-types = { version = "0.13.5", default-features = false, features = [ rand = { version = "0.9.0", default-features = false, features = [ "thread_rng", ] } -serde_json5 = "0.2.1" +serde_json5 = { version = "0.2.1", default-features = false } +serde_json = { version = "1.0.145", default-features = false } tokio = { version = "1.44.1", features = [ "fs", "io-util", @@ -56,14 +59,15 @@ uuid = { version = "1.16.0", default-features = false, features = [ [dev-dependencies] nativelink-macro = { path = "../nativelink-macro" } -nativelink-metric = { path = "../nativelink-metric" } async-lock = { version = "3.4.0", features = ["std"], default-features = false } -async-trait = "0.1.88" +async-trait = { version = "0.1.88", default-features = false } hex = { version = "0.4.3", default-features = false } -hyper = "1.6.0" -hyper-util = "0.1.11" -pretty_assertions = { version = "1.4.1", features = ["std"] } +hyper = { version = "1.6.0", default-features = false } +hyper-util = { version = "0.1.11", default-features = false } +pretty_assertions = { version = "1.4.1", features = [ + "std", +], default-features = false } prost-types = { version = "0.13.5", default-features = false } serde_json = { version = "1.0.140", default-features = false, features = [ "std", diff --git a/nativelink-service/src/ac_server.rs b/nativelink-service/src/ac_server.rs index 29db64d14..c1aa689cb 100644 --- a/nativelink-service/src/ac_server.rs +++ b/nativelink-service/src/ac_server.rs @@ -201,7 +201,7 @@ impl ActionCache for AcServer { #[instrument( err, - ret(level = Level::INFO), + ret(level = Level::TRACE), level = Level::ERROR, skip_all, fields(request = ?grpc_request.get_ref()) diff --git a/nativelink-service/src/bytestream_server.rs b/nativelink-service/src/bytestream_server.rs index c180c8751..d47b3cd9e 100644 --- a/nativelink-service/src/bytestream_server.rs +++ b/nativelink-service/src/bytestream_server.rs @@ -20,13 +20,17 @@ use core::time::Duration; use std::collections::HashMap; use std::collections::hash_map::Entry; use std::sync::Arc; -use std::time::{SystemTime, UNIX_EPOCH}; +use std::time::{Instant, SystemTime, UNIX_EPOCH}; -use futures::future::{BoxFuture, pending}; +use bytes::BytesMut; +use futures::future::pending; use futures::stream::unfold; use futures::{Future, Stream, TryFutureExt, try_join}; use nativelink_config::cas_server::{ByteStreamConfig, InstanceName, WithInstanceName}; use nativelink_error::{Code, Error, ResultExt, make_err, make_input_err}; +use nativelink_metric::{ + MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, group, publish, +}; use nativelink_proto::google::bytestream::byte_stream_server::{ ByteStream, ByteStreamServer as Server, }; @@ -46,7 +50,7 @@ use nativelink_util::digest_hasher::{ use nativelink_util::proto_stream_utils::WriteRequestStreamWrapper; use nativelink_util::resource_info::ResourceInfo; use nativelink_util::spawn; -use nativelink_util::store_trait::{Store, StoreLike, UploadSizeInfo}; +use nativelink_util::store_trait::{Store, StoreLike, StoreOptimizations, UploadSizeInfo}; use nativelink_util::task::JoinHandleDropGuard; use opentelemetry::context::FutureExt; use parking_lot::Mutex; @@ -60,15 +64,196 @@ const DEFAULT_PERSIST_STREAM_ON_DISCONNECT_TIMEOUT: Duration = Duration::from_se /// If this value changes update the documentation in the config definition. const DEFAULT_MAX_BYTES_PER_STREAM: usize = 64 * 1024; +/// Metrics for `ByteStream` server operations. +/// Tracks upload/download activity, throughput, and latency. +#[derive(Debug, Default)] +pub struct ByteStreamMetrics { + /// Number of currently active uploads (includes idle streams waiting for resume) + pub active_uploads: AtomicU64, + /// Total number of write requests received + pub write_requests_total: AtomicU64, + /// Total number of successful write requests + pub write_requests_success: AtomicU64, + /// Total number of failed write requests + pub write_requests_failure: AtomicU64, + /// Total number of read requests received + pub read_requests_total: AtomicU64, + /// Total number of successful read requests + pub read_requests_success: AtomicU64, + /// Total number of failed read requests + pub read_requests_failure: AtomicU64, + /// Total number of `query_write_status` requests + pub query_write_status_total: AtomicU64, + /// Total bytes written via `ByteStream` + pub bytes_written_total: AtomicU64, + /// Total bytes read via `ByteStream` + pub bytes_read_total: AtomicU64, + /// Sum of write durations in nanoseconds (for average latency calculation) + pub write_duration_ns: AtomicU64, + /// Sum of read durations in nanoseconds (for average latency calculation) + pub read_duration_ns: AtomicU64, + /// Number of UUID collisions detected + pub uuid_collisions: AtomicU64, + /// Number of resumed uploads (client reconnected to existing stream) + pub resumed_uploads: AtomicU64, + /// Number of idle streams that timed out + pub idle_stream_timeouts: AtomicU64, +} + +impl MetricsComponent for ByteStreamMetrics { + fn publish( + &self, + _kind: MetricKind, + field_metadata: MetricFieldData, + ) -> Result { + let _enter = group!(field_metadata.name).entered(); + + publish!( + "active_uploads", + &self.active_uploads, + MetricKind::Counter, + "Number of currently active uploads" + ); + publish!( + "write_requests_total", + &self.write_requests_total, + MetricKind::Counter, + "Total write requests received" + ); + publish!( + "write_requests_success", + &self.write_requests_success, + MetricKind::Counter, + "Total successful write requests" + ); + publish!( + "write_requests_failure", + &self.write_requests_failure, + MetricKind::Counter, + "Total failed write requests" + ); + publish!( + "read_requests_total", + &self.read_requests_total, + MetricKind::Counter, + "Total read requests received" + ); + publish!( + "read_requests_success", + &self.read_requests_success, + MetricKind::Counter, + "Total successful read requests" + ); + publish!( + "read_requests_failure", + &self.read_requests_failure, + MetricKind::Counter, + "Total failed read requests" + ); + publish!( + "query_write_status_total", + &self.query_write_status_total, + MetricKind::Counter, + "Total query_write_status requests" + ); + publish!( + "bytes_written_total", + &self.bytes_written_total, + MetricKind::Counter, + "Total bytes written via ByteStream" + ); + publish!( + "bytes_read_total", + &self.bytes_read_total, + MetricKind::Counter, + "Total bytes read via ByteStream" + ); + publish!( + "write_duration_ns", + &self.write_duration_ns, + MetricKind::Counter, + "Sum of write durations in nanoseconds" + ); + publish!( + "read_duration_ns", + &self.read_duration_ns, + MetricKind::Counter, + "Sum of read durations in nanoseconds" + ); + publish!( + "uuid_collisions", + &self.uuid_collisions, + MetricKind::Counter, + "Number of UUID collisions detected" + ); + publish!( + "resumed_uploads", + &self.resumed_uploads, + MetricKind::Counter, + "Number of resumed uploads" + ); + publish!( + "idle_stream_timeouts", + &self.idle_stream_timeouts, + MetricKind::Counter, + "Number of idle streams that timed out" + ); + + Ok(MetricPublishKnownKindData::Component) + } +} + type BytesWrittenAndIdleStream = (Arc, Option); -type SleepFn = Arc BoxFuture<'static, ()> + Send + Sync>; + +/// Type alias for the UUID key used in `active_uploads` `HashMap`. +/// Using u128 instead of String reduces memory allocations and improves +/// cache locality for `HashMap` operations. +type UuidKey = u128; + +/// Parse a UUID string to a u128 for use as a `HashMap` key. +/// This avoids heap allocation for String keys and improves `HashMap` performance. +/// Falls back to hashing the string if it's not a valid hex UUID. +#[inline] +fn parse_uuid_to_key(uuid_str: &str) -> UuidKey { + // UUIDs are typically 32 hex chars (128 bits) or 36 chars with dashes. + // We'll try to parse as hex first, then fall back to hashing. + let clean: String = uuid_str.chars().filter(char::is_ascii_hexdigit).collect(); + if clean.len() >= 16 { + // Take up to 32 hex chars (128 bits) + let hex_str = if clean.len() > 32 { + &clean[..32] + } else { + &clean + }; + u128::from_str_radix(hex_str, 16).unwrap_or_else(|_| { + // Hash fallback for non-hex strings + use core::hash::{Hash, Hasher}; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + uuid_str.hash(&mut hasher); + u128::from(hasher.finish()) + }) + } else { + // Short strings: use hash + use core::hash::{Hash, Hasher}; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + uuid_str.hash(&mut hasher); + u128::from(hasher.finish()) + } +} pub struct InstanceInfo { store: Store, // Max number of bytes to send on each grpc stream chunk. max_bytes_per_stream: usize, - active_uploads: Arc>>, - sleep_fn: SleepFn, + /// Active uploads keyed by UUID as u128 for better performance. + /// Using u128 keys instead of String reduces heap allocations + /// and improves `HashMap` lookup performance. + active_uploads: Arc>>, + /// How long to keep idle streams before timing them out. + idle_stream_timeout: Duration, + metrics: Arc, + /// Handle to the global sweeper task. Kept alive for the lifetime of the instance. + _sweeper_handle: Arc>, } impl Debug for InstanceInfo { @@ -77,6 +262,8 @@ impl Debug for InstanceInfo { .field("store", &self.store) .field("max_bytes_per_stream", &self.max_bytes_per_stream) .field("active_uploads", &self.active_uploads) + .field("idle_stream_timeout", &self.idle_stream_timeout) + .field("metrics", &self.metrics) .finish() } } @@ -85,7 +272,7 @@ type ReadStream = Pin> + Send type StoreUpdateFuture = Pin> + Send + 'static>>; struct StreamState { - uuid: String, + uuid: UuidKey, tx: DropCloserWriteHalf, store_update_fut: StoreUpdateFuture, } @@ -93,7 +280,7 @@ struct StreamState { impl Debug for StreamState { fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result { f.debug_struct("StreamState") - .field("uuid", &self.uuid) + .field("uuid", &format!("{:032x}", self.uuid)) .finish() } } @@ -104,8 +291,8 @@ impl Debug for StreamState { struct ActiveStreamGuard { stream_state: Option, bytes_received: Arc, - active_uploads: Arc>>, - sleep_fn: SleepFn, + active_uploads: Arc>>, + metrics: Arc, } impl ActiveStreamGuard { @@ -114,6 +301,8 @@ impl ActiveStreamGuard { fn graceful_finish(mut self) { let stream_state = self.stream_state.take().unwrap(); self.active_uploads.lock().remove(&stream_state.uuid); + // Decrement active uploads counter on successful completion + self.metrics.active_uploads.fetch_sub(1, Ordering::Relaxed); } } @@ -122,38 +311,33 @@ impl Drop for ActiveStreamGuard { let Some(stream_state) = self.stream_state.take() else { return; // If None it means we don't want it put back into an IdleStream. }; - let weak_active_uploads = Arc::downgrade(&self.active_uploads); let mut active_uploads = self.active_uploads.lock(); - let uuid = stream_state.uuid.clone(); + let uuid = stream_state.uuid; // u128 is Copy, no clone needed let Some(active_uploads_slot) = active_uploads.get_mut(&uuid) else { error!( err = "Failed to find active upload. This should never happen.", - uuid = ?uuid, + uuid = format!("{:032x}", uuid), ); return; }; - let sleep_fn = self.sleep_fn.clone(); + // Mark stream as idle with current timestamp. + // The global sweeper will clean it up after idle_stream_timeout. + // This avoids spawning a task per stream, reducing overhead from O(n) to O(1). active_uploads_slot.1 = Some(IdleStream { stream_state, - _timeout_streaam_drop_guard: spawn!("bytestream_idle_stream_timeout", async move { - (*sleep_fn)().await; - if let Some(active_uploads) = weak_active_uploads.upgrade() { - let mut active_uploads = active_uploads.lock(); - info!(msg = "Removing idle stream", uuid = ?uuid); - active_uploads.remove(&uuid); - } - }), + idle_since: Instant::now(), }); } } /// Represents a stream that is in the "idle" state. this means it is not currently being used /// by a client. If it is not used within a certain amount of time it will be removed from the -/// `active_uploads` map automatically. +/// `active_uploads` map automatically by the global sweeper task. #[derive(Debug)] struct IdleStream { stream_state: StreamState, - _timeout_streaam_drop_guard: JoinHandleDropGuard<()>, + /// When this stream became idle. Used by the global sweeper to determine expiration. + idle_since: Instant, } impl IdleStream { @@ -166,7 +350,7 @@ impl IdleStream { stream_state: Some(self.stream_state), bytes_received, active_uploads: instance_info.active_uploads.clone(), - sleep_fn: instance_info.sleep_fn.clone(), + metrics: instance_info.metrics.clone(), } } } @@ -177,13 +361,15 @@ pub struct ByteStreamServer { } impl ByteStreamServer { - /// Generate a unique UUID by appending a nanosecond timestamp to avoid collisions. - fn generate_unique_uuid(base_uuid: &str) -> String { + /// Generate a unique UUID key by `XOR`ing the base key with a nanosecond timestamp. + /// This ensures virtually zero collision probability while being O(1). + fn generate_unique_uuid_key(base_key: UuidKey) -> UuidKey { let timestamp = SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap_or_default() .as_nanos(); - format!("{base_uuid}-{timestamp:x}") + // XOR with timestamp to create unique key + base_key ^ timestamp } pub fn new( @@ -192,28 +378,23 @@ impl ByteStreamServer { ) -> Result { let mut instance_infos: HashMap = HashMap::new(); for config in configs { - let persist_stream_on_disconnect_timeout = - if config.persist_stream_on_disconnect_timeout == 0 { - DEFAULT_PERSIST_STREAM_ON_DISCONNECT_TIMEOUT - } else { - Duration::from_secs(config.persist_stream_on_disconnect_timeout as u64) - }; + let idle_stream_timeout = if config.persist_stream_on_disconnect_timeout == 0 { + DEFAULT_PERSIST_STREAM_ON_DISCONNECT_TIMEOUT + } else { + Duration::from_secs(config.persist_stream_on_disconnect_timeout as u64) + }; let _old_value = instance_infos.insert( config.instance_name.clone(), - Self::new_with_sleep_fn( - config, - store_manager, - Arc::new(move || Box::pin(sleep(persist_stream_on_disconnect_timeout))), - )?, + Self::new_with_timeout(config, store_manager, idle_stream_timeout)?, ); } Ok(Self { instance_infos }) } - pub fn new_with_sleep_fn( + pub fn new_with_timeout( config: &WithInstanceName, store_manager: &StoreManager, - sleep_fn: SleepFn, + idle_stream_timeout: Duration, ) -> Result { let store = store_manager .get_store(&config.cas_store) @@ -223,11 +404,69 @@ impl ByteStreamServer { } else { config.max_bytes_per_stream }; + + let active_uploads: Arc>> = + Arc::new(Mutex::new(HashMap::new())); + let metrics = Arc::new(ByteStreamMetrics::default()); + + // Spawn a single global sweeper task that periodically cleans up expired idle streams. + // This replaces per-stream timeout tasks, reducing task spawn overhead from O(n) to O(1). + let sweeper_active_uploads = Arc::downgrade(&active_uploads); + let sweeper_metrics = Arc::downgrade(&metrics); + let sweep_interval = idle_stream_timeout / 2; // Check every half-timeout period + let sweeper_handle = spawn!("bytestream_idle_stream_sweeper", async move { + loop { + sleep(sweep_interval).await; + + let Some(active_uploads) = sweeper_active_uploads.upgrade() else { + // InstanceInfo has been dropped, exit the sweeper + break; + }; + let metrics = sweeper_metrics.upgrade(); + + let now = Instant::now(); + let mut expired_count = 0u64; + + // Lock and sweep expired entries + { + let mut uploads = active_uploads.lock(); + uploads.retain(|uuid, (_, maybe_idle)| { + if let Some(idle_stream) = maybe_idle { + if now.duration_since(idle_stream.idle_since) >= idle_stream_timeout { + info!( + msg = "Sweeping expired idle stream", + uuid = format!("{:032x}", uuid) + ); + expired_count += 1; + return false; // Remove this entry + } + } + true // Keep this entry + }); + } + + // Update metrics outside the lock + if expired_count > 0 { + if let Some(m) = &metrics { + m.idle_stream_timeouts + .fetch_add(expired_count, Ordering::Relaxed); + m.active_uploads.fetch_sub(expired_count, Ordering::Relaxed); + } + trace!( + msg = "Sweeper cleaned up expired streams", + count = expired_count + ); + } + } + }); + Ok(InstanceInfo { store, max_bytes_per_stream, - active_uploads: Arc::new(Mutex::new(HashMap::new())), - sleep_fn, + active_uploads, + idle_stream_timeout, + metrics, + _sweeper_handle: Arc::new(sweeper_handle), }) } @@ -248,45 +487,69 @@ impl ByteStreamServer { /// generate the unique UUID in the exact same nanosecond. fn create_or_join_upload_stream( &self, - uuid: &str, + uuid_str: &str, instance: &InstanceInfo, digest: DigestInfo, ) -> ActiveStreamGuard { - let (uuid, bytes_received) = match instance.active_uploads.lock().entry(uuid.to_string()) { - Entry::Occupied(mut entry) => { - let maybe_idle_stream = entry.get_mut(); - if let Some(idle_stream) = maybe_idle_stream.1.take() { - // Case 2: Stream exists but is idle, we can resume it - let bytes_received = maybe_idle_stream.0.clone(); - info!(msg = "Joining existing stream", entry = ?entry.key()); - return idle_stream.into_active_stream(bytes_received, instance); + // Parse UUID string to u128 key for efficient HashMap operations + let uuid_key = parse_uuid_to_key(uuid_str); + + let (uuid, bytes_received, is_collision) = + match instance.active_uploads.lock().entry(uuid_key) { + Entry::Occupied(mut entry) => { + let maybe_idle_stream = entry.get_mut(); + if let Some(idle_stream) = maybe_idle_stream.1.take() { + // Case 2: Stream exists but is idle, we can resume it + let bytes_received = maybe_idle_stream.0.clone(); + info!( + msg = "Joining existing stream", + uuid = format!("{:032x}", entry.key()) + ); + // Track resumed upload + instance + .metrics + .resumed_uploads + .fetch_add(1, Ordering::Relaxed); + return idle_stream.into_active_stream(bytes_received, instance); + } + // Case 3: Stream is active - generate a unique UUID to avoid collision + // Using nanosecond timestamp makes collision probability essentially zero + let original_key = *entry.key(); + let unique_key = Self::generate_unique_uuid_key(original_key); + warn!( + msg = "UUID collision detected, generating unique UUID to prevent conflict", + original_uuid = format!("{:032x}", original_key), + unique_uuid = format!("{:032x}", unique_key) + ); + // Entry goes out of scope here, releasing the lock + + let bytes_received = Arc::new(AtomicU64::new(0)); + let mut active_uploads = instance.active_uploads.lock(); + // Insert with the unique UUID - this should never collide due to nanosecond precision + active_uploads.insert(unique_key, (bytes_received.clone(), None)); + (unique_key, bytes_received, true) } - // Case 3: Stream is active - generate a unique UUID to avoid collision - // Using nanosecond timestamp makes collision probability essentially zero - let original_uuid = entry.key().clone(); - let unique_uuid = Self::generate_unique_uuid(&original_uuid); - warn!( - msg = "UUID collision detected, generating unique UUID to prevent conflict", - original_uuid = ?original_uuid, - unique_uuid = ?unique_uuid - ); - // Entry goes out of scope here, releasing the lock - - let bytes_received = Arc::new(AtomicU64::new(0)); - let mut active_uploads = instance.active_uploads.lock(); - // Insert with the unique UUID - this should never collide due to nanosecond precision - active_uploads.insert(unique_uuid.clone(), (bytes_received.clone(), None)); - (unique_uuid, bytes_received) - } - Entry::Vacant(entry) => { - // Case 1: UUID doesn't exist, create new stream - let bytes_received = Arc::new(AtomicU64::new(0)); - let uuid = entry.key().clone(); - // Our stream is "in use" if the key is in the map, but the value is None. - entry.insert((bytes_received.clone(), None)); - (uuid, bytes_received) - } - }; + Entry::Vacant(entry) => { + // Case 1: UUID doesn't exist, create new stream + let bytes_received = Arc::new(AtomicU64::new(0)); + let uuid = *entry.key(); + // Our stream is "in use" if the key is in the map, but the value is None. + entry.insert((bytes_received.clone(), None)); + (uuid, bytes_received, false) + } + }; + + // Track metrics for new upload + instance + .metrics + .active_uploads + .fetch_add(1, Ordering::Relaxed); + if is_collision { + instance + .metrics + .uuid_collisions + .fetch_add(1, Ordering::Relaxed); + } // Important: Do not return an error from this point onwards without // removing the entry from the map, otherwise that UUID becomes @@ -310,7 +573,7 @@ impl ByteStreamServer { }), bytes_received, active_uploads: instance.active_uploads.clone(), - sleep_fn: instance.sleep_fn.clone(), + metrics: instance.metrics.clone(), } } @@ -380,8 +643,7 @@ impl ByteStreamServer { return Some((Err(err.into()), None)); } response.data = bytes; - trace!(response = ?response); - debug!(response.data = format!("", response.data.len())); + trace!(response.data = format!("", response.data.len())); break; } Err(mut e) => { @@ -494,9 +756,10 @@ impl ByteStreamServer { } continue; } - write_request - .data - .slice((tx.get_bytes_written() - write_offset) as usize..) + write_request.data.slice( + usize::try_from(tx.get_bytes_written() - write_offset) + .unwrap_or(usize::MAX).., + ) } else { if write_offset != tx.get_bytes_written() { return Err(make_input_err!( @@ -561,6 +824,98 @@ impl ByteStreamServer { })) } + /// Fast-path write that bypasses channel overhead for stores that support direct Bytes updates. + /// This buffers all data in memory and calls `update_oneshot` directly. + async fn inner_write_oneshot( + &self, + instance_info: &InstanceInfo, + digest: DigestInfo, + mut stream: WriteRequestStreamWrapper< + impl Stream> + Unpin, + >, + ) -> Result, Error> { + let expected_size = stream.resource_info.expected_size as u64; + + // Pre-allocate buffer for expected size (capped at reasonable limit to prevent DoS) + let capacity = + usize::try_from(expected_size.min(64 * 1024 * 1024)).unwrap_or(64 * 1024 * 1024); + let mut buffer = BytesMut::with_capacity(capacity); + let mut bytes_received: u64 = 0; + + // Collect all data from client stream + loop { + let write_request = match stream.next().await { + None => { + return Err(make_input_err!( + "Client closed stream before sending all data" + )); + } + Some(Err(err)) => return Err(err), + Some(Ok(write_request)) => write_request, + }; + + if write_request.write_offset < 0 { + return Err(make_input_err!( + "Invalid negative write offset in write request: {}", + write_request.write_offset + )); + } + let write_offset = write_request.write_offset as u64; + + // Handle duplicate/resumed data + let data = if write_offset < bytes_received { + if (write_offset + write_request.data.len() as u64) < bytes_received { + if write_request.finish_write { + return Err(make_input_err!( + "Resumed stream finished at {} bytes when we already received {} bytes.", + write_offset + write_request.data.len() as u64, + bytes_received + )); + } + continue; + } + write_request + .data + .slice(usize::try_from(bytes_received - write_offset).unwrap_or(usize::MAX)..) + } else { + if write_offset != bytes_received { + return Err(make_input_err!( + "Received out of order data. Got {}, expected {}", + write_offset, + bytes_received + )); + } + write_request.data + }; + + if !data.is_empty() { + buffer.extend_from_slice(&data); + bytes_received += data.len() as u64; + } + + if expected_size < bytes_received { + return Err(make_input_err!("Received more bytes than expected")); + } + + if write_request.finish_write { + break; + } + } + + // Direct update without channel overhead + let store = instance_info.store.clone(); + store + .update_oneshot(digest, buffer.freeze()) + .await + .err_tip(|| "Error in update_oneshot")?; + + // Note: bytes_written_total is updated in the caller (bytestream_write) based on result + + Ok(Response::new(WriteResponse { + committed_size: expected_size as i64, + })) + } + async fn inner_query_write_status( &self, query_request: &QueryWriteStatusRequest, @@ -587,14 +942,15 @@ impl ByteStreamServer { .await; } - let uuid = resource_info + let uuid_str = resource_info .uuid .take() .ok_or_else(|| make_input_err!("UUID must be set if querying write status"))?; + let uuid_key = parse_uuid_to_key(&uuid_str); { let active_uploads = instance.active_uploads.lock(); - if let Some((received_bytes, _maybe_idle_stream)) = active_uploads.get(uuid.as_ref()) { + if let Some((received_bytes, _maybe_idle_stream)) = active_uploads.get(&uuid_key) { return Ok(Response::new(QueryWriteStatusResponse { committed_size: received_bytes.load(Ordering::Acquire) as i64, // If we are in the active_uploads map, but the value is None, @@ -636,13 +992,23 @@ impl ByteStream for ByteStreamServer { &self, grpc_request: Request, ) -> Result, Status> { + let start_time = Instant::now(); + let read_request = grpc_request.into_inner(); let resource_info = ResourceInfo::new(&read_request.resource_name, false)?; let instance_name = resource_info.instance_name.as_ref(); + let expected_size = resource_info.expected_size as u64; let instance = self .instance_infos .get(instance_name) .err_tip(|| format!("'instance_name' not configured for '{instance_name}'"))?; + + // Track read request + instance + .metrics + .read_requests_total + .fetch_add(1, Ordering::Relaxed); + let store = instance.store.clone(); let digest = DigestInfo::try_new(resource_info.hash.as_ref(), resource_info.expected_size)?; @@ -666,14 +1032,37 @@ impl ByteStream for ByteStreamServer { ) .await .err_tip(|| "In ByteStreamServer::read") - .map(|stream| -> Response { Response::new(Box::pin(stream)) }) - .map_err(Into::into); + .map(|stream| -> Response { Response::new(Box::pin(stream)) }); - if resp.is_ok() { - debug!(return = "Ok()"); + // Track metrics based on result + #[allow(clippy::cast_possible_truncation)] + let elapsed_ns = start_time.elapsed().as_nanos() as u64; + instance + .metrics + .read_duration_ns + .fetch_add(elapsed_ns, Ordering::Relaxed); + + match &resp { + Ok(_) => { + instance + .metrics + .read_requests_success + .fetch_add(1, Ordering::Relaxed); + instance + .metrics + .bytes_read_total + .fetch_add(expected_size, Ordering::Relaxed); + debug!(return = "Ok()"); + } + Err(_) => { + instance + .metrics + .read_requests_failure + .fetch_add(1, Ordering::Relaxed); + } } - resp + resp.map_err(Into::into) } #[instrument( @@ -686,6 +1075,8 @@ impl ByteStream for ByteStreamServer { &self, grpc_request: Request>, ) -> Result, Status> { + let start_time = Instant::now(); + let request = grpc_request.into_inner(); let stream = WriteRequestStreamWrapper::from(request) .await @@ -693,10 +1084,18 @@ impl ByteStream for ByteStreamServer { .map_err(Into::::into)?; let instance_name = stream.resource_info.instance_name.as_ref(); + let expected_size = stream.resource_info.expected_size as u64; let instance = self .instance_infos .get(instance_name) .err_tip(|| format!("'instance_name' not configured for '{instance_name}'"))?; + + // Track write request + instance + .metrics + .write_requests_total + .fetch_add(1, Ordering::Relaxed); + let store = instance.store.clone(); let digest = DigestInfo::try_new( @@ -720,14 +1119,84 @@ impl ByteStream for ByteStreamServer { DigestHasherFunc::try_from, )?; - self.inner_write(instance, digest, stream) - .instrument(error_span!("bytestream_write")) - .with_context( - make_ctx_for_hash_func(digest_function).err_tip(|| "In BytestreamServer::write")?, - ) - .await - .err_tip(|| "In ByteStreamServer::write") - .map_err(Into::into) + // Check if store supports direct oneshot updates (bypasses channel overhead). + // Use fast-path only when: + // 1. Store supports oneshot optimization + // 2. UUID is provided + // 3. Size is under 64MB (memory safety) + // 4. This is a NEW upload (UUID not already in active_uploads) + // 5. The first message has finish_write=true (single-shot upload) + // + // The oneshot path cannot be used for multi-message streams because: + // - QueryWriteStatus won't work (no progress tracking) + // - Resumed streams won't work (no partial progress) + let use_oneshot = if store.optimized_for(StoreOptimizations::SubscribesToUpdateOneshot) + && expected_size <= 64 * 1024 * 1024 + && stream.resource_info.uuid.is_some() + { + // Check if first message completes the upload (single-shot) + let is_single_shot = stream.is_first_msg_complete(); + + if is_single_shot { + let uuid_str = stream.resource_info.uuid.as_ref().unwrap(); + let uuid_key = parse_uuid_to_key(uuid_str); + // Only use oneshot if this UUID is not already being tracked + !instance.active_uploads.lock().contains_key(&uuid_key) + } else { + false + } + } else { + false + }; + + let result = if use_oneshot { + self.inner_write_oneshot(instance, digest, stream) + .instrument(error_span!("bytestream_write_oneshot")) + .with_context( + make_ctx_for_hash_func(digest_function) + .err_tip(|| "In BytestreamServer::write")?, + ) + .await + .err_tip(|| "In ByteStreamServer::write (oneshot)") + } else { + self.inner_write(instance, digest, stream) + .instrument(error_span!("bytestream_write")) + .with_context( + make_ctx_for_hash_func(digest_function) + .err_tip(|| "In BytestreamServer::write")?, + ) + .await + .err_tip(|| "In ByteStreamServer::write") + }; + + // Track metrics based on result + #[allow(clippy::cast_possible_truncation)] + let elapsed_ns = start_time.elapsed().as_nanos() as u64; + instance + .metrics + .write_duration_ns + .fetch_add(elapsed_ns, Ordering::Relaxed); + + match &result { + Ok(_) => { + instance + .metrics + .write_requests_success + .fetch_add(1, Ordering::Relaxed); + instance + .metrics + .bytes_written_total + .fetch_add(expected_size, Ordering::Relaxed); + } + Err(_) => { + instance + .metrics + .write_requests_failure + .fetch_add(1, Ordering::Relaxed); + } + } + + result.map_err(Into::into) } #[instrument( @@ -742,6 +1211,20 @@ impl ByteStream for ByteStreamServer { grpc_request: Request, ) -> Result, Status> { let request = grpc_request.into_inner(); + + // Track query_write_status request - we need to parse the resource name to get the instance + if let Ok(resource_info) = ResourceInfo::new(&request.resource_name, true) { + if let Some(instance) = self + .instance_infos + .get(resource_info.instance_name.as_ref()) + { + instance + .metrics + .query_write_status_total + .fetch_add(1, Ordering::Relaxed); + } + } + self.inner_query_write_status(&request) .await .err_tip(|| "Failed on query_write_status() command") diff --git a/nativelink-service/src/cas_server.rs b/nativelink-service/src/cas_server.rs index 7462d721e..7e0f5f437 100644 --- a/nativelink-service/src/cas_server.rs +++ b/nativelink-service/src/cas_server.rs @@ -278,9 +278,12 @@ impl CasServer { .err_tip(|| "In Directory::file::digest")?; deque.push_back(digest); } + + let page_size_usize = usize::try_from(page_size).unwrap_or(usize::MAX); + if page_token_matched { directories.push(directory); - if directories.len() as i32 == page_size { + if directories.len() == page_size_usize { break; } } diff --git a/nativelink-service/src/execution_server.rs b/nativelink-service/src/execution_server.rs index 047fab444..706206c74 100644 --- a/nativelink-service/src/execution_server.rs +++ b/nativelink-service/src/execution_server.rs @@ -63,7 +63,7 @@ impl NativelinkOperationId { fn from_name(name: &str) -> Result { let (instance_name, name) = name - .rsplit_once('/') + .split_once('/') .err_tip(|| "Expected instance_name and name to be separated by '/'")?; Ok(Self::new( instance_name.to_string(), diff --git a/nativelink-service/src/worker_api_server.rs b/nativelink-service/src/worker_api_server.rs index ff922b1eb..9b6918155 100644 --- a/nativelink-service/src/worker_api_server.rs +++ b/nativelink-service/src/worker_api_server.rs @@ -20,14 +20,15 @@ use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; use futures::stream::unfold; -use futures::Stream; +use futures::{Stream, StreamExt}; use nativelink_config::cas_server::WorkerApiConfig; use nativelink_error::{make_err, Code, Error, ResultExt}; +use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::update_for_scheduler::Update; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::worker_api_server::{ WorkerApi, WorkerApiServer as Server, }; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - execute_result, ConnectWorkerRequest, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForWorker + execute_result, ExecuteComplete, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForScheduler, UpdateForWorker }; use nativelink_scheduler::worker::Worker; use nativelink_scheduler::worker_scheduler::WorkerScheduler; @@ -38,7 +39,7 @@ use nativelink_util::platform_properties::PlatformProperties; use rand::RngCore; use tokio::sync::mpsc; use tokio::time::interval; -use tonic::{Request, Response, Status}; +use tonic::{Response, Status}; use tracing::{debug, error, warn, instrument, Level}; use uuid::Uuid; @@ -49,7 +50,7 @@ pub type NowFn = Box Result + Send + Sync>; pub struct WorkerApiServer { scheduler: Arc, - now_fn: NowFn, + now_fn: Arc, node_id: [u8; 6], } @@ -129,7 +130,7 @@ impl WorkerApiServer { .clone(); Ok(Self { scheduler, - now_fn, + now_fn: Arc::new(now_fn), node_id, }) } @@ -140,8 +141,24 @@ impl WorkerApiServer { async fn inner_connect_worker( &self, - connect_worker_request: ConnectWorkerRequest, + mut update_stream: impl Stream> + + Unpin + + Send + + 'static, ) -> Result, Error> { + let first_message = update_stream + .next() + .await + .err_tip(|| "Missing first message for connect_worker")? + .err_tip(|| "Error reading first message for connect_worker")?; + let Some(Update::ConnectWorkerRequest(connect_worker_request)) = first_message.update + else { + return Err(make_err!( + Code::Internal, + "First message was not a ConnectWorkerRequest" + )); + }; + let (tx, rx) = mpsc::unbounded_channel(); // First convert our proto platform properties into one our scheduler understands. @@ -172,6 +189,7 @@ impl WorkerApiServer { platform_properties, tx, (self.now_fn)()?.as_secs(), + connect_worker_request.max_inflight_tasks, ); self.scheduler .add_worker(worker) @@ -180,6 +198,13 @@ impl WorkerApiServer { worker_id }; + WorkerConnection::start( + self.scheduler.clone(), + self.now_fn.clone(), + worker_id.clone(), + update_stream, + ); + Ok(Response::new(Box::pin(unfold( (rx, worker_id), move |state| async move { @@ -197,35 +222,120 @@ impl WorkerApiServer { )))) } - async fn inner_keep_alive( + pub async fn inner_connect_worker_for_testing( &self, - keep_alive_request: KeepAliveRequest, - ) -> Result, Error> { - let worker_id: WorkerId = keep_alive_request.worker_id.into(); + update_stream: impl Stream> + Unpin + Send + 'static, + ) -> Result, Error> { + self.inner_connect_worker(update_stream).await + } +} + +#[tonic::async_trait] +impl WorkerApi for WorkerApiServer { + type ConnectWorkerStream = ConnectWorkerStream; + + #[instrument( + err, + level = Level::ERROR, + skip_all, + fields(request = ?grpc_request.get_ref()) + )] + async fn connect_worker( + &self, + grpc_request: tonic::Request>, + ) -> Result, Status> { + let resp = self + .inner_connect_worker(grpc_request.into_inner()) + .await + .map_err(Into::into); + if resp.is_ok() { + debug!(return = "Ok()"); + } + resp + } +} + +struct WorkerConnection { + scheduler: Arc, + now_fn: Arc, + worker_id: WorkerId, +} + +impl WorkerConnection { + fn start( + scheduler: Arc, + now_fn: Arc, + worker_id: WorkerId, + mut connection: impl Stream> + Unpin + Send + 'static, + ) { + let instance = Self { + scheduler, + now_fn, + worker_id, + }; + + background_spawn!("worker_api", async move { + let mut had_going_away = false; + while let Some(maybe_update) = connection.next().await { + let update = match maybe_update.map(|u| u.update) { + Ok(Some(update)) => update, + Ok(None) => { + tracing::warn!(worker_id=?instance.worker_id, "Empty update"); + continue; + } + Err(err) => { + tracing::warn!(worker_id=?instance.worker_id, ?err, "Error from worker"); + break; + } + }; + let result = match update { + Update::ConnectWorkerRequest(_connect_worker_request) => Err(make_err!( + Code::Internal, + "Got ConnectWorkerRequest after initial message for {}", + instance.worker_id + )), + Update::KeepAliveRequest(keep_alive_request) => { + instance.inner_keep_alive(keep_alive_request).await + } + Update::GoingAwayRequest(going_away_request) => { + had_going_away = true; + instance.inner_going_away(going_away_request).await + } + Update::ExecuteResult(execute_result) => { + instance.inner_execution_response(execute_result).await + } + Update::ExecuteComplete(execute_complete) => { + instance.execution_complete(execute_complete).await + } + }; + if let Err(err) = result { + tracing::warn!(worker_id=?instance.worker_id, ?err, "Error processing worker message"); + } + } + tracing::debug!(worker_id=?instance.worker_id, "Update for scheduler dropped"); + if !had_going_away { + drop(instance.scheduler.remove_worker(&instance.worker_id).await); + } + }); + } + + async fn inner_keep_alive(&self, _keep_alive_request: KeepAliveRequest) -> Result<(), Error> { self.scheduler - .worker_keep_alive_received(&worker_id, (self.now_fn)()?.as_secs()) + .worker_keep_alive_received(&self.worker_id, (self.now_fn)()?.as_secs()) .await .err_tip(|| "Could not process keep_alive from worker in inner_keep_alive()")?; - Ok(Response::new(())) + Ok(()) } - async fn inner_going_away( - &self, - going_away_request: GoingAwayRequest, - ) -> Result, Error> { - let worker_id: WorkerId = going_away_request.worker_id.into(); + async fn inner_going_away(&self, _going_away_request: GoingAwayRequest) -> Result<(), Error> { self.scheduler - .remove_worker(&worker_id) + .remove_worker(&self.worker_id) .await .err_tip(|| "While calling WorkerApiServer::inner_going_away")?; - Ok(Response::new(())) + Ok(()) } - async fn inner_execution_response( - &self, - execute_result: ExecuteResult, - ) -> Result, Error> { - let worker_id: WorkerId = execute_result.worker_id.into(); + async fn inner_execution_response(&self, execute_result: ExecuteResult) -> Result<(), Error> { let operation_id = OperationId::from(execute_result.operation_id); match execute_result @@ -238,97 +348,37 @@ impl WorkerApiServer { .err_tip(|| "Failed to convert ExecuteResponse into an ActionStage")?; self.scheduler .update_action( - &worker_id, + &self.worker_id, &operation_id, UpdateOperationType::UpdateWithActionStage(action_stage), ) .await - .err_tip(|| format!("Failed to operation {operation_id:?}"))?; + .err_tip(|| format!("Failed to operation {operation_id}"))?; } execute_result::Result::InternalError(e) => { self.scheduler .update_action( - &worker_id, + &self.worker_id, &operation_id, UpdateOperationType::UpdateWithError(e.into()), ) .await - .err_tip(|| format!("Failed to operation {operation_id:?}"))?; + .err_tip(|| format!("Failed to operation {operation_id}"))?; } } - Ok(Response::new(())) - } -} - -#[tonic::async_trait] -impl WorkerApi for WorkerApiServer { - type ConnectWorkerStream = ConnectWorkerStream; - - #[instrument( - err, - level = Level::ERROR, - skip_all, - fields(request = ?grpc_request.get_ref()) - )] - async fn connect_worker( - &self, - grpc_request: Request, - ) -> Result, Status> { - let resp = self - .inner_connect_worker(grpc_request.into_inner()) - .await - .map_err(Into::into); - if resp.is_ok() { - debug!(return = "Ok()"); - } - resp - } - - #[instrument( - err, - ret(level = Level::DEBUG), - level = Level::DEBUG, - skip_all, - fields(request = ?grpc_request.get_ref()) - )] - async fn keep_alive( - &self, - grpc_request: Request, - ) -> Result, Status> { - self.inner_keep_alive(grpc_request.into_inner()) - .await - .map_err(Into::into) - } - - #[instrument( - err, - ret(level = Level::INFO), - level = Level::ERROR, - skip_all, - fields(request = ?grpc_request.get_ref()) - )] - async fn going_away( - &self, - grpc_request: Request, - ) -> Result, Status> { - self.inner_going_away(grpc_request.into_inner()) - .await - .map_err(Into::into) + Ok(()) } - #[instrument( - err, - ret(level = Level::DEBUG), - level = Level::ERROR, - skip_all, - fields(request = ?grpc_request.get_ref()) - )] - async fn execution_response( - &self, - grpc_request: Request, - ) -> Result, Status> { - self.inner_execution_response(grpc_request.into_inner()) + async fn execution_complete(&self, execute_complete: ExecuteComplete) -> Result<(), Error> { + let operation_id = OperationId::from(execute_complete.operation_id); + self.scheduler + .update_action( + &self.worker_id, + &operation_id, + UpdateOperationType::ExecutionComplete, + ) .await - .map_err(Into::into) + .err_tip(|| format!("Failed to operation {operation_id}"))?; + Ok(()) } } diff --git a/nativelink-service/tests/ac_server_test.rs b/nativelink-service/tests/ac_server_test.rs index a538ad7ad..4f3ca7feb 100644 --- a/nativelink-service/tests/ac_server_test.rs +++ b/nativelink-service/tests/ac_server_test.rs @@ -56,6 +56,7 @@ async fn make_store_manager() -> Result, Error> { store_manager.add_store( "main_cas", store_factory( + "main_cas", &StoreSpec::Memory(MemorySpec::default()), &store_manager, None, @@ -65,6 +66,7 @@ async fn make_store_manager() -> Result, Error> { store_manager.add_store( "main_ac", store_factory( + "main_ac", &StoreSpec::Memory(MemorySpec::default()), &store_manager, None, @@ -117,6 +119,7 @@ async fn empty_store() -> Result<(), Box> { let err = raw_response.unwrap_err(); assert_eq!(err.code(), Code::NotFound); assert!(err.message().is_empty()); + Ok(()) } @@ -134,6 +137,8 @@ async fn has_single_item() -> Result<(), Box> { insert_into_store(ac_store.as_pin(), HASH1, HASH1_SIZE, &action_result).await?; let raw_response = get_action_result(&ac_server, HASH1, HASH1_SIZE).await; + assert!(!logs_contain(" output_files: [")); + assert!( raw_response.is_ok(), "Expected value, got error {raw_response:?}" diff --git a/nativelink-service/tests/bep_server_test.rs b/nativelink-service/tests/bep_server_test.rs index ff70e85e7..1da676a53 100644 --- a/nativelink-service/tests/bep_server_test.rs +++ b/nativelink-service/tests/bep_server_test.rs @@ -55,6 +55,7 @@ async fn make_store_manager() -> Result, Error> { store_manager.add_store( BEP_STORE_NAME, store_factory( + BEP_STORE_NAME, &StoreSpec::Memory(MemorySpec::default()), &store_manager, None, @@ -331,7 +332,10 @@ async fn publish_build_tool_event_stream_test() -> Result<(), Box Result, Error> { store_manager.add_store( "main_cas", store_factory( + "main_cas", &StoreSpec::Memory(MemorySpec::default()), &store_manager, None, diff --git a/nativelink-service/tests/cas_server_test.rs b/nativelink-service/tests/cas_server_test.rs index 7ab7654f5..f9bf6bbab 100644 --- a/nativelink-service/tests/cas_server_test.rs +++ b/nativelink-service/tests/cas_server_test.rs @@ -50,6 +50,7 @@ async fn make_store_manager() -> Result, Error> { store_manager.add_store( "main_cas", store_factory( + "main_cas", &StoreSpec::Memory(MemorySpec::default()), &store_manager, None, diff --git a/nativelink-service/tests/execution_server_test.rs b/nativelink-service/tests/execution_server_test.rs index 35177f6e8..63fab5def 100644 --- a/nativelink-service/tests/execution_server_test.rs +++ b/nativelink-service/tests/execution_server_test.rs @@ -35,6 +35,7 @@ async fn make_store_manager() -> Result, Error> { store_manager.add_store( "main_cas", store_factory( + "main_cas", &StoreSpec::Memory(MemorySpec::default()), &store_manager, None, diff --git a/nativelink-service/tests/fetch_server_test.rs b/nativelink-service/tests/fetch_server_test.rs index f663f7fce..b3b5ca014 100644 --- a/nativelink-service/tests/fetch_server_test.rs +++ b/nativelink-service/tests/fetch_server_test.rs @@ -36,6 +36,7 @@ async fn make_store_manager() -> Result, Error> { store_manager.add_store( "test_fetch_store", store_factory( + "test_fetch_store", &StoreSpec::Memory(MemorySpec::default()), &store_manager, None, diff --git a/nativelink-service/tests/push_server_test.rs b/nativelink-service/tests/push_server_test.rs index 066b8b6ea..937e02d00 100644 --- a/nativelink-service/tests/push_server_test.rs +++ b/nativelink-service/tests/push_server_test.rs @@ -38,6 +38,7 @@ async fn make_store_manager() -> Result, Error> { store_manager.add_store( "test_push_store", store_factory( + "test_push_store", &StoreSpec::Memory(MemorySpec::default()), &store_manager, None, diff --git a/nativelink-service/tests/worker_api_server_test.rs b/nativelink-service/tests/worker_api_server_test.rs index 620ccaaaa..cc2fd5136 100644 --- a/nativelink-service/tests/worker_api_server_test.rs +++ b/nativelink-service/tests/worker_api_server_test.rs @@ -22,16 +22,16 @@ use async_trait::async_trait; use bytes::Bytes; use nativelink_config::cas_server::WorkerApiConfig; use nativelink_config::schedulers::WorkerAllocationStrategy; -use nativelink_error::{Error, ResultExt}; +use nativelink_error::{Error, ResultExt, make_err}; use nativelink_macro::nativelink_test; use nativelink_metric::MetricsComponent; use nativelink_proto::build::bazel::remote::execution::v2::{ ActionResult as ProtoActionResult, ExecuteResponse, ExecutedActionMetadata, LogFile, OutputDirectory, OutputFile, OutputSymlink, }; -use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::worker_api_server::WorkerApi; +use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::update_for_scheduler::Update; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - ConnectWorkerRequest, ExecuteResult, KeepAliveRequest, execute_result, update_for_worker, + execute_result, update_for_worker, ConnectWorkerRequest, ExecuteResult, KeepAliveRequest, UpdateForScheduler }; use nativelink_proto::google::rpc::Status as ProtoStatus; use nativelink_scheduler::api_worker_scheduler::ApiWorkerScheduler; @@ -45,11 +45,12 @@ use nativelink_util::action_messages::{ use nativelink_util::common::DigestInfo; use nativelink_util::digest_hasher::DigestHasherFunc; use nativelink_util::operation_state_manager::{UpdateOperationType, WorkerStateManager}; +use nativelink_util::platform_properties::PlatformProperties; use pretty_assertions::assert_eq; use tokio::join; use tokio::sync::{Notify, mpsc}; use tokio_stream::StreamExt; -use tonic::Request; +use nativelink_scheduler::worker_registry::WorkerRegistry; const BASE_NOW_S: u64 = 10; const BASE_WORKER_TIMEOUT_S: u64 = 100; @@ -128,9 +129,10 @@ impl WorkerStateManager for MockWorkerStateManager { struct TestContext { scheduler: Arc, state_manager: Arc, - worker_api_server: WorkerApiServer, + _worker_api_server: WorkerApiServer, connection_worker_stream: ConnectWorkerStream, worker_id: WorkerId, + worker_stream: mpsc::Sender, } #[expect( @@ -142,6 +144,14 @@ const fn static_now_fn() -> Result { } async fn setup_api_server(worker_timeout: u64, now_fn: NowFn) -> Result { + setup_api_server_with_task_limit(worker_timeout, now_fn, 0).await +} + +async fn setup_api_server_with_task_limit( + worker_timeout: u64, + now_fn: NowFn, + max_worker_tasks: u64, +) -> Result { const SCHEDULER_NAME: &str = "DUMMY_SCHEDULE_NAME"; const UUID_SIZE: usize = 36; @@ -149,12 +159,15 @@ async fn setup_api_server(worker_timeout: u64, now_fn: NowFn) -> Result> = HashMap::new(); @@ -169,9 +182,24 @@ async fn setup_api_server(worker_timeout: u64, now_fn: NowFn) -> Result Result Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { + let test_context = + setup_api_server_with_task_limit(BASE_WORKER_TIMEOUT_S, Box::new(static_now_fn), 1).await?; + + let selected_worker = test_context + .scheduler + .find_worker_for_action(&PlatformProperties::new(HashMap::new()), true) + .await; + assert_eq!( + selected_worker, + Some(test_context.worker_id.clone()), + "Expected worker to permit tasks to begin with" + ); + + let action_digest = DigestInfo::new([7u8; 32], 123); + let instance_name = "instance_name".to_string(); + + let unique_qualifier = ActionUniqueQualifier::Uncacheable(ActionUniqueKey { + instance_name: instance_name.clone(), + digest_function: DigestHasherFunc::Sha256, + digest: action_digest, + }); + + let action_info = Arc::new(ActionInfo { + command_digest: DigestInfo::new([0u8; 32], 0), + input_root_digest: DigestInfo::new([0u8; 32], 0), + timeout: Duration::MAX, + platform_properties: HashMap::new(), + priority: 0, + load_timestamp: make_system_time(0), + insert_timestamp: make_system_time(0), + unique_qualifier, + }); + + let platform_properties = test_context + .scheduler + .get_platform_property_manager() + .make_platform_properties(action_info.platform_properties.clone()) + .err_tip(|| "Failed to make platform properties in SimpleScheduler::do_try_match")?; + + let expected_operation_id = OperationId::default(); + + test_context + .scheduler + .worker_notify_run_action( + test_context.worker_id.clone(), + expected_operation_id, + ActionInfoWithProps { + inner: action_info, + platform_properties, + }, + ) + .await + .unwrap(); + + let selected_worker = test_context + .scheduler + .find_worker_for_action(&PlatformProperties::new(HashMap::new()), true) + .await; + assert_eq!( + selected_worker, None, + "Expected not to be able to give worker a second task" + ); + + assert!(logs_contain("All workers are fully allocated")); + + Ok(()) +} diff --git a/nativelink-store/BUILD.bazel b/nativelink-store/BUILD.bazel index 46441d513..600c7bbbd 100644 --- a/nativelink-store/BUILD.bazel +++ b/nativelink-store/BUILD.bazel @@ -29,12 +29,16 @@ rust_library( "src/grpc_store.rs", "src/lib.rs", "src/memory_store.rs", + "src/metrics_store.rs", "src/mongo_store.rs", "src/noop_store.rs", "src/ontap_s3_existence_cache_store.rs", "src/ontap_s3_store.rs", "src/redis_store.rs", + "src/redis_utils/aggregate_types.rs", "src/redis_utils/ft_aggregate.rs", + "src/redis_utils/ft_create.rs", + "src/redis_utils/ft_cursor_read.rs", "src/redis_utils/mod.rs", "src/ref_store.rs", "src/s3_store.rs", @@ -52,6 +56,7 @@ rust_library( "//nativelink-error", "//nativelink-metric", "//nativelink-proto", + "//nativelink-redis-tester", "//nativelink-util", "@crates//:async-lock", "@crates//:aws-config", @@ -63,9 +68,7 @@ rust_library( "@crates//:blake3", "@crates//:byteorder", "@crates//:bytes", - "@crates//:bytes-utils", "@crates//:const_format", - "@crates//:fred", "@crates//:futures", "@crates//:gcloud-auth", "@crates//:gcloud-storage", @@ -76,6 +79,7 @@ rust_library( "@crates//:hyper", "@crates//:hyper-rustls", "@crates//:hyper-util", + "@crates//:itertools", "@crates//:lz4_flex", "@crates//:mongodb", "@crates//:opentelemetry", @@ -83,9 +87,12 @@ rust_library( "@crates//:patricia_tree", "@crates//:prost", "@crates//:rand", + "@crates//:redis", "@crates//:regex", + "@crates//:reqwest", + "@crates//:reqwest-middleware", "@crates//:rustls", - "@crates//:rustls-pemfile", + "@crates//:rustls-pki-types", "@crates//:serde", "@crates//:serde_json", "@crates//:sha2", @@ -94,6 +101,7 @@ rust_library( "@crates//:tokio-util", "@crates//:tonic", "@crates//:tracing", + "@crates//:url", "@crates//:uuid", ], ) @@ -111,6 +119,7 @@ rust_test_suite( "tests/filesystem_store_test.rs", "tests/gcs_client_test.rs", "tests/gcs_store_test.rs", + "tests/grpc_store_test.rs", "tests/memory_store_test.rs", "tests/mongo_store_test.rs", "tests/ontap_s3_existence_cache_store_test.rs", @@ -132,6 +141,7 @@ rust_test_suite( "//nativelink-error", "//nativelink-metric", "//nativelink-proto", + "//nativelink-redis-tester", "//nativelink-util", "@crates//:async-lock", "@crates//:aws-sdk-s3", @@ -140,7 +150,6 @@ rust_test_suite( "@crates//:aws-smithy-types", "@crates//:bincode", "@crates//:bytes", - "@crates//:fred", "@crates//:futures", "@crates//:hex", "@crates//:http", @@ -152,12 +161,15 @@ rust_test_suite( "@crates//:parking_lot", "@crates//:pretty_assertions", "@crates//:rand", + "@crates//:redis", + "@crates//:redis-test", "@crates//:serde_json", "@crates//:serial_test", "@crates//:sha2", "@crates//:tempfile", "@crates//:tokio", "@crates//:tokio-stream", + "@crates//:tonic", "@crates//:tracing", "@crates//:tracing-test", "@crates//:uuid", @@ -176,12 +188,12 @@ rust_test( "@crates//:aws-smithy-runtime", "@crates//:aws-smithy-runtime-api", "@crates//:aws-smithy-types", - "@crates//:fred", "@crates//:http", "@crates//:memory-stats", "@crates//:mock_instant", "@crates//:pretty_assertions", "@crates//:rand", + "@crates//:redis", "@crates//:serde_json", "@crates//:sha2", ], diff --git a/nativelink-store/Cargo.toml b/nativelink-store/Cargo.toml index 6c51c69e3..6927004a2 100644 --- a/nativelink-store/Cargo.toml +++ b/nativelink-store/Cargo.toml @@ -1,19 +1,21 @@ +#:schema ../tools/cargo-with-detailed-deps.json lints.workspace = true [package] edition = "2024" name = "nativelink-store" -version = "0.7.3" +version = "1.0.0-rc4" [dependencies] nativelink-config = { path = "../nativelink-config" } nativelink-error = { path = "../nativelink-error" } nativelink-metric = { path = "../nativelink-metric" } nativelink-proto = { path = "../nativelink-proto" } +nativelink-redis-tester = { path = "../nativelink-redis-tester" } nativelink-util = { path = "../nativelink-util" } async-lock = { version = "3.4.0", features = ["std"], default-features = false } -async-trait = "0.1.88" +async-trait = { version = "0.1.88", default-features = false } aws-config = { version = "1.6.1", default-features = false, features = ["sso"] } aws-sdk-s3 = { version = "1.82.0", features = [ "http-1x", @@ -34,32 +36,20 @@ bincode = { version = "2.0.1", default-features = false, features = [ blake3 = { version = "1.8.0", default-features = false } byteorder = { version = "1.5.0", default-features = false } bytes = { version = "1.10.1", default-features = false } -bytes-utils = { version = "0.1.4", default-features = false } const_format = { version = "0.2.34", default-features = false } -fred = { version = "10.1.0", default-features = false, features = [ - "blocking-encoding", - "custom-reconnect-errors", - "enable-rustls-ring", - "i-redisearch", - "i-scripts", - "i-std", - "mocks", - "sentinel-auth", - "sentinel-client", - "sha-1", - "subscriber-client", +futures = { version = "0.3.31", default-features = false, features = ["std"] } +gcloud-auth = { version = "1.2", default-features = false, features = [ + "jwt-rust-crypto", ] } -futures = { version = "0.3.31", default-features = false } -gcloud-auth = { version = "1.1.2", default-features = false } -gcloud-storage = { version = "1.1.1", default-features = false, features = [ +gcloud-storage = { version = "1", default-features = false, features = [ "auth", "rustls-tls", ] } hex = { version = "0.4.3", default-features = false } http = { version = "1.3.1", default-features = false } -http-body = "1.0.1" -http-body-util = "0.1.3" -hyper = { version = "1.6.0" } +http-body = { version = "1.0.1", default-features = false } +http-body-util = { version = "0.1.3", default-features = false } +hyper = { version = "1.6.0", default-features = false } hyper-rustls = { version = "0.27.5", default-features = false, features = [ "http1", "http2", @@ -68,25 +58,37 @@ hyper-rustls = { version = "0.27.5", default-features = false, features = [ "rustls-platform-verifier", ] } hyper-util = { version = "0.1.11", default-features = false } +itertools = { version = "0.14.0", default-features = false } lz4_flex = { version = "0.11.3", default-features = false } mongodb = { version = "3", features = [ "compat-3-0-0", "rustls-tls", ], default-features = false } -opentelemetry = { version = "0.29.1", default-features = false } -parking_lot = { version = "0.12.3", features = ["arc_lock", "send_guard"] } +opentelemetry = { version = "0.30.0", default-features = false } +parking_lot = { version = "0.12.3", features = [ + "arc_lock", + "send_guard", +], default-features = false } patricia_tree = { version = "0.9.0", default-features = false } prost = { version = "0.13.5", default-features = false } rand = { version = "0.9.0", default-features = false, features = [ "thread_rng", ] } +redis = { version = "1.0.0", default-features = false, features = [ + "ahash", + "cluster-async", + "connection-manager", + "script", + "sentinel", + "tokio-comp", +] } regex = { version = "1.11.1", default-features = false } +reqwest = { version = "0.12", default-features = false } +reqwest-middleware = { version = "0.4.2", default-features = false } rustls = { version = "0.23.27", default-features = false, features = [] } -rustls-pemfile = { version = "2.2.0", features = [ - "std", -], default-features = false } +rustls-pki-types = { version = "1.13.1", default-features = false } serde = { version = "1.0.219", default-features = false } -serde_json = "1.0.140" +serde_json = { version = "1.0.140", default-features = false } sha2 = { version = "0.10.8", default-features = false } tokio = { version = "1.44.1", features = [ "fs", @@ -97,12 +99,13 @@ tokio = { version = "1.44.1", features = [ tokio-stream = { version = "0.1.17", features = [ "fs", ], default-features = false } -tokio-util = { version = "0.7.14" } +tokio-util = { version = "0.7.14", default-features = false } tonic = { version = "0.13.0", features = [ "tls-ring", "transport", ], default-features = false } tracing = { version = "0.1.41", default-features = false } +url = { version = "2.5.7", default-features = false } uuid = { version = "1.16.0", default-features = false, features = [ "serde", "v4", @@ -123,15 +126,21 @@ aws-smithy-runtime-api = { version = "1.7.4", default-features = false } aws-smithy-types = { version = "1.3.0", default-features = false, features = [ "http-body-1-x", ] } +futures = { version = "0.3.31", default-features = false, features = [ + "executor", +] } http = { version = "1.3.1", default-features = false } -memory-stats = "1.2.0" -mock_instant = "0.5.3" -pretty_assertions = { version = "1.4.1", features = ["std"] } +memory-stats = { version = "1.2.0", default-features = false } +mock_instant = { version = "0.5.3", default-features = false } +pretty_assertions = { version = "1.4.1", features = [ + "std", +], default-features = false } rand = { version = "0.9.0", default-features = false, features = [ "small_rng", "thread_rng", ] } -serde_json = "1.0.140" +redis-test = { version = "1.0.0", default-features = false, features = ["aio"] } +serde_json = { version = "1.0.140", default-features = false } tempfile = { version = "3.8.1", default-features = false } tracing-test = { version = "0.2.5", default-features = false, features = [ "no-env-filter", diff --git a/nativelink-store/src/callback_utils.rs b/nativelink-store/src/callback_utils.rs index 32cc9f68a..a18f20c52 100644 --- a/nativelink-store/src/callback_utils.rs +++ b/nativelink-store/src/callback_utils.rs @@ -12,30 +12,34 @@ // See the License for the specific language governing permissions and // limitations under the License. +use core::borrow::Borrow; +use core::pin::Pin; use std::sync::Arc; -use nativelink_util::evicting_map::RemoveStateCallback; +use nativelink_util::evicting_map; use nativelink_util::store_trait::{RemoveItemCallback, StoreKey}; -use tonic::async_trait; // Generic struct to hold a RemoveItemCallback ref for the purposes // of a RemoveStateCallback call #[derive(Debug)] pub struct RemoveItemCallbackHolder { - callback_fn: Arc>, + callback: Arc, } impl RemoveItemCallbackHolder { - pub fn new(callback: &Arc>) -> Self { - Self { - callback_fn: callback.clone(), - } + pub fn new(callback: Arc) -> Self { + Self { callback } } } -#[async_trait] -impl RemoveStateCallback> for RemoveItemCallbackHolder { - async fn callback(&self, key: &StoreKey<'static>) { - self.callback_fn.callback(key).await; +impl<'a, Q> evicting_map::RemoveItemCallback for RemoveItemCallbackHolder +where + Q: Borrow>, +{ + fn callback(&self, store_key: &Q) -> Pin + Send>> { + let callback = self.callback.clone(); + let store_key: &StoreKey<'_> = Borrow::>::borrow(store_key); + let store_key = store_key.borrow().into_owned(); + Box::pin(async move { callback.callback(store_key).await }) } } diff --git a/nativelink-store/src/completeness_checking_store.rs b/nativelink-store/src/completeness_checking_store.rs index b6f526229..bbdbde8d9 100644 --- a/nativelink-store/src/completeness_checking_store.rs +++ b/nativelink-store/src/completeness_checking_store.rs @@ -392,9 +392,9 @@ impl StoreDriver for CompletenessCheckingStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { - self.ac_store.register_remove_callback(callback)?; + self.ac_store.register_remove_callback(callback.clone())?; self.cas_store.register_remove_callback(callback)?; Ok(()) } diff --git a/nativelink-store/src/compression_store.rs b/nativelink-store/src/compression_store.rs index b76ca2377..345e06703 100644 --- a/nativelink-store/src/compression_store.rs +++ b/nativelink-store/src/compression_store.rs @@ -195,7 +195,8 @@ impl UploadState { usize::try_from(max_index_count) .err_tip(|| "Could not convert max_index_count to usize")? ], - index_count: max_index_count as u32, + index_count: u32::try_from(max_index_count) + .err_tip(|| "Could not convert max_index_count to u32")?, uncompressed_data_size: 0, // Updated later. config: header.config, version: CURRENT_STREAM_FORMAT_VERSION, @@ -361,14 +362,18 @@ impl StoreDriver for CompressionStore { } // Now fill the size in our slice. - LittleEndian::write_u32(&mut compressed_data_buf[1..5], compressed_data_sz as u32); + LittleEndian::write_u32( + &mut compressed_data_buf[1..5], + u32::try_from(compressed_data_sz).unwrap_or(u32::MAX), + ); // Now send our chunk. tx.send(compressed_data_buf.freeze()) .await .err_tip(|| "Failed to write chunk to inner store in compression store")?; - index.position_from_prev_index = compressed_data_sz as u32; + index.position_from_prev_index = + u32::try_from(compressed_data_sz).unwrap_or(u32::MAX); index_count += 1; } @@ -384,7 +389,8 @@ impl StoreDriver for CompressionStore { .footer .indexes .resize(index_count as usize, SliceIndex::default()); - output_state.footer.index_count = output_state.footer.indexes.len() as u32; + output_state.footer.index_count = + u32::try_from(output_state.footer.indexes.len()).unwrap_or(u32::MAX); output_state.footer.uncompressed_data_size = received_amt; { // Write Footer. @@ -395,7 +401,7 @@ impl StoreDriver for CompressionStore { let mut footer = BytesMut::with_capacity(1 + 4 + serialized_footer.len()); footer.put_u8(FOOTER_FRAME_TYPE); - footer.put_u32_le(serialized_footer.len() as u32); + footer.put_u32_le(u32::try_from(serialized_footer.len()).unwrap_or(u32::MAX)); footer.extend_from_slice(&serialized_footer); tx.send(footer.freeze()) @@ -453,7 +459,7 @@ impl StoreDriver for CompressionStore { }; let header_size = serialized_size(&EMPTY_HEADER, self.bincode_config)?; let chunk = rx - .consume(Some(header_size as usize)) + .consume(Some(usize::try_from(header_size).unwrap_or(usize::MAX))) .await .err_tip(|| "Failed to read header in get_part compression store")?; error_if!( @@ -536,9 +542,13 @@ impl StoreDriver for CompressionStore { let new_uncompressed_data_sz = uncompressed_data_sz + uncompressed_chunk_sz as u64; if new_uncompressed_data_sz >= offset && remaining_bytes_to_send > 0 { - let start_pos = offset.saturating_sub(uncompressed_data_sz) as usize; + let start_pos = + usize::try_from(offset.saturating_sub(uncompressed_data_sz)) + .unwrap_or(usize::MAX); let end_pos = cmp::min( - start_pos + remaining_bytes_to_send as usize, + start_pos.saturating_add( + usize::try_from(remaining_bytes_to_send).unwrap_or(usize::MAX), + ), uncompressed_chunk_sz, ); if end_pos != start_pos { @@ -644,7 +654,7 @@ impl StoreDriver for CompressionStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { self.inner_store.register_remove_callback(callback) } diff --git a/nativelink-store/src/dedup_store.rs b/nativelink-store/src/dedup_store.rs index 10e17d71e..252411a45 100644 --- a/nativelink-store/src/dedup_store.rs +++ b/nativelink-store/src/dedup_store.rs @@ -381,9 +381,10 @@ impl StoreDriver for DedupStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { - self.index_store.register_remove_callback(callback)?; + self.index_store + .register_remove_callback(callback.clone())?; self.content_store.register_remove_callback(callback)?; Ok(()) } diff --git a/nativelink-store/src/default_store_factory.rs b/nativelink-store/src/default_store_factory.rs index 969fb8c57..63e161891 100644 --- a/nativelink-store/src/default_store_factory.rs +++ b/nativelink-store/src/default_store_factory.rs @@ -13,16 +13,17 @@ // limitations under the License. use core::pin::Pin; +use std::env; use std::sync::Arc; use std::time::SystemTime; use futures::stream::FuturesOrdered; use futures::{Future, TryStreamExt}; -use nativelink_config::stores::{ExperimentalCloudObjectSpec, StoreSpec}; +use nativelink_config::stores::{ExperimentalCloudObjectSpec, RedisMode, StoreSpec}; use nativelink_error::Error; use nativelink_util::health_utils::HealthRegistryBuilder; +use nativelink_util::metrics::StoreType; use nativelink_util::store_trait::{Store, StoreDriver}; - use crate::completeness_checking_store::CompletenessCheckingStore; use crate::compression_store::CompressionStore; use crate::dedup_store::DedupStore; @@ -32,6 +33,7 @@ use crate::filesystem_store::FilesystemStore; use crate::gcs_store::GcsStore; use crate::grpc_store::GrpcStore; use crate::memory_store::MemoryStore; +use crate::metrics_store::MetricsStore; use crate::mongo_store::ExperimentalMongoStore; use crate::noop_store::NoopStore; use crate::ontap_s3_existence_cache_store::OntapS3ExistenceCache; @@ -47,6 +49,7 @@ use crate::verify_store::VerifyStore; type FutureMaybeStore<'a> = Box> + Send + 'a>; pub fn store_factory<'a>( + name: &'a str, backend: &'a StoreSpec, store_manager: &'a Arc, maybe_health_registry_builder: Option<&'a mut HealthRegistryBuilder>, @@ -65,42 +68,48 @@ pub fn store_factory<'a>( GcsStore::new(gcs_config, SystemTime::now).await? } }, - StoreSpec::RedisStore(spec) => RedisStore::new(spec.clone())?, + StoreSpec::RedisStore(spec) => { + if spec.mode == RedisMode::Cluster { + RedisStore::new_cluster(spec.clone()).await? + } else { + RedisStore::new_standard(spec.clone()).await? + } + } StoreSpec::Verify(spec) => VerifyStore::new( spec, - store_factory(&spec.backend, store_manager, None).await?, + store_factory(name, &spec.backend, store_manager, None).await?, ), StoreSpec::Compression(spec) => CompressionStore::new( &spec.clone(), - store_factory(&spec.backend, store_manager, None).await?, + store_factory(name, &spec.backend, store_manager, None).await?, )?, StoreSpec::Dedup(spec) => DedupStore::new( spec, - store_factory(&spec.index_store, store_manager, None).await?, - store_factory(&spec.content_store, store_manager, None).await?, + store_factory(name, &spec.index_store, store_manager, None).await?, + store_factory(name, &spec.content_store, store_manager, None).await?, )?, StoreSpec::ExistenceCache(spec) => ExistenceCacheStore::new( spec, - store_factory(&spec.backend, store_manager, None).await?, + store_factory(name, &spec.backend, store_manager, None).await?, ), StoreSpec::OntapS3ExistenceCache(spec) => { OntapS3ExistenceCache::new(spec, SystemTime::now).await? } StoreSpec::CompletenessChecking(spec) => CompletenessCheckingStore::new( - store_factory(&spec.backend, store_manager, None).await?, - store_factory(&spec.cas_store, store_manager, None).await?, + store_factory(name, &spec.backend, store_manager, None).await?, + store_factory(name, &spec.cas_store, store_manager, None).await?, ), StoreSpec::FastSlow(spec) => FastSlowStore::new( spec, - store_factory(&spec.fast, store_manager, None).await?, - store_factory(&spec.slow, store_manager, None).await?, + store_factory(name, &spec.fast, store_manager, None).await?, + store_factory(name, &spec.slow, store_manager, None).await?, ), StoreSpec::Filesystem(spec) => ::new(spec).await?, StoreSpec::RefStore(spec) => RefStore::new(spec, Arc::downgrade(store_manager)), StoreSpec::SizePartitioning(spec) => SizePartitioningStore::new( spec, - store_factory(&spec.lower_store, store_manager, None).await?, - store_factory(&spec.upper_store, store_manager, None).await?, + store_factory(name, &spec.lower_store, store_manager, None).await?, + store_factory(name, &spec.upper_store, store_manager, None).await?, ), StoreSpec::Grpc(spec) => GrpcStore::new(spec).await?, StoreSpec::Noop(_) => NoopStore::new(), @@ -109,7 +118,7 @@ pub fn store_factory<'a>( let stores = spec .stores .iter() - .map(|store_spec| store_factory(&store_spec.store, store_manager, None)) + .map(|store_spec| store_factory(name, &store_spec.store, store_manager, None)) .collect::>() .try_collect::>() .await?; @@ -121,6 +130,61 @@ pub fn store_factory<'a>( store.clone().register_health(health_registry_builder); } - Ok(Store::new(store)) + let store = Store::new(store); + + return if should_wrap_in_metrics_store(backend) { + Ok(Store::new(MetricsStore::new( + Arc::new(store), + name, + compute_store_type(backend), + ))) + } else { + Ok(store) + } }) } + +fn should_wrap_in_metrics_store(spec: &StoreSpec) -> bool { + if env::var("NL_STORE_METRICS").is_err() { + return false + } + + matches!( + spec, + StoreSpec::Memory(_) + | StoreSpec::Grpc(_) + | StoreSpec::ExperimentalCloudObjectStore(_) + | StoreSpec::ExperimentalMongo(_) + | StoreSpec::Filesystem(_) + | StoreSpec::RedisStore(_) + ) +} + +fn compute_store_type(spec: &StoreSpec) -> StoreType { + match spec { + StoreSpec::Memory(_) => StoreType::Memory, + StoreSpec::ExperimentalCloudObjectStore(s) => match s { + ExperimentalCloudObjectSpec::Aws(_) => StoreType::S3, + ExperimentalCloudObjectSpec::Gcs(_) => StoreType::Gcs, + ExperimentalCloudObjectSpec::Ontap(_) => StoreType::OntapS3, + }, + StoreSpec::RedisStore(_) => StoreType::Redis, + StoreSpec::Verify(_) => StoreType::Verify, + StoreSpec::Compression(_) => StoreType::Compression, + StoreSpec::Dedup(_) => StoreType::Dedup, + StoreSpec::ExistenceCache(_) => StoreType::ExistenceCache, + StoreSpec::OntapS3ExistenceCache(_) => StoreType::OntapS3ExistenceCache, + StoreSpec::CompletenessChecking(_) => StoreType::CompletenessChecking, + StoreSpec::FastSlow(_) => StoreType::FastSlow, + StoreSpec::SizePartitioning(_) => StoreType::SizePartitioning, + StoreSpec::Filesystem(_) => StoreType::Filesystem, + StoreSpec::Grpc(_) => StoreType::Grpc, + StoreSpec::Noop(_) => StoreType::Noop, + StoreSpec::ExperimentalMongo(_) => StoreType::Mongo, + StoreSpec::RefStore(_) => StoreType::Ref, + StoreSpec::Shard(_) => StoreType::Shard, + _ => { + panic!("Invalid store spec: {:?}", spec); + } + } +} diff --git a/nativelink-store/src/existence_cache_store.rs b/nativelink-store/src/existence_cache_store.rs index 3c50cecb6..e36454bdd 100644 --- a/nativelink-store/src/existence_cache_store.rs +++ b/nativelink-store/src/existence_cache_store.rs @@ -18,6 +18,8 @@ use std::sync::{Arc, Weak}; use std::time::SystemTime; use async_trait::async_trait; +use futures::StreamExt; +use futures::stream::FuturesUnordered; use nativelink_config::stores::{EvictionPolicy, ExistenceCacheSpec}; use nativelink_error::{Error, ResultExt, error_if}; use nativelink_metric::MetricsComponent; @@ -57,7 +59,7 @@ pub struct ExistenceCacheStore { // as if it immediately expires them, we should only apply the remove callbacks // afterwards. If this is None, we're not pausing; if it's Some it's the location to // store them in temporarily - pause_remove_callbacks: Arc>>>>, + pause_remove_callbacks: Mutex>>>, } impl ExistenceCacheStore { @@ -66,15 +68,19 @@ impl ExistenceCacheStore { } } -#[async_trait] impl RemoveItemCallback for ExistenceCacheStore { - async fn callback(&self, store_key: &StoreKey<'_>) { + fn callback<'a>( + &'a self, + store_key: StoreKey<'a>, + ) -> Pin + Send + 'a>> { debug!(?store_key, "Removing item from cache due to callback"); - let new_key = store_key.borrow(); - let deleted_key = self.existence_cache.remove(&new_key.into_digest()).await; - if !deleted_key { - info!(?store_key, "Failed to delete key from cache on callback"); - } + let digest = store_key.borrow().into_digest(); + Box::pin(async move { + let deleted_key = self.existence_cache.remove(&digest).await; + if !deleted_key { + info!(?store_key, "Failed to delete key from cache on callback"); + } + }) } } @@ -83,19 +89,25 @@ struct ExistenceCacheCallback { cache: Weak>, } -#[async_trait] impl RemoveItemCallback for ExistenceCacheCallback { - async fn callback(&self, store_key: &StoreKey<'_>) { + fn callback<'a>( + &'a self, + store_key: StoreKey<'a>, + ) -> Pin + Send + 'a>> { let cache = self.cache.upgrade(); if let Some(local_cache) = cache { - if let Some(callbacks) = &mut *local_cache.pause_remove_callbacks.lock_arc() { - callbacks.push(store_key.borrow().into_owned()); + if let Some(callbacks) = local_cache.pause_remove_callbacks.lock().as_mut() { + callbacks.push(store_key.into_owned()); } else { - local_cache.callback(store_key).await; + let store_key = store_key.into_owned(); + return Box::pin(async move { + local_cache.callback(store_key).await; + }); } } else { debug!("Cache dropped, so not doing callback"); } + Box::pin(async {}) } } @@ -110,14 +122,12 @@ impl ExistenceCacheStore { let existence_cache_store = Arc::new(Self { inner_store, existence_cache: EvictingMap::new(eviction_policy, anchor_time), - pause_remove_callbacks: Arc::new(Mutex::new(None)), + pause_remove_callbacks: Mutex::new(None), }); let other_ref = Arc::downgrade(&existence_cache_store); existence_cache_store .inner_store - .register_remove_callback(&Arc::new(Box::new(ExistenceCacheCallback { - cache: other_ref, - }))) + .register_remove_callback(Arc::new(ExistenceCacheCallback { cache: other_ref })) .expect("Register remove callback should work"); existence_cache_store } @@ -237,7 +247,7 @@ impl StoreDriver for ExistenceCacheStore { return Ok(()); } { - let mut locked_callbacks = self.pause_remove_callbacks.lock_arc(); + let mut locked_callbacks = self.pause_remove_callbacks.lock(); if locked_callbacks.is_none() { locked_callbacks.replace(vec![]); } @@ -254,11 +264,13 @@ impl StoreDriver for ExistenceCacheStore { } } { - let mut locked_callbacks = self.pause_remove_callbacks.lock_arc(); - if let Some(callbacks) = locked_callbacks.take() { - for store_key in callbacks { - self.callback(&store_key).await; - } + let maybe_keys = self.pause_remove_callbacks.lock().take(); + if let Some(keys) = maybe_keys { + let mut callbacks: FuturesUnordered<_> = keys + .into_iter() + .map(|store_key| self.callback(store_key)) + .collect(); + while callbacks.next().await.is_some() {} } } result @@ -281,6 +293,8 @@ impl StoreDriver for ExistenceCacheStore { .existence_cache .insert(digest, ExistenceItem(digest.size_bytes())) .await; + } else { + let _ = self.existence_cache.remove(&digest).await; } result } @@ -299,7 +313,7 @@ impl StoreDriver for ExistenceCacheStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { self.inner_store.register_remove_callback(callback) } diff --git a/nativelink-store/src/fast_slow_store.rs b/nativelink-store/src/fast_slow_store.rs index b76e13fd3..b29346ac9 100644 --- a/nativelink-store/src/fast_slow_store.rs +++ b/nativelink-store/src/fast_slow_store.rs @@ -16,14 +16,13 @@ use core::borrow::BorrowMut; use core::cmp::{max, min}; use core::ops::Range; use core::pin::Pin; -use core::sync::atomic::{AtomicU64, Ordering}; use std::collections::HashMap; use std::ffi::OsString; use std::sync::{Arc, Weak}; use async_trait::async_trait; use futures::{FutureExt, join}; -use nativelink_config::stores::FastSlowSpec; +use nativelink_config::stores::{FastSlowSpec, StoreDirection}; use nativelink_error::{Code, Error, ResultExt, make_err}; use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{ @@ -37,6 +36,8 @@ use nativelink_util::store_trait::{ }; use parking_lot::Mutex; use tokio::sync::OnceCell; +use tracing::{debug, trace, warn}; +use nativelink_util::metrics::FAST_SLOW_STORE_METRICS; // TODO(palfrey) This store needs to be evaluated for more efficient memory usage, // there are many copies happening internally. @@ -51,11 +52,11 @@ type Loader = Arc>; pub struct FastSlowStore { #[metric(group = "fast_store")] fast_store: Store, + fast_direction: StoreDirection, #[metric(group = "slow_store")] slow_store: Store, + slow_direction: StoreDirection, weak_self: Weak, - #[metric] - metrics: FastSlowStoreMetrics, // De-duplicate requests for the fast store, only the first streams, others // are blocked. This may feel like it's causing a slow down of tasks, but // actually it's faster because we're not downloading the file multiple @@ -113,12 +114,13 @@ impl Drop for LoaderGuard<'_> { } impl FastSlowStore { - pub fn new(_spec: &FastSlowSpec, fast_store: Store, slow_store: Store) -> Arc { + pub fn new(spec: &FastSlowSpec, fast_store: Store, slow_store: Store) -> Arc { Arc::new_cyclic(|weak_self| Self { fast_store, + fast_direction: spec.fast_direction, slow_store, + slow_direction: spec.slow_direction, weak_self: weak_self.clone(), - metrics: FastSlowStoreMetrics::default(), populating_digests: Mutex::new(HashMap::new()), }) } @@ -164,26 +166,37 @@ impl FastSlowStore { offset: u64, length: Option, ) -> Result<(), Error> { - let sz = self + let reader_stream_size = if self .slow_store - .has(key.borrow()) - .await - .err_tip(|| "Failed to run has() on slow store")? - .ok_or_else(|| { - make_err!( - Code::NotFound, - "Object {} not found in either fast or slow store. \ - If using multiple workers, ensure all workers share the same CAS storage path.", - key.as_str() - ) - })?; - - self.metrics - .slow_store_hit_count - .fetch_add(1, Ordering::Acquire); + .inner_store(Some(key.borrow())) + .optimized_for(StoreOptimizations::LazyExistenceOnSync) + { + trace!( + %key, + store_name = %self.slow_store.inner_store(Some(key.borrow())).get_name(), + "Skipping .has() check due to LazyExistenceOnSync optimization" + ); + UploadSizeInfo::MaxSize(u64::MAX) + } else { + UploadSizeInfo::ExactSize(self + .slow_store + .has(key.borrow()) + .await + .err_tip(|| "Failed to run has() on slow store")? + .ok_or_else(|| { + make_err!( + Code::NotFound, + "Object {} not found in either fast or slow store. \ + If using multiple workers, ensure all workers share the same CAS storage path.", + key.as_str() + ) + })? + ) + }; let send_range = offset..length.map_or(u64::MAX, |length| length + offset); let mut bytes_received: u64 = 0; + let mut counted_hit = false; let (mut fast_tx, fast_rx) = make_buf_channel_pair(); let (slow_tx, mut slow_rx) = make_buf_channel_pair(); @@ -201,11 +214,17 @@ impl FastSlowStore { let fast_res = fast_tx.send_eof(); return Ok::<_, Error>((fast_res, maybe_writer_pin)); } + + if !counted_hit { + FAST_SLOW_STORE_METRICS.slow_store_hit_count.add(1, &[]); + counted_hit = true; + } + let output_buf_len = u64::try_from(output_buf.len()) .err_tip(|| "Could not output_buf.len() to u64")?; - self.metrics + FAST_SLOW_STORE_METRICS .slow_store_downloaded_bytes - .fetch_add(output_buf_len, Ordering::Acquire); + .add(output_buf_len, &[]); let writer_fut = Self::calculate_range( &(bytes_received..bytes_received + output_buf_len), @@ -226,9 +245,9 @@ impl FastSlowStore { }; let slow_store_fut = self.slow_store.get(key.borrow(), slow_tx); - let fast_store_fut = - self.fast_store - .update(key.borrow(), fast_rx, UploadSizeInfo::ExactSize(sz)); + let fast_store_fut = self + .fast_store + .update(key.borrow(), fast_rx, reader_stream_size); let (data_stream_res, slow_res, fast_res) = join!(data_stream_fut, slow_store_fut, fast_store_fut); @@ -245,7 +264,10 @@ impl FastSlowStore { }, ) } - Err(err) => fast_res.merge(slow_res).merge(Err(err)), + Err(err) => match slow_res { + Err(slow_err) if slow_err.code == Code::NotFound => Err(slow_err), + _ => fast_res.merge(slow_res).merge(Err(err)), + }, } } @@ -262,8 +284,22 @@ impl FastSlowStore { if maybe_size_info.is_some() { return Ok(()); } - let loader = self.get_loader(key.borrow()); - loader + + // If the fast store is noop or read only or update only then this is an error. + if self + .fast_store + .inner_store(Some(key.borrow())) + .optimized_for(StoreOptimizations::NoopUpdates) + || self.fast_direction == StoreDirection::ReadOnly + || self.fast_direction == StoreDirection::Update + { + return Err(make_err!( + Code::Internal, + "Attempt to populate fast store that is read only or noop" + )); + } + + self.get_loader(key.borrow()) .get_or_try_init(|| { Pin::new(self).populate_and_maybe_stream(key.borrow(), None, 0, None) }) @@ -327,18 +363,45 @@ impl StoreDriver for FastSlowStore { ) -> Result<(), Error> { // If either one of our stores is a noop store, bypass the multiplexing // and just use the store that is not a noop store. - let slow_store = self.slow_store.inner_store(Some(key.borrow())); - if slow_store.optimized_for(StoreOptimizations::NoopUpdates) { + let ignore_slow = self + .slow_store + .inner_store(Some(key.borrow())) + .optimized_for(StoreOptimizations::NoopUpdates) + || self.slow_direction == StoreDirection::ReadOnly + || self.slow_direction == StoreDirection::Get; + let ignore_fast = self + .fast_store + .inner_store(Some(key.borrow())) + .optimized_for(StoreOptimizations::NoopUpdates) + || self.fast_direction == StoreDirection::ReadOnly + || self.fast_direction == StoreDirection::Get; + if ignore_slow && ignore_fast { + // We need to drain the reader to avoid the writer complaining that we dropped + // the connection prematurely. + reader + .drain() + .await + .err_tip(|| "In FastFlowStore::update")?; + return Ok(()); + } + if ignore_slow { return self.fast_store.update(key, reader, size_info).await; } - let fast_store = self.fast_store.inner_store(Some(key.borrow())); - if fast_store.optimized_for(StoreOptimizations::NoopUpdates) { + if ignore_fast { return self.slow_store.update(key, reader, size_info).await; } let (mut fast_tx, fast_rx) = make_buf_channel_pair(); let (mut slow_tx, slow_rx) = make_buf_channel_pair(); + let key_debug = format!("{key:?}"); + trace!( + key = %key_debug, + "FastSlowStore::update: starting dual-store upload", + ); + let update_start = std::time::Instant::now(); + let mut bytes_sent: u64 = 0; + let data_stream_fut = async move { loop { let buffer = reader @@ -353,11 +416,27 @@ impl StoreDriver for FastSlowStore { slow_tx .send_eof() .err_tip(|| "Failed to write eof to writer in fast_slow store update")?; + debug!( + total_bytes = bytes_sent, + "FastSlowStore::update: data_stream sent EOF to both stores", + ); return Result::<(), Error>::Ok(()); } + let chunk_len = buffer.len(); + let send_start = std::time::Instant::now(); let (fast_result, slow_result) = join!(fast_tx.send(buffer.clone()), slow_tx.send(buffer)); + let send_elapsed = send_start.elapsed(); + if send_elapsed.as_secs() >= 5 { + warn!( + chunk_len, + send_elapsed_ms = send_elapsed.as_millis(), + total_bytes = bytes_sent, + "FastSlowStore::update: channel send stalled (>5s). A downstream store may be hanging", + ); + } + bytes_sent += u64::try_from(chunk_len).unwrap_or(u64::MAX); fast_result .map_err(|e| { make_err!( @@ -381,11 +460,29 @@ impl StoreDriver for FastSlowStore { let (data_stream_res, fast_res, slow_res) = join!(data_stream_fut, fast_store_fut, slow_store_fut); + + let total_elapsed = update_start.elapsed(); + if data_stream_res.is_err() || fast_res.is_err() || slow_res.is_err() { + warn!( + key = %key_debug, + elapsed_ms = total_elapsed.as_millis(), + data_stream_ok = data_stream_res.is_ok(), + fast_store_ok = fast_res.is_ok(), + slow_store_ok = slow_res.is_ok(), + "FastSlowStore::update: completed with error(s)", + ); + } else { + trace!( + key = %key_debug, + elapsed_ms = total_elapsed.as_millis(), + "FastSlowStore::update: completed successfully", + ); + } data_stream_res.merge(fast_res).merge(slow_res)?; Ok(()) } - /// FastSlowStore has optimizations for dealing with files. + /// `FastSlowStore` has optimizations for dealing with files. fn optimized_for(&self, optimization: StoreOptimizations) -> bool { optimization == StoreOptimizations::FileUpdates } @@ -400,14 +497,24 @@ impl StoreDriver for FastSlowStore { mut file: fs::FileSlot, upload_size: UploadSizeInfo, ) -> Result, Error> { + trace!( + key = ?key, + ?upload_size, + "FastSlowStore::update_with_whole_file: starting", + ); if self .fast_store .optimized_for(StoreOptimizations::FileUpdates) { if !self .slow_store + .inner_store(Some(key.borrow())) .optimized_for(StoreOptimizations::NoopUpdates) + && self.slow_direction != StoreDirection::ReadOnly + && self.slow_direction != StoreDirection::Get { + trace!("FastSlowStore::update_with_whole_file: uploading to slow_store"); + let slow_start = std::time::Instant::now(); slow_update_store_with_file( self.slow_store.as_store_driver_pin(), key.borrow(), @@ -416,6 +523,15 @@ impl StoreDriver for FastSlowStore { ) .await .err_tip(|| "In FastSlowStore::update_with_whole_file slow_store")?; + trace!( + elapsed_ms = slow_start.elapsed().as_millis(), + "FastSlowStore::update_with_whole_file: slow_store upload completed", + ); + } + if self.fast_direction == StoreDirection::ReadOnly + || self.fast_direction == StoreDirection::Get + { + return Ok(Some(file)); } return self .fast_store @@ -427,10 +543,13 @@ impl StoreDriver for FastSlowStore { .slow_store .optimized_for(StoreOptimizations::FileUpdates) { - if !self + let ignore_fast = self .fast_store + .inner_store(Some(key.borrow())) .optimized_for(StoreOptimizations::NoopUpdates) - { + || self.fast_direction == StoreDirection::ReadOnly + || self.fast_direction == StoreDirection::Get; + if !ignore_fast { slow_update_store_with_file( self.fast_store.as_store_driver_pin(), key.borrow(), @@ -440,6 +559,11 @@ impl StoreDriver for FastSlowStore { .await .err_tip(|| "In FastSlowStore::update_with_whole_file fast_store")?; } + let ignore_slow = self.slow_direction == StoreDirection::ReadOnly + || self.slow_direction == StoreDirection::Get; + if ignore_slow { + return Ok(Some(file)); + } return self .slow_store .update_with_whole_file(key, path, file, upload_size) @@ -460,33 +584,42 @@ impl StoreDriver for FastSlowStore { length: Option, ) -> Result<(), Error> { // TODO(palfrey) Investigate if we should maybe ignore errors here instead of - // forwarding the up. + // forwarding them up. if self.fast_store.has(key.borrow()).await?.is_some() { - self.metrics - .fast_store_hit_count - .fetch_add(1, Ordering::Acquire); + FAST_SLOW_STORE_METRICS.fast_store_hit_count.add(1, &[]); self.fast_store .get_part(key, writer.borrow_mut(), offset, length) .await?; - self.metrics + FAST_SLOW_STORE_METRICS .fast_store_downloaded_bytes - .fetch_add(writer.get_bytes_written(), Ordering::Acquire); + .add(writer.get_bytes_written(), &[]); + return Ok(()); + } + + // If the fast store is noop or read only or update only then bypass it. + if self + .fast_store + .inner_store(Some(key.borrow())) + .optimized_for(StoreOptimizations::NoopUpdates) + || self.fast_direction == StoreDirection::ReadOnly + || self.fast_direction == StoreDirection::Update + { + FAST_SLOW_STORE_METRICS.slow_store_hit_count.add(1, &[]); + self.slow_store + .get_part(key, writer.borrow_mut(), offset, length) + .await?; + FAST_SLOW_STORE_METRICS + .slow_store_downloaded_bytes + .add(writer.get_bytes_written(), &[]); return Ok(()); } - let loader = self.get_loader(key.borrow()); let mut writer = Some(writer); - loader + self.get_loader(key.borrow()) .get_or_try_init(|| { - writer - .take() - .map(|writer| { - self.populate_and_maybe_stream(key.borrow(), Some(writer), offset, length) - }) - .expect("writer somehow became None") + self.populate_and_maybe_stream(key.borrow(), writer.take(), offset, length) }) .await?; - drop(loader); // If we didn't stream then re-enter which will stream from the fast // store, or retry the download. We should not get in a loop here @@ -515,24 +648,12 @@ impl StoreDriver for FastSlowStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { - self.fast_store.register_remove_callback(callback)?; + self.fast_store.register_remove_callback(callback.clone())?; self.slow_store.register_remove_callback(callback)?; Ok(()) } } -#[derive(Debug, Default, MetricsComponent)] -struct FastSlowStoreMetrics { - #[metric(help = "Hit count for the fast store")] - fast_store_hit_count: AtomicU64, - #[metric(help = "Downloaded bytes from the fast store")] - fast_store_downloaded_bytes: AtomicU64, - #[metric(help = "Hit count for the slow store")] - slow_store_hit_count: AtomicU64, - #[metric(help = "Downloaded bytes from the slow store")] - slow_store_downloaded_bytes: AtomicU64, -} - default_health_status_indicator!(FastSlowStore); diff --git a/nativelink-store/src/filesystem_store.rs b/nativelink-store/src/filesystem_store.rs index 33bed51a0..735fff63f 100644 --- a/nativelink-store/src/filesystem_store.rs +++ b/nativelink-store/src/filesystem_store.rs @@ -22,7 +22,7 @@ use std::time::SystemTime; use async_lock::RwLock; use async_trait::async_trait; -use bytes::BytesMut; +use bytes::{Bytes, BytesMut}; use futures::stream::{StreamExt, TryStreamExt}; use futures::{Future, TryFutureExt}; use nativelink_config::stores::FilesystemSpec; @@ -39,8 +39,9 @@ use nativelink_util::store_trait::{ RemoveItemCallback, StoreDriver, StoreKey, StoreKeyBorrow, StoreOptimizations, UploadSizeInfo, }; use tokio::io::{AsyncReadExt, AsyncWriteExt, Take}; +use tokio::sync::Semaphore; use tokio_stream::wrappers::ReadDirStream; -use tracing::{debug, error, warn}; +use tracing::{debug, error, info, trace, warn}; use crate::callback_utils::RemoveItemCallbackHolder; use crate::cas_utils::is_zero_digest; @@ -129,7 +130,8 @@ impl Drop for EncodedFilePath { .fetch_add(1, Ordering::Relaxed) + 1; debug!( - ?current_active_drop_spawns, + %current_active_drop_spawns, + ?file_path, "Spawned a filesystem_delete_file" ); background_spawn!("filesystem_delete_file", async move { @@ -148,6 +150,7 @@ impl Drop for EncodedFilePath { - 1; debug!( ?current_active_drop_spawns, + ?file_path, "Dropped a filesystem_delete_file" ); }); @@ -220,6 +223,7 @@ pub trait FileEntry: LenEntry + Send + Sync + Debug + 'static { pub struct FileEntryImpl { data_size: u64, block_size: u64, + // We lock around this as it gets rewritten when we move between temp and content types encoded_file_path: RwLock, } @@ -362,37 +366,38 @@ impl LenEntry for FileEntryImpl { // target file location to the new temp file. `unref()` should only ever be called once. #[inline] async fn unref(&self) { - { - let mut encoded_file_path = self.encoded_file_path.write().await; - if encoded_file_path.path_type == PathType::Temp { - // We are already a temp file that is now marked for deletion on drop. - // This is very rare, but most likely the rename into the content path failed. - return; - } - let from_path = encoded_file_path.get_file_path(); - let new_key = make_temp_key(&encoded_file_path.key); - - let to_path = - to_full_path_from_key(&encoded_file_path.shared_context.temp_path, &new_key); - - if let Err(err) = fs::rename(&from_path, &to_path).await { - warn!( - key = ?encoded_file_path.key, - ?from_path, - ?to_path, - ?err, - "Failed to rename file", - ); - } else { - debug!( - key = ?encoded_file_path.key, - ?from_path, - ?to_path, - "Renamed file", - ); - encoded_file_path.path_type = PathType::Temp; - encoded_file_path.key = new_key; - } + let mut encoded_file_path = self.encoded_file_path.write().await; + if encoded_file_path.path_type == PathType::Temp { + // We are already a temp file that is now marked for deletion on drop. + // This is very rare, but most likely the rename into the content path failed. + warn!( + key = ?encoded_file_path.key, + "File is already a temp file", + ); + return; + } + let from_path = encoded_file_path.get_file_path(); + let new_key = make_temp_key(&encoded_file_path.key); + + let to_path = to_full_path_from_key(&encoded_file_path.shared_context.temp_path, &new_key); + + if let Err(err) = fs::rename(&from_path, &to_path).await { + warn!( + key = ?encoded_file_path.key, + ?from_path, + ?to_path, + ?err, + "Failed to rename file", + ); + } else { + debug!( + key = ?encoded_file_path.key, + ?from_path, + ?to_path, + "Renamed file (unref)", + ); + encoded_file_path.path_type = PathType::Temp; + encoded_file_path.key = new_key; } } } @@ -415,7 +420,8 @@ pub fn key_from_file(file_name: &str, file_type: FileType) -> Result = EvictingMap, Arc, SystemTime>; +type FsEvictingMap<'a, Fe> = + EvictingMap, Arc, SystemTime, RemoveItemCallbackHolder>; async fn add_files_to_cache( evicting_map: &FsEvictingMap<'_, Fe>, @@ -453,7 +459,7 @@ async fn add_files_to_cache( .insert_with_time( key.into_owned().into(), Arc::new(file_entry), - time_since_anchor.as_secs() as i32, + i32::try_from(time_since_anchor.as_secs()).unwrap_or(i32::MAX), ) .await; Ok(()) @@ -530,7 +536,7 @@ async fn add_files_to_cache( if let Err(err) = rename_fn(&from_file, &to_file) { warn!(?from_file, ?to_file, ?err, "Failed to rename file",); } else { - debug!(?from_file, ?to_file, "Renamed file",); + debug!(?from_file, ?to_file, "Renamed file (old cache)",); } } Ok(()) @@ -631,6 +637,8 @@ pub struct FilesystemStore { read_buffer_size: usize, weak_self: Weak, rename_fn: fn(&OsStr, &OsStr) -> Result<(), std::io::Error>, + /// Limits concurrent write operations to prevent disk I/O saturation. + write_semaphore: Option, } impl FilesystemStore { @@ -688,6 +696,11 @@ impl FilesystemStore { } else { spec.read_buffer_size as usize }; + let write_semaphore = if spec.max_concurrent_writes > 0 { + Some(Semaphore::new(spec.max_concurrent_writes)) + } else { + None + }; Ok(Arc::new_cyclic(|weak_self| Self { shared_context, evicting_map, @@ -695,6 +708,7 @@ impl FilesystemStore { read_buffer_size, weak_self: weak_self.clone(), rename_fn, + write_semaphore, })) } @@ -703,12 +717,27 @@ impl FilesystemStore { } pub async fn get_file_entry_for_digest(&self, digest: &DigestInfo) -> Result, Error> { + if is_zero_digest(digest) { + return Ok(Arc::new(Fe::create( + 0, + 0, + RwLock::new(EncodedFilePath { + shared_context: self.shared_context.clone(), + path_type: PathType::Content, + key: digest.into(), + }), + ))); + } self.evicting_map .get(&digest.into()) .await .ok_or_else(|| make_err!(Code::NotFound, "{digest} not found in filesystem store. This may indicate the file was evicted due to cache pressure. Consider increasing 'max_bytes' in your filesystem store's eviction_policy configuration.")) } + pub fn get_len(&self) -> u64 { + self.evicting_map.len() + } + async fn update_file( self: Pin<&Self>, mut entry: Fe, @@ -733,12 +762,26 @@ impl FilesystemStore { data_size += data_len as u64; } + let permit = if let Some(sem) = &self.write_semaphore { + Some( + sem.acquire() + .await + .map_err(|_| make_err!(Code::Internal, "Write semaphore closed"))?, + ) + } else { + None + }; + temp_file .as_ref() .sync_all() .await .err_tip(|| "Failed to sync_data in filesystem store")?; + drop(permit); + + temp_file.advise_dontneed(); + trace!(?temp_file, "Dropping file to update_file"); drop(temp_file); *entry.data_size_mut() = data_size; @@ -769,17 +812,25 @@ impl FilesystemStore { // We need to guarantee that this will get to the end even if the parent future is dropped. // See: https://github.com/TraceMachina/nativelink/issues/495 background_spawn!("filesystem_store_emplace_file", async move { + evicting_map + .insert(key.borrow().into_owned().into(), entry.clone()) + .await; + + // The insert might have resulted in an eviction/unref so we need to check + // it still exists in there. But first, get the lock... let mut encoded_file_path = entry.get_encoded_file_path().write().await; + // Then check it's still in there... + if evicting_map.get(&key).await.is_none() { + info!(%key, "Got eviction while emplacing, dropping"); + return Ok(()); + } + let final_path = get_file_path_raw( &PathType::Content, encoded_file_path.shared_context.as_ref(), &key, ); - evicting_map - .insert(key.borrow().into_owned().into(), entry.clone()) - .await; - let from_path = encoded_file_path.get_file_path(); // Internally tokio spawns fs commands onto a blocking thread anyways. // Since we are already on a blocking thread, we just need the `fs` wrapper to manage @@ -856,10 +907,24 @@ impl StoreDriver for FilesystemStore { async fn update( self: Pin<&Self>, key: StoreKey<'_>, - reader: DropCloserReadHalf, + mut reader: DropCloserReadHalf, _upload_size: UploadSizeInfo, ) -> Result<(), Error> { + if is_zero_digest(key.borrow()) { + // don't need to add, because zero length files are just assumed to exist + return Ok(()); + } + let temp_key = make_temp_key(&key); + + // There's a possibility of deadlock here where we take all of the + // file semaphores with make_and_open_file and the semaphores for + // whatever is populating reader is exhasted on the threads that + // have the FileSlots and not on those which can't. To work around + // this we don't take the FileSlot until there's something on the + // reader available to know that the populator is active. + reader.peek().await?; + let (entry, temp_file, temp_full_path) = Fe::make_and_open_file( self.block_size, EncodedFilePath { @@ -881,7 +946,60 @@ impl StoreDriver for FilesystemStore { } fn optimized_for(&self, optimization: StoreOptimizations) -> bool { - optimization == StoreOptimizations::FileUpdates + matches!( + optimization, + StoreOptimizations::FileUpdates | StoreOptimizations::SubscribesToUpdateOneshot + ) + } + + async fn update_oneshot(self: Pin<&Self>, key: StoreKey<'_>, data: Bytes) -> Result<(), Error> { + if is_zero_digest(key.borrow()) { + return Ok(()); + } + + let temp_key = make_temp_key(&key); + let (mut entry, mut temp_file, temp_full_path) = Fe::make_and_open_file( + self.block_size, + EncodedFilePath { + shared_context: self.shared_context.clone(), + path_type: PathType::Temp, + key: temp_key, + }, + ) + .await + .err_tip(|| "Failed to create temp file in filesystem store update_oneshot")?; + + // Write directly without channel overhead + if !data.is_empty() { + temp_file + .write_all(&data) + .await + .err_tip(|| format!("Failed to write data to {}", temp_full_path.display()))?; + } + + let _permit = if let Some(sem) = &self.write_semaphore { + Some( + sem.acquire() + .await + .map_err(|_| make_err!(Code::Internal, "Write semaphore closed"))?, + ) + } else { + None + }; + + temp_file + .as_ref() + .sync_all() + .await + .err_tip(|| "Failed to sync_data in filesystem store update_oneshot")?; + + drop(_permit); + + temp_file.advise_dontneed(); + drop(temp_file); + + *entry.data_size_mut() = data.len() as u64; + self.emplace_file(key.into_owned(), Arc::new(entry)).await } async fn update_with_whole_file( @@ -900,6 +1018,10 @@ impl StoreDriver for FilesystemStore { .err_tip(|| format!("While reading metadata for {}", path.display()))? .len(), }; + if file_size == 0 { + // don't need to add, because zero length files are just assumed to exist + return Ok(None); + } let entry = Fe::create( file_size, self.block_size, @@ -911,6 +1033,8 @@ impl StoreDriver for FilesystemStore { ); // We are done with the file, if we hold a reference to the file here, it could // result in a deadlock if `emplace_file()` also needs file descriptors. + trace!(?file, "Dropping file to to update_with_whole_file"); + file.advise_dontneed(); drop(file); self.emplace_file(key.into_owned(), Arc::new(entry)) .await @@ -970,6 +1094,7 @@ impl StoreDriver for FilesystemStore { .await .err_tip(|| "Failed to send chunk in filesystem store get_part")?; } + temp_file.get_ref().advise_dontneed(); writer .send_eof() .err_tip(|| "Filed to send EOF in filesystem store get_part")?; @@ -995,10 +1120,10 @@ impl StoreDriver for FilesystemStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { self.evicting_map - .add_remove_callback(Box::new(RemoveItemCallbackHolder::new(callback))); + .add_remove_callback(RemoveItemCallbackHolder::new(callback)); Ok(()) } } diff --git a/nativelink-store/src/gcs_client/client.rs b/nativelink-store/src/gcs_client/client.rs index dd27df601..664ec2114 100644 --- a/nativelink-store/src/gcs_client/client.rs +++ b/nativelink-store/src/gcs_client/client.rs @@ -108,6 +108,28 @@ pub struct GcsClient { } impl GcsClient { + fn create_client_config(spec: &ExperimentalGcsSpec) -> Result { + let mut client_config = ClientConfig::default(); + let connect_timeout = if spec.connection_timeout_s > 0 { + Duration::from_secs(spec.connection_timeout_s) + } else { + Duration::from_secs(3) + }; + let read_timeout = if spec.read_timeout_s > 0 { + Duration::from_secs(spec.read_timeout_s) + } else { + Duration::from_secs(3) + }; + let client = reqwest::ClientBuilder::new() + .connect_timeout(connect_timeout) + .read_timeout(read_timeout) + .build() + .map_err(|e| make_err!(Code::Internal, "Unable to create GCS client: {e:?}"))?; + let mid_client = reqwest_middleware::ClientBuilder::new(client).build(); + client_config.http = Some(mid_client); + Ok(client_config) + } + /// Create a new GCS client from the provided spec pub async fn new(spec: &ExperimentalGcsSpec) -> Result { // Attempt to get the authentication from a file with the environment @@ -115,8 +137,12 @@ impl GcsClient { // environment in variable GOOGLE_APPLICATION_CREDENTIALS_JSON. If that // fails, attempt to get authentication from the environment. let maybe_client_config = match CredentialsFile::new().await { - Ok(credentials) => ClientConfig::default().with_credentials(credentials).await, - Err(_) => ClientConfig::default().with_auth().await, + Ok(credentials) => { + Self::create_client_config(spec)? + .with_credentials(credentials) + .await + } + Err(_) => Self::create_client_config(spec)?.with_auth().await, } .map_err(|e| { make_err!( @@ -129,7 +155,8 @@ impl GcsClient { let client_config = if spec.authentication_required { maybe_client_config.err_tip(|| "Authentication required and none found.")? } else { - maybe_client_config.unwrap_or_else(|_| ClientConfig::default().anonymous()) + maybe_client_config + .or_else(|_| Self::create_client_config(spec).map(ClientConfig::anonymous))? }; // Creating client with the configured authentication @@ -206,9 +233,12 @@ impl GcsClient { reader: &mut DropCloserReadHalf, max_size: u64, ) -> Result<(), Error> { - let initial_capacity = core::cmp::min(max_size as usize, 10 * 1024 * 1024); + let initial_capacity = core::cmp::min( + usize::try_from(max_size).unwrap_or(usize::MAX), + 10 * 1024 * 1024, + ); let mut data = Vec::with_capacity(initial_capacity); - let max_size = max_size as usize; + let max_size = usize::try_from(max_size).unwrap_or(usize::MAX); let mut total_size = 0usize; while total_size < max_size { @@ -259,7 +289,7 @@ impl GcsClient { // Upload data in chunks let mut offset: u64 = 0; - let max_size = max_size as usize; + let max_size = usize::try_from(max_size).unwrap_or(usize::MAX); let mut total_uploaded = 0usize; while total_uploaded < max_size { @@ -532,7 +562,11 @@ impl GcsOperations for GcsClient { let mut rng = rand::rng(); let jitter_factor = rng.random::().mul_add(0.4, 0.8); - retry_delay = (retry_delay as f64 * jitter_factor) as u64; + retry_delay = Duration::from_millis(retry_delay) + .mul_f64(jitter_factor) + .as_millis() + .try_into() + .unwrap_or(u64::MAX); retry_count += 1; } diff --git a/nativelink-store/src/gcs_client/mocks.rs b/nativelink-store/src/gcs_client/mocks.rs index 5d593283e..d35dac758 100644 --- a/nativelink-store/src/gcs_client/mocks.rs +++ b/nativelink-store/src/gcs_client/mocks.rs @@ -334,7 +334,7 @@ impl GcsOperations for MockGcsOperations { if let Some(obj) = objects.get(&object_key) { let content = &obj.content; - let start_idx = start as usize; + let start_idx = usize::try_from(start).unwrap_or(usize::MAX); if start_idx > content.len() { return Err(make_err!( Code::OutOfRange, @@ -354,7 +354,7 @@ impl GcsOperations for MockGcsOperations { start )); } - core::cmp::min(e as usize, content.len()) + core::cmp::min(usize::try_from(e).unwrap_or(usize::MAX), content.len()) } else { content.len() }; @@ -439,7 +439,7 @@ impl GcsOperations for MockGcsOperations { }); // Handle the chunk data - let offset_usize = offset as usize; + let offset_usize = usize::try_from(offset).unwrap_or(usize::MAX); if mock_object.content.len() < offset_usize + data.len() { mock_object.content.resize(offset_usize + data.len(), 0); } @@ -483,7 +483,7 @@ impl GcsOperations for MockGcsOperations { // Read all data from the reader let mut buffer = Vec::new(); - let max_size = max_size as usize; + let max_size = usize::try_from(max_size).unwrap_or(usize::MAX); let mut total_read = 0usize; while total_read < max_size { diff --git a/nativelink-store/src/gcs_store.rs b/nativelink-store/src/gcs_store.rs index b8bcacc20..4334bbdd2 100644 --- a/nativelink-store/src/gcs_store.rs +++ b/nativelink-store/src/gcs_store.rs @@ -28,7 +28,9 @@ use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator}; use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::retry::{Retrier, RetryResult}; -use nativelink_util::store_trait::{RemoveItemCallback, StoreDriver, StoreKey, UploadSizeInfo}; +use nativelink_util::store_trait::{ + RemoveItemCallback, StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo, +}; use rand::Rng; use tokio::time::sleep; @@ -100,10 +102,10 @@ where let max_chunk_size = core::cmp::min(spec.resumable_chunk_size.unwrap_or(CHUNK_SIZE), CHUNK_SIZE); - let max_chunk_size = if max_chunk_size % CHUNK_MULTIPLE != 0 { - ((max_chunk_size + CHUNK_MULTIPLE / 2) / CHUNK_MULTIPLE) * CHUNK_MULTIPLE - } else { + let max_chunk_size = if max_chunk_size.is_multiple_of(CHUNK_MULTIPLE) { max_chunk_size + } else { + ((max_chunk_size + CHUNK_MULTIPLE / 2) / CHUNK_MULTIPLE) * CHUNK_MULTIPLE }; let max_retry_buffer_size = spec @@ -222,12 +224,25 @@ where .await } + fn optimized_for(&self, optimization: StoreOptimizations) -> bool { + matches!(optimization, StoreOptimizations::LazyExistenceOnSync) + } + async fn update( self: Pin<&Self>, digest: StoreKey<'_>, mut reader: DropCloserReadHalf, upload_size: UploadSizeInfo, ) -> Result<(), Error> { + if is_zero_digest(digest.borrow()) { + return reader.recv().await.and_then(|should_be_empty| { + should_be_empty + .is_empty() + .then_some(()) + .ok_or_else(|| make_err!(Code::Internal, "Zero byte hash not empty")) + }); + } + let object_path = self.make_object_path(&digest); reader.set_max_recent_data_size( @@ -238,7 +253,7 @@ where // For small files with exact size, we'll use simple upload if let UploadSizeInfo::ExactSize(size) = upload_size { if size < MIN_MULTIPART_SIZE { - let content = reader.consume(Some(size as usize)).await?; + let content = reader.consume(Some(usize::try_from(size)?)).await?; let client = &self.client; return self @@ -452,7 +467,7 @@ where fn register_remove_callback( self: Arc, - _callback: &Arc>, + _callback: Arc, ) -> Result<(), Error> { // As we're backed by GCS, this store doesn't actually drop stuff // so we can actually just ignore this diff --git a/nativelink-store/src/grpc_store.rs b/nativelink-store/src/grpc_store.rs index 8c895fa6d..8711f9ca3 100644 --- a/nativelink-store/src/grpc_store.rs +++ b/nativelink-store/src/grpc_store.rs @@ -53,7 +53,7 @@ use parking_lot::Mutex; use prost::Message; use tokio::time::sleep; use tonic::{Code, IntoRequest, Request, Response, Status, Streaming}; -use tracing::error; +use tracing::{error, trace, warn}; use uuid::Uuid; // This store is usually a pass-through store, but can also be used as a CAS store. Using it as an @@ -66,6 +66,8 @@ pub struct GrpcStore { store_type: nativelink_config::stores::StoreType, retrier: Retrier, connection_manager: ConnectionManager, + /// Per-RPC timeout. `Duration::ZERO` means disabled. + rpc_timeout: Duration, } impl GrpcStore { @@ -88,6 +90,8 @@ impl GrpcStore { endpoints.push(endpoint); } + let rpc_timeout = Duration::from_secs(spec.rpc_timeout_s); + Ok(Arc::new(Self { instance_name: spec.instance_name.clone(), store_type: spec.store_type, @@ -103,6 +107,7 @@ impl GrpcStore { spec.retry.clone(), jitter_fn, ), + rpc_timeout, })) } @@ -136,11 +141,23 @@ impl GrpcStore { ); let mut request = grpc_request.into_inner(); + + // Some builds (Chromium for example) do lots of empty requests for some reason, so shortcut them + if request.blob_digests.is_empty() { + return Ok(Response::new(FindMissingBlobsResponse { + missing_blob_digests: vec![], + })); + } + request.instance_name.clone_from(&self.instance_name); self.perform_request(request, |request| async move { let channel = self .connection_manager - .connection() + .connection(format!( + "find_missing_blobs: ({}) {:?}", + request.blob_digests.len(), + request.blob_digests + )) .await .err_tip(|| "in find_missing_blobs")?; ContentAddressableStorageClient::new(channel) @@ -165,7 +182,7 @@ impl GrpcStore { self.perform_request(request, |request| async move { let channel = self .connection_manager - .connection() + .connection("batch_update_blobs".into()) .await .err_tip(|| "in batch_update_blobs")?; ContentAddressableStorageClient::new(channel) @@ -190,7 +207,7 @@ impl GrpcStore { self.perform_request(request, |request| async move { let channel = self .connection_manager - .connection() + .connection("batch_read_blobs".into()) .await .err_tip(|| "in batch_read_blobs")?; ContentAddressableStorageClient::new(channel) @@ -215,7 +232,7 @@ impl GrpcStore { self.perform_request(request, |request| async move { let channel = self .connection_manager - .connection() + .connection(format!("get_tree: {:?}", request.root_digest)) .await .err_tip(|| "in get_tree")?; ContentAddressableStorageClient::new(channel) @@ -242,7 +259,7 @@ impl GrpcStore { ) -> Result> + use<>, Error> { let channel = self .connection_manager - .connection() + .connection(format!("read_internal: {}", request.resource_name)) .await .err_tip(|| "in read_internal")?; let mut response = ByteStreamClient::new(channel) @@ -294,51 +311,128 @@ impl GrpcStore { stream, ))); + let write_start = std::time::Instant::now(); + let instance_name = self.instance_name.clone(); + let rpc_timeout = self.rpc_timeout; + trace!( + instance_name = %instance_name, + rpc_timeout_s = rpc_timeout.as_secs(), + "GrpcStore::write: starting ByteStream write", + ); + let mut attempt: u32 = 0; let result = self .retrier - .retry(unfold(local_state, move |local_state| async move { - // The client write may occur on a separate thread and - // therefore in order to share the state with it we have to - // wrap it in a Mutex and retrieve it after the write - // has completed. There is no way to get the value back - // from the client. - let result = self - .connection_manager - .connection() - .and_then(|channel| async { - ByteStreamClient::new(channel) - .write(WriteStateWrapper::new(local_state.clone())) - .await - .err_tip(|| "in GrpcStore::write") - }) - .await; - - // Get the state back from StateWrapper, this should be - // uncontended since write has returned. - let mut local_state_locked = local_state.lock(); - - let result = local_state_locked - .take_read_stream_error() - .map(|err| RetryResult::Err(err.append("Where read_stream_error was set"))) - .unwrap_or_else(|| { - // No stream error, handle the original result - match result { - Ok(response) => RetryResult::Ok(response), - Err(err) => { - if local_state_locked.can_resume() { - local_state_locked.resume(); - RetryResult::Retry(err) - } else { - RetryResult::Err(err.append("Retry is not possible")) - } + .retry(unfold(local_state, move |local_state| { + attempt += 1; + let instance_name = instance_name.clone(); + async move { + // The client write may occur on a separate thread and + // therefore in order to share the state with it we have to + // wrap it in a Mutex and retrieve it after the write + // has completed. There is no way to get the value back + // from the client. + trace!( + instance_name = %instance_name, + attempt, + "GrpcStore::write: requesting connection from pool", + ); + let conn_start = std::time::Instant::now(); + let rpc_fut = self.connection_manager.connection("write".into()).and_then( + |channel| { + let conn_elapsed = conn_start.elapsed(); + let instance_for_rpc = instance_name.clone(); + let conn_elapsed_ms = + u64::try_from(conn_elapsed.as_millis()).unwrap_or(u64::MAX); + trace!( + instance_name = %instance_for_rpc, + conn_elapsed_ms, + "GrpcStore::write: got connection, starting ByteStream.Write RPC", + ); + let rpc_start = std::time::Instant::now(); + let local_state_for_rpc = local_state.clone(); + async move { + let res = ByteStreamClient::new(channel) + .write(WriteStateWrapper::new(local_state_for_rpc)) + .await + .err_tip(|| "in GrpcStore::write"); + let rpc_elapsed_ms = u64::try_from(rpc_start.elapsed().as_millis()) + .unwrap_or(u64::MAX); + trace!( + instance_name = %instance_for_rpc, + rpc_elapsed_ms, + success = res.is_ok(), + "GrpcStore::write: ByteStream.Write RPC returned", + ); + res + } + }, + ); + + let result = if rpc_timeout > Duration::ZERO { + match tokio::time::timeout(rpc_timeout, rpc_fut).await { + Ok(res) => res, + Err(_elapsed) => { + warn!( + instance_name = %instance_name, + attempt, + rpc_timeout_s = rpc_timeout.as_secs(), + "GrpcStore::write: per-RPC timeout exceeded, cancelling", + ); + #[allow(unused_qualifications)] + Err(nativelink_error::make_err!( + nativelink_error::Code::DeadlineExceeded, + "GrpcStore::write RPC timed out after {}s", + rpc_timeout.as_secs() + )) } } - }); + } else { + rpc_fut.await + }; + + // Get the state back from StateWrapper, this should be + // uncontended since write has returned. + let mut local_state_locked = local_state.lock(); + + let result = local_state_locked + .take_read_stream_error() + .map(|err| RetryResult::Err(err.append("Where read_stream_error was set"))) + .unwrap_or_else(|| { + // No stream error, handle the original result + match result { + Ok(response) => RetryResult::Ok(response), + Err(ref err) => { + warn!( + instance_name = %instance_name, + attempt, + ?err, + can_resume = local_state_locked.can_resume(), + "GrpcStore::write: RPC failed", + ); + if local_state_locked.can_resume() { + local_state_locked.resume(); + RetryResult::Retry(err.clone()) + } else { + RetryResult::Err( + err.clone().append("Retry is not possible"), + ) + } + } + } + }); - drop(local_state_locked); - Some((result, local_state)) + drop(local_state_locked); + Some((result, local_state)) + } })) .await?; + + let total_elapsed_ms = u64::try_from(write_start.elapsed().as_millis()).unwrap_or(u64::MAX); + trace!( + instance_name = %self.instance_name, + total_elapsed_ms, + "GrpcStore::write: completed successfully", + ); Ok(result) } @@ -364,7 +458,7 @@ impl GrpcStore { self.perform_request(request, |request| async move { let channel = self .connection_manager - .connection() + .connection(format!("query_write_status: {}", request.resource_name)) .await .err_tip(|| "in query_write_status")?; ByteStreamClient::new(channel) @@ -384,7 +478,7 @@ impl GrpcStore { self.perform_request(request, |request| async move { let channel = self .connection_manager - .connection() + .connection(format!("get_action_result: {:?}", request.action_digest)) .await .err_tip(|| "in get_action_result")?; ActionCacheClient::new(channel) @@ -404,7 +498,7 @@ impl GrpcStore { self.perform_request(request, |request| async move { let channel = self .connection_manager - .connection() + .connection(format!("update_action_result: {:?}", request.action_digest)) .await .err_tip(|| "in update_action_result")?; ActionCacheClient::new(channel) @@ -596,6 +690,12 @@ impl StoreDriver for GrpcStore { digest.packed_hash(), digest.size_bytes(), ); + trace!( + resource_name = %resource_name, + digest_hash = %digest.packed_hash(), + digest_size = digest.size_bytes(), + "GrpcStore::update: starting upload for digest", + ); let local_state = LocalState { resource_name, reader, @@ -770,7 +870,7 @@ impl StoreDriver for GrpcStore { fn register_remove_callback( self: Arc, - _callback: &Arc>, + _callback: Arc, ) -> Result<(), Error> { Err(Error::new( Code::Internal, diff --git a/nativelink-store/src/lib.rs b/nativelink-store/src/lib.rs index 72b7f46d6..4a367ee33 100644 --- a/nativelink-store/src/lib.rs +++ b/nativelink-store/src/lib.rs @@ -39,3 +39,4 @@ pub mod shard_store; pub mod size_partitioning_store; pub mod store_manager; pub mod verify_store; +pub mod metrics_store; diff --git a/nativelink-store/src/memory_store.rs b/nativelink-store/src/memory_store.rs index f8bdde52f..22391596f 100644 --- a/nativelink-store/src/memory_store.rs +++ b/nativelink-store/src/memory_store.rs @@ -31,7 +31,7 @@ use nativelink_util::health_utils::{ HealthRegistryBuilder, HealthStatusIndicator, default_health_status_indicator, }; use nativelink_util::store_trait::{ - RemoveItemCallback, StoreDriver, StoreKey, StoreKeyBorrow, UploadSizeInfo, + RemoveItemCallback, StoreDriver, StoreKey, StoreKeyBorrow, StoreOptimizations, UploadSizeInfo, }; use crate::callback_utils::RemoveItemCallbackHolder; @@ -61,7 +61,13 @@ impl LenEntry for BytesWrapper { #[derive(Debug, MetricsComponent)] pub struct MemoryStore { #[metric(group = "evicting_map")] - evicting_map: EvictingMap, BytesWrapper, SystemTime>, + evicting_map: EvictingMap< + StoreKeyBorrow, + StoreKey<'static>, + BytesWrapper, + SystemTime, + RemoveItemCallbackHolder, + >, } impl MemoryStore { @@ -75,8 +81,8 @@ impl MemoryStore { /// Returns the number of key-value pairs that are currently in the the cache. /// Function is not for production code paths. - pub async fn len_for_test(&self) -> usize { - self.evicting_map.len_for_test().await + pub fn len_for_test(&self) -> usize { + self.evicting_map.len_for_test() } pub async fn remove_entry(&self, key: StoreKey<'_>) -> bool { @@ -120,8 +126,7 @@ impl StoreDriver for MemoryStore { ); let iterations = self .evicting_map - .range(range, move |key, _value| handler(key.borrow())) - .await; + .range(range, move |key, _value| handler(key.borrow())); Ok(iterations) } @@ -149,6 +154,27 @@ impl StoreDriver for MemoryStore { Ok(()) } + fn optimized_for(&self, optimization: StoreOptimizations) -> bool { + optimization == StoreOptimizations::SubscribesToUpdateOneshot + } + + async fn update_oneshot(self: Pin<&Self>, key: StoreKey<'_>, data: Bytes) -> Result<(), Error> { + // Fast path: Direct insertion without channel overhead. + // We still need to copy the data to prevent holding references to larger buffers. + let final_buffer = if data.is_empty() { + data + } else { + let mut new_buffer = BytesMut::with_capacity(data.len()); + new_buffer.extend_from_slice(&data[..]); + new_buffer.freeze() + }; + + self.evicting_map + .insert(key.into_owned().into(), BytesWrapper(final_buffer)) + .await; + Ok(()) + } + async fn get_part( self: Pin<&Self>, key: StoreKey<'_>, @@ -208,10 +234,10 @@ impl StoreDriver for MemoryStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { self.evicting_map - .add_remove_callback(Box::new(RemoveItemCallbackHolder::new(callback))); + .add_remove_callback(RemoveItemCallbackHolder::new(callback)); Ok(()) } } diff --git a/nativelink-store/src/metrics_store.rs b/nativelink-store/src/metrics_store.rs new file mode 100644 index 000000000..fe3a363f1 --- /dev/null +++ b/nativelink-store/src/metrics_store.rs @@ -0,0 +1,180 @@ +use crate::filesystem_store::FilesystemStore; +use async_trait::async_trait; +use nativelink_error::Error; +use nativelink_metric::MetricsComponent; +use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; +use nativelink_util::health_utils::{HealthStatus, HealthStatusIndicator}; +use nativelink_util::metrics::{StoreMetricAttrs, StoreType, STORE_METRICS}; +use nativelink_util::store_trait::{ + RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, +}; +use std::borrow::Cow; +use std::pin::Pin; +use std::sync::Arc; +use std::time::Instant; + +#[derive(MetricsComponent, Debug)] +pub struct MetricsStore { + inner: Arc, + attrs: Arc, +} + +impl MetricsStore { + #[must_use] + pub fn new(inner: Arc, name: &str, store_type: StoreType) -> Arc { + let attrs = Arc::new(StoreMetricAttrs::new_with_name(store_type, name)); + if let Some(fs_store) = inner.downcast_ref::(None) { + #[derive(Debug)] + struct EvictionCallback { + attrs: Arc, + } + impl RemoveItemCallback for EvictionCallback { + fn callback<'a>( + &'a self, + _store_key: StoreKey<'a>, + ) -> Pin + Send + 'a>> { + Box::pin(async { STORE_METRICS.eviction_count.add(1, self.attrs.eviction()) }) + } + } + if let Err(e) = inner.register_remove_callback(Arc::new(EvictionCallback { + attrs: attrs.clone(), + })) { + tracing::error!("Failed to register remove callback: {:?}", e); + } + + STORE_METRICS.store_size.record(fs_store.get_len(), &attrs.store_size()); + } + + Arc::new(Self { + inner: inner.clone(), + attrs: attrs.clone(), + }) + } +} + +#[async_trait] +impl StoreDriver for MetricsStore { + async fn has_with_results( + self: Pin<&Self>, + digests: &[StoreKey<'_>], + results: &mut [Option], + ) -> Result<(), Error> { + let start = Instant::now(); + let result = self.inner.has_with_results(digests, results).await; + let duration_ms = start.elapsed().as_millis(); + for res in results { + if res.is_some() { + STORE_METRICS + .store_operations + .add(1, &self.attrs.cache_hit()); + STORE_METRICS + .store_operation_duration + .record(duration_ms as f64, &self.attrs.cache_hit()); + } else { + STORE_METRICS + .store_operations + .add(1, &self.attrs.cache_miss()); + STORE_METRICS + .store_operation_duration + .record(duration_ms as f64, &self.attrs.cache_miss()); + } + } + + result + } + + async fn update( + self: Pin<&Self>, + key: StoreKey<'_>, + reader: DropCloserReadHalf, + upload_size: UploadSizeInfo, + ) -> Result<(), Error> { + let start = Instant::now(); + let result = self.inner.update(key, reader, upload_size).await; + let duration_ms = start.elapsed().as_millis(); + if result.is_ok() { + STORE_METRICS + .store_operations + .add(1, &self.attrs.write_success()); + STORE_METRICS + .store_operation_duration + .record(duration_ms as f64, &self.attrs.write_success()); + } else { + STORE_METRICS + .store_operations + .add(1, &self.attrs.write_error()); + STORE_METRICS + .store_operation_duration + .record(duration_ms as f64, &self.attrs.write_error()); + } + + if let Some(fs_store) = self.inner.downcast_ref::(None) { + STORE_METRICS.store_size.record(fs_store.get_len(), &self.attrs.store_size()); + } + + result + } + + async fn get_part( + self: Pin<&Self>, + key: StoreKey<'_>, + writer: &mut DropCloserWriteHalf, + offset: u64, + length: Option, + ) -> Result<(), Error> { + let start = Instant::now(); + let result = self.inner.get_part(key, writer, offset, length).await; + let duration_ms = start.elapsed().as_millis(); + if result.is_ok() { + STORE_METRICS + .store_operations + .add(1, &self.attrs.read_success()); + STORE_METRICS + .store_operation_duration + .record(duration_ms as f64, &self.attrs.read_success()); + } else { + STORE_METRICS + .store_operations + .add(1, &self.attrs.read_error()); + STORE_METRICS + .store_operation_duration + .record(duration_ms as f64, &self.attrs.read_error()); + } + + result + } + + fn inner_store(&self, digest: Option) -> &'_ dyn StoreDriver { + self.inner.inner_store(digest) + } + + fn as_any<'a>(&'a self) -> &'a (dyn core::any::Any + Sync + Send + 'static) { + self + } + + fn as_any_arc(self: Arc) -> Arc { + self + } + + fn register_remove_callback( + self: Arc, + callback: Arc, + ) -> Result<(), Error> { + self.inner.clone().register_remove_callback(callback) + } +} + +#[async_trait] +impl HealthStatusIndicator for MetricsStore { + fn get_name(&self) -> &'static str { + "MetricsStore" + } + + async fn check_health(&self, _namespace: Cow<'static, str>) -> HealthStatus { + self.inner.check_health(_namespace).await + } +} + +fn should_add_remove_callback(store: Arc) -> bool { + store.downcast_ref::(None).is_some() +} diff --git a/nativelink-store/src/mongo_store.rs b/nativelink-store/src/mongo_store.rs index 9742e002d..2110a20b7 100644 --- a/nativelink-store/src/mongo_store.rs +++ b/nativelink-store/src/mongo_store.rs @@ -518,7 +518,7 @@ impl StoreDriver for ExperimentalMongoStore { } }; - let offset = offset as usize; + let offset = usize::try_from(offset).unwrap_or(usize::MAX); let data_len = data.len(); if offset > data_len { @@ -531,7 +531,10 @@ impl StoreDriver for ExperimentalMongoStore { } let end = if let Some(len) = length { - cmp::min(offset + len as usize, data_len) + cmp::min( + offset.saturating_add(usize::try_from(len).unwrap_or(usize::MAX)), + data_len, + ) } else { data_len }; @@ -576,7 +579,7 @@ impl StoreDriver for ExperimentalMongoStore { fn register_remove_callback( self: Arc, - _callback: &Arc>, + _callback: Arc, ) -> Result<(), Error> { // drop because we don't remove anything from Mongo Ok(()) @@ -797,13 +800,6 @@ impl ExperimentalMongoSubscriptionManager { impl SchedulerSubscriptionManager for ExperimentalMongoSubscriptionManager { type Subscription = ExperimentalMongoSubscription; - fn notify_for_test(&self, value: String) { - let subscribed_keys_mux = self.subscribed_keys.read(); - subscribed_keys_mux - .common_prefix_values(&value) - .for_each(ExperimentalMongoSubscriptionPublisher::notify); - } - fn subscribe(&self, key: K) -> Result where K: SchedulerStoreKeyProvider, @@ -846,7 +842,9 @@ impl SchedulerSubscriptionManager for ExperimentalMongoSubscriptionManager { impl SchedulerStore for ExperimentalMongoStore { type SubscriptionManager = ExperimentalMongoSubscriptionManager; - fn subscription_manager(&self) -> Result, Error> { + async fn subscription_manager( + &self, + ) -> Result, Error> { let mut subscription_manager = self.subscription_manager.lock(); if let Some(subscription_manager) = &*subscription_manager { Ok(subscription_manager.clone()) @@ -1100,4 +1098,11 @@ impl SchedulerStore for ExperimentalMongoStore { make_err!(Code::Internal, "Failed to decode in get_and_decode: {e}") })?)) } + + async fn count_by_index(&self, index: Vec) -> Result, Error> + where + K: SchedulerIndexProvider + Send + { + Ok(vec![0; index.len()]) + } } diff --git a/nativelink-store/src/noop_store.rs b/nativelink-store/src/noop_store.rs index 9f838ff9c..9c749750b 100644 --- a/nativelink-store/src/noop_store.rs +++ b/nativelink-store/src/noop_store.rs @@ -52,7 +52,9 @@ impl StoreDriver for NoopStore { _keys: &[StoreKey<'_>], results: &mut [Option], ) -> Result<(), Error> { - results.iter_mut().for_each(|r| *r = None); + for result in results.iter_mut() { + *result = None; + } Ok(()) } @@ -97,7 +99,7 @@ impl StoreDriver for NoopStore { fn register_remove_callback( self: Arc, - _callback: &Arc>, + _callback: Arc, ) -> Result<(), Error> { // does nothing, so drop Ok(()) diff --git a/nativelink-store/src/ontap_s3_existence_cache_store.rs b/nativelink-store/src/ontap_s3_existence_cache_store.rs index d0139c752..a78d2d35a 100644 --- a/nativelink-store/src/ontap_s3_existence_cache_store.rs +++ b/nativelink-store/src/ontap_s3_existence_cache_store.rs @@ -97,18 +97,23 @@ where } } -#[async_trait] impl RemoveItemCallback for OntapS3CacheCallback where I: InstantWrapper, NowFn: Fn() -> I + Send + Sync + Unpin + Clone + 'static, { - async fn callback(&self, store_key: &StoreKey<'_>) { + fn callback<'a>( + &'a self, + store_key: StoreKey<'a>, + ) -> Pin + Send + 'a>> { let cache = self.cache.upgrade(); if let Some(local_cache) = cache { - local_cache.callback(store_key).await; + Box::pin(async move { + local_cache.callback(store_key).await; + }) } else { debug!("Cache dropped, so not doing callback"); + Box::pin(async {}) } } } @@ -363,9 +368,7 @@ where let other_ref = Arc::downgrade(&cache); cache .inner_store - .register_remove_callback(&Arc::new(Box::new(OntapS3CacheCallback { - cache: other_ref, - })))?; + .register_remove_callback(Arc::new(OntapS3CacheCallback { cache: other_ref }))?; // Try to load existing cache file if let Ok(contents) = fs::read_to_string(&spec.index_path).await { @@ -532,21 +535,25 @@ where fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { self.inner_store.register_remove_callback(callback) } } -#[async_trait] impl RemoveItemCallback for OntapS3ExistenceCache where I: InstantWrapper, NowFn: Fn() -> I + Send + Sync + Unpin + Clone + 'static, { - async fn callback(&self, store_key: &StoreKey<'_>) { - let new_key = store_key.borrow(); - self.digests.write().await.remove(&new_key.into_digest()); + fn callback<'a>( + &'a self, + store_key: StoreKey<'a>, + ) -> Pin + Send + 'a>> { + let digest = store_key.borrow().into_digest(); + Box::pin(async move { + self.digests.write().await.remove(&digest); + }) } } diff --git a/nativelink-store/src/ontap_s3_store.rs b/nativelink-store/src/ontap_s3_store.rs index ea08ba7b9..ecec6bd55 100644 --- a/nativelink-store/src/ontap_s3_store.rs +++ b/nativelink-store/src/ontap_s3_store.rs @@ -50,7 +50,8 @@ use nativelink_util::retry::{Retrier, RetryResult}; use nativelink_util::store_trait::{RemoveItemCallback, StoreDriver, StoreKey, UploadSizeInfo}; use parking_lot::Mutex; use rustls::{ClientConfig, RootCertStore}; -use rustls_pemfile::certs as extract_certs; +use rustls_pki_types::CertificateDer; +use rustls_pki_types::pem::PemObject; use sha2::{Digest, Sha256}; use tokio::time::sleep; use tracing::{Level, event, warn}; @@ -73,6 +74,8 @@ const DEFAULT_MAX_RETRY_BUFFER_PER_REQUEST: usize = 20 * 1024 * 1024; // 20MB // Default limit for concurrent part uploads per multipart upload const DEFAULT_MULTIPART_MAX_CONCURRENT_UPLOADS: usize = 10; +type RemoveCallback = Arc; + #[derive(Debug, MetricsComponent)] pub struct OntapS3Store { s3_client: Arc, @@ -89,7 +92,7 @@ pub struct OntapS3Store { #[metric(help = "The number of concurrent uploads allowed for multipart uploads")] multipart_max_concurrent_uploads: usize, - remove_callbacks: Arc>>>>, + remove_callbacks: Mutex>, } pub fn load_custom_certs(cert_path: &str) -> Result, Error> { @@ -98,13 +101,11 @@ pub fn load_custom_certs(cert_path: &str) -> Result, Error> { // Create a BufReader from the cert file let mut cert_reader = BufReader::new( File::open(cert_path) - .map_err(|e| make_err!(Code::Internal, "Failed to open CA certificate file: {e:?}"))?, + .err_tip(|| format!("Failed to open CA certificate file {cert_path}"))?, ); // Parse certificates - let certs = extract_certs(&mut cert_reader) - .collect::, _>>() - .map_err(|e| make_err!(Code::Internal, "Failed to parse certificates: {e:?}"))?; + let certs = CertificateDer::pem_reader_iter(&mut cert_reader).collect::, _>>()?; // Add each certificate to the root store for cert in certs { @@ -215,7 +216,7 @@ where .common .multipart_max_concurrent_uploads .unwrap_or(DEFAULT_MULTIPART_MAX_CONCURRENT_UPLOADS), - remove_callbacks: Arc::new(Mutex::new(vec![])), + remove_callbacks: Mutex::new(vec![]), })) } @@ -244,15 +245,15 @@ where let now_s = (self.now_fn)().unix_timestamp() as i64; if last_modified.secs() + self.consider_expired_after_s <= now_s { - let store_key = local_digest.borrow(); - let remove_callbacks = self.remove_callbacks.lock_arc(); - let callbacks = remove_callbacks - .iter() - .map(|callback| callback.callback(&store_key)) - .collect::>(); - for callback in callbacks { - callback.await; - } + let remove_callbacks = self.remove_callbacks.lock().clone(); + let mut callbacks: FuturesUnordered<_> = remove_callbacks + .into_iter() + .map(|callback| { + let store_key = local_digest.borrow(); + async move { callback.callback(store_key).await } + }) + .collect(); + while callbacks.next().await.is_some() {} return Some((RetryResult::Ok(None), state)); } } @@ -768,9 +769,9 @@ where fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { - self.remove_callbacks.lock_arc().push(callback.clone()); + self.remove_callbacks.lock().push(callback); Ok(()) } } diff --git a/nativelink-store/src/redis_store.rs b/nativelink-store/src/redis_store.rs index 34e082b61..07c013846 100644 --- a/nativelink-store/src/redis_store.rs +++ b/nativelink-store/src/redis_store.rs @@ -1,4 +1,4 @@ -// Copyright 2024 The NativeLink Authors. All rights reserved. +// Copyright 2024-2026 The NativeLink Authors. All rights reserved. // // Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); // you may not use this file except in compliance with the License. @@ -13,33 +13,27 @@ // limitations under the License. use core::cmp; +use core::fmt::Debug; +use core::marker::PhantomData; use core::ops::{Bound, RangeBounds}; use core::pin::Pin; +use core::str::FromStr; use core::time::Duration; use std::borrow::Cow; +use std::collections::HashSet; use std::sync::{Arc, Weak}; +use std::time::Instant; use async_trait::async_trait; use bytes::Bytes; use const_format::formatcp; -use fred::clients::{Pool as RedisPool, SubscriberClient}; -use fred::interfaces::{ClientLike, KeysInterface, PubsubInterface}; -use fred::prelude::{Client, EventInterface, HashesInterface, RediSearchInterface}; -use fred::types::config::{ - Config as RedisConfig, ConnectionConfig, PerformanceConfig, ReconnectPolicy, UnresponsiveConfig, -}; -use fred::types::redisearch::{ - AggregateOperation, FtAggregateOptions, FtCreateOptions, IndexKind, Load, SearchField, - SearchSchema, SearchSchemaKind, WithCursor, -}; -use fred::types::scan::Scanner; -use fred::types::scripts::Script; -use fred::types::{Builder, Key as RedisKey, Map as RedisMap, SortOrder, Value as RedisValue}; use futures::stream::FuturesUnordered; -use futures::{FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt, future}; +use futures::{Stream, StreamExt, TryFutureExt, TryStreamExt, future}; +use itertools::izip; use nativelink_config::stores::{RedisMode, RedisSpec}; use nativelink_error::{Code, Error, ResultExt, make_err, make_input_err}; use nativelink_metric::MetricsComponent; +use nativelink_redis_tester::SubscriptionManagerNotify; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator}; use nativelink_util::spawn; @@ -51,13 +45,27 @@ use nativelink_util::store_trait::{ use nativelink_util::task::JoinHandleDropGuard; use parking_lot::{Mutex, RwLock}; use patricia_tree::StringPatriciaMap; +use redis::aio::{ConnectionLike, ConnectionManager, ConnectionManagerConfig}; +use redis::cluster::ClusterClient; +use redis::cluster_async::ClusterConnection; +use redis::sentinel::{SentinelClient, SentinelNodeConnectionInfo, SentinelServerType}; +use redis::{ + AsyncCommands, AsyncIter, Client, IntoConnectionInfo, PushInfo, ScanOptions, Script, Value, + pipe, +}; use tokio::select; -use tokio::time::sleep; -use tracing::{error, info, warn}; +use tokio::sync::mpsc::{UnboundedReceiver, UnboundedSender, unbounded_channel}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; +use tokio::time::{sleep, timeout}; +use tokio_stream::wrappers::UnboundedReceiverStream; +use tracing::{debug, error, info, trace, warn}; +use url::Url; use uuid::Uuid; use crate::cas_utils::is_zero_digest; -use crate::redis_utils::ft_aggregate; +use crate::redis_utils::{ + FtAggregateCursor, FtAggregateOptions, FtCreateOptions, SearchSchema, ft_aggregate, ft_create, +}; /// The default size of the read chunk when reading data from Redis. /// Note: If this changes it should be updated in the config documentation. @@ -70,13 +78,6 @@ const DEFAULT_CONNECTION_POOL_SIZE: usize = 3; /// The default delay between retries if not specified. /// Note: If this changes it should be updated in the config documentation. const DEFAULT_RETRY_DELAY: f32 = 0.1; -/// The amount of jitter to add to the retry delay if not specified. -/// Note: If this changes it should be updated in the config documentation. -const DEFAULT_RETRY_JITTER: f32 = 0.5; - -/// The default maximum capacity of the broadcast channel if not specified. -/// Note: If this changes it should be updated in the config documentation. -const DEFAULT_BROADCAST_CHANNEL_CAPACITY: usize = 4096; /// The default connection timeout in milliseconds if not specified. /// Note: If this changes it should be updated in the config documentation. @@ -88,17 +89,216 @@ const DEFAULT_COMMAND_TIMEOUT_MS: u64 = 10_000; /// The default maximum number of chunk uploads per update. /// Note: If this changes it should be updated in the config documentation. -const DEFAULT_MAX_CHUNK_UPLOADS_PER_UPDATE: usize = 10; +pub const DEFAULT_MAX_CHUNK_UPLOADS_PER_UPDATE: usize = 10; /// The default COUNT value passed when scanning keys in Redis. /// Note: If this changes it should be updated in the config documentation. -const DEFAULT_SCAN_COUNT: u32 = 10_000; +const DEFAULT_SCAN_COUNT: usize = 10_000; + +/// The default COUNT value passed when scanning search indexes +/// Note: If this changes it should be updated in the config documentation. +pub const DEFAULT_MAX_COUNT_PER_CURSOR: u64 = 1_500; + +const DEFAULT_CLIENT_PERMITS: usize = 500; + +/// A wrapper around Redis to allow it to be reconnected. +pub trait RedisManager +where + C: ConnectionLike + Clone, +{ + /// Get a connection manager and a unique identifier for this connection + /// which may be used to issue a reconnect later. + fn get_connection(&self) -> impl Future> + Send; + + /// Reconnect if the uuid matches the uuid returned from `get_connection()`. + fn reconnect(&self, uuid: Uuid) -> impl Future> + Send; + + /// Get an invocation of the update version script for a given `key`. + fn update_script(&self, key: &str) -> redis::ScriptInvocation<'_>; + + /// Configure the connection to have a psubscribe on it and perform the + /// subscription on reconnect. + fn psubscribe(&self, pattern: &str) -> impl Future> + Send; +} + +#[derive(Debug)] +pub struct ClusterRedisManager +where + C: ConnectionLike + Clone, +{ + /// A constant Uuid, we never reconnect. + uuid: Uuid, + + /// Redis script used to update a value in redis if the version matches. + /// This is done by incrementing the version number and then setting the new + /// data only if the version number matches the existing version number. + update_if_version_matches_script: Script, + + /// The client pool connecting to the backing Redis instance(s). + connection_manager: C, +} + +impl ClusterRedisManager +where + C: ConnectionLike + Clone, +{ + pub async fn new(mut connection_manager: C) -> Result { + let update_if_version_matches_script = Script::new(LUA_VERSION_SET_SCRIPT); + update_if_version_matches_script + .load_async(&mut connection_manager) + .await?; + Ok(Self { + uuid: Uuid::new_v4(), + update_if_version_matches_script, + connection_manager, + }) + } +} + +impl RedisManager for ClusterRedisManager +where + C: ConnectionLike + Clone + Send + Sync, +{ + fn get_connection(&self) -> impl Future> + Send { + future::ready(Ok((self.connection_manager.clone(), self.uuid))) + } + + fn reconnect(&self, _uuid: Uuid) -> impl Future> + Send { + self.get_connection() + } + + fn update_script(&self, key: &str) -> redis::ScriptInvocation<'_> { + self.update_if_version_matches_script.key(key) + } + + fn psubscribe(&self, _pattern: &str) -> impl Future> + Send { + // This is a no-op for cluster connections. + future::ready(Ok(())) + } +} + +type RedisConnectFuture = dyn Future> + Send; +type RedisConnectFn = dyn Fn() -> Pin>> + Send + Sync; + +pub struct StandardRedisManager +where + C: ConnectionLike + Clone, +{ + /// Function used to re-connect to Redis. + connect_func: Box>, + + /// Redis script used to update a value in redis if the version matches. + /// This is done by incrementing the version number and then setting the new + /// data only if the version number matches the existing version number. + update_if_version_matches_script: Script, + + /// The client pool connecting to the backing Redis instance(s) and a Uuid + /// for this connection in order to avoid multiple reconnection attempts. + connection_manager: tokio::sync::RwLock<(C, Uuid)>, + + /// A list of subscription that should be performed on reconnect. + subscriptions: Mutex>, +} + +impl Debug for StandardRedisManager +where + C: ConnectionLike + Clone, +{ + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("StandardRedisManager") + .field( + "update_if_version_matches_script", + &self.update_if_version_matches_script, + ) + .field("subscriptions", &self.subscriptions) + .finish() + } +} + +impl StandardRedisManager +where + C: ConnectionLike + Clone + Send + Sync, +{ + async fn configure(&self, connection_manager: &mut C) -> Result<(), Error> { + self.update_if_version_matches_script + .load_async(connection_manager) + .await?; + Ok(()) + } + + async fn new(connect_func: Box>) -> Result { + let connection_manager = connect_func().await?; + let update_if_version_matches_script = Script::new(LUA_VERSION_SET_SCRIPT); + let connection = Self { + connect_func, + update_if_version_matches_script, + connection_manager: tokio::sync::RwLock::new((connection_manager, Uuid::new_v4())), + subscriptions: Mutex::new(HashSet::new()), + }; + { + let mut connection_manager = connection.connection_manager.write().await; + connection.configure(&mut connection_manager.0).await?; + } + Ok(connection) + } +} + +impl RedisManager for StandardRedisManager { + async fn get_connection(&self) -> Result<(ConnectionManager, Uuid), Error> { + Ok(self.connection_manager.read().await.clone()) + } + + async fn reconnect(&self, uuid: Uuid) -> Result<(ConnectionManager, Uuid), Error> { + let mut guard = self.connection_manager.write().await; + if guard.1 != uuid { + let connection = guard.clone(); + drop(guard); + return Ok(connection); + } + let mut connection_manager = (self.connect_func)().await?; + let uuid = Uuid::new_v4(); + self.configure(&mut connection_manager).await?; + let subscriptions = { + let guard = self.subscriptions.lock(); + guard.iter().map(Clone::clone).collect::>() + }; + for subscription in subscriptions { + connection_manager.psubscribe(&subscription).await?; + } + *guard = (connection_manager.clone(), uuid); + Ok((connection_manager, uuid)) + } + + fn update_script(&self, key: &str) -> redis::ScriptInvocation<'_> { + self.update_if_version_matches_script.key(key) + } + + async fn psubscribe(&self, pattern: &str) -> Result<(), Error> { + let mut connection = self.get_connection().await?.0; + let new_subscription = self.subscriptions.lock().insert(String::from(pattern)); + if new_subscription { + let result = connection.psubscribe(pattern).await; + if result.is_err() { + self.subscriptions.lock().remove(pattern); + } + result?; + } + Ok(()) + } +} /// A [`StoreDriver`] implementation that uses Redis as a backing store. -#[derive(Debug, MetricsComponent)] -pub struct RedisStore { +#[derive(MetricsComponent)] +pub struct RedisStore +where + C: ConnectionLike + Clone, + M: RedisManager, +{ /// The client pool connecting to the backing Redis instance(s). - client_pool: RedisPool, + connection_manager: M, + + /// The underlying connection type in the connection manager. + _connection_type: PhantomData, /// A channel to publish updates to when a key is added, removed, or modified. #[metric( @@ -106,10 +306,6 @@ pub struct RedisStore { )] pub_sub_channel: Option, - /// A redis client for managing subscriptions. - /// TODO: This should be moved into the store in followups once a standard use pattern has been determined. - subscriber_client: SubscriberClient, - /// A function used to generate names for temporary keys. temp_name_generator_fn: fn() -> String, @@ -125,188 +321,126 @@ pub struct RedisStore { /// The maximum number of chunk uploads per update. /// This is used to limit the number of chunk uploads per update to prevent + /// overloading when uploading large blocks of data #[metric(help = "The maximum number of chunk uploads per update")] max_chunk_uploads_per_update: usize, /// The COUNT value passed when scanning keys in Redis. /// This is used to hint the amount of work that should be done per response. #[metric(help = "The COUNT value passed when scanning keys in Redis")] - scan_count: u32, + scan_count: usize, - /// Redis script used to update a value in redis if the version matches. - /// This is done by incrementing the version number and then setting the new data - /// only if the version number matches the existing version number. - update_if_version_matches_script: Script, + /// The COUNT value used with search indexes + #[metric(help = "The maximum number of results to return per cursor")] + max_count_per_cursor: u64, /// A manager for subscriptions to keys in Redis. - subscription_manager: Mutex>>, -} - -impl RedisStore { - /// Create a new `RedisStore` from the given configuration. - pub fn new(mut spec: RedisSpec) -> Result, Error> { - if spec.addresses.is_empty() { - return Err(make_err!( - Code::InvalidArgument, - "No addresses were specified in redis store configuration." - )); - } - let [addr] = spec.addresses.as_slice() else { - return Err(make_err!( - Code::Unimplemented, - "Connecting directly to multiple redis nodes in a cluster is currently unsupported. Please specify a single URL to a single node, and nativelink will use cluster discover to find the other nodes." - )); - }; - let redis_config = match spec.mode { - RedisMode::Cluster => RedisConfig::from_url_clustered(addr), - RedisMode::Sentinel => RedisConfig::from_url_sentinel(addr), - RedisMode::Standard => RedisConfig::from_url_centralized(addr), - } - .err_tip_with_code(|e| { - ( - Code::InvalidArgument, - format!("while parsing redis node address: {e}"), - ) - })?; + subscription_manager: tokio::sync::OnceCell>, - let reconnect_policy = { - if spec.retry.delay == 0.0 { - spec.retry.delay = DEFAULT_RETRY_DELAY; - } - if spec.retry.jitter == 0.0 { - spec.retry.jitter = DEFAULT_RETRY_JITTER; - } + /// Channel for getting subscription messages + subscriber_channel: Mutex>>, - let max_retries = u32::try_from(spec.retry.max_retries) - .err_tip(|| "max_retries could not be converted to u32 in RedisStore::new")?; - let min_delay_ms = (spec.retry.delay * 1000.0) as u32; - let max_delay_ms = 8000; - let jitter = (spec.retry.jitter * spec.retry.delay * 1000.0) as u32; - - let mut reconnect_policy = ReconnectPolicy::new_exponential( - max_retries, /* max_retries, 0 is unlimited */ - min_delay_ms, /* min_delay */ - max_delay_ms, /* max_delay */ - 2, /* mult */ - ); - reconnect_policy.set_jitter(jitter); - reconnect_policy - }; + /// Permits to limit inflight Redis requests. Technically only + /// limits the calls to `get_client()`, but the requests per client + /// are small enough that it works well enough. + client_permits: Arc, +} - { - if spec.broadcast_channel_capacity == 0 { - spec.broadcast_channel_capacity = DEFAULT_BROADCAST_CHANNEL_CAPACITY; - } - if spec.connection_timeout_ms == 0 { - spec.connection_timeout_ms = DEFAULT_CONNECTION_TIMEOUT_MS; - } - if spec.command_timeout_ms == 0 { - spec.command_timeout_ms = DEFAULT_COMMAND_TIMEOUT_MS; - } - if spec.connection_pool_size == 0 { - spec.connection_pool_size = DEFAULT_CONNECTION_POOL_SIZE; - } - if spec.read_chunk_size == 0 { - spec.read_chunk_size = DEFAULT_READ_CHUNK_SIZE; - } - if spec.max_chunk_uploads_per_update == 0 { - spec.max_chunk_uploads_per_update = DEFAULT_MAX_CHUNK_UPLOADS_PER_UPDATE; - } - if spec.scan_count == 0 { - spec.scan_count = DEFAULT_SCAN_COUNT; - } - } - let connection_timeout = Duration::from_millis(spec.connection_timeout_ms); - let command_timeout = Duration::from_millis(spec.command_timeout_ms); +impl Debug for RedisStore +where + C: ConnectionLike + Clone, + M: RedisManager, +{ + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_struct("RedisStore") + .field("temp_name_generator_fn", &self.temp_name_generator_fn) + .field("key_prefix", &self.key_prefix) + .field("read_chunk_size", &self.read_chunk_size) + .field( + "max_chunk_uploads_per_update", + &self.max_chunk_uploads_per_update, + ) + .field("scan_count", &self.scan_count) + .field("subscription_manager", &self.subscription_manager) + .field("subscriber_channel", &self.subscriber_channel) + .field("client_permits", &self.client_permits) + .finish() + } +} - let mut builder = Builder::from_config(redis_config); - builder - .set_performance_config(PerformanceConfig { - default_command_timeout: command_timeout, - broadcast_channel_capacity: spec.broadcast_channel_capacity, - ..Default::default() - }) - .set_connection_config(ConnectionConfig { - connection_timeout, - internal_command_timeout: command_timeout, - unresponsive: UnresponsiveConfig { - max_timeout: Some(connection_timeout), - // This number needs to be less than the connection timeout. - // We use 4 as it is a good balance between not spamming the server - // and not waiting too long. - interval: connection_timeout / 4, - }, - ..Default::default() - }) - .set_policy(reconnect_policy); +struct ClientWithPermit { + connection_manager: C, + uuid: Uuid, - let client_pool = builder - .build_pool(spec.connection_pool_size) - .err_tip(|| "while creating redis connection pool")?; + // here so it sticks around with the client and doesn't get dropped until that does + #[allow(dead_code)] + semaphore_permit: OwnedSemaphorePermit, +} - let subscriber_client = builder - .build_subscriber_client() - .err_tip(|| "while creating redis subscriber client")?; +impl ClientWithPermit { + async fn reconnect + Sync>(&mut self, manager: &M) -> Result<(), Error> { + (self.connection_manager, self.uuid) = manager.reconnect(self.uuid).await?; + Ok(()) + } +} - Self::new_from_builder_and_parts( - client_pool, - subscriber_client, - spec.experimental_pub_sub_channel.clone(), - || Uuid::new_v4().to_string(), - spec.key_prefix.clone(), - spec.read_chunk_size, - spec.max_chunk_uploads_per_update, - spec.scan_count, - ) - .map(Arc::new) +impl Drop for ClientWithPermit { + fn drop(&mut self) { + trace!( + remaining = self.semaphore_permit.semaphore().available_permits(), + "Dropping a client permit" + ); } +} +impl RedisStore +where + C: ConnectionLike + Clone + Sync, + M: RedisManager + Sync, +{ /// Used for testing when determinism is required. #[expect(clippy::too_many_arguments)] - pub fn new_from_builder_and_parts( - client_pool: RedisPool, - subscriber_client: SubscriberClient, + pub async fn new_from_builder_and_parts( pub_sub_channel: Option, temp_name_generator_fn: fn() -> String, key_prefix: String, read_chunk_size: usize, max_chunk_uploads_per_update: usize, - scan_count: u32, + scan_count: usize, + max_client_permits: usize, + max_count_per_cursor: u64, + subscriber_channel: UnboundedReceiver, + connection_manager: M, ) -> Result { - // Start connection pool (this will retry forever by default). - client_pool.connect(); - subscriber_client.connect(); - info!("Redis index fingerprint: {FINGERPRINT_CREATE_INDEX_HEX}"); Ok(Self { - client_pool, + connection_manager, + _connection_type: PhantomData, pub_sub_channel, - subscriber_client, temp_name_generator_fn, key_prefix, read_chunk_size, max_chunk_uploads_per_update, scan_count, - update_if_version_matches_script: Script::from_lua(LUA_VERSION_SET_SCRIPT), - subscription_manager: Mutex::new(None), + subscription_manager: tokio::sync::OnceCell::new(), + subscriber_channel: Mutex::new(Some(subscriber_channel)), + client_permits: Arc::new(Semaphore::new(max_client_permits)), + max_count_per_cursor, }) } - async fn get_client(&'_ self) -> Result<&'_ Client, Error> { - let client = self.client_pool.next(); - let config = client.client_config(); - if config.mocks.is_none() { - client.wait_for_connect().await.err_tip(|| - format!( - "Connection issue connecting to redis server with hosts: {:?}, username: {}, database: {}", - config.server.hosts().iter().map(|s| format!("{}:{}", s.host, s.port)).collect::>(), - config.username.unwrap_or_else(|| "None".to_string()), - config.database.unwrap_or_default() - ) - )?; - } - Ok(client) + async fn get_client(&self) -> Result, Error> { + let local_client_permits = self.client_permits.clone(); + let remaining = local_client_permits.available_permits(); + let semaphore_permit = local_client_permits.acquire_owned().await?; + trace!(remaining, "Got a client permit"); + let (connection_manager, uuid) = self.connection_manager.get_connection().await?; + Ok(ClientWithPermit { + connection_manager, + uuid, + semaphore_permit, + }) } /// Encode a [`StoreKey`] so it can be sent to Redis. @@ -331,10 +465,263 @@ impl RedisStore { } } } + + fn set_spec_defaults(spec: &mut RedisSpec) -> Result<(), Error> { + if spec.addresses.is_empty() { + return Err(make_err!( + Code::InvalidArgument, + "No addresses were specified in redis store configuration." + )); + } + + if spec.broadcast_channel_capacity != 0 { + warn!("broadcast_channel_capacity in Redis spec is deprecated and ignored"); + } + if spec.response_timeout_s != 0 { + warn!( + "response_timeout_s in Redis spec is deprecated and ignored, use command_timeout_ms" + ); + } + if spec.connection_timeout_s != 0 { + if spec.connection_timeout_ms != 0 { + return Err(make_err!( + Code::InvalidArgument, + "Both connection_timeout_s and connection_timeout_ms were set, can only have one!" + )); + } + warn!("connection_timeout_s in Redis spec is deprecated, use connection_timeout_ms"); + spec.connection_timeout_ms = spec.connection_timeout_s * 1000; + } + if spec.connection_timeout_ms == 0 { + spec.connection_timeout_ms = DEFAULT_CONNECTION_TIMEOUT_MS; + } + if spec.command_timeout_ms == 0 { + spec.command_timeout_ms = DEFAULT_COMMAND_TIMEOUT_MS; + } + if spec.connection_pool_size == 0 { + spec.connection_pool_size = DEFAULT_CONNECTION_POOL_SIZE; + } + if spec.read_chunk_size == 0 { + spec.read_chunk_size = DEFAULT_READ_CHUNK_SIZE; + } + if spec.max_count_per_cursor == 0 { + spec.max_count_per_cursor = DEFAULT_MAX_COUNT_PER_CURSOR; + } + if spec.max_chunk_uploads_per_update == 0 { + spec.max_chunk_uploads_per_update = DEFAULT_MAX_CHUNK_UPLOADS_PER_UPDATE; + } + if spec.scan_count == 0 { + spec.scan_count = DEFAULT_SCAN_COUNT; + } + if spec.max_client_permits == 0 { + spec.max_client_permits = DEFAULT_CLIENT_PERMITS; + } + if spec.retry.delay == 0.0 { + spec.retry.delay = DEFAULT_RETRY_DELAY; + } + if spec.retry.max_retries == 0 { + spec.retry.max_retries = 1; + } + trace!(?spec, "redis spec is after setting defaults"); + Ok(()) + } + + // Only used by tests, because we need to make a real redis connection, then fix this to get fixed values + pub fn replace_temp_name_generator(&mut self, replacement: fn() -> String) { + self.temp_name_generator_fn = replacement; + } +} + +impl RedisStore> { + pub async fn new_cluster(mut spec: RedisSpec) -> Result, Error> { + if spec.mode != RedisMode::Cluster { + return Err(Error::new( + Code::InvalidArgument, + "new_cluster only works for Cluster mode".to_string(), + )); + } + Self::set_spec_defaults(&mut spec)?; + + let parsed_addrs: Vec<_> = spec + .addresses + .iter_mut() + .map(|addr| { + addr.clone().into_connection_info().map(|connection_info| { + let redis_settings = connection_info + .redis_settings() + .clone() + // We need RESP3 here because the cluster mode doesn't support RESP2 pubsub + // See also https://docs.rs/redis/latest/redis/cluster_async/index.html#pubsub + .set_protocol(redis::ProtocolVersion::RESP3); + connection_info.set_redis_settings(redis_settings) + }) + }) + .collect::, _>>()?; + + let connection_timeout = Duration::from_millis(spec.connection_timeout_ms); + let command_timeout = Duration::from_millis(spec.command_timeout_ms); + let (tx, subscriber_channel) = unbounded_channel(); + + let builder = ClusterClient::builder(parsed_addrs) + .connection_timeout(connection_timeout) + .response_timeout(command_timeout) + .push_sender(tx) + .retries(u32::try_from(spec.retry.max_retries)?); + + let client = builder.build()?; + + Self::new_from_builder_and_parts( + spec.experimental_pub_sub_channel, + || Uuid::new_v4().to_string(), + spec.key_prefix.clone(), + spec.read_chunk_size, + spec.max_chunk_uploads_per_update, + spec.scan_count, + spec.max_client_permits, + spec.max_count_per_cursor, + subscriber_channel, + ClusterRedisManager::new(client.get_async_connection().await?).await?, + ) + .await + .map(Arc::new) + } +} + +impl RedisStore> { + async fn connect( + spec: RedisSpec, + tx: UnboundedSender, + ) -> Result { + let connection_timeout = Duration::from_millis(spec.connection_timeout_ms); + let command_timeout = Duration::from_millis(spec.command_timeout_ms); + + let addr = &spec.addresses[0]; + let local_addr = addr.clone(); + let mut parsed_addr = local_addr + .replace("redis+sentinel://", "redis://") + .into_connection_info()?; + + let redis_settings = parsed_addr + .redis_settings() + .clone() + // We need RESP3 here because we want to do set_push_sender + .set_protocol(redis::ProtocolVersion::RESP3); + parsed_addr = parsed_addr.set_redis_settings(redis_settings); + debug!(?parsed_addr, "Parsed redis addr"); + + let client = timeout( + connection_timeout, + spawn!("connect", async move { + match spec.mode { + RedisMode::Standard => Client::open(parsed_addr).map_err(Into::::into), + RedisMode::Cluster => { + return Err(Error::new( + Code::Internal, + "Use RedisStore::new_cluster for cluster connections".to_owned(), + )); + } + RedisMode::Sentinel => async { + let url_parsing = Url::parse(&local_addr)?; + let master_name = url_parsing + .query_pairs() + .find(|(key, _)| key == "sentinelServiceName") + .map_or_else(|| "master".into(), |(_, value)| value.to_string()); + + let redis_connection_info = parsed_addr.redis_settings().clone(); + let sentinel_connection_info = SentinelNodeConnectionInfo::default() + .set_redis_connection_info(redis_connection_info); + + // We fish this out because sentinels don't support db, we need to set it + // on the client only. See also https://github.com/redis-rs/redis-rs/issues/1950 + let original_db = parsed_addr.redis_settings().db(); + if original_db != 0 { + // sentinel_connection_info has the actual DB set + let revised_settings = parsed_addr.redis_settings().clone().set_db(0); + parsed_addr = parsed_addr.set_redis_settings(revised_settings); + } + + SentinelClient::build( + vec![parsed_addr], + master_name, + Some(sentinel_connection_info), + SentinelServerType::Master, + ) + .map_err(Into::::into) + } + .and_then(|mut s| async move { Ok(s.async_get_client().await) }) + .await? + .map_err(Into::::into), + } + .err_tip_with_code(|_e| { + ( + Code::InvalidArgument, + format!("While connecting to redis with url: {local_addr}"), + ) + }) + }), + ) + .await + .err_tip(|| format!("Timeout while connecting to redis with url: {addr}"))???; + + let connection_manager_config = { + ConnectionManagerConfig::new() + .set_number_of_retries(spec.retry.max_retries) + .set_connection_timeout(Some(connection_timeout)) + .set_response_timeout(Some(command_timeout)) + .set_push_sender(tx) + }; + + let mut connection_manager = + ConnectionManager::new_with_config(client, connection_manager_config) + .await + .err_tip(|| format!("While connecting to redis with url: {addr}"))?; + + if let Some(pub_sub_channel) = spec.experimental_pub_sub_channel { + connection_manager.psubscribe(pub_sub_channel).await?; + } + + Ok(connection_manager) + } + + /// Create a new `RedisStore` from the given configuration. + pub async fn new_standard(mut spec: RedisSpec) -> Result, Error> { + Self::set_spec_defaults(&mut spec)?; + + if spec.addresses.len() != 1 { + return Err(make_err!( + Code::Unimplemented, + "Connecting directly to multiple redis nodes in a cluster is currently unsupported. Please specify a single URL to a single node, and nativelink will use cluster discover to find the other nodes." + )); + } + + let (tx, subscriber_channel) = unbounded_channel(); + + Self::new_from_builder_and_parts( + spec.experimental_pub_sub_channel.clone(), + || Uuid::new_v4().to_string(), + spec.key_prefix.clone(), + spec.read_chunk_size, + spec.max_chunk_uploads_per_update, + spec.scan_count, + spec.max_client_permits, + spec.max_count_per_cursor, + subscriber_channel, + StandardRedisManager::new(Box::new(move || { + Box::pin(Self::connect(spec.clone(), tx.clone())) + })) + .await?, + ) + .await + .map(Arc::new) + } } #[async_trait] -impl StoreDriver for RedisStore { +impl StoreDriver for RedisStore +where + C: ConnectionLike + Clone + Send + Sync + Unpin + 'static, + M: RedisManager + Unpin + Send + Sync + 'static, +{ async fn has_with_results( self: Pin<&Self>, keys: &[StoreKey<'_>], @@ -344,9 +731,8 @@ impl StoreDriver for RedisStore { // difficult and it doesn't work very well in cluster mode. // If we wanted to optimize this with pipeline be careful to // implement retry and to support cluster mode. - let client = self.get_client().await?; - keys.iter() - .zip(results.iter_mut()) + + izip!(keys.iter(), results.iter_mut(),) .map(|(key, result)| async move { // We need to do a special pass to ensure our zero key exist. if is_zero_digest(key.borrow()) { @@ -354,27 +740,19 @@ impl StoreDriver for RedisStore { return Ok::<_, Error>(()); } let encoded_key = self.encode_key(key); - let pipeline = client.pipeline(); - pipeline - .strlen::<(), _>(encoded_key.as_ref()) - .await - .err_tip(|| { - format!("In RedisStore::has_with_results::strlen for {encoded_key}") - })?; + + let mut client = self.get_client().await?; + // Redis returns 0 when the key doesn't exist // AND when the key exists with value of length 0. // Therefore, we need to check both length and existence - // and do it in a pipeline for efficiency. - pipeline - .exists::<(), _>(encoded_key.as_ref()) - .await - .err_tip(|| { - format!("In RedisStore::has_with_results::exists for {encoded_key}") - })?; - let (blob_len, exists) = pipeline - .all::<(u64, bool)>() + // and do it in a pipeline for efficiency + let (blob_len, exists) = pipe() + .strlen(encoded_key.as_ref()) + .exists(encoded_key.as_ref()) + .query_async::<(u64, bool)>(&mut client.connection_manager) .await - .err_tip(|| "In RedisStore::has_with_results::query")?; + .err_tip(|| "In RedisStore::has_with_results::all")?; *result = if exists { Some(blob_len) } else { None }; @@ -411,30 +789,50 @@ impl StoreDriver for RedisStore { }, Bound::Unbounded => format!("{}*", self.key_prefix), }; - let client = self.get_client().await?; - let mut scan_stream = client.scan(pattern, Some(self.scan_count), None); + let mut client = self.get_client().await?; + trace!(%pattern, count=self.scan_count, "Running SCAN"); + let opts = ScanOptions::default() + .with_pattern(pattern) + .with_count(self.scan_count); + let mut scan_stream: AsyncIter = client + .connection_manager + .scan_options(opts) + .await + .err_tip(|| "During scan_options")?; let mut iterations = 0; - 'outer: while let Some(mut page) = scan_stream.try_next().await? { - if let Some(keys) = page.take_results() { - for key in keys { - // TODO: Notification of conversion errors - // Any results that do not conform to expectations are ignored. - if let Some(key) = key.as_str() { - if let Some(key) = key.strip_prefix(&self.key_prefix) { - let key = StoreKey::new_str(key); - if range.contains(&key) { - iterations += 1; - if !handler(&key) { - break 'outer; - } - } + let mut errors = vec![]; + while let Some(key) = scan_stream.next_item().await { + if let Ok(Value::BulkString(raw_key)) = key { + let Ok(str_key) = str::from_utf8(&raw_key) else { + error!(?raw_key, "Non-utf8 key"); + errors.push(format!("Non-utf8 key {raw_key:?}")); + continue; + }; + if let Some(key) = str_key.strip_prefix(&self.key_prefix) { + let key = StoreKey::new_str(key); + if range.contains(&key) { + iterations += 1; + if !handler(&key) { + error!("Issue in handler"); + errors.push("Issue in handler".to_string()); } + } else { + trace!(%key, ?range, "Key not in range"); } + } else { + errors.push("Key doesn't match prefix".to_string()); } + } else { + error!(?key, "Non-string in key"); + errors.push("Non-string in key".to_string()); } - page.next(); } - Ok(iterations) + if errors.is_empty() { + Ok(iterations) + } else { + error!(?errors, "Errors in scan stream"); + Err(Error::new(Code::Internal, format!("Errors: {errors:?}"))) + } } async fn update( @@ -474,7 +872,7 @@ impl StoreDriver for RedisStore { } } - let client = self.get_client().await?; + let mut client = self.get_client().await?; let mut read_stream = reader .scan(0u32, |bytes_read, chunk_res| { @@ -482,7 +880,7 @@ impl StoreDriver for RedisStore { chunk_res .err_tip(|| "Failed to read chunk in update in redis store") .and_then(|chunk| { - let offset = *bytes_read; + let offset = isize::try_from(*bytes_read).err_tip(|| "Could not convert offset to isize in RedisStore::update")?; let chunk_len = u32::try_from(chunk.len()).err_tip( || "Could not convert chunk length to u32 in RedisStore::update", )?; @@ -498,12 +896,30 @@ impl StoreDriver for RedisStore { let (offset, end_pos, chunk) = res?; let temp_key_ref = &temp_key; Ok(async move { - client - .setrange::<(), _, _>(temp_key_ref, offset, chunk) - .await - .err_tip( - || format!("While appending to temp key ({temp_key_ref}) in RedisStore::update. offset = {offset}. end_pos = {end_pos}"), - )?; + let (mut connection_manager, connect_id) = self.connection_manager.get_connection().await?; + match connection_manager + .setrange::<_, _, usize>(temp_key_ref, offset, chunk.to_vec()) + .await { + Ok(_) => {}, + Err(err) + if err.kind() == redis::ErrorKind::Server(redis::ServerErrorKind::ReadOnly) => + { + let (mut connection_manager, _connect_id) = self.connection_manager.reconnect(connect_id).await?; + connection_manager + .setrange::<_, _, usize>(temp_key_ref, offset, chunk.to_vec()) + .await + .err_tip( + || format!("(after reconnect) while appending to temp key ({temp_key_ref}) in RedisStore::update. offset = {offset}. end_pos = {end_pos}"), + )?; + } + Err(err) => { + let mut error: Error = err.into(); + error + .messages + .push(format!("While appending to temp key ({temp_key_ref}) in RedisStore::update. offset = {offset}. end_pos = {end_pos}")); + return Err(error); + } + } Ok::(end_pos) }) }) @@ -516,13 +932,14 @@ impl StoreDriver for RedisStore { } } - let blob_len = client - .strlen::(&temp_key) + let blob_len: usize = client + .connection_manager + .strlen(&temp_key) .await .err_tip(|| format!("In RedisStore::update strlen check for {temp_key}"))?; // This is a safety check to ensure that in the event some kind of retry was to happen // and the data was appended to the key twice, we reject the data. - if blob_len != u64::from(total_len) { + if blob_len != usize::try_from(total_len).unwrap_or(usize::MAX) { return Err(make_input_err!( "Data length mismatch in RedisStore::update for {}({}) - expected {} bytes, got {} bytes", key.borrow().as_str(), @@ -534,13 +951,17 @@ impl StoreDriver for RedisStore { // Rename the temp key so that the data appears under the real key. Any data already present in the real key is lost. client - .rename::<(), _, _>(&temp_key, final_key.as_ref()) + .connection_manager + .rename::<_, _, ()>(&temp_key, final_key.as_ref()) .await .err_tip(|| "While queueing key rename in RedisStore::update()")?; // If we have a publish channel configured, send a notice that the key has been set. if let Some(pub_sub_channel) = &self.pub_sub_channel { - return Ok(client.publish(pub_sub_channel, final_key.as_ref()).await?); + return Ok(client + .connection_manager + .publish(pub_sub_channel, final_key.as_ref()) + .await?); } Ok(()) @@ -553,7 +974,7 @@ impl StoreDriver for RedisStore { offset: u64, length: Option, ) -> Result<(), Error> { - let offset = usize::try_from(offset).err_tip(|| "Could not convert offset to usize")?; + let offset = isize::try_from(offset).err_tip(|| "Could not convert offset to isize")?; let length = length .map(|v| usize::try_from(v).err_tip(|| "Could not convert length to usize")) .transpose()?; @@ -566,7 +987,6 @@ impl StoreDriver for RedisStore { .err_tip(|| "Failed to send zero EOF in redis store get_part"); } - let client = self.get_client().await?; let encoded_key = self.encode_key(&key); let encoded_key = encoded_key.as_ref(); @@ -576,18 +996,20 @@ impl StoreDriver for RedisStore { // We want to read the data at the key from `offset` to `offset + length`. let data_start = offset; let data_end = data_start - .saturating_add(length.unwrap_or(isize::MAX as usize)) + .saturating_add(length.unwrap_or(isize::MAX as usize) as isize) .saturating_sub(1); // And we don't ever want to read more than `read_chunk_size` bytes at a time, so we'll need to iterate. let mut chunk_start = data_start; let mut chunk_end = cmp::min( - data_start.saturating_add(self.read_chunk_size) - 1, + data_start.saturating_add(self.read_chunk_size as isize) - 1, data_end, ); + let mut client = self.get_client().await?; loop { let chunk: Bytes = client + .connection_manager .getrange(encoded_key, chunk_start, chunk_end) .await .err_tip(|| "In RedisStore::get_part::getrange")?; @@ -615,7 +1037,7 @@ impl StoreDriver for RedisStore { // ...and go grab the next chunk. chunk_start = chunk_end + 1; chunk_end = cmp::min( - chunk_start.saturating_add(self.read_chunk_size) - 1, + chunk_start.saturating_add(self.read_chunk_size as isize) - 1, data_end, ); } @@ -624,8 +1046,9 @@ impl StoreDriver for RedisStore { // This is required by spec. if writer.get_bytes_written() == 0 { // We're supposed to read 0 bytes, so just check if the key exists. - let exists = client - .exists::(encoded_key) + let exists: bool = client + .connection_manager + .exists(encoded_key) .await .err_tip(|| "In RedisStore::get_part::zero_exists")?; @@ -660,7 +1083,7 @@ impl StoreDriver for RedisStore { fn register_remove_callback( self: Arc, - _callback: &Arc>, + _callback: Arc, ) -> Result<(), Error> { // As redis doesn't drop stuff, we can just ignore this Ok(()) @@ -668,7 +1091,11 @@ impl StoreDriver for RedisStore { } #[async_trait] -impl HealthStatusIndicator for RedisStore { +impl HealthStatusIndicator for RedisStore +where + C: ConnectionLike + Clone + Send + Sync + Unpin + 'static, + M: RedisManager + Send + Sync + Unpin + 'static, +{ fn get_name(&self) -> &'static str { "RedisStore" } @@ -682,10 +1109,8 @@ impl HealthStatusIndicator for RedisStore { // Below this line are specific to the redis scheduler implementation. // ------------------------------------------------------------------- -/// The maximum number of results to return per cursor. -const MAX_COUNT_PER_CURSOR: u64 = 256; /// The time in milliseconds that a redis cursor can be idle before it is closed. -const CURSOR_IDLE_MS: u64 = 2_000; +const CURSOR_IDLE_MS: u64 = 30_000; /// The name of the field in the Redis hash that stores the data. const DATA_FIELD_NAME: &str = "data"; /// The name of the field in the Redis hash that stores the version. @@ -693,6 +1118,7 @@ const VERSION_FIELD_NAME: &str = "version"; /// The time to live of indexes in seconds. After this time redis may delete the index. const INDEX_TTL_S: u64 = 60 * 60 * 24; // 24 hours. +#[allow(rustdoc::broken_intra_doc_links)] /// Lua script to set a key if the version matches. /// Args: /// KEYS[1]: The key where the version is stored. @@ -702,7 +1128,7 @@ const INDEX_TTL_S: u64 = 60 * 60 * 24; // 24 hours. /// Returns: /// The new version if the version matches. nil is returned if the /// value was not set. -const LUA_VERSION_SET_SCRIPT: &str = formatcp!( +pub const LUA_VERSION_SET_SCRIPT: &str = formatcp!( r" local key = KEYS[1] local expected_version = tonumber(ARGV[1]) @@ -908,65 +1334,85 @@ impl RedisSubscriptionPublisher { } } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct RedisSubscriptionManager { subscribed_keys: Arc>>, - tx_for_test: tokio::sync::mpsc::UnboundedSender, - _subscription_spawn: JoinHandleDropGuard<()>, + tx_for_test: UnboundedSender, + _subscription_spawn: Arc>>, } impl RedisSubscriptionManager { - pub fn new(subscribe_client: SubscriberClient, pub_sub_channel: String) -> Self { + pub fn new(subscriber_channel: UnboundedReceiver) -> Self { let subscribed_keys = Arc::new(RwLock::new(StringPatriciaMap::new())); let subscribed_keys_weak = Arc::downgrade(&subscribed_keys); - let (tx_for_test, mut rx_for_test) = tokio::sync::mpsc::unbounded_channel(); + let (tx_for_test, mut rx_for_test) = unbounded_channel(); + let mut local_subscriber_channel = UnboundedReceiverStream::new(subscriber_channel); Self { subscribed_keys, tx_for_test, - _subscription_spawn: spawn!("redis_subscribe_spawn", async move { - let mut rx = subscribe_client.message_rx(); - loop { - if let Err(e) = subscribe_client.subscribe(&pub_sub_channel).await { - error!("Error subscribing to pattern - {e}"); - return; - } - let mut reconnect_rx = subscribe_client.reconnect_rx(); - let reconnect_fut = reconnect_rx.recv().fuse(); - tokio::pin!(reconnect_fut); + _subscription_spawn: Arc::new(Mutex::new(spawn!( + "redis_subscribe_spawn", + async move { loop { - let key = select! { - value = rx_for_test.recv() => { - let Some(value) = value else { - unreachable!("Channel should never close"); - }; - value.into() - }, - msg = rx.recv() => { - match msg { - Ok(msg) => { - if let RedisValue::String(s) = msg.value { - s - } else { - error!("Received non-string message in RedisSubscriptionManager"); + loop { + let key = select! { + value = rx_for_test.recv() => { + let Some(value) = value else { + unreachable!("Channel should never close"); + }; + value + }, + maybe_push_info = local_subscriber_channel.next() => { + if let Some(push_info) = maybe_push_info { + match push_info.kind { + redis::PushKind::PMessage => {}, + redis::PushKind::PSubscribe => { + trace!(?push_info, "PSubscribe, ignore"); + continue; + } + _ => { + warn!(?push_info, "Other push_info message, discarded"); + continue; + }, + } + if push_info.data.len() != 3 { + error!(?push_info, "Expected exactly 3 values on subscriber channel (pattern, channel, value)"); continue; } - }, - Err(e) => { - // Check to see if our parent has been dropped and if so kill spawn. - if subscribed_keys_weak.upgrade().is_none() { - warn!("It appears our parent has been dropped, exiting RedisSubscriptionManager spawn"); - return; + match push_info.data.last().unwrap() { + Value::SimpleString(s) => { + s.clone() + } + Value::BulkString(v) => { + String::from_utf8(v.clone()).expect("String message") + } + other => { + error!(?other, "Received non-string message in RedisSubscriptionManager"); + continue; + } } - error!("Error receiving message in RedisSubscriptionManager reconnecting and flagging everything changed - {e}"); + } else { + error!("Error receiving message in RedisSubscriptionManager from subscriber_channel"); break; } } - }, - _ = &mut reconnect_fut => { - warn!("Redis reconnected flagging all subscriptions as changed and resuming"); - break; - } - }; + }; + trace!(key, "New subscription manager key"); + let Some(subscribed_keys) = subscribed_keys_weak.upgrade() else { + warn!( + "It appears our parent has been dropped, exiting RedisSubscriptionManager spawn" + ); + return; + }; + let subscribed_keys_mux = subscribed_keys.read(); + subscribed_keys_mux + .common_prefix_values(&*key) + .for_each(RedisSubscriptionPublisher::notify); + } + // Sleep for a small amount of time to ensure we don't reconnect too quickly. + sleep(Duration::from_secs(1)).await; + // If we reconnect or lag behind we might have had dirty keys, so we need to + // flag all of them as changed. let Some(subscribed_keys) = subscribed_keys_weak.upgrade() else { warn!( "It appears our parent has been dropped, exiting RedisSubscriptionManager spawn" @@ -974,40 +1420,25 @@ impl RedisSubscriptionManager { return; }; let subscribed_keys_mux = subscribed_keys.read(); - subscribed_keys_mux - .common_prefix_values(&*key) - .for_each(RedisSubscriptionPublisher::notify); - } - // Sleep for a small amount of time to ensure we don't reconnect too quickly. - sleep(Duration::from_secs(1)).await; - // If we reconnect or lag behind we might have had dirty keys, so we need to - // flag all of them as changed. - let Some(subscribed_keys) = subscribed_keys_weak.upgrade() else { - warn!( - "It appears our parent has been dropped, exiting RedisSubscriptionManager spawn" - ); - return; - }; - let subscribed_keys_mux = subscribed_keys.read(); - // Just in case also get a new receiver. - rx = subscribe_client.message_rx(); - // Drop all buffered messages, then flag everything as changed. - rx.resubscribe(); - for publisher in subscribed_keys_mux.values() { - publisher.notify(); + // Just in case also get a new receiver. + for publisher in subscribed_keys_mux.values() { + publisher.notify(); + } } } - }), + ))), } } } -impl SchedulerSubscriptionManager for RedisSubscriptionManager { - type Subscription = RedisSubscription; - +impl SubscriptionManagerNotify for RedisSubscriptionManager { fn notify_for_test(&self, value: String) { self.tx_for_test.send(value).unwrap(); } +} + +impl SchedulerSubscriptionManager for RedisSubscriptionManager { + type Subscription = RedisSubscription; fn subscribe(&self, key: K) -> Result where @@ -1044,27 +1475,31 @@ impl SchedulerSubscriptionManager for RedisSubscriptionManager { } } -impl SchedulerStore for RedisStore { +impl SchedulerStore for RedisStore +where + C: Clone + ConnectionLike + Sync + Send + 'static, + M: RedisManager + Sync + Send + 'static, +{ type SubscriptionManager = RedisSubscriptionManager; - fn subscription_manager(&self) -> Result, Error> { - let mut subscription_manager = self.subscription_manager.lock(); - - if let Some(subscription_manager) = &*subscription_manager { - Ok(subscription_manager.clone()) - } else { - let Some(pub_sub_channel) = &self.pub_sub_channel else { - return Err(make_input_err!( - "RedisStore must have a pubsub channel for a Redis Scheduler if using subscriptions" - )); - }; - let sub = Arc::new(RedisSubscriptionManager::new( - self.subscriber_client.clone(), - pub_sub_channel.clone(), - )); - *subscription_manager = Some(sub.clone()); - Ok(sub) - } + async fn subscription_manager(&self) -> Result, Error> { + self.subscription_manager + .get_or_try_init(|| async move { + let Some(subscriber_channel) = self.subscriber_channel.lock().take() else { + return Err(make_input_err!( + "Multiple attempts to obtain the subscription manager in RedisStore" + )); + }; + let Some(pub_sub_channel) = &self.pub_sub_channel else { + return Err(make_input_err!( + "RedisStore must have a pubsub for Redis Scheduler if using subscriptions" + )); + }; + self.connection_manager.psubscribe(pub_sub_channel).await?; + Ok(Arc::new(RedisSubscriptionManager::new(subscriber_channel))) + }) + .await + .map(Clone::clone) } async fn update_data(&self, data: T) -> Result, Error> @@ -1075,61 +1510,131 @@ impl SchedulerStore for RedisStore { + Send, { let key = data.get_key(); - let key = self.encode_key(&key); - let client = self.get_client().await?; + let redis_key = self.encode_key(&key); + let mut client = self.get_client().await?; let maybe_index = data.get_indexes().err_tip(|| { - format!("Err getting index in RedisStore::update_data::versioned for {key:?}") + format!("Err getting index in RedisStore::update_data::versioned for {redis_key}") })?; if ::Versioned::VALUE { let current_version = data.current_version(); let data = data.try_into_bytes().err_tip(|| { - format!("Could not convert value to bytes in RedisStore::update_data::versioned for {key:?}") + format!("Could not convert value to bytes in RedisStore::update_data::versioned for {redis_key}") })?; - let mut argv = Vec::with_capacity(3 + maybe_index.len() * 2); - argv.push(Bytes::from(format!("{current_version}"))); - argv.push(data); + let mut script = self.connection_manager.update_script(redis_key.as_ref()); + let mut script_invocation = script.arg(format!("{current_version}")).arg(data.to_vec()); for (name, value) in maybe_index { - argv.push(Bytes::from_static(name.as_bytes())); - argv.push(value); + script_invocation = script_invocation.arg(name).arg(value.to_vec()); } - let (success, new_version): (bool, i64) = self - .update_if_version_matches_script - .evalsha_with_reload(client, vec![key.as_ref()], argv) + let start = Instant::now(); + let (success, new_version): (bool, i64) = match script_invocation + .invoke_async(&mut client.connection_manager) .await - .err_tip(|| format!("In RedisStore::update_data::versioned for {key:?}"))?; + { + Ok(v) => v, + Err(err) + if err.kind() == redis::ErrorKind::Server(redis::ServerErrorKind::ReadOnly) => + { + client.reconnect(&self.connection_manager).await?; + script_invocation + .invoke_async(&mut client.connection_manager) + .await + .err_tip(|| format!("(after reconnect) In RedisStore::update_data::versioned for {key:?}"))? + } + Err(err) => { + let mut error: Error = err.into(); + error + .messages + .push(format!("In RedisStore::update_data::versioned for {key:?}")); + return Err(error); + } + }; + + let elapsed = start.elapsed(); + + if elapsed > Duration::from_millis(100) { + warn!( + %redis_key, + ?elapsed, + "Slow Redis version-set operation" + ); + } if !success { - tracing::info!( - "Error updating Redis key {key} expected version {current_version} but found {new_version}" + warn!( + %redis_key, + %key, + %current_version, + %new_version, + caller = core::any::type_name::(), + "Redis version conflict - optimistic lock failed" ); return Ok(None); } + trace!( + %redis_key, + %key, + old_version = %current_version, + %new_version, + "Updated redis key to new version" + ); // If we have a publish channel configured, send a notice that the key has been set. if let Some(pub_sub_channel) = &self.pub_sub_channel { - return Ok(client.publish(pub_sub_channel, key.as_ref()).await?); + return Ok(client + .connection_manager + .publish(pub_sub_channel, redis_key.as_ref()) + .await?); } Ok(Some(new_version)) } else { let data = data.try_into_bytes().err_tip(|| { - format!("Could not convert value to bytes in RedisStore::update_data::noversion for {key:?}") + format!("Could not convert value to bytes in RedisStore::update_data::noversion for {redis_key}") })?; - let mut fields = RedisMap::new(); - fields.reserve(1 + maybe_index.len()); - fields.insert(DATA_FIELD_NAME.into(), data.into()); + let mut fields: Vec<(String, _)> = vec![]; + fields.push((DATA_FIELD_NAME.into(), data.to_vec())); for (name, value) in maybe_index { - fields.insert(name.into(), value.into()); + fields.push((name.into(), value.to_vec())); } - client - .hset::<(), _, _>(key.as_ref(), fields) + match client + .connection_manager + .hset_multiple::<_, _, _, ()>(redis_key.as_ref(), &fields) .await - .err_tip(|| format!("In RedisStore::update_data::noversion for {key:?}"))?; + { + Ok(v) => v, + Err(err) + if err.kind() == redis::ErrorKind::Server(redis::ServerErrorKind::ReadOnly) => + { + client.reconnect(&self.connection_manager).await?; + client + .connection_manager + .hset_multiple::<_, _, _, ()>(redis_key.as_ref(), &fields) + .await + .err_tip(|| format!("(after reconnect) In RedisStore::update_data::noversion for {redis_key}"))?; + } + Err(err) => { + let mut error: Error = err.into(); + error.messages.push(format!( + "In RedisStore::update_data::noversion for {redis_key}" + )); + return Err(error); + } + } // If we have a publish channel configured, send a notice that the key has been set. if let Some(pub_sub_channel) = &self.pub_sub_channel { - return Ok(client.publish(pub_sub_channel, key.as_ref()).await?); + return Ok(client + .connection_manager + .publish(pub_sub_channel, redis_key.as_ref()) + .await?); } Ok(Some(0)) // Always use "0" version since this is not a versioned request. } } + async fn count_by_index(&self, index: Vec) -> Result, Error> + where + K: SchedulerIndexProvider + Send, + { + Err(make_err!(Code::Unimplemented, "Not implemented")) + } + async fn search_by_index_prefix( &self, index: K, @@ -1142,100 +1647,70 @@ impl SchedulerStore for RedisStore { { let index_value = index.index_value(); let run_ft_aggregate = || { - let client = self.client_pool.next().clone(); let sanitized_field = try_sanitize(index_value.as_ref()).err_tip(|| { format!("In RedisStore::search_by_index_prefix::try_sanitize - {index_value:?}") })?; Ok::<_, Error>(async move { ft_aggregate( - client, + self.connection_manager.get_connection().await?.0, format!( "{}", get_index_name!(K::KEY_PREFIX, K::INDEX_NAME, K::MAYBE_SORT_KEY) ), - format!("@{}:{{ {} }}", K::INDEX_NAME, sanitized_field), + if sanitized_field.is_empty() { + "*".to_string() + } else { + format!("@{}:{{ {} }}", K::INDEX_NAME, sanitized_field) + }, FtAggregateOptions { - load: Some(Load::Some(vec![ - SearchField { - identifier: DATA_FIELD_NAME.into(), - property: None, - }, - SearchField { - identifier: VERSION_FIELD_NAME.into(), - property: None, - }, - ])), - cursor: Some(WithCursor { - count: Some(MAX_COUNT_PER_CURSOR), - max_idle: Some(CURSOR_IDLE_MS), - }), - pipeline: vec![AggregateOperation::SortBy { - properties: K::MAYBE_SORT_KEY.map_or_else(Vec::new, |v| { - vec![(format!("@{v}").into(), SortOrder::Asc)] - }), - max: None, - }], - ..Default::default() + load: vec![DATA_FIELD_NAME.into(), VERSION_FIELD_NAME.into()], + cursor: FtAggregateCursor { + count: self.max_count_per_cursor, + max_idle: CURSOR_IDLE_MS, + }, + sort_by: K::MAYBE_SORT_KEY.map_or_else(Vec::new, |v| vec![format!("@{v}")]), }, ) .await }) }; + let stream = run_ft_aggregate()? .or_else(|_| async move { let mut schema = vec![SearchSchema { field_name: K::INDEX_NAME.into(), - alias: None, - kind: SearchSchemaKind::Tag { - sortable: false, - unf: false, - separator: None, - casesensitive: false, - withsuffixtrie: false, - noindex: false, - }, + sortable: false, }]; if let Some(sort_key) = K::MAYBE_SORT_KEY { schema.push(SearchSchema { field_name: sort_key.into(), - alias: None, - kind: SearchSchemaKind::Tag { - sortable: true, - unf: false, - separator: None, - casesensitive: false, - withsuffixtrie: false, - noindex: false, - }, + sortable: true, }); } - let create_result = self - .client_pool - .next() - .ft_create::<(), _>( - format!( - "{}", - get_index_name!(K::KEY_PREFIX, K::INDEX_NAME, K::MAYBE_SORT_KEY) - ), - FtCreateOptions { - on: Some(IndexKind::Hash), - prefixes: vec![K::KEY_PREFIX.into()], - nohl: true, - nofields: true, - nofreqs: true, - nooffsets: true, - temporary: Some(INDEX_TTL_S), - ..Default::default() - }, - schema, + + let create_result = ft_create( + self.connection_manager.get_connection().await?.0, + format!( + "{}", + get_index_name!(K::KEY_PREFIX, K::INDEX_NAME, K::MAYBE_SORT_KEY) + ), + FtCreateOptions { + prefixes: vec![K::KEY_PREFIX.into()], + nohl: true, + nofields: true, + nofreqs: true, + nooffsets: true, + temporary: Some(INDEX_TTL_S), + }, + schema, + ) + .await + .err_tip(|| { + format!( + "Error with ft_create in RedisStore::search_by_index_prefix({})", + get_index_name!(K::KEY_PREFIX, K::INDEX_NAME, K::MAYBE_SORT_KEY), ) - .await - .err_tip(|| { - format!( - "Error with ft_create in RedisStore::search_by_index_prefix({})", - get_index_name!(K::KEY_PREFIX, K::INDEX_NAME, K::MAYBE_SORT_KEY), - ) - }); + }); let run_result = run_ft_aggregate()?.await.err_tip(|| { format!( "Error with second ft_aggregate in RedisStore::search_by_index_prefix({})", @@ -1247,29 +1722,90 @@ impl SchedulerStore for RedisStore { run_result.or_else(move |e| create_result.merge(Err(e))) }) .await?; - Ok(stream.map(|result| { - let mut redis_map = - result.err_tip(|| "Error in stream of in RedisStore::search_by_index_prefix")?; - let bytes_data = redis_map - .remove(&RedisKey::from_static_str(DATA_FIELD_NAME)) - .err_tip(|| "Missing data field in RedisStore::search_by_index_prefix")? - .into_bytes() - .err_tip(|| { - formatcp!("'{DATA_FIELD_NAME}' is not Bytes in RedisStore::search_by_index_prefix::into_bytes") - })?; - let version = if ::Versioned::VALUE { - redis_map - .remove(&RedisKey::from_static_str(VERSION_FIELD_NAME)) - .err_tip(|| "Missing version field in RedisStore::search_by_index_prefix")? - .as_i64() - .err_tip(|| { - formatcp!("'{VERSION_FIELD_NAME}' is not u64 in RedisStore::search_by_index_prefix::as_u64") - })? - } else { - 0 + Ok(stream.filter_map(|result| async move { + let raw_redis_map = match result { + Ok(v) => v, + Err(e) => { + return Some( + Err(Error::from(e)) + .err_tip(|| "Error in stream of in RedisStore::search_by_index_prefix"), + ); + } + }; + + let Some(redis_map) = raw_redis_map.as_sequence() else { + return Some(Err(Error::new( + Code::Internal, + format!("Non-array from ft_aggregate: {raw_redis_map:?}"), + ))); }; - K::decode(version, bytes_data) - .err_tip(|| "In RedisStore::search_by_index_prefix::decode") + let mut redis_map_iter = redis_map.iter(); + let mut bytes_data: Option = None; + let mut version: Option = None; + loop { + let Some(key) = redis_map_iter.next() else { + break; + }; + let value = redis_map_iter.next().unwrap(); + let Value::BulkString(k) = key else { + return Some(Err(Error::new( + Code::Internal, + format!("Non-BulkString key from ft_aggregate: {key:?}"), + ))); + }; + let Ok(str_key) = str::from_utf8(k) else { + return Some(Err(Error::new( + Code::Internal, + format!("Non-utf8 key from ft_aggregate: {key:?}"), + ))); + }; + let Value::BulkString(v) = value else { + return Some(Err(Error::new( + Code::Internal, + format!("Non-BulkString value from ft_aggregate: {key:?}"), + ))); + }; + match str_key { + DATA_FIELD_NAME => { + bytes_data = Some(v.clone().into()); + } + VERSION_FIELD_NAME => { + let Ok(str_v) = str::from_utf8(v) else { + return Some(Err(Error::new( + Code::Internal, + format!("Non-utf8 version value from ft_aggregate: {v:?}"), + ))); + }; + let Ok(raw_version) = str_v.parse::() else { + return Some(Err(Error::new( + Code::Internal, + format!("Non-integer version value from ft_aggregate: {str_v:?}"), + ))); + }; + version = Some(raw_version); + } + other => { + if K::MAYBE_SORT_KEY == Some(other) { + // ignore sort keys + } else { + return Some(Err(Error::new( + Code::Internal, + format!("Extra keys from ft_aggregate: {other}"), + ))); + } + } + } + } + let Some(found_bytes_data) = bytes_data else { + return Some(Err(Error::new( + Code::Internal, + format!("Missing '{DATA_FIELD_NAME}' in ft_aggregate, got: {raw_redis_map:?}"), + ))); + }; + Some( + K::decode(version.unwrap_or(0), found_bytes_data) + .err_tip(|| "In RedisStore::search_by_index_prefix::decode"), + ) })) } @@ -1282,22 +1818,37 @@ impl SchedulerStore for RedisStore { { let key = key.get_key(); let key = self.encode_key(&key); - let client = self.get_client().await?; - let (maybe_version, maybe_data) = client - .hmget::<(Option, Option), _, _>( + let mut client = self.get_client().await?; + let results: Vec = client + .connection_manager + .hmget::<_, Vec, Vec>( key.as_ref(), - vec![ - RedisKey::from(VERSION_FIELD_NAME), - RedisKey::from(DATA_FIELD_NAME), - ], + vec![VERSION_FIELD_NAME.into(), DATA_FIELD_NAME.into()], ) .await .err_tip(|| format!("In RedisStore::get_without_version::notversioned {key}"))?; - let Some(data) = maybe_data else { + let Some(Value::BulkString(data)) = results.get(1) else { return Ok(None); }; - Ok(Some(K::decode(maybe_version.unwrap_or(0), data).err_tip( - || format!("In RedisStore::get_with_version::notversioned::decode {key}"), - )?)) + #[allow(clippy::get_first)] + let version = if let Some(raw_v) = results.get(0) { + match raw_v { + Value::Int(v) => *v, + Value::BulkString(v) => i64::from_str(str::from_utf8(v).expect("utf-8 bulkstring")) + .expect("integer bulkstring"), + Value::Nil => 0, + _ => { + warn!(?raw_v, "Non-integer version!"); + 0 + } + } + } else { + 0 + }; + Ok(Some( + K::decode(version, Bytes::from(data.clone())).err_tip(|| { + format!("In RedisStore::get_with_version::notversioned::decode {key}") + })?, + )) } } diff --git a/nativelink-store/src/redis_utils/aggregate_types.rs b/nativelink-store/src/redis_utils/aggregate_types.rs new file mode 100644 index 000000000..f05c6212d --- /dev/null +++ b/nativelink-store/src/redis_utils/aggregate_types.rs @@ -0,0 +1,24 @@ +// Copyright 2025 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::VecDeque; + +use redis::Value; + +#[derive(Debug, Default)] +pub(crate) struct RedisCursorData { + pub total: i64, + pub cursor: u64, + pub data: VecDeque, +} diff --git a/nativelink-store/src/redis_utils/ft_aggregate.rs b/nativelink-store/src/redis_utils/ft_aggregate.rs index 72b3ed8ad..a38fd15be 100644 --- a/nativelink-store/src/redis_utils/ft_aggregate.rs +++ b/nativelink-store/src/redis_utils/ft_aggregate.rs @@ -1,4 +1,4 @@ -// Copyright 2024 The NativeLink Authors. All rights reserved. +// Copyright 2024-2025 The NativeLink Authors. All rights reserved. // // Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); // you may not use this file except in compliance with the License. @@ -12,42 +12,100 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::VecDeque; +use core::fmt::Debug; -use fred::error::{Error as RedisError, ErrorKind as RedisErrorKind}; -use fred::interfaces::RediSearchInterface; -use fred::types::redisearch::FtAggregateOptions; -use fred::types::{FromValue, Map as RedisMap, Value as RedisValue}; use futures::Stream; +use nativelink_error::Error; +use redis::aio::ConnectionLike; +use redis::{Arg, ErrorKind, RedisError, Value}; +use tracing::error; -/// Calls `FT_AGGREGATE` in redis. Fred does not properly support this command +use crate::redis_utils::aggregate_types::RedisCursorData; +use crate::redis_utils::ft_cursor_read::ft_cursor_read; + +#[derive(Debug)] +pub(crate) struct FtAggregateCursor { + pub count: u64, + pub max_idle: u64, +} + +#[derive(Debug)] +pub(crate) struct FtAggregateOptions { + pub load: Vec, + pub cursor: FtAggregateCursor, + pub sort_by: Vec, +} + +/// Calls `FT.AGGREGATE` in redis. redis-rs does not properly support this command /// so we have to manually handle it. -pub(crate) async fn ft_aggregate( - client: C, - index: I, - query: Q, +pub(crate) async fn ft_aggregate( + mut connection_manager: C, + index: String, + query: String, options: FtAggregateOptions, -) -> Result> + Send, RedisError> +) -> Result> + Send, Error> where - C: RediSearchInterface, - I: Into, - Q: Into, + C: ConnectionLike + Send, { - struct State { - client: C, - index: bytes_utils::string::Str, + struct State { + connection_manager: C, + index: String, data: RedisCursorData, } - let index = index.into(); - let query = query.into(); - let data: RedisCursorData = client.ft_aggregate(index.clone(), query, options).await?; + let mut cmd = redis::cmd("FT.AGGREGATE"); + let mut ft_aggregate_cmd = cmd + .arg(&index) + .arg(&query) + .arg("LOAD") + .arg(options.load.len()) + .arg(&options.load) + .arg("WITHCURSOR") + .arg("COUNT") + .arg(options.cursor.count) + .arg("MAXIDLE") + .arg(options.cursor.max_idle) + .arg("SORTBY") + .arg(options.sort_by.len() * 2); + for key in &options.sort_by { + ft_aggregate_cmd = ft_aggregate_cmd.arg(key).arg("ASC"); + } + let res = ft_aggregate_cmd + .query_async::(&mut connection_manager) + .await; + let data = match res { + Ok(d) => d, + Err(e) => { + let all_args: Vec<_> = ft_aggregate_cmd + .args_iter() + .map(|a| match a { + Arg::Simple(bytes) => match str::from_utf8(bytes) { + Ok(s) => s.to_string(), + Err(_) => format!("{bytes:?}"), + }, + other => { + format!("{other:?}") + } + }) + .collect(); + error!( + ?e, + index, + ?query, + ?options, + ?all_args, + "Error calling ft.aggregate" + ); + return Err(e.into()); + } + }; let state = State { - client, + connection_manager, index, - data, + data: data.try_into()?, }; + Ok(futures::stream::unfold( Some(state), move |maybe_state| async move { @@ -59,10 +117,12 @@ where if state.data.cursor == 0 { return None; } - let data_res = state - .client - .ft_cursor_read(state.index.clone(), state.data.cursor, None) - .await; + let data_res = ft_cursor_read( + &mut state.connection_manager, + state.index.clone(), + state.data.cursor, + ) + .await; state.data = match data_res { Ok(data) => data, Err(err) => return Some((Err(err), None)), @@ -72,52 +132,243 @@ where )) } -#[derive(Debug, Default)] -struct RedisCursorData { - total: u64, - cursor: u64, - data: VecDeque, +fn resp2_data_parse( + output: &mut RedisCursorData, + results_array: &[Value], +) -> Result<(), RedisError> { + let mut results_iter = results_array.iter(); + match results_iter.next() { + Some(Value::Int(t)) => { + output.total = *t; + } + Some(other) => { + error!(?other, "Non-int for first value in ft.aggregate"); + return Err(RedisError::from(( + ErrorKind::Parse, + "Non int for aggregate total", + format!("{other:?}"), + ))); + } + None => { + error!("No items in results array for ft.aggregate!"); + return Err(RedisError::from(( + ErrorKind::Parse, + "No items in results array for ft.aggregate", + ))); + } + } + + for item in results_iter { + match item { + Value::Array(items) if items.len() % 2 == 0 => {} + other => { + error!( + ?other, + "Expected an array with an even number of items, didn't get it for aggregate value" + ); + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected an array with an even number of items, didn't get it for aggregate value", + format!("{other:?}"), + ))); + } + } + + output.data.push_back(item.clone()); + } + Ok(()) } -impl FromValue for RedisCursorData { - fn from_value(value: RedisValue) -> Result { - if !value.is_array() { - return Err(RedisError::new(RedisErrorKind::Protocol, "Expected array")); +fn resp3_data_parse( + output: &mut RedisCursorData, + results_map: &Vec<(Value, Value)>, +) -> Result<(), RedisError> { + for (raw_key, value) in results_map { + let Value::SimpleString(key) = raw_key else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected SimpleString keys", + format!("{raw_key:?}"), + ))); + }; + match key.as_str() { + "attributes" => { + let Value::Array(attributes) = value else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected array for attributes", + format!("{value:?}"), + ))); + }; + if !attributes.is_empty() { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected empty attributes", + format!("{attributes:?}"), + ))); + } + } + "format" => { + let Value::SimpleString(format) = value else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected SimpleString for format", + format!("{value:?}"), + ))); + }; + if format.as_str() != "STRING" { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected STRING format", + format.to_string(), + ))); + } + } + "results" => { + let Value::Array(values) = value else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected Array for results", + format!("{value:?}"), + ))); + }; + for raw_value in values { + let Value::Map(value) = raw_value else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected list of maps in result", + format!("{raw_value:?}"), + ))); + }; + for (raw_map_key, raw_map_value) in value { + let Value::SimpleString(map_key) = raw_map_key else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected SimpleString keys for result maps", + format!("{raw_key:?}"), + ))); + }; + match map_key.as_str() { + "extra_attributes" => { + let Value::Map(extra_attributes_values) = raw_map_value else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected Map for extra_attributes", + format!("{raw_map_value:?}"), + ))); + }; + let mut output_array = vec![]; + for (e_key, e_value) in extra_attributes_values { + output_array.push(e_key.clone()); + output_array.push(e_value.clone()); + } + output.data.push_back(Value::Array(output_array)); + } + "values" => { + let Value::Array(values_values) = raw_map_value else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected Array for values", + format!("{raw_map_value:?}"), + ))); + }; + if !values_values.is_empty() { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected empty values (all in extra_attributes)", + format!("{values_values:?}"), + ))); + } + } + _ => { + return Err(RedisError::from(( + ErrorKind::Parse, + "Unknown result map key", + format!("{map_key:?}"), + ))); + } + } + } + } + } + "total_results" => { + let Value::Int(total) = value else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected int for total_results", + format!("{value:?}"), + ))); + }; + output.total = *total; + } + "warning" => { + let Value::Array(warnings) = value else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected Array for warning", + format!("{value:?}"), + ))); + }; + if !warnings.is_empty() { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected empty warnings", + format!("{warnings:?}"), + ))); + } + } + _ => { + return Err(RedisError::from(( + ErrorKind::Parse, + "Unexpected key in ft.aggregate", + format!("{key} => {value:?}"), + ))); + } } - let mut output = Self::default(); - let value = value.into_array(); + } + Ok(()) +} + +impl TryFrom for RedisCursorData { + type Error = RedisError; + fn try_from(raw_value: Value) -> Result { + let Value::Array(value) = raw_value else { + error!( + ?raw_value, + "Bad data in ft.aggregate, expected array at top-level" + ); + return Err(RedisError::from((ErrorKind::Parse, "Expected array"))); + }; if value.len() < 2 { - return Err(RedisError::new( - RedisErrorKind::Protocol, + return Err(RedisError::from(( + ErrorKind::Parse, "Expected at least 2 elements", - )); + ))); } + let mut output = Self::default(); let mut value = value.into_iter(); - let data_ary = value.next().unwrap().into_array(); - if data_ary.is_empty() { - return Err(RedisError::new( - RedisErrorKind::Protocol, - "Expected at least 1 element in data array", - )); - } - let Some(total) = data_ary[0].as_u64() else { - return Err(RedisError::new( - RedisErrorKind::Protocol, - "Expected integer as first element", - )); - }; - output.total = total; - output.data.reserve(data_ary.len() - 1); - for map_data in data_ary.into_iter().skip(1) { - output.data.push_back(map_data.into_map()?); + match value.next().unwrap() { + Value::Array(d) => resp2_data_parse(&mut output, &d)?, + Value::Map(d) => resp3_data_parse(&mut output, &d)?, + other => { + error!( + ?other, + "Bad data in ft.aggregate, expected array for results" + ); + return Err(RedisError::from(( + ErrorKind::Parse, + "Non map item", + format!("{other:?}"), + ))); + } } - let Some(cursor) = value.next().unwrap().as_u64() else { - return Err(RedisError::new( - RedisErrorKind::Protocol, + let Value::Int(cursor) = value.next().unwrap() else { + return Err(RedisError::from(( + ErrorKind::Parse, "Expected integer as last element", - )); + ))); }; - output.cursor = cursor; + output.cursor = cursor as u64; Ok(output) } } diff --git a/nativelink-store/src/redis_utils/ft_create.rs b/nativelink-store/src/redis_utils/ft_create.rs new file mode 100644 index 000000000..79a8b6015 --- /dev/null +++ b/nativelink-store/src/redis_utils/ft_create.rs @@ -0,0 +1,78 @@ +// Copyright 2025 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use redis::RedisError; +use redis::aio::ConnectionLike; + +pub(crate) struct SearchSchema { + pub field_name: String, + pub sortable: bool, +} + +#[allow(clippy::struct_excessive_bools)] +pub(crate) struct FtCreateOptions { + pub prefixes: Vec, + pub nohl: bool, + pub nofields: bool, + pub nofreqs: bool, + pub nooffsets: bool, + pub temporary: Option, +} + +pub(crate) async fn ft_create( + mut connection_manager: C, + index: String, + options: FtCreateOptions, + schemas: Vec, +) -> Result<(), RedisError> +where + C: ConnectionLike + Send, +{ + let mut cmd = redis::cmd("FT.CREATE"); + let mut ft_create_cmd = cmd.arg(index).arg("ON").arg("HASH"); + if options.nohl { + ft_create_cmd = ft_create_cmd.arg("NOHL"); + } + if options.nofields { + ft_create_cmd = ft_create_cmd.arg("NOFIELDS"); + } + if options.nofreqs { + ft_create_cmd = ft_create_cmd.arg("NOFREQS"); + } + if options.nooffsets { + ft_create_cmd = ft_create_cmd.arg("NOOFFSETS"); + } + if let Some(seconds) = options.temporary { + ft_create_cmd = ft_create_cmd.arg("TEMPORARY").arg(seconds); + } + if !options.prefixes.is_empty() { + ft_create_cmd = ft_create_cmd.arg("PREFIX").arg(options.prefixes.len()); + for prefix in options.prefixes { + ft_create_cmd = ft_create_cmd.arg(prefix); + } + } + ft_create_cmd = ft_create_cmd.arg("SCHEMA"); + for schema in schemas { + ft_create_cmd = ft_create_cmd.arg(schema.field_name).arg("TAG"); + if schema.sortable { + ft_create_cmd = ft_create_cmd.arg("SORTABLE"); + } + } + + ft_create_cmd + .to_owned() + .exec_async(&mut connection_manager) + .await?; + Ok(()) +} diff --git a/nativelink-store/src/redis_utils/ft_cursor_read.rs b/nativelink-store/src/redis_utils/ft_cursor_read.rs new file mode 100644 index 000000000..eb1323cf8 --- /dev/null +++ b/nativelink-store/src/redis_utils/ft_cursor_read.rs @@ -0,0 +1,66 @@ +// Copyright 2025 The NativeLink Authors. All rights reserved. +// +// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// See LICENSE file for details +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use redis::aio::ConnectionLike; +use redis::{ErrorKind, RedisError, Value}; + +use crate::redis_utils::aggregate_types::RedisCursorData; + +pub(crate) async fn ft_cursor_read( + connection_manager: &mut C, + index: String, + cursor_id: u64, +) -> Result +where + C: ConnectionLike + Send, +{ + let mut cmd = redis::cmd("ft.cursor"); + let ft_cursor_cmd = cmd.arg("read").arg(index).cursor_arg(cursor_id); + let data = ft_cursor_cmd + .to_owned() + .query_async::(connection_manager) + .await?; + let Value::Array(value) = data else { + return Err(RedisError::from((ErrorKind::Parse, "Expected array"))); + }; + if value.len() < 2 { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected at least 2 elements", + ))); + } + let mut value = value.into_iter(); + let Value::Array(data_ary) = value.next().unwrap() else { + return Err(RedisError::from((ErrorKind::Parse, "Non map item"))); + }; + if data_ary.is_empty() { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected at least 1 element in data array", + ))); + } + let Value::Int(new_cursor_id) = value.next().unwrap() else { + return Err(RedisError::from(( + ErrorKind::Parse, + "Expected cursor id as second element", + ))); + }; + + Ok(RedisCursorData { + // this should generally be impossible, but -1 provides a decent "obviously bad" value just in case + total: i64::try_from(data_ary.len()).unwrap_or(-1), + cursor: new_cursor_id as u64, + data: data_ary.into(), + }) +} diff --git a/nativelink-store/src/redis_utils/mod.rs b/nativelink-store/src/redis_utils/mod.rs index 0f76773bc..230ee2f4f 100644 --- a/nativelink-store/src/redis_utils/mod.rs +++ b/nativelink-store/src/redis_utils/mod.rs @@ -12,5 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod aggregate_types; mod ft_aggregate; -pub(crate) use ft_aggregate::ft_aggregate; +mod ft_create; +mod ft_cursor_read; +pub(crate) use ft_aggregate::{FtAggregateCursor, FtAggregateOptions, ft_aggregate}; +pub(crate) use ft_create::{FtCreateOptions, SearchSchema, ft_create}; diff --git a/nativelink-store/src/ref_store.rs b/nativelink-store/src/ref_store.rs index 41dfdfa5a..d432553f0 100644 --- a/nativelink-store/src/ref_store.rs +++ b/nativelink-store/src/ref_store.rs @@ -14,17 +14,18 @@ use core::cell::UnsafeCell; use core::pin::Pin; -use std::sync::{Arc, Mutex, Weak}; +use std::sync::{Arc, Weak}; use async_trait::async_trait; use nativelink_config::stores::RefSpec; -use nativelink_error::{Code, Error, ResultExt, make_err, make_input_err}; +use nativelink_error::{Error, ResultExt, make_input_err}; use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; use nativelink_util::health_utils::{HealthStatusIndicator, default_health_status_indicator}; use nativelink_util::store_trait::{ RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, }; +use parking_lot::Mutex; use tracing::error; use crate::store_manager::StoreManager; @@ -47,7 +48,7 @@ pub struct RefStore { name: String, store_manager: Weak, inner: StoreReference, - remove_callbacks: Mutex>>>, + remove_callbacks: Mutex>>, } impl RefStore { @@ -80,19 +81,14 @@ impl RefStore { } // This should protect us against multiple writers writing the same location at the same // time. - let _lock = self.inner.mux.lock().map_err(|e| { - make_err!( - Code::Internal, - "Failed to lock mutex in ref_store : {:?}", - e - ) - })?; + let _lock = self.inner.mux.lock(); let store_manager = self .store_manager .upgrade() .err_tip(|| "Store manager is gone")?; if let Some(store) = store_manager.get_store(&self.name) { - for callback in self.remove_callbacks.lock().unwrap().iter() { + let remove_callbacks = self.remove_callbacks.lock().clone(); + for callback in remove_callbacks { store.register_remove_callback(callback)?; } unsafe { @@ -158,15 +154,15 @@ impl StoreDriver for RefStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { - self.remove_callbacks.lock()?.push(callback.clone()); + self.remove_callbacks.lock().push(callback.clone()); let ref_store = self.inner.cell.0.get(); unsafe { if let Some(ref store) = *ref_store { store.register_remove_callback(callback)?; } - }; + } Ok(()) } } diff --git a/nativelink-store/src/s3_store.rs b/nativelink-store/src/s3_store.rs index 65e2ad53c..a175a0b54 100644 --- a/nativelink-store/src/s3_store.rs +++ b/nativelink-store/src/s3_store.rs @@ -13,9 +13,7 @@ // limitations under the License. use core::cmp; -use core::future::Future; use core::pin::Pin; -use core::task::{Context, Poll}; use core::time::Duration; use std::borrow::Cow; use std::sync::Arc; @@ -23,7 +21,6 @@ use std::sync::Arc; use async_trait::async_trait; use aws_config::default_provider::credentials; use aws_config::provider_config::ProviderConfig; -use aws_config::retry::ErrorKind::TransientError; use aws_config::{AppName, BehaviorVersion}; use aws_sdk_s3::Client; use aws_sdk_s3::config::Region; @@ -32,26 +29,10 @@ use aws_sdk_s3::operation::get_object::GetObjectError; use aws_sdk_s3::operation::head_object::HeadObjectError; use aws_sdk_s3::primitives::ByteStream; // SdkBody use aws_sdk_s3::types::builders::{CompletedMultipartUploadBuilder, CompletedPartBuilder}; -use aws_smithy_runtime_api::client::http::{ - HttpClient as SmithyHttpClient, HttpConnector as SmithyHttpConnector, HttpConnectorFuture, - HttpConnectorSettings, SharedHttpConnector, -}; -use aws_smithy_runtime_api::client::orchestrator::HttpRequest; -use aws_smithy_runtime_api::client::result::ConnectorError; -use aws_smithy_runtime_api::client::runtime_components::RuntimeComponents; -use aws_smithy_runtime_api::http::Response; use aws_smithy_types::body::SdkBody; -use bytes::{Bytes, BytesMut}; use futures::future::FusedFuture; use futures::stream::{FuturesUnordered, unfold}; -use futures::{FutureExt, Stream, StreamExt, TryFutureExt, TryStreamExt}; -use http_body::{Frame, SizeHint}; -use http_body_util::BodyExt; -use hyper::{Method, Request}; -use hyper_rustls::{HttpsConnector, HttpsConnectorBuilder}; -use hyper_util::client::legacy::Client as LegacyClient; -use hyper_util::client::legacy::connect::HttpConnector as LegacyHttpConnector; -use hyper_util::rt::TokioExecutor; +use futures::{FutureExt, StreamExt, TryFutureExt, TryStreamExt}; use nativelink_config::stores::ExperimentalAwsSpec; // Note: S3 store should be very careful about the error codes it returns // when in a retryable wrapper. Always prefer Code::Aborted or another @@ -62,18 +43,19 @@ use nativelink_metric::MetricsComponent; use nativelink_util::buf_channel::{ DropCloserReadHalf, DropCloserWriteHalf, make_buf_channel_pair, }; -use nativelink_util::fs; use nativelink_util::health_utils::{HealthRegistryBuilder, HealthStatus, HealthStatusIndicator}; use nativelink_util::instant_wrapper::InstantWrapper; use nativelink_util::retry::{Retrier, RetryResult}; -use nativelink_util::store_trait::{RemoveItemCallback, StoreDriver, StoreKey, UploadSizeInfo}; +use nativelink_util::store_trait::{ + RemoveItemCallback, StoreDriver, StoreKey, StoreOptimizations, UploadSizeInfo, +}; use parking_lot::Mutex; -use rand::Rng; use tokio::sync::mpsc; use tokio::time::sleep; use tracing::{error, info}; use crate::cas_utils::is_zero_digest; +use crate::common_s3_utils::{BodyWrapper, TlsClient}; // S3 parts cannot be smaller than this number. See: // https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html @@ -95,328 +77,6 @@ const DEFAULT_MAX_RETRY_BUFFER_PER_REQUEST: usize = 5 * 1024 * 1024; // 5MB. // Note: If you change this, adjust the docs in the config. const DEFAULT_MULTIPART_MAX_CONCURRENT_UPLOADS: usize = 10; -#[derive(Clone)] -pub struct TlsClient { - client: LegacyClient, SdkBody>, - retrier: Retrier, -} - -impl TlsClient { - #[must_use] - pub fn new( - spec: &ExperimentalAwsSpec, - jitter_fn: Arc Duration + Send + Sync>, - ) -> Self { - let connector_with_roots = HttpsConnectorBuilder::new().with_platform_verifier(); - - let connector_with_schemes = if spec.common.insecure_allow_http { - connector_with_roots.https_or_http() - } else { - connector_with_roots.https_only() - }; - - let connector = if spec.common.disable_http2 { - connector_with_schemes.enable_http1().build() - } else { - connector_with_schemes.enable_http1().enable_http2().build() - }; - - let client = LegacyClient::builder(TokioExecutor::new()).build(connector); - - Self { - client, - retrier: Retrier::new( - Arc::new(|duration| Box::pin(sleep(duration))), - jitter_fn, - spec.common.retry.clone(), - ), - } - } -} - -impl core::fmt::Debug for TlsClient { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> { - f.debug_struct("TlsClient").finish_non_exhaustive() - } -} - -impl SmithyHttpClient for TlsClient { - fn http_connector( - &self, - _settings: &HttpConnectorSettings, - _components: &RuntimeComponents, - ) -> SharedHttpConnector { - SharedHttpConnector::new(self.clone()) - } -} - -enum BufferedBodyState { - Cloneable(SdkBody), - Buffered(Bytes), - Empty, -} - -mod body_processing { - use super::{BodyExt, BufferedBodyState, BytesMut, ConnectorError, SdkBody, TransientError}; - - /// Buffer a request body fully into memory. - /// - /// TODO(palfrey): This could lead to OOMs in extremely constrained - /// environments. Probably better to implement something - /// like a rewindable stream logic. - #[inline] - pub(crate) async fn buffer_body(body: SdkBody) -> Result { - let mut bytes = BytesMut::new(); - let mut body_stream = body; - while let Some(frame) = body_stream.frame().await { - match frame { - Ok(frame) => { - if let Some(data) = frame.data_ref() { - bytes.extend_from_slice(data); - } - } - Err(e) => { - return Err(ConnectorError::other( - format!("Failed to read request body: {e}").into(), - Some(TransientError), - )); - } - } - } - - Ok(BufferedBodyState::Buffered(bytes.freeze())) - } -} - -struct RequestComponents { - method: Method, - uri: hyper::Uri, - version: hyper::Version, - headers: hyper::HeaderMap, - body_data: BufferedBodyState, -} - -mod conversions { - use super::{ - BufferedBodyState, ConnectorError, Future, HttpRequest, Method, RequestComponents, - Response, SdkBody, TransientError, body_processing, - }; - - pub(crate) trait RequestExt { - fn into_components(self) - -> impl Future>; - } - - impl RequestExt for HttpRequest { - async fn into_components(self) -> Result { - // Note: This does *not* refer the the HTTP protocol, but to the - // version of the http crate. - let hyper_req = self.try_into_http1x().map_err(|e| { - ConnectorError::other( - format!("Failed to convert to HTTP request: {e}").into(), - Some(TransientError), - ) - })?; - - let method = hyper_req.method().clone(); - let uri = hyper_req.uri().clone(); - let version = hyper_req.version(); - let headers = hyper_req.headers().clone(); - - let body = hyper_req.into_body(); - - // Only buffer bodies for methods likely to have payloads. - let needs_buffering = matches!(method, Method::POST | Method::PUT); - - // Preserve the body in case we need to retry. - let body_data = if needs_buffering { - if let Some(cloneable_body) = body.try_clone() { - BufferedBodyState::Cloneable(cloneable_body) - } else { - body_processing::buffer_body(body).await? - } - } else { - BufferedBodyState::Empty - }; - - Ok(RequestComponents { - method, - uri, - version, - headers, - body_data, - }) - } - } - - pub(crate) trait ResponseExt { - fn into_smithy_response(self) -> Response; - } - - impl ResponseExt for hyper::Response { - fn into_smithy_response(self) -> Response { - let (parts, body) = self.into_parts(); - let sdk_body = SdkBody::from_body_1_x(body); - let mut smithy_resp = Response::new(parts.status.into(), sdk_body); - let header_pairs: Vec<(String, String)> = parts - .headers - .iter() - .filter_map(|(name, value)| { - value - .to_str() - .ok() - .map(|value_str| (name.as_str().to_owned(), value_str.to_owned())) - }) - .collect(); - - for (name, value) in header_pairs { - smithy_resp.headers_mut().insert(name, value); - } - - smithy_resp - } - } -} - -struct RequestBuilder<'a> { - components: &'a RequestComponents, -} - -impl<'a> RequestBuilder<'a> { - #[inline] - const fn new(components: &'a RequestComponents) -> Self { - Self { components } - } - - #[inline] - #[allow(unused_qualifications, reason = "false positive on hyper::http::Error")] - fn build(&self) -> Result, hyper::http::Error> { - let mut req_builder = Request::builder() - .method(self.components.method.clone()) - .uri(self.components.uri.clone()) - .version(self.components.version); - - let headers_map = req_builder.headers_mut().unwrap(); - for (name, value) in &self.components.headers { - headers_map.insert(name, value.clone()); - } - - match &self.components.body_data { - BufferedBodyState::Cloneable(body) => { - let cloned_body = body.try_clone().expect("Body should be cloneable"); - req_builder.body(cloned_body) - } - BufferedBodyState::Buffered(bytes) => req_builder.body(SdkBody::from(bytes.clone())), - BufferedBodyState::Empty => req_builder.body(SdkBody::empty()), - } - } -} - -mod execution { - use super::conversions::ResponseExt; - use super::{ - Code, HttpsConnector, LegacyClient, LegacyHttpConnector, RequestBuilder, RequestComponents, - Response, RetryResult, SdkBody, fs, make_err, - }; - - #[inline] - pub(crate) async fn execute_request( - client: LegacyClient, SdkBody>, - components: &RequestComponents, - ) -> RetryResult> { - let _permit = match fs::get_permit().await { - Ok(permit) => permit, - Err(e) => { - return RetryResult::Retry(make_err!( - Code::Unavailable, - "Failed to acquire permit: {e}" - )); - } - }; - - let request = match RequestBuilder::new(components).build() { - Ok(req) => req, - Err(e) => { - return RetryResult::Err(make_err!( - Code::Internal, - "Failed to create request: {e}", - )); - } - }; - - match client.request(request).await { - Ok(resp) => RetryResult::Ok(resp.into_smithy_response()), - Err(e) => RetryResult::Retry(make_err!( - Code::Unavailable, - "Failed request in S3Store: {e}" - )), - } - } - - #[inline] - pub(crate) fn create_retry_stream( - client: LegacyClient, SdkBody>, - components: RequestComponents, - ) -> impl futures::Stream>> { - futures::stream::unfold(components, move |components| { - let client_clone = client.clone(); - async move { - let result = execute_request(client_clone, &components).await; - - Some((result, components)) - } - }) - } -} - -impl SmithyHttpConnector for TlsClient { - fn call(&self, req: HttpRequest) -> HttpConnectorFuture { - use conversions::RequestExt; - - let client = self.client.clone(); - let retrier = self.retrier.clone(); - - HttpConnectorFuture::new(Box::pin(async move { - let components = req.into_components().await?; - - let retry_stream = execution::create_retry_stream(client, components); - - match retrier.retry(retry_stream).await { - Ok(response) => Ok(response), - Err(e) => Err(ConnectorError::other( - format!("Connection failed after retries: {e}").into(), - Some(TransientError), - )), - } - })) - } -} - -#[derive(Debug)] -pub struct BodyWrapper { - reader: DropCloserReadHalf, - size: u64, -} - -impl http_body::Body for BodyWrapper { - type Data = Bytes; - type Error = std::io::Error; - - fn poll_frame( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll, Self::Error>>> { - let reader = Pin::new(&mut Pin::get_mut(self).reader); - reader - .poll_next(cx) - .map(|maybe_bytes_res| maybe_bytes_res.map(|res| res.map(Frame::data))) - } - - fn size_hint(&self) -> SizeHint { - SizeHint::with_exact(self.size) - } -} - #[derive(Debug, MetricsComponent)] pub struct S3Store { s3_client: Arc, @@ -433,7 +93,7 @@ pub struct S3Store { #[metric(help = "The number of concurrent uploads allowed for multipart uploads")] multipart_max_concurrent_uploads: usize, - remove_callbacks: Arc>>>>, + remove_callbacks: Mutex>>, } impl S3Store @@ -442,15 +102,9 @@ where NowFn: Fn() -> I + Send + Sync + Unpin + 'static, { pub async fn new(spec: &ExperimentalAwsSpec, now_fn: NowFn) -> Result, Error> { - let jitter_amt = spec.common.retry.jitter; - let jitter_fn = Arc::new(move |delay: Duration| { - if jitter_amt == 0. { - return delay; - } - delay.mul_f32(jitter_amt.mul_add(rand::rng().random::() - 0.5, 1.)) - }); + let jitter_fn = spec.common.retry.make_jitter_fn(); let s3_client = { - let http_client = TlsClient::new(&spec.clone(), jitter_fn.clone()); + let http_client = TlsClient::new(&spec.common.clone()); let credential_provider = credentials::DefaultCredentialsChain::builder() .configure( @@ -509,7 +163,7 @@ where .common .multipart_max_concurrent_uploads .map_or(DEFAULT_MULTIPART_MAX_CONCURRENT_UPLOADS, |v| v), - remove_callbacks: Arc::new(Mutex::new(vec![])), + remove_callbacks: Mutex::new(Vec::new()), })) } @@ -538,15 +192,14 @@ where let now_s = (self.now_fn)().unix_timestamp() as i64; if last_modified.secs() + self.consider_expired_after_s <= now_s { - let remove_callbacks = self.remove_callbacks.lock_arc(); - let borrow_key = local_digest.borrow(); - let callbacks = remove_callbacks + let remove_callbacks = self.remove_callbacks.lock().clone(); + let mut callbacks: FuturesUnordered<_> = remove_callbacks .iter() - .map(|callback| callback.callback(&borrow_key)) - .collect::>(); - for callback in callbacks { - callback.await; - } + .map(|callback| { + callback.callback(local_digest.borrow()) + }) + .collect(); + while callbacks.next().await.is_some() {} return Some((RetryResult::Ok(None), state)); } } @@ -609,6 +262,10 @@ where .await } + fn optimized_for(&self, optimization: StoreOptimizations) -> bool { + matches!(optimization, StoreOptimizations::LazyExistenceOnSync) + } + async fn update( self: Pin<&Self>, digest: StoreKey<'_>, @@ -998,9 +655,9 @@ where fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { - self.remove_callbacks.lock_arc().push(callback.clone()); + self.remove_callbacks.lock().push(callback); Ok(()) } } diff --git a/nativelink-store/src/shard_store.rs b/nativelink-store/src/shard_store.rs index 0ebbfe878..e59a05845 100644 --- a/nativelink-store/src/shard_store.rs +++ b/nativelink-store/src/shard_store.rs @@ -67,8 +67,11 @@ impl ShardStore { .stores .iter() .map(|shard_config| { - (u64::from(u32::MAX) * u64::from(shard_config.weight.unwrap_or(1)) / total_weight) - as u32 + u32::try_from( + u64::from(u32::MAX) * u64::from(shard_config.weight.unwrap_or(1)) + / total_weight, + ) + .unwrap_or(u32::MAX) }) .scan(0, |state, weight| { *state += weight; @@ -243,10 +246,10 @@ impl StoreDriver for ShardStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { for store in &self.weights_and_stores { - store.store.register_remove_callback(callback)?; + store.store.register_remove_callback(callback.clone())?; } Ok(()) } diff --git a/nativelink-store/src/size_partitioning_store.rs b/nativelink-store/src/size_partitioning_store.rs index 23aed4c40..a959244b5 100644 --- a/nativelink-store/src/size_partitioning_store.rs +++ b/nativelink-store/src/size_partitioning_store.rs @@ -164,9 +164,10 @@ impl StoreDriver for SizePartitioningStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { - self.lower_store.register_remove_callback(callback)?; + self.lower_store + .register_remove_callback(callback.clone())?; self.upper_store.register_remove_callback(callback)?; Ok(()) } diff --git a/nativelink-store/src/verify_store.rs b/nativelink-store/src/verify_store.rs index baebed857..04ba3a02f 100644 --- a/nativelink-store/src/verify_store.rs +++ b/nativelink-store/src/verify_store.rs @@ -233,7 +233,7 @@ impl StoreDriver for VerifyStore { fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { self.inner_store.register_remove_callback(callback) } diff --git a/nativelink-store/tests/compression_store_test.rs b/nativelink-store/tests/compression_store_test.rs index 230b47211..622d3b35f 100644 --- a/nativelink-store/tests/compression_store_test.rs +++ b/nativelink-store/tests/compression_store_test.rs @@ -297,7 +297,8 @@ async fn check_header_test() -> Result<(), Error> { ); let upload_size = reader.read_u32_le().await?; assert_eq!( - upload_size, MAX_SIZE_INPUT as u32, + u64::from(upload_size), + MAX_SIZE_INPUT, "Expected upload size to match" ); } @@ -452,7 +453,7 @@ async fn check_footer_test() -> Result<(), Error> { position_from_prev_index: v }) .to_vec(), - index_count: EXPECTED_INDEXES.len() as u32, + index_count: u32::try_from(EXPECTED_INDEXES.len()).unwrap_or(u32::MAX), uncompressed_data_size: data_len as u64, config: Lz4Config { block_size: BLOCK_SIZE @@ -509,3 +510,134 @@ async fn get_part_is_zero_digest() -> Result<(), Error> { Ok(()) } + +// Regression test for the bug where start_pos > end_pos in the slice operation +#[nativelink_test] +async fn regression_test_range_start_not_greater_than_end() -> Result<(), Error> { + // Create a store with a small block size to trigger multiple blocks + const BLOCK_SIZE: u32 = 64 * 1024; // 64KB, same as DEFAULT_BLOCK_SIZE + + let inner_store = MemoryStore::new(&MemorySpec::default()); + let store_owned = CompressionStore::new( + &CompressionSpec { + backend: StoreSpec::Memory(MemorySpec::default()), + compression_algorithm: nativelink_config::stores::CompressionAlgorithm::Lz4( + nativelink_config::stores::Lz4Config { + block_size: BLOCK_SIZE, + ..Default::default() + }, + ), + }, + Store::new(inner_store.clone()), + ) + .err_tip(|| "Failed to create compression store")?; + let store = Pin::new(&store_owned); + + // Create a large buffer that spans multiple blocks + let data_size = BLOCK_SIZE as usize * 3; // 3 blocks + let mut data = vec![0u8; data_size]; + let mut rng = SmallRng::seed_from_u64(42); + rng.fill(&mut data[..]); + + let digest = DigestInfo::try_new(VALID_HASH, data_size).unwrap(); + store.update_oneshot(digest, data.clone().into()).await?; + + // Try to read exactly at block boundaries with various offsets + let boundary = u64::from(BLOCK_SIZE); + + // These specific offsets test the case in the bug report where + // start_pos was 65536 and end_pos was 65535 + for (offset, length) in &[ + (boundary - 1, Some(2u64)), // Read across block boundary + (boundary, Some(1u64)), // Read exactly at block boundary + (boundary + 1, Some(10u64)), // Read just after block boundary + // Specifically test the case where offset >= block size + (u64::from(BLOCK_SIZE), Some(20u64)), + // Specifically test the case that caused the bug (65536 and 65535) + (u64::from(BLOCK_SIZE), Some(u64::from(BLOCK_SIZE) - 1)), + // More edge cases around the block boundary to thoroughly test the issue + (u64::from(BLOCK_SIZE) - 1, Some(1u64)), // Just before boundary + (u64::from(BLOCK_SIZE), Some(0u64)), // Zero length at boundary + (u64::from(BLOCK_SIZE), Some(u64::MAX)), // Unlimited length at boundary + (u64::from(BLOCK_SIZE) * 2, Some(u64::from(BLOCK_SIZE) - 1)), // Same issue at next block + ] { + // First test with get_part_unchunked + let result = store.get_part_unchunked(digest, *offset, *length).await; + + // The bug was causing a panic, so just checking that it doesn't panic + // means the fix is working + assert!( + result.is_ok(), + "Reading with get_part_unchunked at offset {offset} with length {length:?} should not fail" + ); + + let store_data = result.unwrap(); + + // Verify the data matches what we expect + let expected_len = cmp::min( + usize::try_from(length.unwrap_or(u64::MAX))?, + data.len().saturating_sub(usize::try_from(*offset)?), + ); + assert_eq!( + store_data.len(), + expected_len, + "Expected data length to match when reading at offset {} with length {:?}", + offset, + length + ); + + if expected_len > 0 { + let start = usize::try_from(*offset)?; + let end = start + expected_len; + assert_eq!( + &store_data[..], + &data[start..end], + "Expected data content to match when reading at offset {} with length {:?}", + offset, + length + ); + } + + // Now also test with the lower-level get_part method to ensure it doesn't panic + // This is closer to what the bytestream server would call + let (mut tx, mut rx) = make_buf_channel_pair(); + + // The error was happening in this method call + let get_part_result = store.get_part(digest, &mut tx, *offset, *length).await; + assert!( + get_part_result.is_ok(), + "Reading with get_part at offset {offset} with length {length:?} should not fail" + ); + + // Just to consume the stream and ensure it behaves as expected + let mut received_data = Vec::new(); + while let Ok(chunk) = rx.consume(Some(1024)).await { + if chunk.is_empty() { + break; + } + received_data.extend_from_slice(&chunk); + } + + assert_eq!( + received_data.len(), + expected_len, + "Expected get_part received data length to match when reading at offset {} with length {:?}", + offset, + length + ); + + if expected_len > 0 { + let start = usize::try_from(*offset)?; + let end = start + expected_len; + assert_eq!( + &received_data[..], + &data[start..end], + "Expected get_part data content to match when reading at offset {} with length {:?}", + offset, + length + ); + } + } + + Ok(()) +} diff --git a/nativelink-store/tests/fast_slow_store_test.rs b/nativelink-store/tests/fast_slow_store_test.rs index 0ea4be4f1..53dd12387 100644 --- a/nativelink-store/tests/fast_slow_store_test.rs +++ b/nativelink-store/tests/fast_slow_store_test.rs @@ -18,7 +18,7 @@ use std::sync::{Arc, Mutex}; use async_trait::async_trait; use bytes::Bytes; -use nativelink_config::stores::{FastSlowSpec, MemorySpec, NoopSpec, StoreSpec}; +use nativelink_config::stores::{FastSlowSpec, MemorySpec, NoopSpec, StoreDirection, StoreSpec}; use nativelink_error::{Code, Error, ResultExt, make_err}; use nativelink_macro::nativelink_test; use nativelink_metric::MetricsComponent; @@ -35,13 +35,18 @@ use rand::{Rng, SeedableRng}; const MEGABYTE_SZ: usize = 1024 * 1024; -fn make_stores() -> (Store, Store, Store) { +fn make_stores_direction( + fast_direction: StoreDirection, + slow_direction: StoreDirection, +) -> (Store, Store, Store) { let fast_store = Store::new(MemoryStore::new(&MemorySpec::default())); let slow_store = Store::new(MemoryStore::new(&MemorySpec::default())); let fast_slow_store = Store::new(FastSlowStore::new( &FastSlowSpec { fast: StoreSpec::Memory(MemorySpec::default()), slow: StoreSpec::Memory(MemorySpec::default()), + fast_direction, + slow_direction, }, fast_store.clone(), slow_store.clone(), @@ -49,6 +54,10 @@ fn make_stores() -> (Store, Store, Store) { (fast_slow_store, fast_store, slow_store) } +fn make_stores() -> (Store, Store, Store) { + make_stores_direction(StoreDirection::default(), StoreDirection::default()) +} + fn make_random_data(sz: usize) -> Vec { let mut value = vec![0u8; sz]; let mut rng = SmallRng::seed_from_u64(1); @@ -284,7 +293,7 @@ async fn drop_on_eof_completes_store_futures() -> Result<(), Error> { // Gets called in the slow store and we provide the data that's // sent to the upstream and the fast store. let bytes = length.unwrap_or_else(|| key.into_digest().size_bytes()) - offset; - let data = vec![0_u8; bytes as usize]; + let data = vec![0_u8; usize::try_from(bytes).unwrap_or(usize::MAX)]; writer.send(Bytes::copy_from_slice(&data)).await?; writer.send_eof() } @@ -303,7 +312,7 @@ async fn drop_on_eof_completes_store_futures() -> Result<(), Error> { fn register_remove_callback( self: Arc, - _callback: &Arc>, + _callback: Arc, ) -> Result<(), Error> { Ok(()) } @@ -339,6 +348,8 @@ async fn drop_on_eof_completes_store_futures() -> Result<(), Error> { &FastSlowSpec { fast: StoreSpec::Memory(MemorySpec::default()), slow: StoreSpec::Memory(MemorySpec::default()), + fast_direction: StoreDirection::default(), + slow_direction: StoreDirection::default(), }, fast_store, slow_store, @@ -380,6 +391,8 @@ async fn ignore_value_in_fast_store() -> Result<(), Error> { &FastSlowSpec { fast: StoreSpec::Memory(MemorySpec::default()), slow: StoreSpec::Memory(MemorySpec::default()), + fast_direction: StoreDirection::default(), + slow_direction: StoreDirection::default(), }, fast_store.clone(), slow_store, @@ -403,6 +416,8 @@ async fn has_checks_fast_store_when_noop() -> Result<(), Error> { let fast_slow_store_config = FastSlowSpec { fast: StoreSpec::Memory(MemorySpec::default()), slow: StoreSpec::Noop(NoopSpec::default()), + fast_direction: StoreDirection::default(), + slow_direction: StoreDirection::default(), }; let fast_slow_store = Arc::new(FastSlowStore::new( &fast_slow_store_config, @@ -437,3 +452,256 @@ async fn has_checks_fast_store_when_noop() -> Result<(), Error> { ); Ok(()) } + +#[nativelink_test] +async fn fast_get_only_not_updated() -> Result<(), Error> { + let (fast_slow_store, fast_store, slow_store) = + make_stores_direction(StoreDirection::Get, StoreDirection::Both); + let digest = DigestInfo::try_new(VALID_HASH, 100).unwrap(); + fast_slow_store + .update_oneshot(digest, make_random_data(100).into()) + .await?; + assert!( + fast_store.has(digest).await?.is_none(), + "Expected data to not be in the fast store" + ); + assert!( + slow_store.has(digest).await?.is_some(), + "Expected data in the slow store" + ); + Ok(()) +} + +#[nativelink_test] +async fn fast_readonly_only_not_updated() -> Result<(), Error> { + let (fast_slow_store, fast_store, slow_store) = + make_stores_direction(StoreDirection::ReadOnly, StoreDirection::Both); + let digest = DigestInfo::try_new(VALID_HASH, 100).unwrap(); + fast_slow_store + .update_oneshot(digest, make_random_data(100).into()) + .await?; + assert!( + fast_store.has(digest).await?.is_none(), + "Expected data to not be in the fast store" + ); + assert!( + slow_store.has(digest).await?.is_some(), + "Expected data in the slow store" + ); + Ok(()) +} + +#[nativelink_test] +async fn slow_readonly_only_not_updated() -> Result<(), Error> { + let (fast_slow_store, fast_store, slow_store) = + make_stores_direction(StoreDirection::Both, StoreDirection::ReadOnly); + let digest = DigestInfo::try_new(VALID_HASH, 100).unwrap(); + fast_slow_store + .update_oneshot(digest, make_random_data(100).into()) + .await?; + assert!( + fast_store.has(digest).await?.is_some(), + "Expected data to be in the fast store" + ); + assert!( + slow_store.has(digest).await?.is_none(), + "Expected data to not be in the slow store" + ); + Ok(()) +} + +#[nativelink_test] +async fn slow_get_only_not_updated() -> Result<(), Error> { + let (fast_slow_store, fast_store, slow_store) = + make_stores_direction(StoreDirection::Both, StoreDirection::Get); + let digest = DigestInfo::try_new(VALID_HASH, 100).unwrap(); + fast_slow_store + .update_oneshot(digest, make_random_data(100).into()) + .await?; + assert!( + fast_store.has(digest).await?.is_some(), + "Expected data to be in the fast store" + ); + assert!( + slow_store.has(digest).await?.is_none(), + "Expected data to not be in the slow store" + ); + Ok(()) +} + +#[nativelink_test] +async fn fast_put_only_not_updated() -> Result<(), Error> { + let (fast_slow_store, fast_store, slow_store) = + make_stores_direction(StoreDirection::Update, StoreDirection::Both); + let digest = DigestInfo::try_new(VALID_HASH, 100).unwrap(); + slow_store + .update_oneshot(digest, make_random_data(100).into()) + .await?; + fast_slow_store.get_part_unchunked(digest, 0, None).await?; + assert!( + fast_store.has(digest).await?.is_none(), + "Expected data to not be in the fast store" + ); + Ok(()) +} + +#[nativelink_test] +async fn fast_readonly_only_not_updated_on_get() -> Result<(), Error> { + let (fast_slow_store, fast_store, slow_store) = + make_stores_direction(StoreDirection::ReadOnly, StoreDirection::Both); + let digest = DigestInfo::try_new(VALID_HASH, 100).unwrap(); + slow_store + .update_oneshot(digest, make_random_data(100).into()) + .await?; + assert!( + !fast_slow_store + .get_part_unchunked(digest, 0, None) + .await? + .is_empty(), + "Data not found in slow store" + ); + assert!( + fast_store.has(digest).await?.is_none(), + "Expected data to not be in the fast store" + ); + assert!( + slow_store.has(digest).await?.is_some(), + "Expected data in the slow store" + ); + Ok(()) +} + +fn make_stores_with_lazy_slow() -> (Store, Store, Store) { + #[derive(MetricsComponent)] + struct LazyStore { + inner: Arc, + } + + #[async_trait] + impl StoreDriver for LazyStore { + async fn has_with_results( + self: Pin<&Self>, + digests: &[StoreKey<'_>], + results: &mut [Option], + ) -> Result<(), Error> { + Pin::new(self.inner.as_ref()) + .has_with_results(digests, results) + .await + } + + async fn update( + self: Pin<&Self>, + digest: StoreKey<'_>, + reader: nativelink_util::buf_channel::DropCloserReadHalf, + size_info: nativelink_util::store_trait::UploadSizeInfo, + ) -> Result<(), Error> { + Pin::new(self.inner.as_ref()) + .update(digest, reader, size_info) + .await + } + + async fn get_part( + self: Pin<&Self>, + key: StoreKey<'_>, + writer: &mut nativelink_util::buf_channel::DropCloserWriteHalf, + offset: u64, + length: Option, + ) -> Result<(), Error> { + Pin::new(self.inner.as_ref()) + .get_part(key, writer, offset, length) + .await + } + + fn optimized_for( + &self, + optimization: nativelink_util::store_trait::StoreOptimizations, + ) -> bool { + matches!( + optimization, + nativelink_util::store_trait::StoreOptimizations::LazyExistenceOnSync + ) + } + + fn inner_store(&self, _digest: Option) -> &'_ dyn StoreDriver { + self + } + + fn as_any(&self) -> &(dyn core::any::Any + Sync + Send + 'static) { + self + } + + fn as_any_arc(self: Arc) -> Arc { + self + } + + fn register_remove_callback( + self: Arc, + _callback: Arc, + ) -> Result<(), Error> { + Ok(()) + } + } + + default_health_status_indicator!(LazyStore); + + let fast_store = Store::new(MemoryStore::new(&MemorySpec::default())); + let slow_store = Store::new(Arc::new(LazyStore { + inner: MemoryStore::new(&MemorySpec::default()), + })); + let fast_slow_store = Store::new(FastSlowStore::new( + &FastSlowSpec { + fast: StoreSpec::Memory(MemorySpec::default()), + slow: StoreSpec::Memory(MemorySpec::default()), + fast_direction: StoreDirection::default(), + slow_direction: StoreDirection::default(), + }, + fast_store.clone(), + slow_store.clone(), + )); + (fast_slow_store, fast_store, slow_store) +} + +#[nativelink_test] +async fn lazy_not_found_returns_error_when_missing() -> Result<(), Error> { + let (fast_slow_store, _fast_store, _slow_store) = make_stores_with_lazy_slow(); + let digest = DigestInfo::try_new(VALID_HASH, 100).unwrap(); + + let result = fast_slow_store.get_part_unchunked(digest, 0, None).await; + + assert!(result.is_err(), "Expected error when key doesn't exist"); + assert_eq!( + result.unwrap_err().code, + Code::NotFound, + "Expected NotFound error code" + ); + Ok(()) +} + +#[nativelink_test] +async fn lazy_not_found_syncs_to_fast_store() -> Result<(), Error> { + let (fast_slow_store, fast_store, slow_store) = make_stores_with_lazy_slow(); + let original_data = make_random_data(100); + let digest = DigestInfo::try_new(VALID_HASH, original_data.len()).unwrap(); + + slow_store + .update_oneshot(digest, original_data.clone().into()) + .await?; + + assert!( + fast_store.has(digest).await?.is_none(), + "Expected data to not be in fast store initially" + ); + + let retrieved_data = fast_slow_store.get_part_unchunked(digest, 0, None).await?; + + assert_eq!( + retrieved_data.as_ref(), + original_data.as_slice(), + "Retrieved data should match" + ); + assert!( + fast_store.has(digest).await?.is_some(), + "Expected data to be synced to fast store" + ); + Ok(()) +} diff --git a/nativelink-store/tests/filesystem_store_test.rs b/nativelink-store/tests/filesystem_store_test.rs index e7377ff1f..7655de0c1 100644 --- a/nativelink-store/tests/filesystem_store_test.rs +++ b/nativelink-store/tests/filesystem_store_test.rs @@ -26,7 +26,7 @@ use bytes::Bytes; use futures::executor::block_on; use futures::task::Poll; use futures::{Future, FutureExt, poll}; -use nativelink_config::stores::FilesystemSpec; +use nativelink_config::stores::{EvictionPolicy, FilesystemSpec}; use nativelink_error::{Code, Error, ResultExt, make_err}; use nativelink_macro::nativelink_test; use nativelink_store::filesystem_store::{ @@ -41,16 +41,31 @@ use nativelink_util::{background_spawn, spawn}; use opentelemetry::context::{Context, FutureExt as OtelFutureExt}; use parking_lot::Mutex; use pretty_assertions::assert_eq; -use rand::Rng; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; use sha2::{Digest, Sha256}; use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, Take}; -use tokio::sync::Barrier; +use tokio::sync::{Barrier, Semaphore}; use tokio::time::sleep; use tokio_stream::StreamExt; use tokio_stream::wrappers::ReadDirStream; use tracing::Instrument; +const VALID_HASH: &str = "0123456789abcdef000000000000000000010000000000000123456789abcdef"; + +fn make_random_data(sz: usize) -> Vec { + let mut value = vec![0u8; sz]; + let mut rng = SmallRng::seed_from_u64(1); + rng.fill(&mut value[..]); + value +} + trait FileEntryHooks { + fn on_make_and_open( + _encoded_file_path: &EncodedFilePath, + ) -> impl Future> + Send { + core::future::ready(Ok(())) + } fn on_unref(_entry: &Fe) {} fn on_drop(_entry: &Fe) {} } @@ -84,6 +99,7 @@ impl FileEntry for TestFileEntry< block_size: u64, encoded_file_path: EncodedFilePath, ) -> Result<(Self, fs::FileSlot, OsString), Error> { + Hooks::on_make_and_open(&encoded_file_path).await?; let (inner, file_slot, path) = FileEntryImpl::make_and_open_file(block_size, encoded_file_path).await?; Ok(( @@ -220,9 +236,9 @@ async fn wait_for_no_open_files() -> Result<(), Error> { Ok(()) } -/// Helper function to ensure there are no temporary files left. -async fn check_temp_empty(temp_path: &str) -> Result<(), Error> { - let (_permit, temp_dir_handle) = fs::read_dir(format!("{temp_path}/{DIGEST_FOLDER}")) +/// Helper function to ensure there are no temporary or content files left. +async fn check_storage_dir_empty(storage_path: &str) -> Result<(), Error> { + let (_permit, temp_dir_handle) = fs::read_dir(format!("{storage_path}/{DIGEST_FOLDER}")) .await .err_tip(|| "Failed opening temp directory")? .into_inner(); @@ -237,7 +253,7 @@ async fn check_temp_empty(temp_path: &str) -> Result<(), Error> { ); } - let (_permit, temp_dir_handle) = fs::read_dir(format!("{temp_path}/{STR_FOLDER}")) + let (_permit, temp_dir_handle) = fs::read_dir(format!("{storage_path}/{STR_FOLDER}")) .await .err_tip(|| "Failed opening temp directory")? .into_inner(); @@ -325,7 +341,7 @@ async fn temp_files_get_deleted_on_replace_test() -> Result<(), Error> { FilesystemStore::>::new(&FilesystemSpec { content_path: content_path.clone(), temp_path: temp_path.clone(), - eviction_policy: Some(nativelink_config::stores::EvictionPolicy { + eviction_policy: Some(EvictionPolicy { max_count: 3, ..Default::default() }), @@ -374,7 +390,7 @@ async fn temp_files_get_deleted_on_replace_test() -> Result<(), Error> { "Dropped a filesystem_delete_file current_active_drop_spawns=0" )); - check_temp_empty(&temp_path).await + check_storage_dir_empty(&temp_path).await } // This test ensures that if a file is overridden and an open stream to the file already @@ -398,12 +414,13 @@ async fn file_continues_to_stream_on_content_replace_test() -> Result<(), Error> FilesystemStore::>::new(&FilesystemSpec { content_path: content_path.clone(), temp_path: temp_path.clone(), - eviction_policy: Some(nativelink_config::stores::EvictionPolicy { + eviction_policy: Some(EvictionPolicy { max_count: 3, ..Default::default() }), block_size: 1, read_buffer_size: 1, + ..Default::default() }) .await?, ); @@ -481,7 +498,7 @@ async fn file_continues_to_stream_on_content_replace_test() -> Result<(), Error> } // Now ensure our temp file was cleaned up. - check_temp_empty(&temp_path).await + check_storage_dir_empty(&temp_path).await } // Eviction has a different code path than a file replacement, so we check that if a @@ -506,12 +523,13 @@ async fn file_gets_cleans_up_on_cache_eviction() -> Result<(), Error> { FilesystemStore::>::new(&FilesystemSpec { content_path: content_path.clone(), temp_path: temp_path.clone(), - eviction_policy: Some(nativelink_config::stores::EvictionPolicy { + eviction_policy: Some(EvictionPolicy { max_count: 1, ..Default::default() }), block_size: 1, read_buffer_size: 1, + ..Default::default() }) .await?, ); @@ -577,7 +595,7 @@ async fn file_gets_cleans_up_on_cache_eviction() -> Result<(), Error> { } // Now ensure our temp file was cleaned up. - check_temp_empty(&temp_path).await + check_storage_dir_empty(&temp_path).await } // Test to ensure that if we are holding a reference to `FileEntry` and the contents are @@ -652,7 +670,7 @@ async fn eviction_on_insert_calls_unref_once() -> Result<(), Error> { FilesystemStore::>::new(&FilesystemSpec { content_path: make_temp_path("content_path"), temp_path: make_temp_path("temp_path"), - eviction_policy: Some(nativelink_config::stores::EvictionPolicy { + eviction_policy: Some(EvictionPolicy { max_bytes: 5, ..Default::default() }), @@ -799,7 +817,7 @@ async fn rename_on_insert_fails_due_to_filesystem_error_proper_cleanup_happens() // Now it should have cleaned up its temp files. { - check_temp_empty(&temp_path).await?; + check_storage_dir_empty(&temp_path).await?; } // Finally ensure that our entry is not in the store. @@ -901,32 +919,6 @@ async fn get_part_is_zero_digest() -> Result<(), Error> { #[nativelink_test] async fn has_with_results_on_zero_digests() -> Result<(), Error> { - async fn wait_for_empty_content_file< - Fut: Future>, - F: Fn() -> Fut, - >( - content_path: &str, - digest: DigestInfo, - yield_fn: F, - ) -> Result<(), Error> { - loop { - yield_fn().await?; - - let empty_digest_file_name = - OsString::from(format!("{content_path}/{DIGEST_FOLDER}/{digest}")); - - let file_metadata = fs::metadata(empty_digest_file_name) - .await - .err_tip(|| "Failed to open content file")?; - - // Test that the empty digest file is created and contains an empty length. - if file_metadata.is_file() && file_metadata.len() == 0 { - return Ok(()); - } - } - // Unreachable. - } - let digest = DigestInfo::new(Sha256::new().finalize().into(), 0); let content_path = make_temp_path("content_path"); let temp_path = make_temp_path("temp_path"); @@ -954,12 +946,93 @@ async fn has_with_results_on_zero_digests() -> Result<(), Error> { ); assert_eq!(results, vec![Some(0)]); - wait_for_empty_content_file(&content_path, digest, || async move { - tokio::task::yield_now().await; + check_storage_dir_empty(&content_path).await?; + + Ok(()) +} + +async fn wrap_update_zero_digest(updater: F) -> Result<(), Error> +where + F: AsyncFnOnce(DigestInfo, Arc) -> Result<(), Error>, +{ + let digest = DigestInfo::new(Sha256::new().finalize().into(), 0); + let content_path = make_temp_path("content_path"); + let temp_path = make_temp_path("temp_path"); + + let store = FilesystemStore::::new_with_timeout_and_rename_fn( + &FilesystemSpec { + content_path: content_path.clone(), + temp_path: temp_path.clone(), + read_buffer_size: 1, + ..Default::default() + }, + |from, to| std::fs::rename(from, to), + ) + .await?; + updater(digest, store).await?; + check_storage_dir_empty(&content_path).await?; + check_storage_dir_empty(&temp_path).await?; + Ok(()) +} + +#[nativelink_test] +async fn update_whole_file_with_zero_digest() -> Result<(), Error> { + wrap_update_zero_digest(async |digest, store| { + let temp_file_dir = make_temp_path("update_with_zero_digest"); + std::fs::create_dir_all(&temp_file_dir)?; + let temp_file_path = Path::new(&temp_file_dir).join("zero-length-file"); + std::fs::write(&temp_file_path, b"") + .err_tip(|| format!("Writing to {temp_file_path:?}"))?; + let file_slot = fs::open_file(&temp_file_path, 0, 0).await?.into_inner(); + store + .update_with_whole_file( + digest, + temp_file_path.into(), + file_slot, + UploadSizeInfo::ExactSize(0), + ) + .await?; Ok(()) }) + .await +} + +#[nativelink_test] +async fn update_oneshot_with_zero_digest() -> Result<(), Error> { + wrap_update_zero_digest(async |digest, store| store.update_oneshot(digest, Bytes::new()).await) + .await +} + +#[nativelink_test] +async fn update_with_zero_digest() -> Result<(), Error> { + wrap_update_zero_digest(async |digest, store| { + let (_writer, reader) = make_buf_channel_pair(); + store + .update(digest, reader, UploadSizeInfo::ExactSize(0)) + .await + }) + .await +} + +#[nativelink_test] +async fn get_file_entry_for_zero_digest() -> Result<(), Error> { + let digest = DigestInfo::new(Sha256::new().finalize().into(), 0); + let content_path = make_temp_path("content_path"); + let temp_path = make_temp_path("temp_path"); + + let store = FilesystemStore::::new_with_timeout_and_rename_fn( + &FilesystemSpec { + content_path: content_path.clone(), + temp_path: temp_path.clone(), + read_buffer_size: 1, + ..Default::default() + }, + |from, to| std::fs::rename(from, to), + ) .await?; + let file_entry = store.get_file_entry_for_digest(&digest).await?; + assert!(file_entry.is_empty()); Ok(()) } @@ -1247,3 +1320,148 @@ async fn update_with_whole_file_uses_same_inode() -> Result<(), Error> { Ok(()) } + +#[nativelink_test] +async fn file_slot_taken_when_ready() -> Result<(), Error> { + static FILE_SEMAPHORE: Semaphore = Semaphore::const_new(1); + static WRITER_SEMAPHORE: Semaphore = Semaphore::const_new(1); + static FILE_PERMIT: Mutex>> = Mutex::new(None); + static WRITER_PERMIT: Mutex>> = Mutex::new(None); + + struct SingleSemaphoreHooks; + impl FileEntryHooks for SingleSemaphoreHooks { + async fn on_make_and_open(_encoded_file_path: &EncodedFilePath) -> Result<(), Error> { + *FILE_PERMIT.lock() = + Some(FILE_SEMAPHORE.acquire().await.map_err(|e| { + make_err!(Code::Internal, "Unable to acquire semaphore: {e:?}") + })?); + // Drop the writer permit now that we have one. + WRITER_PERMIT.lock().take(); + Ok(()) + } + } + + *WRITER_PERMIT.lock() = Some(WRITER_SEMAPHORE.acquire().await.unwrap()); + *FILE_PERMIT.lock() = Some(FILE_SEMAPHORE.acquire().await.unwrap()); + + let content_path = make_temp_path("content_path"); + let temp_path = make_temp_path("temp_path"); + + let value_1: String = "x".repeat(1024); + let value_2: String = "y".repeat(1024); + + let digest_1 = DigestInfo::try_new(HASH1, value_1.len())?; + let digest_2 = DigestInfo::try_new(HASH2, value_2.len())?; + + let store = Box::pin( + FilesystemStore::>::new_with_timeout_and_rename_fn( + &FilesystemSpec { + content_path: content_path.clone(), + temp_path: temp_path.clone(), + read_buffer_size: 1, + ..Default::default() + }, + |from, to| std::fs::rename(from, to), + ) + .await?, + ); + + let value_1 = Bytes::from(value_1); + let value_2 = Bytes::from(value_2); + + let (mut writer_1, reader_1) = make_buf_channel_pair(); + let (mut writer_2, reader_2) = make_buf_channel_pair(); + let size_1 = UploadSizeInfo::ExactSize(value_1.len().try_into()?); + let size_2 = UploadSizeInfo::ExactSize(value_2.len().try_into()?); + let store_ref = &store; + let update_1_fut = async move { + let result = store_ref.update(digest_1, reader_1, size_1).await; + FILE_PERMIT.lock().take(); + result + }; + let update_2_fut = async move { + let result = store_ref.update(digest_2, reader_2, size_2).await; + FILE_PERMIT.lock().take(); + result + }; + + let writer_1_fut = async move { + let _permit = WRITER_SEMAPHORE.acquire().await.unwrap(); + writer_1.send(value_1.slice(0..1)).await?; + writer_1.send(value_1.slice(1..2)).await?; + writer_1.send(value_1.slice(2..3)).await?; + writer_1.send(value_1.slice(3..)).await?; + writer_1.send_eof()?; + Ok::<_, Error>(()) + }; + let writer_2_fut = async move { + writer_2.send(value_2.slice(0..1)).await?; + writer_2.send(value_2.slice(1..2)).await?; + writer_2.send(value_2.slice(2..3)).await?; + // Allow the update to get a file permit. + FILE_PERMIT.lock().take(); + writer_2.send(value_2.slice(3..)).await?; + writer_2.send_eof()?; + Ok::<_, Error>(()) + }; + + let (res_1, res_2, res_3, res_4) = tokio::time::timeout(Duration::from_secs(10), async move { + tokio::join!(update_1_fut, update_2_fut, writer_1_fut, writer_2_fut) + }) + .await + .map_err(|_| make_err!(Code::Internal, "Deadlock detected"))?; + res_1.merge(res_2).merge(res_3).merge(res_4) +} + +// If we insert a file larger than the max_bytes eviction policy, it should be safely +// evicted, without deadlocking. +#[nativelink_test] +async fn safe_small_safe_eviction() -> Result<(), Error> { + let store_spec = FilesystemSpec { + content_path: "/tmp/nativelink/safe_fs".into(), + temp_path: "/tmp/nativelink/safe_fs_temp".into(), + eviction_policy: Some(EvictionPolicy { + max_bytes: 1, + ..Default::default() + }), + ..Default::default() + }; + let store = Store::new(::new(&store_spec).await?); + + // > than the max_bytes + let bytes = 2; + + let data = make_random_data(bytes); + let digest = DigestInfo::try_new(VALID_HASH, data.len()).unwrap(); + + assert_eq!( + store.has(digest).await, + Ok(None), + "Expected data to not exist in store" + ); + + store.update_oneshot(digest, data.clone().into()).await?; + + assert_eq!( + store.has(digest).await, + Ok(None), + "Expected data to not exist in store, because eviction" + ); + + let (tx, mut rx) = make_buf_channel_pair(); + + assert_eq!( + store.get(digest, tx).await, + Err(Error { + code: Code::NotFound, + messages: vec![format!( + "{VALID_HASH}-{bytes} not found in filesystem store here" + )], + }), + "Expected data to not exist in store, because eviction" + ); + + assert!(rx.recv().await.is_err()); + + Ok(()) +} diff --git a/nativelink-store/tests/gcs_client_test.rs b/nativelink-store/tests/gcs_client_test.rs index 95cff00f8..22b4dd30e 100644 --- a/nativelink-store/tests/gcs_client_test.rs +++ b/nativelink-store/tests/gcs_client_test.rs @@ -237,7 +237,7 @@ async fn test_upload_from_reader() -> Result<(), Error> { let data_size = 100; let mut send_data = BytesMut::new(); for i in 0..data_size { - send_data.put_u8(((i % 93) + 33) as u8); + send_data.put_u8(u8::try_from((i % 93) + 33).expect("printable ASCII range")); } let send_data = send_data.freeze(); let (mut tx, rx) = make_buf_channel_pair(); diff --git a/nativelink-store/tests/gcs_store_test.rs b/nativelink-store/tests/gcs_store_test.rs index 287a1d3e5..ba42bf9b9 100644 --- a/nativelink-store/tests/gcs_store_test.rs +++ b/nativelink-store/tests/gcs_store_test.rs @@ -21,6 +21,7 @@ use mock_instant::thread_local::MockClock; use nativelink_config::stores::{CommonObjectSpec, ExperimentalGcsSpec}; use nativelink_error::{Code, Error, make_err}; use nativelink_macro::nativelink_test; +use nativelink_store::cas_utils::ZERO_BYTE_DIGESTS; use nativelink_store::gcs_client::client::GcsOperations; use nativelink_store::gcs_client::mocks::{MockGcsOperations, MockRequest}; use nativelink_store::gcs_client::types::{DEFAULT_CONTENT_TYPE, ObjectPath}; @@ -182,7 +183,7 @@ async fn simple_update() -> Result<(), Error> { // Create test data let mut send_data = BytesMut::new(); for i in 0..DATA_SIZE { - send_data.put_u8(((i % 93) + 33) as u8); + send_data.put_u8(u8::try_from((i % 93) + 33).expect("printable ASCII range")); } let send_data = send_data.freeze(); @@ -230,6 +231,83 @@ async fn simple_update() -> Result<(), Error> { Ok(()) } +#[nativelink_test] +async fn update_zero_length() -> Result<(), Error> { + // Create mock GCS operations + let mock_ops = Arc::new(MockGcsOperations::new()); + let store = create_test_store(mock_ops.clone()).await?; + + let digest = ZERO_BYTE_DIGESTS[0]; + let store_key: StoreKey = to_store_key(digest); + let (mut tx, rx) = make_buf_channel_pair(); + + // Start update operation + let store_clone = store.clone(); + let update_fut = nativelink_util::spawn!("update_task", async move { + store_clone + .update(store_key, rx, UploadSizeInfo::ExactSize(0)) + .await + }); + + tx.send_eof()?; + update_fut.await??; + + // Verify the mock operations were called correctly + let requests = mock_ops.get_requests().await; + let write_requests = requests.iter().filter_map(|req| { + if let MockRequest::Write { + object_path, + content_len, + } = req + { + Some((object_path, content_len)) + } else { + None + } + }); + + assert_eq!(write_requests.count(), 0, "Expected no write request"); + + Ok(()) +} + +#[nativelink_test] +async fn update_zero_digest_with_data() -> Result<(), Error> { + const DATA_SIZE: usize = 50; + + // Create mock GCS operations + let mock_ops = Arc::new(MockGcsOperations::new()); + let store = create_test_store(mock_ops.clone()).await?; + + // Create test data + let mut send_data = BytesMut::new(); + for i in 0..DATA_SIZE { + send_data.put_u8(u8::try_from((i % 93) + 33).unwrap()); + } + let send_data = send_data.freeze(); + + let digest = ZERO_BYTE_DIGESTS[0]; + let store_key: StoreKey = to_store_key(digest); + let (mut tx, rx) = make_buf_channel_pair(); + + // Start update operation + let store_clone = store.clone(); + let update_fut = nativelink_util::spawn!("update_task", async move { + store_clone + .update(store_key, rx, UploadSizeInfo::ExactSize(DATA_SIZE as u64)) + .await + }); + + tx.send(send_data).await?; + tx.send_eof()?; + assert!( + update_fut.await?.is_err(), + "No error for zero byte digest with data" + ); + + Ok(()) +} + #[nativelink_test] async fn get_part_test() -> Result<(), Error> { // Create mock GCS operations @@ -426,7 +504,9 @@ async fn large_file_update_test() -> Result<(), Error> { let store = create_test_store(mock_ops.clone()).await?; // Create test data - let pattern: Vec = (0..100).map(|i| (i % 256) as u8).collect(); + let pattern: Vec = (0..100) + .map(|i| u8::try_from(i % 256).expect("modulo 256 fits in u8")) + .collect(); // Create a digest and channel pair let digest = DigestInfo::try_new(VALID_HASH1, DATA_SIZE as u64)?; @@ -578,7 +658,8 @@ async fn create_test_store_with_expiration( bucket: BUCKET_NAME.to_string(), common: CommonObjectSpec { key_prefix: Some(KEY_PREFIX.to_string()), - consider_expired_after_s: expiration_seconds as u32, + consider_expired_after_s: u32::try_from(expiration_seconds) + .expect("expiration_seconds exceeds u32::MAX"), ..Default::default() }, ..Default::default() diff --git a/nativelink-store/tests/grpc_store_test.rs b/nativelink-store/tests/grpc_store_test.rs new file mode 100644 index 000000000..85ab3be4e --- /dev/null +++ b/nativelink-store/tests/grpc_store_test.rs @@ -0,0 +1,45 @@ +use core::time::Duration; + +use nativelink_config::stores::{GrpcEndpoint, GrpcSpec, Retry, StoreType}; +use nativelink_error::Error; +use nativelink_macro::nativelink_test; +use nativelink_proto::build::bazel::remote::execution::v2::{ + FindMissingBlobsRequest, digest_function, +}; +use nativelink_store::grpc_store::GrpcStore; +use tokio::time::timeout; +use tonic::Request; + +#[nativelink_test] +async fn fast_find_missing_blobs() -> Result<(), Error> { + let spec = GrpcSpec { + instance_name: String::new(), + endpoints: vec![GrpcEndpoint { + address: "http://foobar".into(), + tls_config: None, + concurrency_limit: None, + connect_timeout_s: 0, + tcp_keepalive_s: 0, + http2_keepalive_interval_s: 0, + http2_keepalive_timeout_s: 0, + }], + store_type: StoreType::Cas, + retry: Retry::default(), + max_concurrent_requests: 0, + connections_per_endpoint: 0, + rpc_timeout_s: 1, + }; + let store = GrpcStore::new(&spec).await?; + let request = Request::new(FindMissingBlobsRequest { + instance_name: String::new(), + blob_digests: vec![], + digest_function: digest_function::Value::Sha256.into(), + }); + let res = timeout(Duration::from_secs(1), async move { + store.find_missing_blobs(request).await + }) + .await??; + let inner_res = res.into_inner(); + assert_eq!(inner_res.missing_blob_digests.len(), 0); + Ok(()) +} diff --git a/nativelink-store/tests/ontap_s3_existence_cache_store_test.rs b/nativelink-store/tests/ontap_s3_existence_cache_store_test.rs index 3b04c7a10..a94740d09 100644 --- a/nativelink-store/tests/ontap_s3_existence_cache_store_test.rs +++ b/nativelink-store/tests/ontap_s3_existence_cache_store_test.rs @@ -82,6 +82,7 @@ async fn create_test_store(mock_client: StaticReplayClient) -> Result Result<(), Error> { let mut send_data = BytesMut::with_capacity(CONTENT_LENGTH); for i in 0..CONTENT_LENGTH { - send_data.put_u8(((i % 93) + 33) as u8); // Printable characters only. + let value = (i % 93) + 33; + send_data.put_u8(u8::try_from(value).expect("value always in u8 range")); } let send_data = send_data.freeze(); @@ -584,7 +585,8 @@ async fn multipart_update_large_cas() -> Result<(), Error> { let mut send_data = Vec::with_capacity(AC_ENTRY_SIZE); for i in 0..send_data.capacity() { - send_data.push(((i * 3) % 256) as u8); + let value = (i * 3) % 256; + send_data.push(u8::try_from(value).expect("value always in u8 range")); } let digest = DigestInfo::try_new(VALID_HASH1, send_data.len())?; diff --git a/nativelink-store/tests/redis_store_test.rs b/nativelink-store/tests/redis_store_test.rs index 62792bf77..1dca90517 100644 --- a/nativelink-store/tests/redis_store_test.rs +++ b/nativelink-store/tests/redis_store_test.rs @@ -1,4 +1,4 @@ -// Copyright 2024 The NativeLink Authors. All rights reserved. +// Copyright 2024-2025 The NativeLink Authors. All rights reserved. // // Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); // you may not use this file except in compliance with the License. @@ -13,37 +13,44 @@ // limitations under the License. use core::ops::RangeBounds; -use core::sync::atomic::{AtomicBool, Ordering}; -use std::collections::VecDeque; -use std::sync::{Arc, Mutex}; -use std::thread::panicking; +use core::time::Duration; +use std::collections::HashMap; +use std::sync::Arc; use bytes::{Bytes, BytesMut}; -use fred::bytes_utils::string::Str; -use fred::clients::SubscriberClient; -use fred::error::Error as RedisError; -use fred::mocks::{MockCommand, Mocks}; -use fred::prelude::{Builder, Pool as RedisPool}; -use fred::types::Value as RedisValue; -use fred::types::config::{Config as RedisConfig, PerformanceConfig}; -use nativelink_config::stores::RedisSpec; -use nativelink_error::{Code, Error}; +use futures::TryStreamExt; +use nativelink_config::stores::{RedisMode, RedisSpec}; +use nativelink_error::{Code, Error, ResultExt, make_err}; use nativelink_macro::nativelink_test; +use nativelink_redis_tester::{ + ReadOnlyRedis, add_lua_script, fake_redis_sentinel_master_stream, fake_redis_sentinel_stream, + fake_redis_stream, make_fake_redis_with_responses, +}; use nativelink_store::cas_utils::ZERO_BYTE_DIGESTS; -use nativelink_store::redis_store::RedisStore; +use nativelink_store::redis_store::{ + ClusterRedisManager, DEFAULT_MAX_CHUNK_UPLOADS_PER_UPDATE, DEFAULT_MAX_COUNT_PER_CURSOR, + LUA_VERSION_SET_SCRIPT, RedisStore, RedisSubscriptionManager, +}; use nativelink_util::buf_channel::make_buf_channel_pair; use nativelink_util::common::DigestInfo; use nativelink_util::health_utils::HealthStatus; -use nativelink_util::store_trait::{StoreKey, StoreLike, UploadSizeInfo}; +use nativelink_util::store_trait::{ + FalseValue, SchedulerCurrentVersionProvider, SchedulerIndexProvider, SchedulerStore, + SchedulerStoreDataProvider, SchedulerStoreDecodeTo, SchedulerStoreKeyProvider, StoreKey, + StoreLike, TrueValue, UploadSizeInfo, +}; use pretty_assertions::assert_eq; -use tokio::sync::watch; +use redis::{PushInfo, RedisError, Value}; +use redis_test::{MockCmd, MockRedisConnection}; +use tokio::time::{sleep, timeout}; +use tracing::{Instrument, info, info_span}; const VALID_HASH1: &str = "3031323334353637383961626364656630303030303030303030303030303030"; const TEMP_UUID: &str = "550e8400-e29b-41d4-a716-446655440000"; const DEFAULT_READ_CHUNK_SIZE: usize = 1024; -const DEFAULT_MAX_CHUNK_UPLOADS_PER_UPDATE: usize = 10; -const DEFAULT_SCAN_COUNT: u32 = 10_000; +const DEFAULT_SCAN_COUNT: usize = 10_000; +const DEFAULT_MAX_PERMITS: usize = 100; fn mock_uuid_generator() -> String { uuid::Uuid::parse_str(TEMP_UUID).unwrap().to_string() @@ -53,158 +60,57 @@ fn make_temp_key(final_name: &str) -> String { format!("temp-{TEMP_UUID}-{{{final_name}}}") } -#[derive(Debug)] -struct MockRedisBackend { - /// Commands we expect to encounter, and results we to return to the client. - // Commands are pushed from the back and popped from the front. - expected: Mutex)>>, - - tx: watch::Sender, - rx: watch::Receiver, - - failing: AtomicBool, -} - -impl Default for MockRedisBackend { - fn default() -> Self { - Self::new() - } -} - -impl MockRedisBackend { - fn new() -> Self { - let (tx, rx) = watch::channel(MockCommand { - cmd: "".into(), - subcommand: None, - args: vec![], - }); - Self { - expected: Mutex::default(), - tx, - rx, - failing: AtomicBool::new(false), - } - } - - fn expect(&self, command: MockCommand, result: Result) -> &Self { - self.expected.lock().unwrap().push_back((command, result)); - self - } - - async fn wait_for(&self, command: MockCommand) { - self.rx - .clone() - .wait_for(|cmd| *cmd == command) - .await - .expect("the channel isn't closed while the struct exists"); - } -} - -impl Mocks for MockRedisBackend { - fn process_command(&self, actual: MockCommand) -> Result { - self.tx - .send(actual.clone()) - .expect("the channel isn't closed while the struct exists"); - - let Some((expected, result)) = self.expected.lock().unwrap().pop_front() else { - // panic here -- this isn't a redis error, it's a test failure - self.failing.store(true, Ordering::Relaxed); - panic!("Didn't expect any more commands, but received {actual:?}"); - }; - - if actual != expected { - self.failing.store(true, Ordering::Relaxed); - assert_eq!( - actual, expected, - "mismatched command, received (left) but expected (right)" - ); - } - - result - } - - fn process_transaction(&self, commands: Vec) -> Result { - static MULTI: MockCommand = MockCommand { - cmd: Str::from_static("MULTI"), - subcommand: None, - args: Vec::new(), - }; - static EXEC: MockCommand = MockCommand { - cmd: Str::from_static("EXEC"), - subcommand: None, - args: Vec::new(), - }; - - let results = core::iter::once(MULTI.clone()) - .chain(commands) - .chain([EXEC.clone()]) - .map(|command| self.process_command(command)) - .collect::, RedisError>>()?; - - Ok(RedisValue::Array(results)) - } +async fn make_mock_store( + commands: Vec, +) -> RedisStore> { + make_mock_store_with_prefix(commands, String::new()).await } -impl Drop for MockRedisBackend { - fn drop(&mut self) { - if panicking() || self.failing.load(Ordering::Relaxed) { - // We're already failing, let's make debugging easier and let future devs solve problems one at a time. - return; - } - - let expected = self.expected.get_mut().unwrap(); - - if expected.is_empty() { - return; - } - - assert_eq!( - *expected, - VecDeque::new(), - "Didn't receive all expected commands, expected (left)" - ); - - // Panicking isn't enough inside a tokio task, we need to `exit(1)` - std::process::exit(1) - } +fn add_lua_version_script(mut responses: HashMap) -> HashMap { + add_lua_script( + &mut responses, + LUA_VERSION_SET_SCRIPT, + "b22b9926cbce9dd9ba97fa7ba3626f89feea1ed5", + ); + responses } -fn make_clients(mut builder: Builder) -> (RedisPool, SubscriberClient) { - const CONNECTION_POOL_SIZE: usize = 1; - let client_pool = builder - .set_performance_config(PerformanceConfig { - broadcast_channel_capacity: 4096, - ..Default::default() - }) - .build_pool(CONNECTION_POOL_SIZE) - .unwrap(); - - let subscriber_client = builder.build_subscriber_client().unwrap(); - (client_pool, subscriber_client) +async fn make_fake_redis() -> u16 { + make_fake_redis_with_responses(add_lua_version_script(fake_redis_stream())).await } -fn make_mock_store(mocks: &Arc) -> RedisStore { - make_mock_store_with_prefix(mocks, String::new()) +async fn fake_redis_sentinel_master_stream_with_script() -> u16 { + make_fake_redis_with_responses(add_lua_version_script(fake_redis_sentinel_master_stream())) + .await } -fn make_mock_store_with_prefix(mocks: &Arc, key_prefix: String) -> RedisStore { - let mut builder = Builder::default_centralized(); - let mocks = Arc::clone(mocks); - builder.set_config(RedisConfig { - mocks: Some(mocks), - ..Default::default() - }); - let (client_pool, subscriber_client) = make_clients(builder); +async fn make_mock_store_with_prefix( + mut commands: Vec, + key_prefix: String, +) -> RedisStore> { + commands.insert( + 0, + MockCmd::new( + redis::cmd("SCRIPT").arg("LOAD").arg(LUA_VERSION_SET_SCRIPT), + Ok("b22b9926cbce9dd9ba97fa7ba3626f89feea1ed5"), + ), + ); + let mock_connection = MockRedisConnection::new(commands); + let manager = ClusterRedisManager::new(mock_connection).await.unwrap(); + let (_tx, rx) = tokio::sync::mpsc::unbounded_channel(); RedisStore::new_from_builder_and_parts( - client_pool, - subscriber_client, None, mock_uuid_generator, key_prefix, DEFAULT_READ_CHUNK_SIZE, DEFAULT_MAX_CHUNK_UPLOADS_PER_UPDATE, DEFAULT_SCAN_COUNT, + DEFAULT_MAX_PERMITS, + DEFAULT_MAX_COUNT_PER_CURSOR, + rx, + manager, ) + .await .unwrap() } @@ -212,79 +118,54 @@ fn make_mock_store_with_prefix(mocks: &Arc, key_prefix: String async fn upload_and_get_data() -> Result<(), Error> { // Construct the data we want to send. Since it's small, we expect it to be sent in a single chunk. let data = Bytes::from_static(b"14"); - let chunk_data = RedisValue::Bytes(data.clone()); // Construct a digest for our data and create a key based on that digest. let digest = DigestInfo::try_new(VALID_HASH1, 2)?; let packed_hash_hex = format!("{digest}"); // Construct our Redis store with a mocked out backend. - let temp_key = RedisValue::Bytes(make_temp_key(&packed_hash_hex).into()); - let real_key = RedisValue::Bytes(packed_hash_hex.into()); + let temp_key = make_temp_key(&packed_hash_hex); + let real_key = packed_hash_hex; - let mocks = Arc::new(MockRedisBackend::new()); - - // The first set of commands are for setting the data. - mocks + let commands = vec![ + // The first set of commands are for setting the data. // Append the real value to the temp key. - .expect( - MockCommand { - cmd: Str::from_static("SETRANGE"), - subcommand: None, - args: vec![temp_key.clone(), 0.into(), chunk_data], - }, - Ok(RedisValue::Array(vec![RedisValue::Null])), - ) - .expect( - MockCommand { - cmd: Str::from_static("STRLEN"), - subcommand: None, - args: vec![temp_key.clone()], - }, - Ok(RedisValue::Array(vec![RedisValue::Integer( - data.len() as i64 - )])), - ) + MockCmd::new( + redis::cmd("SETRANGE") + .arg(temp_key.clone()) + .arg(0) + .arg(data.to_vec()), + Ok(Value::Int(0)), + ), + MockCmd::new( + redis::cmd("STRLEN").arg(temp_key.clone()), + Ok(Value::Int(data.len() as i64)), + ), // Move the data from the fake key to the real key. - .expect( - MockCommand { - cmd: Str::from_static("RENAME"), - subcommand: None, - args: vec![temp_key, real_key.clone()], - }, - Ok(RedisValue::Array(vec![RedisValue::Null])), - ); - - // The second set of commands are for retrieving the data from the key. - mocks + MockCmd::new( + redis::cmd("RENAME") + .arg(temp_key.clone()) + .arg(real_key.clone()), + Ok(Value::Nil), + ), + // The second set of commands are for retrieving the data from the key. // Check that the key exists. - .expect( - MockCommand { - cmd: Str::from_static("STRLEN"), - subcommand: None, - args: vec![real_key.clone()], - }, - Ok(RedisValue::Integer(2)), - ) - .expect( - MockCommand { - cmd: Str::from_static("EXISTS"), - subcommand: None, - args: vec![real_key.clone()], - }, - Ok(RedisValue::Integer(1)), - ) + MockCmd::with_values( + redis::pipe() + .cmd("STRLEN") + .arg(real_key.clone()) + .cmd("EXISTS") + .arg(real_key.clone()), + Ok(vec![Value::Int(2), Value::Boolean(true)]), + ), // Retrieve the data from the real key. - .expect( - MockCommand { - cmd: Str::from_static("GETRANGE"), - subcommand: None, - args: vec![real_key, RedisValue::Integer(0), RedisValue::Integer(1)], - }, - Ok(RedisValue::String(Str::from_static("14"))), - ); + MockCmd::new( + redis::cmd("GETRANGE").arg(real_key).arg(0).arg(1), + Ok(Value::BulkString(b"14".to_vec())), + ), + ]; - let store = make_mock_store(&mocks); + let store = make_mock_store(commands).await; store.update_oneshot(digest, data.clone()).await.unwrap(); @@ -307,70 +188,46 @@ async fn upload_and_get_data() -> Result<(), Error> { #[nativelink_test] async fn upload_and_get_data_with_prefix() -> Result<(), Error> { let data = Bytes::from_static(b"14"); - let chunk_data = RedisValue::Bytes(data.clone()); let prefix = "TEST_PREFIX-"; let digest = DigestInfo::try_new(VALID_HASH1, 2)?; let packed_hash_hex = format!("{prefix}{digest}"); - let temp_key = RedisValue::Bytes(make_temp_key(&packed_hash_hex).into()); - let real_key = RedisValue::Bytes(packed_hash_hex.into()); - - let mocks = Arc::new(MockRedisBackend::new()); - mocks - .expect( - MockCommand { - cmd: Str::from_static("SETRANGE"), - subcommand: None, - args: vec![temp_key.clone(), 0.into(), chunk_data], - }, - Ok(RedisValue::Array(vec![RedisValue::Null])), - ) - .expect( - MockCommand { - cmd: Str::from_static("STRLEN"), - subcommand: None, - args: vec![temp_key.clone()], - }, - Ok(RedisValue::Array(vec![RedisValue::Integer( - data.len() as i64 - )])), - ) - .expect( - MockCommand { - cmd: Str::from_static("RENAME"), - subcommand: None, - args: vec![temp_key, real_key.clone()], - }, - Ok(RedisValue::Array(vec![RedisValue::Null])), - ) - .expect( - MockCommand { - cmd: Str::from_static("STRLEN"), - subcommand: None, - args: vec![real_key.clone()], - }, - Ok(RedisValue::Integer(2)), - ) - .expect( - MockCommand { - cmd: Str::from_static("EXISTS"), - subcommand: None, - args: vec![real_key.clone()], - }, - Ok(RedisValue::Integer(1)), - ) - .expect( - MockCommand { - cmd: Str::from_static("GETRANGE"), - subcommand: None, - args: vec![real_key, RedisValue::Integer(0), RedisValue::Integer(1)], - }, - Ok(RedisValue::String(Str::from_static("14"))), - ); - - let store = make_mock_store_with_prefix(&mocks, prefix.to_string()); + let temp_key = make_temp_key(&packed_hash_hex); + let real_key = packed_hash_hex; + + let commands = vec![ + MockCmd::new( + redis::cmd("SETRANGE") + .arg(temp_key.clone()) + .arg(0) + .arg(data.clone().to_vec()), + Ok(Value::Int(0)), + ), + MockCmd::new( + redis::cmd("STRLEN").arg(temp_key.clone()), + Ok(Value::Int(data.len() as i64)), + ), + MockCmd::new( + redis::cmd("RENAME").arg(temp_key).arg(real_key.clone()), + Ok(Value::Nil), + ), + MockCmd::with_values( + redis::pipe() + .cmd("STRLEN") + .arg(real_key.clone()) + .cmd("EXISTS") + .arg(real_key.clone()), + Ok(vec![Value::Int(2), Value::Boolean(true)]), + ), + MockCmd::new( + redis::cmd("GETRANGE").arg(real_key).arg(0).arg(1), + Ok(Value::BulkString(b"14".to_vec())), + ), + ]; + + let store = make_mock_store_with_prefix(commands, prefix.to_string()).await; store.update_oneshot(digest, data.clone()).await.unwrap(); @@ -395,8 +252,8 @@ async fn upload_empty_data() -> Result<(), Error> { let data = Bytes::from_static(b""); let digest = ZERO_BYTE_DIGESTS[0]; - let mocks = Arc::new(MockRedisBackend::new()); - let store = make_mock_store(&mocks); + let commands = vec![]; + let store = make_mock_store(commands).await; store.update_oneshot(digest, data).await.unwrap(); let result = store.has(digest).await.unwrap(); @@ -414,8 +271,8 @@ async fn upload_empty_data_with_prefix() -> Result<(), Error> { let digest = ZERO_BYTE_DIGESTS[0]; let prefix = "TEST_PREFIX-"; - let mocks = Arc::new(MockRedisBackend::new()); - let store = make_mock_store_with_prefix(&mocks, prefix.to_string()); + let commands = vec![]; + let store = make_mock_store_with_prefix(commands, prefix.to_string()).await; store.update_oneshot(digest, data).await.unwrap(); let result = store.has(digest).await.unwrap(); @@ -435,83 +292,56 @@ async fn test_large_downloads_are_chunked() -> Result<(), Error> { let digest = DigestInfo::try_new(VALID_HASH1, 1)?; let packed_hash_hex = format!("{digest}"); - let temp_key = RedisValue::Bytes(make_temp_key(&packed_hash_hex).into()); - let real_key = RedisValue::Bytes(packed_hash_hex.into()); - - let mocks = Arc::new(MockRedisBackend::new()); - - mocks - .expect( - MockCommand { - cmd: Str::from_static("SETRANGE"), - subcommand: None, - args: vec![temp_key.clone(), 0.into(), data.clone().into()], - }, - Ok(RedisValue::Array(vec![RedisValue::Null])), - ) - .expect( - MockCommand { - cmd: Str::from_static("STRLEN"), - subcommand: None, - args: vec![temp_key.clone()], - }, - Ok(RedisValue::Array(vec![RedisValue::Integer( - data.len() as i64 - )])), - ) - .expect( - MockCommand { - cmd: Str::from_static("RENAME"), - subcommand: None, - args: vec![temp_key, real_key.clone()], - }, - Ok(RedisValue::Array(vec![RedisValue::Null])), - ) - .expect( - MockCommand { - cmd: Str::from_static("STRLEN"), - subcommand: None, - args: vec![real_key.clone()], - }, - Ok(RedisValue::Integer(data.len().try_into().unwrap())), - ) - .expect( - MockCommand { - cmd: Str::from_static("EXISTS"), - subcommand: None, - args: vec![real_key.clone()], - }, - Ok(RedisValue::Integer(1)), - ) - .expect( - MockCommand { - cmd: Str::from_static("GETRANGE"), - subcommand: None, - args: vec![ - real_key.clone(), - RedisValue::Integer(0), - // We expect to be asked for data from `0..READ_CHUNK_SIZE`, but since GETRANGE is inclusive - // the actual call should be from `0..=(READ_CHUNK_SIZE - 1)`. - RedisValue::Integer(READ_CHUNK_SIZE as i64 - 1), - ], - }, - Ok(RedisValue::Bytes(data.slice(..READ_CHUNK_SIZE))), - ) - .expect( - MockCommand { - cmd: Str::from_static("GETRANGE"), - subcommand: None, - args: vec![ - real_key, - RedisValue::Integer(READ_CHUNK_SIZE as i64), - // Similar GETRANCE index shenanigans here. - RedisValue::Integer(data.len() as i64 - 1), - ], - }, - Ok(RedisValue::Bytes(data.slice(READ_CHUNK_SIZE..))), - ); - - let store = make_mock_store(&mocks); + let temp_key = make_temp_key(&packed_hash_hex); + let real_key = packed_hash_hex; + + let commands = vec![ + MockCmd::new( + redis::cmd("SETRANGE") + .arg(temp_key.clone()) + .arg(0) + .arg(data.clone().to_vec()), + Ok(Value::Int(0)), + ), + MockCmd::new( + redis::cmd("STRLEN").arg(temp_key.clone()), + Ok(Value::Int(data.len() as i64)), + ), + MockCmd::new( + redis::cmd("RENAME").arg(temp_key).arg(real_key.clone()), + Ok(Value::Nil), + ), + MockCmd::with_values( + redis::pipe() + .cmd("STRLEN") + .arg(real_key.clone()) + .cmd("EXISTS") + .arg(real_key.clone()), + Ok(vec![ + Value::Int(data.len().try_into().unwrap()), + Value::Int(1), + ]), + ), + MockCmd::new( + // We expect to be asked for data from `0..READ_CHUNK_SIZE`, but since GETRANGE is inclusive + // the actual call should be from `0..=(READ_CHUNK_SIZE - 1)`. + redis::cmd("GETRANGE") + .arg(real_key.clone()) + .arg(0) + .arg(READ_CHUNK_SIZE as i64 - 1), + Ok(Value::BulkString(data.slice(..READ_CHUNK_SIZE).into())), + ), + MockCmd::new( + // Similar GETRANGE index shenanigans here. + redis::cmd("GETRANGE") + .arg(real_key) + .arg(READ_CHUNK_SIZE as i64) + .arg(data.len() as i64 - 1), + Ok(Value::BulkString(data.slice(READ_CHUNK_SIZE..).into())), + ), + ]; + + let store = make_mock_store(commands).await; store.update_oneshot(digest, data.clone()).await.unwrap(); @@ -547,106 +377,65 @@ async fn yield_between_sending_packets_in_update() -> Result<(), Error> { let digest = DigestInfo::try_new(VALID_HASH1, 2)?; let packed_hash_hex = format!("{digest}"); - let temp_key = RedisValue::Bytes(make_temp_key(&packed_hash_hex).into()); - let real_key = RedisValue::Bytes(packed_hash_hex.into()); + let temp_key = make_temp_key(&packed_hash_hex); + let real_key = packed_hash_hex; - let mocks = Arc::new(MockRedisBackend::new()); - let first_append = MockCommand { - cmd: Str::from_static("SETRANGE"), - subcommand: None, - args: vec![temp_key.clone(), 0.into(), data_p1.clone().into()], - }; - - mocks + let commands = vec![ // We expect multiple `"SETRANGE"`s as we send data in multiple chunks - .expect( - first_append.clone(), - Ok(RedisValue::Array(vec![RedisValue::Null])), - ) - .expect( - MockCommand { - cmd: Str::from_static("SETRANGE"), - subcommand: None, - args: vec![ - temp_key.clone(), - data_p1.len().try_into().unwrap(), - data_p2.clone().into(), - ], - }, - Ok(RedisValue::Array(vec![RedisValue::Null])), - ) - .expect( - MockCommand { - cmd: Str::from_static("STRLEN"), - subcommand: None, - args: vec![temp_key.clone()], - }, - Ok(RedisValue::Array(vec![RedisValue::Integer( - data.len() as i64 - )])), - ) - .expect( - MockCommand { - cmd: Str::from_static("RENAME"), - subcommand: None, - args: vec![temp_key, real_key.clone()], - }, - Ok(RedisValue::Array(vec![RedisValue::Null])), - ) - .expect( - MockCommand { - cmd: Str::from_static("STRLEN"), - subcommand: None, - args: vec![real_key.clone()], - }, - Ok(RedisValue::Integer(2)), - ) - .expect( - MockCommand { - cmd: Str::from_static("EXISTS"), - subcommand: None, - args: vec![real_key.clone()], - }, - Ok(RedisValue::Integer(1)), - ) - .expect( - MockCommand { - cmd: Str::from_static("GETRANGE"), - subcommand: None, - args: vec![ - real_key.clone(), - RedisValue::Integer(0), - RedisValue::Integer((DEFAULT_READ_CHUNK_SIZE - 1) as i64), - ], - }, - Ok(RedisValue::Bytes(data.clone())), - ) - .expect( - MockCommand { - cmd: Str::from_static("GETRANGE"), - subcommand: None, - args: vec![ - real_key.clone(), - RedisValue::Integer(DEFAULT_READ_CHUNK_SIZE as i64), - RedisValue::Integer((DEFAULT_READ_CHUNK_SIZE * 2 - 1) as i64), - ], - }, - Ok(RedisValue::Bytes(data.clone())), - ) - .expect( - MockCommand { - cmd: Str::from_static("GETRANGE"), - subcommand: None, - args: vec![ - real_key, - RedisValue::Integer((DEFAULT_READ_CHUNK_SIZE * 2) as i64), - RedisValue::Integer((data_p1.len() + data_p2.len() - 1) as i64), - ], - }, - Ok(RedisValue::Bytes(data.clone())), - ); - - let store = make_mock_store(&mocks); + MockCmd::new( + redis::cmd("SETRANGE") + .arg(temp_key.clone()) + .arg(0) + .arg(data_p1.clone().to_vec()), + Ok(Value::Int(0)), + ), + MockCmd::new( + redis::cmd("SETRANGE") + .arg(temp_key.clone()) + .arg(data_p1.len()) + .arg(data_p2.clone().to_vec()), + Ok(Value::Int(0)), + ), + MockCmd::new( + redis::cmd("STRLEN").arg(temp_key.clone()), + Ok(Value::Int(data.len() as i64)), + ), + MockCmd::new( + redis::cmd("RENAME").arg(temp_key).arg(real_key.clone()), + Ok(Value::Nil), + ), + MockCmd::with_values( + redis::pipe() + .cmd("STRLEN") + .arg(real_key.clone()) + .cmd("EXISTS") + .arg(real_key.clone()), + Ok(vec![Value::Int(2), Value::Int(1)]), + ), + MockCmd::new( + redis::cmd("GETRANGE") + .arg(real_key.clone()) + .arg(0) + .arg((DEFAULT_READ_CHUNK_SIZE - 1) as i64), + Ok(Value::BulkString(data.clone().to_vec())), + ), + MockCmd::new( + redis::cmd("GETRANGE") + .arg(real_key.clone()) + .arg(DEFAULT_READ_CHUNK_SIZE as i64) + .arg((DEFAULT_READ_CHUNK_SIZE * 2 - 1) as i64), + Ok(Value::BulkString(data.clone().to_vec())), + ), + MockCmd::new( + redis::cmd("GETRANGE") + .arg(real_key) + .arg((DEFAULT_READ_CHUNK_SIZE * 2) as i64) + .arg((data_p1.len() + data_p2.len() - 1) as i64), + Ok(Value::BulkString(data.clone().to_vec())), + ), + ]; + + let store = make_mock_store(commands).await; let (mut tx, rx) = make_buf_channel_pair(); @@ -661,7 +450,6 @@ async fn yield_between_sending_packets_in_update() -> Result<(), Error> { }, async { tx.send(data_p1).await.unwrap(); - mocks.wait_for(first_append).await; tx.send(data_p2).await.unwrap(); tx.send_eof().unwrap(); Ok::<_, Error>(()) @@ -688,40 +476,29 @@ async fn yield_between_sending_packets_in_update() -> Result<(), Error> { // Regression test for: https://github.com/TraceMachina/nativelink/issues/1286 #[nativelink_test] async fn zero_len_items_exist_check() -> Result<(), Error> { - let mocks = Arc::new(MockRedisBackend::new()); - let digest = DigestInfo::try_new(VALID_HASH1, 0)?; let packed_hash_hex = format!("{digest}"); - let real_key = RedisValue::Bytes(packed_hash_hex.into()); - - mocks - .expect( - MockCommand { - cmd: Str::from_static("GETRANGE"), - subcommand: None, - args: vec![ - real_key.clone(), - RedisValue::Integer(0), - // We expect to be asked for data from `0..READ_CHUNK_SIZE`, but since GETRANGE is inclusive - // the actual call should be from `0..=(READ_CHUNK_SIZE - 1)`. - RedisValue::Integer(DEFAULT_READ_CHUNK_SIZE as i64 - 1), - ], - }, - Ok(RedisValue::String(Str::from_static(""))), - ) - .expect( - MockCommand { - cmd: Str::from_static("EXISTS"), - subcommand: None, - args: vec![real_key], - }, - Ok(RedisValue::Integer(0)), - ); + let real_key = packed_hash_hex; - let store = make_mock_store(&mocks); + let commands = vec![ + MockCmd::new( + redis::cmd("GETRANGE") + .arg(real_key.clone()) + .arg(0) + .arg(DEFAULT_READ_CHUNK_SIZE as i64 - 1), + Ok(Value::BulkString(vec![])), + ), + MockCmd::new(redis::cmd("EXISTS").arg(real_key), Ok(Value::Int(0))), + ]; + + let store = make_mock_store(commands).await; let result = store.get_part_unchunked(digest, 0, None).await; - assert_eq!(result.unwrap_err().code, Code::NotFound); + assert_eq!( + result.as_ref().unwrap_err().code, + Code::NotFound, + "{result:?}" + ); Ok(()) } @@ -729,7 +506,7 @@ async fn zero_len_items_exist_check() -> Result<(), Error> { #[nativelink_test] async fn list_test() -> Result<(), Error> { async fn get_list( - store: &RedisStore, + store: &RedisStore>, range: impl RangeBounds> + Send + Sync + 'static, ) -> Vec> { let mut found_keys = vec![]; @@ -747,79 +524,80 @@ async fn list_test() -> Result<(), Error> { const KEY2: StoreKey = StoreKey::new_str("key2"); const KEY3: StoreKey = StoreKey::new_str("key3"); - let command = MockCommand { - cmd: Str::from_static("SCAN"), - subcommand: None, - args: vec![ - RedisValue::String(Str::from_static("0")), - RedisValue::String(Str::from_static("MATCH")), - RedisValue::String(Str::from_static("key*")), - RedisValue::String(Str::from_static("COUNT")), - RedisValue::Integer(10000), - ], - }; - let command_open = MockCommand { - cmd: Str::from_static("SCAN"), - subcommand: None, - args: vec![ - RedisValue::String(Str::from_static("0")), - RedisValue::String(Str::from_static("MATCH")), - RedisValue::String(Str::from_static("*")), - RedisValue::String(Str::from_static("COUNT")), - RedisValue::Integer(10000), - ], - }; - let result = Ok(RedisValue::Array(vec![ - RedisValue::String(Str::from_static("0")), - RedisValue::Array(vec![ - RedisValue::String(Str::from_static("key1")), - RedisValue::String(Str::from_static("key2")), - RedisValue::String(Str::from_static("key3")), - ]), - ])); - - let mocks = Arc::new(MockRedisBackend::new()); - mocks - .expect(command_open.clone(), result.clone()) - .expect(command_open.clone(), result.clone()) - .expect(command.clone(), result.clone()) - .expect(command.clone(), result.clone()) - .expect(command.clone(), result.clone()) - .expect(command_open.clone(), result.clone()) - .expect(command.clone(), result.clone()) - .expect(command_open, result); - - let store = make_mock_store(&mocks); - - // Test listing all keys. + #[allow(clippy::unnecessary_wraps)] // because that's what MockCmd wants + fn result() -> Result { + Ok(Value::Array(vec![ + Value::BulkString(b"key1".to_vec()), + Value::BulkString(b"key2".to_vec()), + Value::BulkString(b"key3".to_vec()), + ])) + } + + fn command() -> MockCmd { + MockCmd::new( + redis::cmd("SCAN") + .arg("0") + .arg("MATCH") + .arg("key*") + .arg("COUNT") + .arg(10000), + result(), + ) + } + fn command_open() -> MockCmd { + MockCmd::new( + redis::cmd("SCAN") + .arg("0") + .arg("MATCH") + .arg("*") + .arg("COUNT") + .arg(10000), + result(), + ) + } + + let commands = vec![ + command_open(), + command_open(), + command(), + command(), + command(), + command_open(), + command(), + command(), + ]; + + let store = make_mock_store(commands).await; + + info!("Test listing all keys"); let keys = get_list(&store, ..).await; assert_eq!(keys, vec![KEY1, KEY2, KEY3]); - // Test listing from key1 to all. + info!("Test listing from key1 to all"); let keys = get_list(&store, KEY1..).await; assert_eq!(keys, vec![KEY1, KEY2, KEY3]); - // Test listing from key1 to key2. + info!("Test listing from key1 to key2"); let keys = get_list(&store, KEY1..KEY2).await; assert_eq!(keys, vec![KEY1]); - // Test listing from key1 including key2. + info!("Test listing from key1 including key2"); let keys = get_list(&store, KEY1..=KEY2).await; assert_eq!(keys, vec![KEY1, KEY2]); - // Test listing from key1 to key3. + info!("Test listing from key1 to key3"); let keys = get_list(&store, KEY1..KEY3).await; assert_eq!(keys, vec![KEY1, KEY2]); - // Test listing from all to key2. + info!("Test listing from all to key2"); let keys = get_list(&store, ..KEY2).await; assert_eq!(keys, vec![KEY1]); - // Test listing from key2 to key3. + info!("Test listing from key2 to key3"); let keys = get_list(&store, KEY2..KEY3).await; assert_eq!(keys, vec![KEY2]); - // Test listing with reversed bounds. + info!("Test listing with reversed bounds"); let keys = get_list(&store, KEY3..=KEY1).await; assert_eq!(keys, vec![]); @@ -829,8 +607,8 @@ async fn list_test() -> Result<(), Error> { // Prevent regressions to https://reviewable.io/reviews/TraceMachina/nativelink/1188#-O2pu9LV5ux4ILuT6MND #[nativelink_test] async fn dont_loop_forever_on_empty() -> Result<(), Error> { - let mocks = Arc::new(MockRedisBackend::new()); - let store = make_mock_store(&mocks); + let commands = vec![]; + let store = make_mock_store(commands).await; let digest = DigestInfo::try_new(VALID_HASH1, 2).unwrap(); let (tx, rx) = make_buf_channel_pair(); @@ -851,30 +629,36 @@ async fn dont_loop_forever_on_empty() -> Result<(), Error> { #[nativelink_test] fn test_connection_errors() { + // name is resolvable, but not connectable let spec = RedisSpec { - addresses: vec!["redis://non-existent-server:6379/".to_string()], + addresses: vec!["redis://nativelink.com:6379/".to_string()], + connection_timeout_ms: 1000, ..Default::default() }; - let store = RedisStore::new(spec).expect("Working spec"); - let err = store - .has("1234") + let err = RedisStore::new_standard(spec) .await - .expect_err("Wanted connection error"); - assert_eq!(err.messages.len(), 2); - // err.messages[0] varies a bit, always something about lookup failures + .expect_err("Shouldn't have connected"); assert_eq!( - err.messages[1], - "Connection issue connecting to redis server with hosts: [\"non-existent-server:6379\"], username: None, database: 0" + Error { + code: Code::DeadlineExceeded, + messages: vec![ + "Io: timed out".into(), + format!("While connecting to redis with url: redis://nativelink.com:6379/") + ] + }, + err ); } #[nativelink_test] -fn test_health() { +async fn test_health() { + let port = make_fake_redis().await; let spec = RedisSpec { - addresses: vec!["redis://nativelink.com:6379/".to_string()], + addresses: vec![format!("redis://127.0.0.1:{port}/")], + command_timeout_ms: 1000, ..Default::default() }; - let store = RedisStore::new(spec).expect("Working spec"); + let store = RedisStore::new_standard(spec).await.expect("Working spec"); match store.check_health(std::borrow::Cow::Borrowed("foo")).await { HealthStatus::Ok { struct_name: _, @@ -886,17 +670,717 @@ fn test_health() { struct_name, message, } => { - assert_eq!(struct_name, "nativelink_store::redis_store::RedisStore"); assert_eq!( - message, - "Store.update_oneshot() failed: Error { code: DeadlineExceeded, messages: [\"Timeout Error: Request timed out.\", \"Connection issue connecting to redis server with hosts: [\\\"nativelink.com:6379\\\"], username: None, database: 0\"] }" + struct_name, + "nativelink_store::redis_store::RedisStore>" + ); + assert!( + message.starts_with("Store.update_oneshot() failed: Error { code: DeadlineExceeded, messages: [\"Io: timed out\", \"While appending to temp key ("), + "message: '{message}'" ); - assert!(logs_contain( - "check_health Store.update_oneshot() failed e=Error { code: DeadlineExceeded, messages: [\"Timeout Error: Request timed out.\", \"Connection issue connecting to redis server with hosts: [\\\"nativelink.com:6379\\\"], username: None, database: 0\"] }" - )); + logs_assert(|logs| { + for log in logs { + if log.contains("check_health Store.update_oneshot() failed e=Error { code: DeadlineExceeded, messages: [\"Io: timed out\", \"While appending to temp key (") { + return Ok(()) + } + } + Err(format!("No check_health log! {logs:?}")) + }); } health_result => { panic!("Other result: {health_result:?}"); } } } + +#[nativelink_test] +async fn test_deprecated_broadcast_channel_capacity() { + let port = make_fake_redis().await; + let spec = RedisSpec { + addresses: vec![format!("redis://127.0.0.1:{port}/")], + broadcast_channel_capacity: 1, + ..Default::default() + }; + RedisStore::new_standard(spec).await.expect("Working spec"); + + assert!(logs_contain( + "broadcast_channel_capacity in Redis spec is deprecated and ignored" + )); +} + +#[nativelink_test] +async fn test_sentinel_connect() { + let redis_span = info_span!("redis"); + let redis_port = fake_redis_sentinel_master_stream_with_script() + .instrument(redis_span) + .await; + let sentinel_span = info_span!("sentinel"); + let sentinel_port = + make_fake_redis_with_responses(fake_redis_sentinel_stream("master", redis_port)) + .instrument(sentinel_span) + .await; + let spec = RedisSpec { + addresses: vec![format!("redis+sentinel://127.0.0.1:{sentinel_port}/")], + mode: RedisMode::Sentinel, + ..Default::default() + }; + RedisStore::new_standard(spec).await.expect("Working spec"); +} + +#[nativelink_test] +async fn test_sentinel_connect_with_bad_master() { + // Note this is a fake redis port, which is fine because the sentinel code never connects to it + let port = make_fake_redis_with_responses(fake_redis_sentinel_stream("other_name", 1234)).await; + let spec = RedisSpec { + addresses: vec![format!("redis+sentinel://127.0.0.1:{port}/")], + mode: RedisMode::Sentinel, + connection_timeout_ms: 100, + ..Default::default() + }; + assert_eq!( + Error { + code: Code::InvalidArgument, + messages: vec![ + "MasterNameNotFoundBySentinel: Master with given name not found in sentinel - MasterNameNotFoundBySentinel".into(), + format!("While connecting to redis with url: redis+sentinel://127.0.0.1:{port}/") + ] + }, + RedisStore::new_standard(spec).await.unwrap_err() + ); +} + +#[nativelink_test] +async fn test_sentinel_connect_and_update_oneshot_readonly() { + let redis_span = info_span!("redis"); + + let redis_port = ReadOnlyRedis::new().run().instrument(redis_span).await; + let sentinel_span = info_span!("sentinel"); + let sentinel_port = + make_fake_redis_with_responses(fake_redis_sentinel_stream("master", redis_port)) + .instrument(sentinel_span) + .await; + let spec = RedisSpec { + addresses: vec![format!("redis+sentinel://127.0.0.1:{sentinel_port}/")], + mode: RedisMode::Sentinel, + ..Default::default() + }; + let mut raw_store = + Arc::into_inner(RedisStore::new_standard(spec).await.expect("Working spec")).unwrap(); + raw_store.replace_temp_name_generator(mock_uuid_generator); + let store = Arc::new(raw_store); + store + .update_oneshot("abcd", Bytes::from_static(b"hello")) + .await + .expect("working update"); +} + +#[nativelink_test] +async fn test_sentinel_connect_and_update_data_unversioned_readonly() { + let redis_span = info_span!("redis"); + + let redis_port = ReadOnlyRedis::new().run().instrument(redis_span).await; + let sentinel_span = info_span!("sentinel"); + let sentinel_port = + make_fake_redis_with_responses(fake_redis_sentinel_stream("master", redis_port)) + .instrument(sentinel_span) + .await; + let spec = RedisSpec { + addresses: vec![format!("redis+sentinel://127.0.0.1:{sentinel_port}/")], + mode: RedisMode::Sentinel, + ..Default::default() + }; + let mut raw_store = + Arc::into_inner(RedisStore::new_standard(spec).await.expect("Working spec")).unwrap(); + raw_store.replace_temp_name_generator(mock_uuid_generator); + let store = Arc::new(raw_store); + let data = TestSchedulerDataUnversioned { + key: "test:scheduler_key_1".to_string(), + content: "Test scheduler data #1".to_string(), + version: 0, + }; + store.update_data(data).await.expect("working update"); +} + +#[nativelink_test] +async fn test_sentinel_connect_and_update_data_versioned_readonly() { + let redis_span = info_span!("redis"); + + let redis_port = ReadOnlyRedis::new().run().instrument(redis_span).await; + let sentinel_span = info_span!("sentinel"); + let sentinel_port = + make_fake_redis_with_responses(fake_redis_sentinel_stream("master", redis_port)) + .instrument(sentinel_span) + .await; + let spec = RedisSpec { + addresses: vec![format!("redis+sentinel://127.0.0.1:{sentinel_port}/")], + mode: RedisMode::Sentinel, + ..Default::default() + }; + let mut raw_store = + Arc::into_inner(RedisStore::new_standard(spec).await.expect("Working spec")).unwrap(); + raw_store.replace_temp_name_generator(mock_uuid_generator); + let store = Arc::new(raw_store); + let data = TestSchedulerDataVersioned { + key: "test:scheduler_key_1".to_string(), + content: "Test scheduler data #1".to_string(), + version: 0, + }; + store.update_data(data).await.expect("working update"); +} + +#[nativelink_test] +async fn test_sentinel_connect_with_url_specified_master() { + let redis_port = fake_redis_sentinel_master_stream_with_script() + .instrument(info_span!("redis")) + .await; + let port = + make_fake_redis_with_responses(fake_redis_sentinel_stream("specific_master", redis_port)) + .instrument(info_span!("sentinel")) + .await; + let spec = RedisSpec { + addresses: vec![format!( + "redis+sentinel://127.0.0.1:{port}/?sentinelServiceName=specific_master" + )], + mode: RedisMode::Sentinel, + connection_timeout_ms: 100, + ..Default::default() + }; + RedisStore::new_standard(spec).await.expect("Working spec"); +} + +#[nativelink_test] +async fn test_redis_connect_timeout() { + let port = make_fake_redis_with_responses(HashMap::new()).await; + let spec = RedisSpec { + addresses: vec![format!("redis://127.0.0.1:{port}/")], + connection_timeout_ms: 1, + ..Default::default() + }; + assert_eq!( + Error { + code: Code::DeadlineExceeded, + messages: vec![ + "Io: timed out".into(), + format!("While connecting to redis with url: redis://127.0.0.1:{port}/") + ] + }, + RedisStore::new_standard(spec).await.unwrap_err() + ); +} + +#[nativelink_test] +async fn test_connect_other_db() { + let redis_port = make_fake_redis().await; + let spec = RedisSpec { + addresses: vec![format!("redis://127.0.0.1:{redis_port}/3")], + ..Default::default() + }; + RedisStore::new_standard(spec).await.expect("Working spec"); +} + +#[nativelink_test] +async fn test_sentinel_connect_other_db() { + let redis_span = info_span!("redis"); + let redis_port = fake_redis_sentinel_master_stream_with_script() + .instrument(redis_span) + .await; + let sentinel_span = info_span!("sentinel"); + let sentinel_port = + make_fake_redis_with_responses(fake_redis_sentinel_stream("master", redis_port)) + .instrument(sentinel_span) + .await; + let spec = RedisSpec { + addresses: vec![format!("redis+sentinel://127.0.0.1:{sentinel_port}/3")], + mode: RedisMode::Sentinel, + connection_timeout_ms: 5_000, + command_timeout_ms: 5_000, + ..Default::default() + }; + RedisStore::new_standard(spec).await.expect("Working spec"); +} + +struct SearchByContentPrefix { + prefix: String, +} + +// Define test structures that implement the scheduler traits +#[derive(Debug, Clone, PartialEq)] +struct TestSchedulerDataUnversioned { + key: String, + content: String, + version: i64, +} + +impl SchedulerStoreDecodeTo for TestSchedulerDataUnversioned { + type DecodeOutput = Self; + + fn decode(version: i64, data: Bytes) -> Result { + let content = String::from_utf8(data.to_vec()) + .map_err(|e| make_err!(Code::InvalidArgument, "Invalid UTF-8 data: {e}"))?; + // We don't have the key in the data, so we'll use a placeholder + Ok(Self { + key: "decoded".to_string(), + content, + version, + }) + } +} + +impl SchedulerStoreKeyProvider for TestSchedulerDataUnversioned { + type Versioned = FalseValue; // Using unversioned storage + + fn get_key(&self) -> StoreKey<'static> { + StoreKey::Str(std::borrow::Cow::Owned(self.key.clone())) + } +} + +impl SchedulerStoreDataProvider for TestSchedulerDataUnversioned { + fn try_into_bytes(self) -> Result { + Ok(Bytes::from(self.content.into_bytes())) + } + + fn get_indexes(&self) -> Result, Error> { + // Add some test indexes - need to use 'static strings + Ok(vec![ + ("test_index", Bytes::from("test_value")), + ( + "content_prefix", + Bytes::from(self.content.chars().take(10).collect::()), + ), + ]) + } +} + +// Define test structures that implement the scheduler traits +#[derive(Debug, Clone, PartialEq)] +struct TestSchedulerDataVersioned { + key: String, + content: String, + version: i64, +} + +impl SchedulerStoreKeyProvider for TestSchedulerDataVersioned { + type Versioned = TrueValue; // Using versioned storage + + fn get_key(&self) -> StoreKey<'static> { + StoreKey::Str(std::borrow::Cow::Owned(self.key.clone())) + } +} + +impl SchedulerStoreDataProvider for TestSchedulerDataVersioned { + fn try_into_bytes(self) -> Result { + Ok(Bytes::from(self.content.into_bytes())) + } + + fn get_indexes(&self) -> Result, Error> { + // Add some test indexes - need to use 'static strings + Ok(vec![ + ("test_index", Bytes::from("test_value")), + ( + "content_prefix", + Bytes::from(self.content.chars().take(10).collect::()), + ), + ]) + } +} + +impl SchedulerCurrentVersionProvider for TestSchedulerDataVersioned { + fn current_version(&self) -> i64 { + 0 + } +} + +struct TestSchedulerKey; + +impl SchedulerStoreDecodeTo for TestSchedulerKey { + type DecodeOutput = TestSchedulerDataUnversioned; + + fn decode(version: i64, data: Bytes) -> Result { + TestSchedulerDataUnversioned::decode(version, data) + } +} + +impl SchedulerIndexProvider for SearchByContentPrefix { + const KEY_PREFIX: &'static str = "test:"; + const INDEX_NAME: &'static str = "content_prefix"; + type Versioned = TrueValue; + + const MAYBE_SORT_KEY: Option<&'static str> = Some("sort_key"); + + fn index_value(&self) -> std::borrow::Cow<'_, str> { + std::borrow::Cow::Borrowed(&self.prefix) + } +} + +impl SchedulerStoreKeyProvider for SearchByContentPrefix { + type Versioned = TrueValue; + + fn get_key(&self) -> StoreKey<'static> { + StoreKey::Str(std::borrow::Cow::Owned("dummy_key".to_string())) + } +} + +impl SchedulerStoreDecodeTo for SearchByContentPrefix { + type DecodeOutput = TestSchedulerDataUnversioned; + + fn decode(version: i64, data: Bytes) -> Result { + TestSchedulerKey::decode(version, data) + } +} + +#[nativelink_test] +fn test_search_by_index() -> Result<(), Error> { + fn make_ft_aggregate() -> MockCmd { + MockCmd::new( + redis::cmd("FT.AGGREGATE") + .arg("test:_content_prefix_sort_key_3e762c15") + .arg("@content_prefix:{ Searchable }") + .arg("LOAD") + .arg(2) + .arg("data") + .arg("version") + .arg("WITHCURSOR") + .arg("COUNT") + .arg(1500) + .arg("MAXIDLE") + .arg(30000) + .arg("SORTBY") + .arg(2usize) + .arg("@sort_key") + .arg("ASC"), + Ok(Value::Array(vec![ + Value::Array(vec![ + Value::Int(1), + Value::Array(vec![ + Value::BulkString(b"data".to_vec()), + Value::BulkString(b"1234".to_vec()), + Value::BulkString(b"version".to_vec()), + Value::BulkString(b"1".to_vec()), + ]), + ]), + Value::Int(0), + ])), + ) + } + + let commands = vec![ + make_ft_aggregate(), + MockCmd::new( + redis::cmd("FT.CREATE") + .arg("test:_content_prefix__3e762c15") + .arg("ON") + .arg("HASH") + .arg("NOHL") + .arg("NOFIELDS") + .arg("NOFREQS") + .arg("NOOFFSETS") + .arg("TEMPORARY") + .arg(86400) + .arg("PREFIX") + .arg(1) + .arg("test:") + .arg("SCHEMA") + .arg("content_prefix") + .arg("TAG"), + Ok(Value::Nil), + ), + make_ft_aggregate(), + ]; + let store = make_mock_store(commands).await; + let search_provider = SearchByContentPrefix { + prefix: "Searchable".to_string(), + }; + + let search_results: Vec = store + .search_by_index_prefix(search_provider) + .await + .err_tip(|| "Failed to search by index")? + .try_collect() + .await?; + + assert!(search_results.len() == 1, "Should find 1 matching entry"); + + assert_eq!( + search_results[0].content, "1234", + "Content should match search pattern: '{}'", + search_results[0].content + ); + + Ok(()) +} + +#[nativelink_test] +fn test_search_by_index_failure() -> Result<(), Error> { + let store = make_mock_store(vec![]).await; + let search_provider = SearchByContentPrefix { + prefix: String::new(), + }; + + // Can't use unwrap_err as that needs Debug which this error doesn't provide + let Err(error) = store.search_by_index_prefix(search_provider).await else { + panic!("Expected an error"); + }; + + assert_eq!(error, Error::new_with_messages(Code::Unknown, [ + "Client: TEST - Client: unexpected command", "Error with ft_create in RedisStore::search_by_index_prefix(test:_content_prefix_sort_key_3e762c15)", "---", "Client: TEST - Client: unexpected command", "Error with second ft_aggregate in RedisStore::search_by_index_prefix(test:_content_prefix_sort_key_3e762c15)"].iter().map(ToString::to_string).collect())); + + assert!(logs_contain( + "Error calling ft.aggregate e=TEST - Client: unexpected command index=\"test:_content_prefix_sort_key_3e762c15\" query=\"*\" options=FtAggregateOptions { load: [\"data\", \"version\"], cursor: FtAggregateCursor { count: 1500, max_idle: 30000 }, sort_by: [\"@sort_key\"] } all_args=[\"FT.AGGREGATE\", \"test:_content_prefix_sort_key_3e762c15\", \"*\", \"LOAD\", \"2\", \"data\", \"version\", \"WITHCURSOR\", \"COUNT\", \"1500\", \"MAXIDLE\", \"30000\", \"SORTBY\", \"2\", \"@sort_key\", \"ASC\"]" + )); + + Ok(()) +} + +#[nativelink_test] +fn test_search_by_index_with_sort_key() -> Result<(), Error> { + fn make_ft_aggregate() -> MockCmd { + MockCmd::new( + redis::cmd("FT.AGGREGATE") + .arg("test:_content_prefix_sort_key_3e762c15") + .arg("@content_prefix:{ Searchable }") + .arg("LOAD") + .arg(2) + .arg("data") + .arg("version") + .arg("WITHCURSOR") + .arg("COUNT") + .arg(1500) + .arg("MAXIDLE") + .arg(30000) + .arg("SORTBY") + .arg(2usize) + .arg("@sort_key") + .arg("ASC"), + Ok(Value::Array(vec![ + Value::Array(vec![ + Value::Int(1), + Value::Array(vec![ + Value::BulkString(b"data".to_vec()), + Value::BulkString(b"1234".to_vec()), + Value::BulkString(b"version".to_vec()), + Value::BulkString(b"1".to_vec()), + Value::BulkString(b"sort_key".to_vec()), + Value::BulkString(b"1234".to_vec()), + ]), + ]), + Value::Int(0), + ])), + ) + } + + let commands = vec![ + make_ft_aggregate(), + MockCmd::new( + redis::cmd("FT.CREATE") + .arg("test:_content_prefix__3e762c15") + .arg("ON") + .arg("HASH") + .arg("NOHL") + .arg("NOFIELDS") + .arg("NOFREQS") + .arg("NOOFFSETS") + .arg("TEMPORARY") + .arg(86400) + .arg("PREFIX") + .arg(1) + .arg("test:") + .arg("SCHEMA") + .arg("content_prefix") + .arg("TAG"), + Ok(Value::Nil), + ), + make_ft_aggregate(), + ]; + let store = make_mock_store(commands).await; + let search_provider = SearchByContentPrefix { + prefix: "Searchable".to_string(), + }; + + let search_results: Vec = store + .search_by_index_prefix(search_provider) + .await + .err_tip(|| "Failed to search by index")? + .try_collect() + .await?; + + assert!(search_results.len() == 1, "Should find 1 matching entry"); + + assert_eq!( + search_results[0].content, "1234", + "Content should match search pattern: '{}'", + search_results[0].content + ); + + Ok(()) +} + +#[nativelink_test] +fn test_search_by_index_resp3() -> Result<(), Error> { + fn make_ft_aggregate() -> MockCmd { + MockCmd::new( + redis::cmd("FT.AGGREGATE") + .arg("test:_content_prefix_sort_key_3e762c15") + .arg("@content_prefix:{ Searchable }") + .arg("LOAD") + .arg(2) + .arg("data") + .arg("version") + .arg("WITHCURSOR") + .arg("COUNT") + .arg(1500) + .arg("MAXIDLE") + .arg(30000) + .arg("SORTBY") + .arg(2usize) + .arg("@sort_key") + .arg("ASC"), + Ok(Value::Array(vec![ + Value::Map(vec![ + ( + Value::SimpleString("attributes".into()), + Value::Array(vec![]), + ), + ( + Value::SimpleString("format".into()), + Value::SimpleString("STRING".into()), + ), + ( + Value::SimpleString("results".into()), + Value::Array(vec![Value::Map(vec![ + ( + Value::SimpleString("extra_attributes".into()), + Value::Map(vec![ + ( + Value::BulkString(b"data".to_vec()), + Value::BulkString(b"1234".to_vec()), + ), + ( + Value::BulkString(b"version".to_vec()), + Value::BulkString(b"1".to_vec()), + ), + ]), + ), + (Value::SimpleString("values".into()), Value::Array(vec![])), + ])]), + ), + (Value::SimpleString("total_results".into()), Value::Int(1)), + (Value::SimpleString("warning".into()), Value::Array(vec![])), + ]), + Value::Int(0), + ])), + ) + } + + let commands = vec![ + make_ft_aggregate(), + MockCmd::new( + redis::cmd("FT.CREATE") + .arg("test:_content_prefix_sort_key_3e762c15") + .arg("ON") + .arg("HASH") + .arg("NOHL") + .arg("NOFIELDS") + .arg("NOFREQS") + .arg("NOOFFSETS") + .arg("TEMPORARY") + .arg(86400) + .arg("PREFIX") + .arg(1) + .arg("test:") + .arg("SCHEMA") + .arg("content_prefix") + .arg("TAG") + .arg("sort_key") + .arg("TAG") + .arg("SORTABLE"), + Ok(Value::Nil), + ), + make_ft_aggregate(), + ]; + let store = make_mock_store(commands).await; + let search_provider = SearchByContentPrefix { + prefix: "Searchable".to_string(), + }; + + let search_results: Vec = store + .search_by_index_prefix(search_provider) + .await + .err_tip(|| "Failed to search by index")? + .try_collect() + .await?; + + assert!(search_results.len() == 1, "Should find 1 matching entry"); + + assert_eq!( + search_results[0].content, "1234", + "Content should match search pattern: '{}'", + search_results[0].content + ); + + Ok(()) +} + +#[nativelink_test] +async fn no_items_from_none_subscription_channel() -> Result<(), Error> { + let (_tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let subscription_manager = RedisSubscriptionManager::new(rx); + + // To give the stream enough time to get polled + sleep(Duration::from_secs(1)).await; + + assert!(!logs_contain( + "Error receiving message in RedisSubscriptionManager from subscriber_channel" + )); + assert!(!logs_contain("ERROR")); + + // Because otherwise it gets dropped immediately, and we need it to live to do things + drop(subscription_manager); + + Ok(()) +} + +#[nativelink_test] +async fn send_messages_to_subscription_channel() -> Result<(), Error> { + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + let subscription_manager = RedisSubscriptionManager::new(rx); + + tx.send(PushInfo { + kind: redis::PushKind::PSubscribe, + data: vec![ + // Pattern + Value::BulkString("scheduler_key_change".into()), + // Subscribe count + Value::Int(1), + ], + }) + .unwrap(); + tx.send(PushInfo { + kind: redis::PushKind::PMessage, + data: vec![ + // First is the pattern + Value::BulkString("scheduler_key_change".into()), + // Second is the matching channel. Which in this case is the same as the pattern. + Value::BulkString("scheduler_key_change".into()), + // And then the actual message + Value::BulkString("demo-key".into()), + ], + }) + .unwrap(); + + timeout(Duration::from_secs(5), async { + loop { + assert!(!logs_contain("ERROR")); + if logs_contain("New subscription manager key key=\"demo-key\"") { + break; + } + sleep(Duration::from_millis(100)).await; + } + }) + .await + .unwrap(); + + // Because otherwise it gets dropped immediately, and we need it to live to do things + drop(subscription_manager); + + Ok(()) +} diff --git a/nativelink-store/tests/s3_store_test.rs b/nativelink-store/tests/s3_store_test.rs index ebda0d563..d19425499 100644 --- a/nativelink-store/tests/s3_store_test.rs +++ b/nativelink-store/tests/s3_store_test.rs @@ -204,7 +204,7 @@ async fn simple_update_ac() -> Result<(), Error> { const CONTENT_LENGTH: usize = 50; let mut send_data = BytesMut::new(); for i in 0..CONTENT_LENGTH { - send_data.put_u8(((i % 93) + 33) as u8); // Printable characters only. + send_data.put_u8(u8::try_from((i % 93) + 33).expect("printable ASCII range")); } let send_data = send_data.freeze(); @@ -456,9 +456,9 @@ async fn multipart_update_large_cas() -> Result<(), Error> { const MIN_MULTIPART_SIZE: usize = 5 * 1024 * 1024; // 5mb. const AC_ENTRY_SIZE: usize = MIN_MULTIPART_SIZE * 2 + 50; - let mut send_data = Vec::with_capacity(AC_ENTRY_SIZE); + let mut send_data: Vec = Vec::with_capacity(AC_ENTRY_SIZE); for i in 0..send_data.capacity() { - send_data.push(((i * 3) % 256) as u8); + send_data.push(u8::try_from((i * 3) % 256).expect("modulo 256 always fits in u8")); } let digest = DigestInfo::try_new(VALID_HASH1, send_data.len())?; diff --git a/nativelink-store/tests/shard_store_test.rs b/nativelink-store/tests/shard_store_test.rs index ac6b22988..f8753849a 100644 --- a/nativelink-store/tests/shard_store_test.rs +++ b/nativelink-store/tests/shard_store_test.rs @@ -81,7 +81,7 @@ async fn verify_weights( } for (index, (store, expected_hit)) in stores.iter().zip(expected_hits.iter()).enumerate() { - let total_hits = store.len_for_test().await; + let total_hits = store.len_for_test(); #[expect(clippy::print_stdout, reason = "improves debugging")] if print_results { println!("expected_hit: {expected_hit} - total_hits: {total_hits}"); diff --git a/nativelink-util/BUILD.bazel b/nativelink-util/BUILD.bazel index 4e9a12f93..3672953f2 100644 --- a/nativelink-util/BUILD.bazel +++ b/nativelink-util/BUILD.bazel @@ -20,10 +20,12 @@ rust_library( "src/evicting_map.rs", "src/fastcdc.rs", "src/fs.rs", + "src/fs_util.rs", "src/health_utils.rs", "src/instant_wrapper.rs", "src/known_platform_property_provider.rs", "src/lib.rs", + "src/metrics.rs", "src/metrics_utils.rs", "src/operation_state_manager.rs", "src/origin_event.rs", @@ -48,14 +50,18 @@ rust_library( "//nativelink-error", "//nativelink-metric", "//nativelink-proto", + "@crates//:async-lock", "@crates//:base64", "@crates//:bitflags", "@crates//:blake3", "@crates//:bytes", "@crates//:futures", + "@crates//:ginepro", "@crates//:hex", "@crates//:hyper-1.7.0", "@crates//:hyper-util", + "@crates//:humantime", + "@crates//:libc", "@crates//:lru", "@crates//:mock_instant", "@crates//:opentelemetry", @@ -80,7 +86,9 @@ rust_library( "@crates//:tracing", "@crates//:tracing-opentelemetry", "@crates//:tracing-subscriber", + "@crates//:url", "@crates//:uuid", + "@crates//:walkdir", ], ) @@ -88,24 +96,25 @@ rust_test_suite( name = "integration", timeout = "short", srcs = [ - "tests/action_messages_test.rs", "tests/buf_channel_test.rs", "tests/channel_body_for_tests_test.rs", "tests/common_test.rs", "tests/evicting_map_test.rs", "tests/fastcdc_test.rs", "tests/health_utils_test.rs", + "tests/metrics_test.rs", "tests/operation_id_tests.rs", "tests/origin_event_test.rs", + "tests/platform_properties_tests.rs", "tests/proto_stream_utils_test.rs", "tests/resource_info_test.rs", "tests/retry_test.rs", + "tests/store_trait_test.rs", + "tests/telemetry_test.rs", "tests/tls_utils_test.rs", ], compile_data = [ "tests/data/SekienAkashita.jpg", - "tests/data/action_message_cachable_060.json", - "tests/data/action_message_uncachable_060.json", ], proc_macro_deps = [ "//nativelink-macro", @@ -115,13 +124,17 @@ rust_test_suite( ":nativelink-util", "//nativelink-config", "//nativelink-error", + "//nativelink-metric", "//nativelink-proto", + "//nativelink-worker", + "@crates//:axum", "@crates//:bytes", "@crates//:futures", "@crates//:hex", "@crates//:http-body-util", "@crates//:hyper-1.7.0", "@crates//:mock_instant", + "@crates//:opentelemetry", "@crates//:parking_lot", "@crates//:pretty_assertions", "@crates//:rand", @@ -132,6 +145,7 @@ rust_test_suite( "@crates//:tokio-stream", "@crates//:tokio-util", "@crates//:tonic", + "@crates//:tower", "@crates//:tracing", "@crates//:tracing-test", "@crates//:uuid", @@ -150,6 +164,8 @@ rust_test( "@crates//:pretty_assertions", "@crates//:rand", "@crates//:serde_json", + "@crates//:tempfile", + "@crates//:tracing-test", ], ) diff --git a/nativelink-util/Cargo.toml b/nativelink-util/Cargo.toml index 48a3244a4..d4ed627c5 100644 --- a/nativelink-util/Cargo.toml +++ b/nativelink-util/Cargo.toml @@ -1,12 +1,10 @@ +#:schema ../tools/cargo-with-detailed-deps.json lints.workspace = true [package] edition = "2024" name = "nativelink-util" -version = "0.7.3" - -[features] -worker_find_logging = [] +version = "1.0.0-rc4" [dependencies] nativelink-config = { path = "../nativelink-config" } @@ -14,35 +12,42 @@ nativelink-error = { path = "../nativelink-error" } nativelink-metric = { path = "../nativelink-metric" } nativelink-proto = { path = "../nativelink-proto" } -async-trait = "0.1.88" +async-trait = { version = "0.1.88", default-features = false } base64 = { version = "0.22.1", default-features = false, features = ["std"] } -bitflags = "2.9.0" -blake3 = { version = "1.8.0", features = ["mmap"] } +bitflags = { version = "2.9.0", default-features = false } +blake3 = { version = "1.8.0", features = ["mmap"], default-features = false } bytes = { version = "1.10.1", default-features = false } -futures = { version = "0.3.31", default-features = false } +futures = { version = "0.3.31", features = [ + "async-await", +], default-features = false } hex = { version = "0.4.3", default-features = false, features = ["std"] } -hyper = "1.6.0" -hyper-util = "0.1.11" -lru = { version = "0.13.0", default-features = false } -mock_instant = "0.5.3" -opentelemetry = { version = "0.29.0", default-features = false } -opentelemetry-appender-tracing = { version = "0.29.1", default-features = false } -opentelemetry-http = { version = "0.29.0", default-features = false } -opentelemetry-otlp = { version = "0.29.0", default-features = false, features = [ +humantime = { version = "2.3.0", default-features = false } +hyper = { version = "1.7.0", default-features = false } +hyper-util = { version = "0.1.11", default-features = false } +libc = { version = "0.2.177", default-features = false } +lru = { version = "0.16.0", default-features = false } +mock_instant = { version = "0.5.3", default-features = false } +opentelemetry = { version = "0.30.0", default-features = false } +opentelemetry-appender-tracing = { version = "0.30.0", default-features = false } +opentelemetry-http = { version = "0.30.0", default-features = false } +opentelemetry-otlp = { version = "0.30.0", default-features = false, features = [ "grpc-tonic", "logs", "metrics", "trace", "zstd-tonic", ] } -opentelemetry-semantic-conventions = { version = "0.29.0", default-features = false, features = [ +opentelemetry-semantic-conventions = { version = "0.30.0", default-features = false, features = [ "default", "semconv_experimental", ] } -opentelemetry_sdk = { version = "0.29.0", default-features = false } -parking_lot = { version = "0.12.3", features = ["arc_lock", "send_guard"] } -pin-project = "1.1.10" -pin-project-lite = "0.2.16" +opentelemetry_sdk = { version = "0.30.0", default-features = false } +parking_lot = { version = "0.12.3", features = [ + "arc_lock", + "send_guard", +], default-features = false } +pin-project = { version = "1.1.10", default-features = false } +pin-project-lite = { version = "0.2.16", default-features = false } prost = { version = "0.13.5", default-features = false } prost-types = { version = "0.13.5", default-features = false, features = [ "std", @@ -63,15 +68,16 @@ tokio = { version = "1.44.1", features = [ tokio-stream = { version = "0.1.17", features = [ "fs", ], default-features = false } -tokio-util = { version = "0.7.14" } +tokio-util = { version = "0.7.14", default-features = false } tonic = { version = "0.13.0", features = [ + "router", "tls-native-roots", "tls-ring", "transport", ], default-features = false } tower = { version = "0.5.2", default-features = false } tracing = { version = "0.1.41", default-features = false } -tracing-opentelemetry = { version = "0.30.0", default-features = false, features = [ +tracing-opentelemetry = { version = "0.31.0", default-features = false, features = [ "metrics", ] } tracing-subscriber = { version = "0.3.19", features = [ @@ -79,17 +85,25 @@ tracing-subscriber = { version = "0.3.19", features = [ "env-filter", "json", ], default-features = false } +tracing-test = { version = "0.2.5", default-features = false, features = [] } + uuid = { version = "1.16.0", default-features = false, features = [ "serde", "v4", "v6", ] } +walkdir = { version = "2.5.0", default-features = false } +ginepro = "0.9.0" +url = "2.5.7" [dev-dependencies] nativelink-macro = { path = "../nativelink-macro" } -http-body-util = "0.1.3" -pretty_assertions = { version = "1.4.1", features = ["std"] } +axum = { version = "0.8.3", default-features = false } +http-body-util = { version = "0.1.3", default-features = false } +pretty_assertions = { version = "1.4.1", features = [ + "std", +], default-features = false } rand = { version = "0.9.0", default-features = false, features = [ "thread_rng", ] } diff --git a/nativelink-util/src/action_messages.rs b/nativelink-util/src/action_messages.rs index acbc28669..21a181c0e 100644 --- a/nativelink-util/src/action_messages.rs +++ b/nativelink-util/src/action_messages.rs @@ -14,11 +14,13 @@ use core::cmp::Ordering; use core::convert::Into; +use core::fmt::Display; use core::hash::Hash; use core::time::Duration; use std::collections::HashMap; use std::time::SystemTime; +use humantime::format_duration; use nativelink_error::{Error, ResultExt, error_if, make_input_err}; use nativelink_metric::{ MetricFieldData, MetricKind, MetricPublishKnownKindData, MetricsComponent, publish, @@ -69,7 +71,7 @@ impl Default for OperationId { } } -impl core::fmt::Display for OperationId { +impl Display for OperationId { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self { Self::Uuid(uuid) => uuid.fmt(f), @@ -144,7 +146,7 @@ impl MetricsComponent for WorkerId { } } -impl core::fmt::Display for WorkerId { +impl Display for WorkerId { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.write_fmt(format_args!("{}", self.0)) } @@ -152,7 +154,7 @@ impl core::fmt::Display for WorkerId { impl core::fmt::Debug for WorkerId { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - core::fmt::Display::fmt(&self, f) + Display::fmt(&self, f) } } @@ -225,7 +227,7 @@ impl ActionUniqueQualifier { } } -impl core::fmt::Display for ActionUniqueQualifier { +impl Display for ActionUniqueQualifier { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let (cacheable, unique_key) = match self { Self::Cacheable(action) => (true, action), @@ -259,7 +261,7 @@ pub struct ActionUniqueKey { pub digest: DigestInfo, } -impl core::fmt::Display for ActionUniqueKey { +impl Display for ActionUniqueKey { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.write_fmt(format_args!( "{}/{}/{}", @@ -802,6 +804,17 @@ impl ActionStage { | (Self::CompletedFromCache(_), Self::CompletedFromCache(_)) ) } + + pub fn name(&self) -> String { + match self { + Self::Unknown => "Unknown".to_string(), + Self::CacheCheck => "CacheCheck".to_string(), + Self::Queued => "Queued".to_string(), + Self::Executing => "Executing".to_string(), + Self::Completed(_) => "Completed".to_string(), + Self::CompletedFromCache(_) => "CompletedFromCache".to_string(), + } + } } impl MetricsComponent for ActionStage { @@ -810,15 +823,7 @@ impl MetricsComponent for ActionStage { _kind: MetricKind, _field_metadata: MetricFieldData, ) -> Result { - let value = match self { - Self::Unknown => "Unknown".to_string(), - Self::CacheCheck => "CacheCheck".to_string(), - Self::Queued => "Queued".to_string(), - Self::Executing => "Executing".to_string(), - Self::Completed(_) => "Completed".to_string(), - Self::CompletedFromCache(_) => "CompletedFromCache".to_string(), - }; - Ok(MetricPublishKnownKindData::String(value)) + Ok(MetricPublishKnownKindData::String(self.name())) } } @@ -1093,16 +1098,58 @@ where /// Current state of the action. /// This must be 100% compatible with `Operation` in `google/longrunning/operations.proto`. -#[derive(PartialEq, Debug, Clone, Serialize, Deserialize, MetricsComponent)] +#[derive(Debug, Clone, Serialize, Deserialize, MetricsComponent)] pub struct ActionState { #[metric(help = "The current stage of the action.")] pub stage: ActionStage, + #[metric(help = "Last time this action changed stage")] + pub last_transition_timestamp: SystemTime, #[metric(help = "The unique identifier of the action.")] pub client_operation_id: OperationId, #[metric(help = "The digest of the action.")] pub action_digest: DigestInfo, } +impl Display for ActionState { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!( + f, + "stage={} last_transition={} client_operation_id={} action_digest={}", + self.stage.name(), + self.last_transition_timestamp.elapsed().map_or_else( + |_| "".to_string(), + |d| { format_duration(d).to_string() } + ), + self.client_operation_id, + self.action_digest + ) + } +} + +impl PartialOrd for ActionState { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ActionState { + fn cmp(&self, other: &Self) -> Ordering { + self.last_transition_timestamp + .cmp(&other.last_transition_timestamp) + } +} + +impl PartialEq for ActionState { + fn eq(&self, other: &Self) -> bool { + // Ignore last_transition_timestamp as the actions can still be the same even if they happened at different times + self.stage == other.stage + && self.client_operation_id == other.client_operation_id + && self.action_digest == other.action_digest + } +} + +impl Eq for ActionState {} + impl ActionState { pub fn try_from_operation( operation: Operation, @@ -1156,6 +1203,7 @@ impl ActionState { stage, client_operation_id, action_digest, + last_transition_timestamp: SystemTime::now(), }) } diff --git a/nativelink-util/src/common.rs b/nativelink-util/src/common.rs index f98dcbb35..86f9415cc 100644 --- a/nativelink-util/src/common.rs +++ b/nativelink-util/src/common.rs @@ -160,7 +160,10 @@ impl<'a> DigestStackStringifier<'a> { cursor .write_fmt(format_args!("{}", self.digest.size_bytes())) .err_tip(|| format!("Could not write size_bytes to buffer - {hex:?}",))?; - cursor.position() as usize + cursor + .position() + .try_into() + .map_err(|e| make_input_err!("Cursor position exceeds usize bounds: {e}"))? }; // Convert the buffer into utf8 string. core::str::from_utf8(&self.buf[..len]).map_err(|e| { @@ -454,7 +457,7 @@ pub fn encode_stream_proto(proto: &T) -> Result 0 / 1 # encoded as 1 byte unsigned integer. buf.put_u8(0); // Message-Length -> {length of Message} # encoded as 4 byte unsigned integer (big endian). - buf.put_u32(len as u32); + buf.put_u32(u32::try_from(len)?); // Message -> *{binary octet}. } diff --git a/nativelink-util/src/connection_manager.rs b/nativelink-util/src/connection_manager.rs index 5803413e5..eaa5d0d99 100644 --- a/nativelink-util/src/connection_manager.rs +++ b/nativelink-util/src/connection_manager.rs @@ -34,7 +34,7 @@ use crate::retry::{self, Retrier, RetryResult}; #[derive(Debug)] pub struct ConnectionManager { // The channel to request connections from the worker. - worker_tx: mpsc::Sender>, + worker_tx: mpsc::Sender<(String, oneshot::Sender)>, } /// The index into `ConnectionManagerWorker::endpoints`. @@ -101,8 +101,8 @@ struct ConnectionManagerWorker { connecting_channels: FuturesUnordered + Send>>>, /// Connected channels that are available for use. available_channels: VecDeque, - /// Requests for a Channel when available. - waiting_connections: VecDeque>, + /// Requests for a Channel when available - (reason, request) + waiting_connections: VecDeque<(String, oneshot::Sender)>, /// The retry configuration for connecting to an Endpoint, on failure will /// restart the retrier after a 1 second delay. retrier: Retrier, @@ -165,10 +165,10 @@ impl ConnectionManager { /// Get a Connection that can be used as a `tonic::Channel`, except it /// performs some additional counting to reconnect on error and restrict /// the number of concurrent connections. - pub async fn connection(&self) -> Result { + pub async fn connection(&self, reason: String) -> Result { let (tx, rx) = oneshot::channel(); self.worker_tx - .send(tx) + .send((reason, tx)) .await .map_err(|err| make_err!(Code::Unavailable, "Requesting a new connection: {err:?}"))?; rx.await @@ -180,7 +180,7 @@ impl ConnectionManagerWorker { async fn service_requests( mut self, connections_per_endpoint: usize, - mut worker_rx: mpsc::Receiver>, + mut worker_rx: mpsc::Receiver<(String, oneshot::Sender)>, mut connection_rx: mpsc::UnboundedReceiver, ) { // Make the initial set of connections, connection failures will be @@ -199,12 +199,12 @@ impl ConnectionManagerWorker { loop { tokio::select! { request = worker_rx.recv() => { - let Some(request) = request else { + let Some((reason, request)) = request else { // The ConnectionManager was dropped, shut down the // worker. break; }; - self.handle_worker(request); + self.handle_worker(reason, request); } maybe_request = connection_rx.recv() => { if let Some(request) = maybe_request { @@ -308,14 +308,22 @@ impl ConnectionManagerWorker { } // This must never be made async otherwise the select may cancel it. - fn handle_worker(&mut self, tx: oneshot::Sender) { + fn handle_worker(&mut self, reason: String, tx: oneshot::Sender) { if let Some(channel) = (self.available_connections > 0) .then_some(()) .and_then(|()| self.available_channels.pop_front()) { + debug!(reason, "ConnectionManager: request running"); self.provide_channel(channel, tx); } else { - self.waiting_connections.push_back(tx); + debug!( + available_connections = self.available_connections, + available_channels = self.available_channels.len(), + waiting_connections = self.waiting_connections.len(), + reason, + "ConnectionManager: no connection available, request queued", + ); + self.waiting_connections.push_back((reason, tx)); } } @@ -336,7 +344,8 @@ impl ConnectionManagerWorker { && !self.available_channels.is_empty() { if let Some(channel) = self.available_channels.pop_front() { - if let Some(tx) = self.waiting_connections.pop_front() { + if let Some((reason, tx)) = self.waiting_connections.pop_front() { + debug!(reason, "ConnectionManager: channel available, running"); self.provide_channel(channel, tx); } else { // This should never happen, but better than an unwrap. diff --git a/nativelink-util/src/evicting_map.rs b/nativelink-util/src/evicting_map.rs index 90d3ca8b4..007d60568 100644 --- a/nativelink-util/src/evicting_map.rs +++ b/nativelink-util/src/evicting_map.rs @@ -17,16 +17,19 @@ use core::cmp::Eq; use core::fmt::Debug; use core::future::Future; use core::hash::Hash; +use core::marker::PhantomData; use core::ops::RangeBounds; +use core::pin::Pin; use std::collections::BTreeSet; use std::sync::Arc; +use futures::StreamExt; +use futures::stream::FuturesUnordered; use lru::LruCache; use nativelink_config::stores::EvictionPolicy; use nativelink_metric::MetricsComponent; use parking_lot::Mutex; use serde::{Deserialize, Serialize}; -use tonic::async_trait; use tracing::{debug, info}; use crate::instant_wrapper::InstantWrapper; @@ -89,9 +92,8 @@ impl LenEntry for Arc { // Callback to be called when the EvictingMap removes an item // either via eviction or direct deletion. This will be called with // whatever key type the EvictingMap uses. -#[async_trait] -pub trait RemoveStateCallback: Debug + Send + Sync { - async fn callback(&self, key: &Q); +pub trait RemoveItemCallback: Debug + Send + Sync { + fn callback(&self, store_key: &Q) -> Pin + Send>>; } #[derive(Debug, MetricsComponent)] @@ -99,12 +101,15 @@ struct State< K: Ord + Hash + Eq + Clone + Debug + Send + Borrow, Q: Ord + Hash + Eq + Debug, T: LenEntry + Debug + Send, + C: RemoveItemCallback, > { lru: LruCache>, btree: Option>, #[metric(help = "Total size of all items in the store")] sum_store_size: u64, + store_len: u64, + #[metric(help = "Number of bytes evicted from the store")] evicted_bytes: Counter, #[metric(help = "Number of items evicted from the store")] @@ -116,26 +121,36 @@ struct State< #[metric(help = "Number of bytes inserted into the store since it was created")] lifetime_inserted_bytes: Counter, - remove_callbacks: Arc>>>>, + _key_type: PhantomData, + remove_callbacks: Vec, } +type RemoveFuture = Pin + Send>>; + impl< K: Ord + Hash + Eq + Clone + Debug + Send + Sync + Borrow, Q: Ord + Hash + Eq + Debug + Sync, T: LenEntry + Debug + Sync + Send, -> State + C: RemoveItemCallback, +> State { /// Removes an item from the cache and returns the data for deferred cleanup. /// The caller is responsible for calling `unref()` on the returned data outside of the lock. #[must_use] - async fn remove(&mut self, key: &Q, eviction_item: &EvictionItem, replaced: bool) -> T + fn remove( + &mut self, + key: &Q, + eviction_item: &EvictionItem, + replaced: bool, + ) -> (T, Vec) where T: Clone, { if let Some(btree) = &mut self.btree { - btree.remove(key.borrow()); + btree.remove(key); } self.sum_store_size -= eviction_item.data.len(); + self.store_len -= 1; if replaced { self.replaced_items.inc(); self.replaced_bytes.add(eviction_item.data.len()); @@ -144,19 +159,20 @@ impl< self.evicted_bytes.add(eviction_item.data.len()); } - let locked_callbacks = self.remove_callbacks.lock_arc(); - for callback in locked_callbacks.iter() { - callback.callback(key).await; - } + let callbacks = self + .remove_callbacks + .iter() + .map(|callback| callback.callback(key)) + .collect(); // Return the data for deferred unref outside of lock - eviction_item.data.clone() + (eviction_item.data.clone(), callbacks) } /// Inserts a new item into the cache. If the key already exists, the old item is returned /// for deferred cleanup. #[must_use] - async fn put(&mut self, key: &K, eviction_item: EvictionItem) -> Option + fn put(&mut self, key: &K, eviction_item: EvictionItem) -> Option<(T, Vec)> where K: Clone, T: Clone, @@ -165,15 +181,22 @@ impl< if let Some(btree) = &mut self.btree { btree.insert(key.clone()); } - if let Some(old_item) = self.lru.put(key.clone(), eviction_item) { - let old_data = self.remove(key.borrow(), &old_item, true).await; - return Some(old_data); - } - None + self.lru + .put(key.clone(), eviction_item) + .map(|old_item| self.remove(key.borrow(), &old_item, true)) } - fn add_remove_callback(&self, callback: Box>) { - self.remove_callbacks.lock_arc().push(callback); + fn add_remove_callback(&mut self, callback: C) { + self.remove_callbacks.push(callback); + } +} + +#[derive(Debug, Clone, Copy)] +pub struct NoopRemove; + +impl RemoveItemCallback for NoopRemove { + fn callback(&self, _store_key: &Q) -> Pin + Send>> { + Box::pin(async {}) } } @@ -183,9 +206,10 @@ pub struct EvictingMap< Q: Ord + Hash + Eq + Debug, T: LenEntry + Debug + Send, I: InstantWrapper, + C: RemoveItemCallback = NoopRemove, > { #[metric] - state: Arc>>, + state: Mutex>, anchor_time: I, #[metric(help = "Maximum size of the store in bytes")] max_bytes: u64, @@ -197,28 +221,31 @@ pub struct EvictingMap< max_count: u64, } -impl EvictingMap +impl EvictingMap where K: Ord + Hash + Eq + Clone + Debug + Send + Sync + Borrow, Q: Ord + Hash + Eq + Debug + Sync, T: LenEntry + Debug + Clone + Send + Sync, I: InstantWrapper, + C: RemoveItemCallback, { pub fn new(config: &EvictionPolicy, anchor_time: I) -> Self { Self { // We use unbounded because if we use the bounded version we can't call the delete // function on the LenEntry properly. - state: Arc::new(Mutex::new(State { + state: Mutex::new(State { lru: LruCache::unbounded(), btree: None, sum_store_size: 0, + store_len: 0, evicted_bytes: Counter::default(), evicted_items: CounterWithTime::default(), replaced_bytes: Counter::default(), replaced_items: CounterWithTime::default(), lifetime_inserted_bytes: Counter::default(), - remove_callbacks: Arc::new(Mutex::new(vec![])), - })), + _key_type: PhantomData, + remove_callbacks: Vec::new(), + }), anchor_time, max_bytes: config.max_bytes as u64, evict_bytes: config.evict_bytes as u64, @@ -228,13 +255,13 @@ where } pub async fn enable_filtering(&self) { - let mut state = self.state.lock_arc(); + let mut state = self.state.lock(); if state.btree.is_none() { Self::rebuild_btree_index(&mut state); } } - fn rebuild_btree_index(state: &mut State) { + fn rebuild_btree_index(state: &mut State) { state.btree = Some(state.lru.iter().map(|(k, _)| k).cloned().collect()); } @@ -242,12 +269,12 @@ where /// and return the number of items that were processed. /// The `handler` function should return `true` to continue processing the next item /// or `false` to stop processing. - pub async fn range(&self, prefix_range: impl RangeBounds + Send, mut handler: F) -> u64 + pub fn range(&self, prefix_range: impl RangeBounds + Send, mut handler: F) -> u64 where F: FnMut(&K, &T) -> bool + Send, K: Ord, { - let mut state = self.state.lock_arc(); + let mut state = self.state.lock(); let btree = if let Some(ref btree) = state.btree { btree } else { @@ -268,8 +295,8 @@ where /// Returns the number of key-value pairs that are currently in the the cache. /// Function is not for production code paths. - pub async fn len_for_test(&self) -> usize { - self.state.lock_arc().lru.len() + pub fn len_for_test(&self) -> usize { + self.state.lock().lru.len() } fn should_evict( @@ -281,20 +308,22 @@ where ) -> bool { let is_over_size = max_bytes != 0 && sum_store_size >= max_bytes; - let evict_older_than_seconds = - (self.anchor_time.elapsed().as_secs() as i32) - self.max_seconds; + let elapsed_seconds = + i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX); + let evict_older_than_seconds = elapsed_seconds.saturating_sub(self.max_seconds); let old_item_exists = self.max_seconds != 0 && peek_entry.seconds_since_anchor < evict_older_than_seconds; - let is_over_count = self.max_count != 0 && (lru_len as u64) > self.max_count; + let is_over_count = + self.max_count != 0 && u64::try_from(lru_len).unwrap_or(u64::MAX) > self.max_count; is_over_size || old_item_exists || is_over_count } #[must_use] - async fn evict_items(&self, state: &mut State) -> Vec { + fn evict_items(&self, state: &mut State) -> (Vec, Vec) { let Some((_, mut peek_entry)) = state.lru.peek_lru() else { - return Vec::new(); + return (Vec::new(), Vec::new()); }; let max_bytes = if self.max_bytes != 0 @@ -311,6 +340,7 @@ where }; let mut items_to_unref = Vec::new(); + let mut removal_futures = Vec::new(); while self.should_evict(state.lru.len(), peek_entry, state.sum_store_size, max_bytes) { let (key, eviction_item) = state @@ -318,8 +348,9 @@ where .pop_lru() .expect("Tried to peek() then pop() but failed"); debug!(?key, "Evicting",); - let data = state.remove(key.borrow(), &eviction_item, false).await; + let (data, futures) = state.remove(key.borrow(), &eviction_item, false); items_to_unref.push(data); + removal_futures.extend(futures.into_iter()); peek_entry = if let Some((_, entry)) = state.lru.peek_lru() { entry @@ -328,14 +359,11 @@ where }; } - items_to_unref + (items_to_unref, removal_futures) } /// Return the size of a `key`, if not found `None` is returned. - pub async fn size_for_key(&self, key: &Q) -> Option - where - Q: Sync, - { + pub async fn size_for_key(&self, key: &Q) -> Option { let mut results = [None]; self.sizes_for_keys([key], &mut results[..], false).await; results[0] @@ -360,47 +388,60 @@ where // to be able to borrow a `Q`. R: Borrow + Send, { - let mut state = self.state.lock_arc(); - - let lru_len = state.lru.len(); - for (key, result) in keys.into_iter().zip(results.iter_mut()) { - let maybe_entry = if peek { - state.lru.peek_mut(key.borrow()) - } else { - state.lru.get_mut(key.borrow()) - }; - match maybe_entry { - Some(entry) => { - // Note: We need to check eviction because the item might be expired - // based on the current time. In such case, we remove the item while - // we are here. - if self.should_evict(lru_len, entry, 0, u64::MAX) { - *result = None; - if let Some((key, eviction_item)) = state.lru.pop_entry(key.borrow()) { - info!(?key, "Item expired, evicting"); - let data = state.remove(key.borrow(), &eviction_item, false).await; - // Store data for later unref - we can't drop state here as we're still iterating - // The unref will happen after the method completes - // For now, we just do inline unref - data.unref().await; + let (removal_futures, data_to_unref) = { + let mut state = self.state.lock(); + + let lru_len = state.lru.len(); + let mut data_to_unref = Vec::new(); + let mut removal_futures = Vec::new(); + for (key, result) in keys.into_iter().zip(results.iter_mut()) { + let maybe_entry = if peek { + state.lru.peek_mut(key.borrow()) + } else { + state.lru.get_mut(key.borrow()) + }; + match maybe_entry { + Some(entry) => { + // Note: We need to check eviction because the item might be expired + // based on the current time. In such case, we remove the item while + // we are here. + if self.should_evict(lru_len, entry, 0, u64::MAX) { + *result = None; + if let Some((key, eviction_item)) = state.lru.pop_entry(key.borrow()) { + info!(?key, "Item expired, evicting"); + let (data, futures) = + state.remove(key.borrow(), &eviction_item, false); + // Store data for later unref - we can't drop state here as we're still iterating + data_to_unref.push(data); + removal_futures.extend(futures.into_iter()); + } + } else { + if !peek { + entry.seconds_since_anchor = + i32::try_from(self.anchor_time.elapsed().as_secs()) + .unwrap_or(i32::MAX); + } + *result = Some(entry.data.len()); } - } else { - if !peek { - entry.seconds_since_anchor = - self.anchor_time.elapsed().as_secs() as i32; - } - *result = Some(entry.data.len()); } + None => *result = None, } - None => *result = None, } - } + (removal_futures, data_to_unref) + }; + + // Perform the async callbacks outside of the lock + let mut callbacks: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while callbacks.next().await.is_some() {} + let mut callbacks: FuturesUnordered<_> = + data_to_unref.iter().map(LenEntry::unref).collect(); + while callbacks.next().await.is_some() {} } pub async fn get(&self, key: &Q) -> Option { // Fast path: Check if we need eviction before acquiring lock for eviction let needs_eviction = { - let state = self.state.lock_arc(); + let state = self.state.lock(); if let Some((_, peek_entry)) = state.lru.peek_lru() { self.should_evict( state.lru.len(), @@ -415,20 +456,23 @@ where // Perform eviction if needed if needs_eviction { - let items_to_unref = { - let mut state = self.state.lock_arc(); - self.evict_items(&mut *state).await + let (items_to_unref, removal_futures) = { + let mut state = self.state.lock(); + self.evict_items(&mut *state) }; // Unref items outside of lock - for item in items_to_unref { - item.unref().await; - } + let mut callbacks: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while callbacks.next().await.is_some() {} + let mut callbacks: FuturesUnordered<_> = + items_to_unref.iter().map(LenEntry::unref).collect(); + while callbacks.next().await.is_some() {} } // Now get the item - let mut state = self.state.lock_arc(); + let mut state = self.state.lock(); let entry = state.lru.get_mut(key.borrow())?; - entry.seconds_since_anchor = self.anchor_time.elapsed().as_secs() as i32; + entry.seconds_since_anchor = + i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX); Some(entry.data.clone()) } @@ -437,26 +481,33 @@ where where K: 'static, { - self.insert_with_time(key, data, self.anchor_time.elapsed().as_secs() as i32) - .await + self.insert_with_time( + key, + data, + i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX), + ) + .await } /// Returns the replaced item if any. pub async fn insert_with_time(&self, key: K, data: T, seconds_since_anchor: i32) -> Option { - let items_to_unref = { - let mut state = self.state.lock_arc(); + let (items_to_unref, removal_futures) = { + let mut state = self.state.lock(); self.inner_insert_many(&mut state, [(key, data)], seconds_since_anchor) - .await }; - // Unref items outside of lock - let mut results = Vec::new(); - for item in items_to_unref { - item.unref().await; - results.push(item); - } + let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while futures.next().await.is_some() {} - results.into_iter().next() + // Unref items outside of lock + let futures: FuturesUnordered<_> = items_to_unref + .into_iter() + .map(|item| async move { + item.unref().await; + item + }) + .collect(); + futures.collect::>().await.into_iter().next() } /// Same as `insert()`, but optimized for multiple inserts. @@ -475,28 +526,40 @@ where return Vec::new(); } - let items_to_unref = { - let state = &mut self.state.lock_arc(); - self.inner_insert_many(state, inserts, self.anchor_time.elapsed().as_secs() as i32) - .await + let (items_to_unref, removal_futures) = { + let mut state = self.state.lock(); + self.inner_insert_many( + &mut state, + inserts, + i32::try_from(self.anchor_time.elapsed().as_secs()).unwrap_or(i32::MAX), + ) }; + let mut futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while futures.next().await.is_some() {} + // Unref items outside of lock - let mut results = Vec::new(); - for item in items_to_unref { - item.unref().await; - results.push(item); - } + items_to_unref + .into_iter() + .map(|item| async move { + item.unref().await; + item + }) + .collect::>() + .collect::>() + .await + } - results + pub fn len(&self) -> u64 { + self.state.lock().store_len } - async fn inner_insert_many( + fn inner_insert_many( &self, - state: &mut State, + state: &mut State, inserts: It, seconds_since_anchor: i32, - ) -> Vec + ) -> (Vec, Vec) where It: IntoIterator + Send, // Note: It's not enough to have the inserts themselves be Send. The @@ -504,6 +567,7 @@ where ::IntoIter: Send, { let mut replaced_items = Vec::new(); + let mut removal_futures = Vec::new(); for (key, data) in inserts { let new_item_size = data.len(); let eviction_item = EvictionItem { @@ -511,46 +575,52 @@ where data, }; - if let Some(old_item) = state.put(&key, eviction_item).await { + if let Some((old_item, futures)) = state.put(&key, eviction_item) { + removal_futures.extend(futures.into_iter()); replaced_items.push(old_item); } state.sum_store_size += new_item_size; + state.store_len += 1; state.lifetime_inserted_bytes.add(new_item_size); } // Perform eviction after all insertions - let items_to_unref = self.evict_items(state).await; + let (items_to_unref, futures) = self.evict_items(state); + removal_futures.extend(futures); // Note: We cannot drop the state lock here since we're borrowing it, // but the caller will handle unreffing these items after releasing the lock - for item in items_to_unref { - replaced_items.push(item); - } + replaced_items.extend(items_to_unref); - replaced_items + (replaced_items, removal_futures) } pub async fn remove(&self, key: &Q) -> bool { - let (items_to_unref, removed_item) = { - let mut state = self.state.lock_arc(); + let (items_to_unref, removed_item, removal_futures) = { + let mut state = self.state.lock(); // First perform eviction - let evicted_items = self.evict_items(&mut *state).await; + let (evicted_items, mut removal_futures) = self.evict_items(&mut *state); // Then try to remove the requested item let removed = if let Some(entry) = state.lru.pop(key.borrow()) { - Some(state.remove(key, &entry, false).await) + let (removed_item, more_removal_futures) = state.remove(key, &entry, false); + removal_futures.extend(more_removal_futures.into_iter()); + Some(removed_item) } else { None }; - (evicted_items, removed) + (evicted_items, removed, removal_futures) }; + let mut callbacks: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while callbacks.next().await.is_some() {} + // Unref evicted items outside of lock - for item in items_to_unref { - item.unref().await; - } + let mut callbacks: FuturesUnordered<_> = + items_to_unref.iter().map(LenEntry::unref).collect(); + while callbacks.next().await.is_some() {} // Unref removed item if any if let Some(item) = removed_item { @@ -567,41 +637,49 @@ where where F: FnOnce(&T) -> bool + Send, { - let mut state = self.state.lock_arc(); - if let Some(entry) = state.lru.get(key.borrow()) { - if !cond(&entry.data) { - return false; - } - // First perform eviction - let evicted_items = self.evict_items(&mut state).await; - - // Then try to remove the requested item - let removed_item = if let Some(entry) = state.lru.pop(key.borrow()) { - Some(state.remove(key, &entry, false).await) + let (evicted_items, removal_futures, removed_item) = { + let mut state = self.state.lock(); + if let Some(entry) = state.lru.get(key.borrow()) { + if !cond(&entry.data) { + return false; + } + // First perform eviction + let (evicted_items, mut removal_futures) = self.evict_items(&mut state); + + // Then try to remove the requested item + let removed_item = if let Some(entry) = state.lru.pop(key.borrow()) { + let (item, more_removal_futures) = state.remove(key, &entry, false); + removal_futures.extend(more_removal_futures.into_iter()); + Some(item) + } else { + None + }; + + (evicted_items, removal_futures, removed_item) } else { - None - }; - - // Drop the lock before unref operations - drop(state); - - // Unref evicted items - for item in evicted_items { - item.unref().await; + (vec![], vec![].into_iter().collect(), None) } + }; - // Unref removed item if any - if let Some(item) = removed_item { - item.unref().await; - return true; - } + // Perform the async callbacks outside of the lock + let mut removal_futures: FuturesUnordered<_> = removal_futures.into_iter().collect(); + while removal_futures.next().await.is_some() {} + + // Unref evicted items + let mut callbacks: FuturesUnordered<_> = + evicted_items.iter().map(LenEntry::unref).collect(); + while callbacks.next().await.is_some() {} - return false; + // Unref removed item if any + if let Some(item) = removed_item { + item.unref().await; + true + } else { + false } - false } - pub fn add_remove_callback(&self, callback: Box>) { - self.state.lock_arc().add_remove_callback(callback); + pub fn add_remove_callback(&self, callback: C) { + self.state.lock().add_remove_callback(callback); } } diff --git a/nativelink-util/src/fastcdc.rs b/nativelink-util/src/fastcdc.rs index a8c3b2748..abb487c9f 100644 --- a/nativelink-util/src/fastcdc.rs +++ b/nativelink-util/src/fastcdc.rs @@ -63,8 +63,7 @@ impl FastCDC { } avg_size - offset }; - // Calculate the number of bits closest approximating our average. - let bits = (avg_size as f64).log2().round() as u32; + Self { min_size, avg_size, @@ -73,8 +72,8 @@ impl FastCDC { norm_size, // Turn our bits into a bitmask we can use later on for more // efficient bitwise operations. - mask_hard: 2u32.pow(bits + 1) - 1, - mask_easy: 2u32.pow(bits - 1) - 1, + mask_hard: 2u32.pow(avg_size.ilog2() + 1) - 1, + mask_easy: 2u32.pow(avg_size.ilog2() - 1) - 1, state: State { hash: 0, @@ -121,7 +120,7 @@ impl Decoder for FastCDC { self.state.reset(); debug_assert!( split_point <= self.max_size, - "Expected {} < {}", + "Expected {} <= {}", split_point, self.max_size ); diff --git a/nativelink-util/src/fs.rs b/nativelink-util/src/fs.rs index 1da084198..284d2ca58 100644 --- a/nativelink-util/src/fs.rs +++ b/nativelink-util/src/fs.rs @@ -15,7 +15,7 @@ use core::pin::Pin; use core::sync::atomic::{AtomicUsize, Ordering}; use core::task::{Context, Poll}; -use std::fs::Metadata; +use std::fs::{Metadata, Permissions}; use std::io::{IoSlice, Seek}; use std::path::{Path, PathBuf}; @@ -27,7 +27,7 @@ use rlimit::increase_nofile_limit; pub use tokio::fs::DirEntry; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncWrite, ReadBuf, SeekFrom, Take}; use tokio::sync::{Semaphore, SemaphorePermit}; -use tracing::{error, info, warn}; +use tracing::{error, info, trace, warn}; use crate::spawn_blocking; @@ -41,6 +41,29 @@ pub struct FileSlot { inner: tokio::fs::File, } +impl FileSlot { + /// Advise the kernel to drop page cache for this file's contents. + /// Only available on Linux; + #[cfg(target_os = "linux")] + pub fn advise_dontneed(&self) { + use std::os::unix::io::AsRawFd; + let fd = self.inner.as_raw_fd(); + let ret = unsafe { libc::posix_fadvise(fd, 0, 0, libc::POSIX_FADV_DONTNEED) }; + if ret != 0 { + tracing::debug!( + fd, + ret, + "posix_fadvise(DONTNEED) returned non-zero (best-effort, ignoring)", + ); + } + } + + #[cfg(not(target_os = "linux"))] + pub const fn advise_dontneed(&self) { + // No-op: posix_fadvise is not available on Mac or Windows. + } +} + impl AsRef for FileSlot { fn as_ref(&self) -> &tokio::fs::File { &self.inner @@ -121,6 +144,10 @@ pub static OPEN_FILE_SEMAPHORE: Semaphore = Semaphore::const_new(DEFAULT_OPEN_FI /// Try to acquire a permit from the open file semaphore. #[inline] pub async fn get_permit() -> Result, Error> { + trace!( + available_permits = OPEN_FILE_SEMAPHORE.available_permits(), + "getting FS permit" + ); OPEN_FILE_SEMAPHORE .acquire() .await @@ -255,10 +282,7 @@ pub async fn hard_link(src: impl AsRef, dst: impl AsRef) -> Result<( call_with_permit(move |_| std::fs::hard_link(src, dst).map_err(Into::::into)).await } -pub async fn set_permissions( - src: impl AsRef, - perm: std::fs::Permissions, -) -> Result<(), Error> { +pub async fn set_permissions(src: impl AsRef, perm: Permissions) -> Result<(), Error> { let src = src.as_ref().to_owned(); call_with_permit(move |_| std::fs::set_permissions(src, perm).map_err(Into::::into)) .await @@ -276,14 +300,9 @@ pub async fn create_dir_all(path: impl AsRef) -> Result<(), Error> { #[cfg(target_family = "unix")] pub async fn symlink(src: impl AsRef, dst: impl AsRef) -> Result<(), Error> { - let src = src.as_ref().to_owned(); - let dst = dst.as_ref().to_owned(); - call_with_permit(move |_| { - tokio::runtime::Handle::current() - .block_on(tokio::fs::symlink(src, dst)) - .map_err(Into::::into) - }) - .await + // TODO: add a test for #2051: deadlock with large number of files + let _permit = get_permit().await?; + tokio::fs::symlink(src, dst).await.map_err(Into::into) } pub async fn read_link(path: impl AsRef) -> Result { @@ -361,7 +380,63 @@ pub async fn symlink_metadata(path: impl AsRef) -> Result call_with_permit(move |_| std::fs::symlink_metadata(path).map_err(Into::::into)).await } +// We can't just use the stock remove_dir_all as it falls over if someone's set readonly +// permissions. This version walks the directories and fixes the permissions where needed +// before deleting everything. +#[cfg(not(target_family = "windows"))] +fn internal_remove_dir_all(path: impl AsRef) -> Result<(), Error> { + // Because otherwise Windows builds complain about these things not being used + use std::io::ErrorKind; + use std::os::unix::fs::PermissionsExt; + + use tracing::debug; + use walkdir::WalkDir; + + for entry in WalkDir::new(&path) { + let Ok(entry) = &entry else { + debug!("Can't get into {entry:?}, assuming already deleted"); + continue; + }; + let metadata = entry.metadata()?; + if metadata.is_dir() { + match std::fs::remove_dir_all(entry.path()) { + Ok(()) => {} + Err(e) if e.kind() == ErrorKind::PermissionDenied => { + std::fs::set_permissions(entry.path(), Permissions::from_mode(0o700)).err_tip( + || format!("Setting permissions for {}", entry.path().display()), + )?; + } + e @ Err(_) => e.err_tip(|| format!("Removing {}", entry.path().display()))?, + } + } else if metadata.is_file() { + std::fs::set_permissions(entry.path(), Permissions::from_mode(0o600)) + .err_tip(|| format!("Setting permissions for {}", entry.path().display()))?; + } + } + + // should now be safe to delete after we fixed all the permissions in the walk loop + match std::fs::remove_dir_all(&path) { + Ok(()) => {} + Err(e) if e.kind() == ErrorKind::NotFound => {} + e @ Err(_) => e.err_tip(|| { + format!( + "Removing {} after permissions fixes", + path.as_ref().display() + ) + })?, + } + Ok(()) +} + +// We can't set the permissions easily in Windows, so just fallback to +// the stock Rust remove_dir_all +#[cfg(target_family = "windows")] +fn internal_remove_dir_all(path: impl AsRef) -> Result<(), Error> { + std::fs::remove_dir_all(&path)?; + Ok(()) +} + pub async fn remove_dir_all(path: impl AsRef) -> Result<(), Error> { let path = path.as_ref().to_owned(); - call_with_permit(move |_| std::fs::remove_dir_all(path).map_err(Into::::into)).await + call_with_permit(move |_| internal_remove_dir_all(path)).await } diff --git a/nativelink-util/src/fs_util.rs b/nativelink-util/src/fs_util.rs new file mode 100644 index 000000000..c010370bc --- /dev/null +++ b/nativelink-util/src/fs_util.rs @@ -0,0 +1,385 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::future::Future; +use core::pin::Pin; +use std::path::Path; + +use nativelink_error::{Code, Error, ResultExt, error_if, make_err}; +use tokio::fs; + +/// Hardlinks an entire directory tree from source to destination. +/// This is much faster than copying for large directory structures. +/// +/// # Arguments +/// * `src_dir` - Source directory path (must exist) +/// * `dst_dir` - Destination directory path (will be created) +/// +/// # Returns +/// * `Ok(())` on success +/// * `Err` if hardlinking fails (e.g., cross-filesystem, unsupported filesystem) +/// +/// # Platform Support +/// - Linux: Full support via `fs::hard_link` +/// - macOS: Full support via `fs::hard_link` +/// - Windows: Requires NTFS filesystem and appropriate permissions +/// +/// # Errors +/// - Source directory doesn't exist +/// - Destination already exists +/// - Cross-filesystem hardlinking attempted +/// - Filesystem doesn't support hardlinks +/// - Permission denied +pub async fn hardlink_directory_tree(src_dir: &Path, dst_dir: &Path) -> Result<(), Error> { + error_if!( + !src_dir.exists(), + "Source directory does not exist: {}", + src_dir.display() + ); + + error_if!( + dst_dir.exists(), + "Destination directory already exists: {}", + dst_dir.display() + ); + + // Create the root destination directory + fs::create_dir_all(dst_dir).await.err_tip(|| { + format!( + "Failed to create destination directory: {}", + dst_dir.display() + ) + })?; + + // Recursively hardlink the directory tree + hardlink_directory_tree_recursive(src_dir, dst_dir).await +} + +/// Internal recursive function to hardlink directory contents +fn hardlink_directory_tree_recursive<'a>( + src: &'a Path, + dst: &'a Path, +) -> Pin> + Send + 'a>> { + Box::pin(async move { + let mut entries = fs::read_dir(src) + .await + .err_tip(|| format!("Failed to read directory: {}", src.display()))?; + + while let Some(entry) = entries + .next_entry() + .await + .err_tip(|| format!("Failed to get next entry in: {}", src.display()))? + { + let entry_path = entry.path(); + let file_name = entry.file_name().into_string().map_err(|os_str| { + make_err!( + Code::InvalidArgument, + "Invalid UTF-8 in filename: {:?}", + os_str + ) + })?; + + let dst_path = dst.join(&file_name); + let metadata = entry + .metadata() + .await + .err_tip(|| format!("Failed to get metadata for: {}", entry_path.display()))?; + + if metadata.is_dir() { + // Create subdirectory and recurse + fs::create_dir(&dst_path) + .await + .err_tip(|| format!("Failed to create directory: {}", dst_path.display()))?; + + hardlink_directory_tree_recursive(&entry_path, &dst_path).await?; + } else if metadata.is_file() { + // Hardlink the file + fs::hard_link(&entry_path, &dst_path) + .await + .err_tip(|| { + format!( + "Failed to hardlink {} to {}. This may occur if the source and destination are on different filesystems", + entry_path.display(), + dst_path.display() + ) + })?; + } else if metadata.is_symlink() { + // Read the symlink target and create a new symlink + let target = fs::read_link(&entry_path) + .await + .err_tip(|| format!("Failed to read symlink: {}", entry_path.display()))?; + + #[cfg(unix)] + fs::symlink(&target, &dst_path) + .await + .err_tip(|| format!("Failed to create symlink: {}", dst_path.display()))?; + + #[cfg(windows)] + { + if target.is_dir() { + fs::symlink_dir(&target, &dst_path).await.err_tip(|| { + format!("Failed to create directory symlink: {}", dst_path.display()) + })?; + } else { + fs::symlink_file(&target, &dst_path).await.err_tip(|| { + format!("Failed to create file symlink: {}", dst_path.display()) + })?; + } + } + } + } + + Ok(()) + }) +} + +/// Sets a directory tree to read-only recursively. +/// This prevents actions from modifying cached directories. +/// +/// # Arguments +/// * `dir` - Directory to make read-only +/// +/// # Platform Notes +/// - Unix: Sets permissions to 0o555 (r-xr-xr-x) +/// - Windows: Sets `FILE_ATTRIBUTE_READONLY` +pub async fn set_readonly_recursive(dir: &Path) -> Result<(), Error> { + error_if!(!dir.exists(), "Directory does not exist: {}", dir.display()); + + set_readonly_recursive_impl(dir).await +} + +fn set_readonly_recursive_impl<'a>( + path: &'a Path, +) -> Pin> + Send + 'a>> { + Box::pin(async move { + let metadata = fs::metadata(path) + .await + .err_tip(|| format!("Failed to get metadata for: {}", path.display()))?; + + if metadata.is_dir() { + let mut entries = fs::read_dir(path) + .await + .err_tip(|| format!("Failed to read directory: {}", path.display()))?; + + while let Some(entry) = entries + .next_entry() + .await + .err_tip(|| format!("Failed to get next entry in: {}", path.display()))? + { + set_readonly_recursive_impl(&entry.path()).await?; + } + } + + // Set the file/directory to read-only + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mut perms = metadata.permissions(); + + // If it's a directory, set to r-xr-xr-x (555) + // If it's a file, set to r--r--r-- (444) + let mode = if metadata.is_dir() { 0o555 } else { 0o444 }; + perms.set_mode(mode); + + fs::set_permissions(path, perms) + .await + .err_tip(|| format!("Failed to set permissions for: {}", path.display()))?; + } + + #[cfg(windows)] + { + let mut perms = metadata.permissions(); + perms.set_readonly(true); + + fs::set_permissions(path, perms) + .await + .err_tip(|| format!("Failed to set permissions for: {}", path.display()))?; + } + + Ok(()) + }) +} + +/// Calculates the total size of a directory tree in bytes. +/// Used for cache size tracking and LRU eviction. +/// +/// # Arguments +/// * `dir` - Directory to calculate size for +/// +/// # Returns +/// Total size in bytes, or Error if directory cannot be read +pub async fn calculate_directory_size(dir: &Path) -> Result { + error_if!(!dir.exists(), "Directory does not exist: {}", dir.display()); + + calculate_directory_size_impl(dir).await +} + +fn calculate_directory_size_impl<'a>( + path: &'a Path, +) -> Pin> + Send + 'a>> { + Box::pin(async move { + let metadata = fs::metadata(path) + .await + .err_tip(|| format!("Failed to get metadata for: {}", path.display()))?; + + if metadata.is_file() { + return Ok(metadata.len()); + } + + if !metadata.is_dir() { + return Ok(0); + } + + let mut total_size = 0u64; + let mut entries = fs::read_dir(path) + .await + .err_tip(|| format!("Failed to read directory: {}", path.display()))?; + + while let Some(entry) = entries + .next_entry() + .await + .err_tip(|| format!("Failed to get next entry in: {}", path.display()))? + { + total_size += calculate_directory_size_impl(&entry.path()).await?; + } + + Ok(total_size) + }) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use nativelink_macro::nativelink_test; + use tempfile::TempDir; + use tokio::io::AsyncWriteExt; + + use super::*; + + async fn create_test_directory() -> Result<(TempDir, PathBuf), Error> { + let temp_dir = TempDir::new().err_tip(|| "Failed to create temp directory")?; + let test_dir = temp_dir.path().join("test_src"); + + fs::create_dir(&test_dir).await?; + + // Create a file + let file1 = test_dir.join("file1.txt"); + let mut f = fs::File::create(&file1).await?; + f.write_all(b"Hello, World!").await?; + f.sync_all().await?; + drop(f); + + // Create a subdirectory with a file + let subdir = test_dir.join("subdir"); + fs::create_dir(&subdir).await?; + + let file2 = subdir.join("file2.txt"); + let mut f = fs::File::create(&file2).await?; + f.write_all(b"Nested file").await?; + f.sync_all().await?; + drop(f); + + Ok((temp_dir, test_dir)) + } + + #[nativelink_test("crate")] + async fn test_hardlink_directory_tree() -> Result<(), Error> { + let (temp_dir, src_dir) = create_test_directory().await?; + let dst_dir = temp_dir.path().join("test_dst"); + + // Hardlink the directory + hardlink_directory_tree(&src_dir, &dst_dir).await?; + + // Verify structure + assert!(dst_dir.join("file1.txt").exists()); + assert!(dst_dir.join("subdir").is_dir()); + assert!(dst_dir.join("subdir/file2.txt").exists()); + + // Verify contents + let content1 = fs::read_to_string(dst_dir.join("file1.txt")).await?; + assert_eq!(content1, "Hello, World!"); + + let content2 = fs::read_to_string(dst_dir.join("subdir/file2.txt")).await?; + assert_eq!(content2, "Nested file"); + + // Verify files are hardlinked (same inode on Unix) + #[cfg(unix)] + { + use std::os::unix::fs::MetadataExt; + let src_meta = fs::metadata(src_dir.join("file1.txt")).await?; + let dst_meta = fs::metadata(dst_dir.join("file1.txt")).await?; + assert_eq!( + src_meta.ino(), + dst_meta.ino(), + "Files should have same inode (hardlinked)" + ); + } + + Ok(()) + } + + #[nativelink_test("crate")] + async fn test_set_readonly_recursive() -> Result<(), Error> { + let (_temp_dir, test_dir) = create_test_directory().await?; + + set_readonly_recursive(&test_dir).await?; + + // Verify files are read-only + let metadata = fs::metadata(test_dir.join("file1.txt")).await?; + assert!(metadata.permissions().readonly()); + + let metadata = fs::metadata(test_dir.join("subdir/file2.txt")).await?; + assert!(metadata.permissions().readonly()); + + Ok(()) + } + + #[nativelink_test("crate")] + async fn test_calculate_directory_size() -> Result<(), Error> { + let (_temp_dir, test_dir) = create_test_directory().await?; + + let size = calculate_directory_size(&test_dir).await?; + + // "Hello, World!" = 13 bytes + // "Nested file" = 11 bytes + // Total = 24 bytes + assert_eq!(size, 24); + + Ok(()) + } + + #[nativelink_test("crate")] + async fn test_hardlink_nonexistent_source() { + let temp_dir = TempDir::new().unwrap(); + let src = temp_dir.path().join("nonexistent"); + let dst = temp_dir.path().join("dest"); + + let result = hardlink_directory_tree(&src, &dst).await; + assert!(result.is_err()); + } + + #[nativelink_test("crate")] + async fn test_hardlink_existing_destination() -> Result<(), Error> { + let (temp_dir, src_dir) = create_test_directory().await?; + let dst_dir = temp_dir.path().join("existing"); + + fs::create_dir(&dst_dir).await?; + + let result = hardlink_directory_tree(&src_dir, &dst_dir).await; + assert!(result.is_err()); + + Ok(()) + } +} diff --git a/nativelink-util/src/known_platform_property_provider.rs b/nativelink-util/src/known_platform_property_provider.rs index 927ea41f0..93645baab 100644 --- a/nativelink-util/src/known_platform_property_provider.rs +++ b/nativelink-util/src/known_platform_property_provider.rs @@ -18,7 +18,7 @@ use nativelink_metric::RootMetricsComponent; use crate::operation_state_manager::ClientStateManager; -/// KnownPlatformPropertyProvider interface is responsible for retrieving +/// `KnownPlatformPropertyProvider` interface is responsible for retrieving /// a list of known platform properties. // TODO(https://github.com/rust-lang/rust/issues/65991) When this lands we can // move this to the nativelink-scheduler crate. diff --git a/nativelink-util/src/lib.rs b/nativelink-util/src/lib.rs index ea7fd9919..8d2937649 100644 --- a/nativelink-util/src/lib.rs +++ b/nativelink-util/src/lib.rs @@ -1,10 +1,10 @@ // Copyright 2024 The NativeLink Authors. All rights reserved. // -// Licensed under the Functional Source License, Version 1.1, Apache 2.0 Future License (the "License"); +// Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// See LICENSE file for details +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, @@ -22,9 +22,11 @@ pub mod digest_hasher; pub mod evicting_map; pub mod fastcdc; pub mod fs; +pub mod fs_util; pub mod health_utils; pub mod instant_wrapper; pub mod known_platform_property_provider; +pub mod metrics; pub mod metrics_utils; pub mod operation_state_manager; pub mod origin_event; diff --git a/nativelink-util/src/metrics.rs b/nativelink-util/src/metrics.rs new file mode 100644 index 000000000..82537194e --- /dev/null +++ b/nativelink-util/src/metrics.rs @@ -0,0 +1,1767 @@ +// Copyright 2025 The NativeLink Authors. All rights reserved. +// +// Licensed under the Business Source License, Version 1.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may requested a copy of the License by emailing contact@nativelink.com. +// +// Use of this module requires an enterprise license agreement, which can be +// attained by emailing contact@nativelink.com or signing up for Nativelink +// Cloud at app.nativelink.com. +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{Display, Formatter}; +use std::sync::{LazyLock, OnceLock}; + +use crate::action_messages::ActionStage; +use opentelemetry::{global, metrics, InstrumentationScope, KeyValue, Value}; + +/// Callback type for observable gauges that report queued action counts. +/// The callback receives an `Observer` that should be used to record values with attributes. +pub type QueuedActionsCallback = Box; + +/// Storage for the external callback for queued actions count. +static QUEUED_ACTIONS_CALLBACK: OnceLock = OnceLock::new(); + +/// Registers an external callback for the `execution_queued_actions_count` observable gauge. +/// +/// This function can only be called once. Subsequent calls will panic. +/// +/// The callback will be invoked during metrics collection and should report +/// the current count of queued actions by calling the provided observer function +/// with the count and any relevant attributes (e.g., platform properties). +/// +/// # Panics +/// +/// Panics if the callback has already been registered. +/// +/// # Example +/// ```ignore +/// register_queued_actions_callback(Box::new(|observe| { +/// // Report counts for different platform configurations +/// observe(10, &[KeyValue::new("platform", "linux")]); +/// observe(5, &[KeyValue::new("platform", "windows")]); +/// })); +/// ``` +pub fn register_queued_actions_callback(callback: QueuedActionsCallback) { + if QUEUED_ACTIONS_CALLBACK.set(callback).is_err() { + panic!("Queued actions callback can only be registered once"); + } +} + +// Metric attribute keys for cache operations. +pub const CACHE_TYPE: &str = "cache.type"; +pub const CACHE_OPERATION: &str = "cache.operation.name"; +pub const CACHE_RESULT: &str = "cache.operation.result"; +pub const STORE_TYPE: &str = "store.type"; +pub const STORE_NAME: &str = "store.name"; + +// Metric attribute keys for remote execution operations. +pub const EXECUTION_STAGE: &str = "execution_stage"; +pub const EXECUTION_RESULT: &str = "execution_result"; +pub const EXECUTION_INSTANCE: &str = "execution_instance"; +pub const EXECUTION_PRIORITY: &str = "execution_priority"; +pub const EXECUTION_EXIT_CODE: &str = "execution_exit_code"; + +/// Cache operation types for metrics classification. +#[derive(Debug, Clone, Copy)] +pub enum CacheOperationName { + /// Data retrieval operations (get, peek, contains, etc.) + Read, + /// Data storage operations (insert, update, replace, etc.) + Write, + /// Explicit data removal operations + Delete, + /// Automatic cache maintenance (evictions, TTL cleanup, etc.) + Evict, +} + +impl From for Value { + fn from(op: CacheOperationName) -> Self { + match op { + CacheOperationName::Read => Self::from("read"), + CacheOperationName::Write => Self::from("write"), + CacheOperationName::Delete => Self::from("delete"), + CacheOperationName::Evict => Self::from("evict"), + } + } +} + +/// Results of cache operations. +/// +/// Result semantics vary by operation type: +/// - Read: Hit/Miss/Expired indicate data availability +/// - Write/Delete/Evict: Success/Error indicate completion status +#[derive(Debug, Clone, Copy)] +pub enum CacheOperationResult { + /// Data found and valid (Read operations) + Hit, + /// Data not found (Read operations) + Miss, + /// Data found but invalid/expired (Read operations) + Expired, + /// Operation completed successfully (Write/Delete/Evict operations) + Success, + /// Operation failed (any operation type) + Error, +} + +impl From for Value { + fn from(result: CacheOperationResult) -> Self { + match result { + CacheOperationResult::Hit => Self::from("hit"), + CacheOperationResult::Miss => Self::from("miss"), + CacheOperationResult::Expired => Self::from("expired"), + CacheOperationResult::Success => Self::from("success"), + CacheOperationResult::Error => Self::from("error"), + } + } +} + +/// Remote execution stages for metrics classification. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ExecutionStage { + /// Unknown stage + Unknown, + /// Checking cache for existing results + CacheCheck, + /// Action is queued waiting for execution + Queued, + /// Action is being executed by a worker + Executing, + /// Action execution completed + Completed, +} + +impl From for Value { + fn from(stage: ExecutionStage) -> Self { + match stage { + ExecutionStage::Unknown => Self::from("unknown"), + ExecutionStage::CacheCheck => Self::from("cache_check"), + ExecutionStage::Queued => Self::from("queued"), + ExecutionStage::Executing => Self::from("executing"), + ExecutionStage::Completed => Self::from("completed"), + } + } +} + +impl From for ExecutionStage { + fn from(stage: ActionStage) -> Self { + match stage { + ActionStage::Unknown => Self::Unknown, + ActionStage::CacheCheck => Self::CacheCheck, + ActionStage::Queued => Self::Queued, + ActionStage::Executing => Self::Executing, + ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => Self::Completed, + } + } +} + +impl From<&ActionStage> for ExecutionStage { + fn from(stage: &ActionStage) -> Self { + match stage { + ActionStage::Unknown => Self::Unknown, + ActionStage::CacheCheck => Self::CacheCheck, + ActionStage::Queued => Self::Queued, + ActionStage::Executing => Self::Executing, + ActionStage::Completed(_) | ActionStage::CompletedFromCache(_) => Self::Completed, + } + } +} + +/// Results of remote execution operations. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ExecutionResult { + /// Execution completed successfully + Success, + /// Execution failed + Failure, + /// Execution was cancelled + Cancelled, + /// Execution timed out + Timeout, + /// Result was found in cache + CacheHit, +} + +impl From for Value { + fn from(result: ExecutionResult) -> Self { + match result { + ExecutionResult::Success => Self::from("success"), + ExecutionResult::Failure => Self::from("failure"), + ExecutionResult::Cancelled => Self::from("cancelled"), + ExecutionResult::Timeout => Self::from("timeout"), + ExecutionResult::CacheHit => Self::from("cache_hit"), + } + } +} + +/// Pre-allocated attribute combinations for efficient cache metrics collection. +/// +/// Avoids runtime allocation by pre-computing common attribute combinations +/// for cache operations and results. +#[derive(Debug)] +pub struct CacheMetricAttrs { + // Read operation attributes + read_hit: Vec, + read_miss: Vec, + read_expired: Vec, + + // Write operation attributes + write_success: Vec, + write_error: Vec, + + // Delete operation attributes + delete_success: Vec, + delete_miss: Vec, + delete_error: Vec, + + // Evict operation attributes + evict_success: Vec, + evict_expired: Vec, +} + +impl CacheMetricAttrs { + /// Creates a new set of pre-computed attributes. + /// + /// The `base_attrs` are included in all attribute combinations (e.g., cache + /// type, instance ID). + #[must_use] + pub fn new(base_attrs: &[KeyValue]) -> Self { + let make_attrs = |op: CacheOperationName, result: CacheOperationResult| { + let mut attrs = base_attrs.to_vec(); + attrs.push(KeyValue::new(CACHE_OPERATION, op)); + attrs.push(KeyValue::new(CACHE_RESULT, result)); + attrs + }; + + Self { + read_hit: make_attrs(CacheOperationName::Read, CacheOperationResult::Hit), + read_miss: make_attrs(CacheOperationName::Read, CacheOperationResult::Miss), + read_expired: make_attrs(CacheOperationName::Read, CacheOperationResult::Expired), + + write_success: make_attrs(CacheOperationName::Write, CacheOperationResult::Success), + write_error: make_attrs(CacheOperationName::Write, CacheOperationResult::Error), + + delete_success: make_attrs(CacheOperationName::Delete, CacheOperationResult::Success), + delete_miss: make_attrs(CacheOperationName::Delete, CacheOperationResult::Miss), + delete_error: make_attrs(CacheOperationName::Delete, CacheOperationResult::Error), + + evict_success: make_attrs(CacheOperationName::Evict, CacheOperationResult::Success), + evict_expired: make_attrs(CacheOperationName::Evict, CacheOperationResult::Expired), + } + } + + // Attribute accessors + #[must_use] + pub fn read_hit(&self) -> &[KeyValue] { + &self.read_hit + } + #[must_use] + pub fn read_miss(&self) -> &[KeyValue] { + &self.read_miss + } + #[must_use] + pub fn read_expired(&self) -> &[KeyValue] { + &self.read_expired + } + #[must_use] + pub fn write_success(&self) -> &[KeyValue] { + &self.write_success + } + #[must_use] + pub fn write_error(&self) -> &[KeyValue] { + &self.write_error + } + #[must_use] + pub fn delete_success(&self) -> &[KeyValue] { + &self.delete_success + } + #[must_use] + pub fn delete_miss(&self) -> &[KeyValue] { + &self.delete_miss + } + #[must_use] + pub fn delete_error(&self) -> &[KeyValue] { + &self.delete_error + } + #[must_use] + pub fn evict_success(&self) -> &[KeyValue] { + &self.evict_success + } + #[must_use] + pub fn evict_expired(&self) -> &[KeyValue] { + &self.evict_expired + } +} + +/// Pre-allocated attribute combinations for efficient remote execution metrics collection. +#[derive(Debug)] +pub struct ExecutionMetricAttrs { + // Stage transition attributes + unknown: Vec, + cache_check: Vec, + queued: Vec, + executing: Vec, + completed_success: Vec, + completed_failure: Vec, + completed_cancelled: Vec, + completed_timeout: Vec, + completed_cache_hit: Vec, +} + +impl ExecutionMetricAttrs { + /// Creates a new set of pre-computed attributes. + /// + /// The `base_attrs` are included in all attribute combinations (e.g., instance + /// name, worker ID). + #[must_use] + pub fn new(base_attrs: &[KeyValue]) -> Self { + let make_attrs = |stage: ExecutionStage, result: Option| { + let mut attrs = base_attrs.to_vec(); + attrs.push(KeyValue::new(EXECUTION_STAGE, stage)); + if let Some(result) = result { + attrs.push(KeyValue::new(EXECUTION_RESULT, result)); + } + attrs + }; + + Self { + unknown: make_attrs(ExecutionStage::Unknown, None), + cache_check: make_attrs(ExecutionStage::CacheCheck, None), + queued: make_attrs(ExecutionStage::Queued, None), + executing: make_attrs(ExecutionStage::Executing, None), + completed_success: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::Success), + ), + completed_failure: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::Failure), + ), + completed_cancelled: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::Cancelled), + ), + completed_timeout: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::Timeout), + ), + completed_cache_hit: make_attrs( + ExecutionStage::Completed, + Some(ExecutionResult::CacheHit), + ), + } + } + + // Attribute accessors + #[must_use] + pub fn unknown(&self) -> &[KeyValue] { + &self.unknown + } + #[must_use] + pub fn cache_check(&self) -> &[KeyValue] { + &self.cache_check + } + #[must_use] + pub fn queued(&self) -> &[KeyValue] { + &self.queued + } + #[must_use] + pub fn executing(&self) -> &[KeyValue] { + &self.executing + } + #[must_use] + pub fn completed_success(&self) -> &[KeyValue] { + &self.completed_success + } + #[must_use] + pub fn completed_failure(&self) -> &[KeyValue] { + &self.completed_failure + } + #[must_use] + pub fn completed_cancelled(&self) -> &[KeyValue] { + &self.completed_cancelled + } + #[must_use] + pub fn completed_timeout(&self) -> &[KeyValue] { + &self.completed_timeout + } + #[must_use] + pub fn completed_cache_hit(&self) -> &[KeyValue] { + &self.completed_cache_hit + } +} + +/// Global cache metrics instruments. +pub static CACHE_METRICS: LazyLock = LazyLock::new(|| { + let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); + + CacheMetrics { + cache_operation_duration: meter + .f64_histogram("cache.operation.duration") + .with_description("Duration of cache operations in milliseconds") + .with_unit("ms") + // The range of these is quite large as a cache might be backed by + // memory, a filesystem, or network storage. The current values were + // determined empirically and might need adjustment. + .with_boundaries(vec![ + // Microsecond range + 0.001, // 1μs + 0.005, // 5μs + 0.01, // 10μs + 0.05, // 50μs + 0.1, // 100μs + // Sub-millisecond range + 0.2, // 200μs + 0.5, // 500μs + 1.0, // 1ms + // Low millisecond range + 2.0, // 2ms + 5.0, // 5ms + 10.0, // 10ms + 20.0, // 20ms + 50.0, // 50ms + 100.0, // 100ms + // Higher latency range + 200.0, // 200ms + 500.0, // 500ms + 1000.0, // 1 second + 2000.0, // 2 seconds + 5000.0, // 5 seconds + ]) + .build(), + + cache_operations: meter + .u64_counter("cache.operations") + .with_description("Total cache operations by type and result") + .build(), + + cache_io: meter + .u64_counter("cache.io") + .with_description("Total bytes processed by cache operations") + .with_unit("By") + .build(), + + cache_size: meter + .i64_up_down_counter("cache.size") + .with_description("Current total size of cached data") + .with_unit("By") + .build(), + + cache_entries: meter + .i64_up_down_counter("cache.entries") + .with_description("Current number of cached entries") + .with_unit("{entry}") + .build(), + + cache_entry_size: meter + .u64_histogram("cache.item.size") + .with_description("Size distribution of cached entries") + .with_unit("By") + .build(), + } +}); + +/// OpenTelemetry metrics instruments for cache monitoring. +#[derive(Debug)] +pub struct CacheMetrics { + /// Histogram of cache operation durations in milliseconds + pub cache_operation_duration: metrics::Histogram, + /// Counter of cache operations by type and result + pub cache_operations: metrics::Counter, + /// Counter of bytes read/written during cache operations + pub cache_io: metrics::Counter, + /// Current total size of all cached data in bytes + pub cache_size: metrics::UpDownCounter, + /// Current number of entries in cache + pub cache_entries: metrics::UpDownCounter, + /// Histogram of individual cache entry sizes in bytes + pub cache_entry_size: metrics::Histogram, +} + +/// Global remote execution metrics instruments. +pub static EXECUTION_METRICS: LazyLock = LazyLock::new(|| { + let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); + + ExecutionMetrics { + execution_stage_duration: meter + .f64_histogram("execution_stage_duration") + .with_description("Duration of each execution stage in seconds") + .with_unit("s") + .with_boundaries(vec![ + // Sub-second range + 0.001, // 1ms + 0.01, // 10ms + 0.1, // 100ms + 0.5, // 500ms + 1.0, // 1s + // Multi-second range + 2.0, // 2s + 5.0, // 5s + 10.0, // 10s + 30.0, // 30s + 60.0, // 1 minute + 120.0, // 2 minutes + 300.0, // 5 minutes + 600.0, // 10 minutes + 1800.0, // 30 minutes + 3600.0, // 1 hour + ]) + .build(), + + execution_total_duration: meter + .f64_histogram("execution_total_duration") + .with_description( + "Total duration of action execution from submission to completion in seconds", + ) + .with_unit("s") + .with_boundaries(vec![ + // Sub-second range + 0.01, // 10ms + 0.1, // 100ms + 0.5, // 500ms + 1.0, // 1s + // Multi-second range + 5.0, // 5s + 10.0, // 10s + 30.0, // 30s + 60.0, // 1 minute + 300.0, // 5 minutes + 600.0, // 10 minutes + 1800.0, // 30 minutes + 3600.0, // 1 hour + 7200.0, // 2 hours + ]) + .build(), + + execution_queue_time: meter + .f64_histogram("execution_queue_time") + .with_description("Time spent waiting in queue before execution in seconds") + .with_unit("s") + .with_boundaries(vec![ + 0.001, // 1ms + 0.01, // 10ms + 0.1, // 100ms + 0.5, // 500ms + 1.0, // 1s + 2.0, // 2s + 5.0, // 5s + 10.0, // 10s + 30.0, // 30s + 60.0, // 1 minute + 300.0, // 5 minutes + 600.0, // 10 minutes + ]) + .build(), + + execution_active_count: meter + .i64_up_down_counter("execution_active_count") + .with_description("Number of actions currently in each stage") + .with_unit("{action}") + .build(), + + execution_completed_count: meter + .u64_counter("execution_completed_count") + .with_description("Total number of completed executions by result") + .with_unit("{action}") + .build(), + + execution_stage_transitions: meter + .u64_counter("execution_stage_transitions") + .with_description("Number of stage transitions") + .with_unit("{transition}") + .build(), + + execution_output_size: meter + .u64_histogram("execution_output_size") + .with_description("Size of execution outputs in bytes") + .with_unit("By") + .with_boundaries(vec![ + 1_024.0, // 1KB + 10_240.0, // 10KB + 102_400.0, // 100KB + 1_048_576.0, // 1MB + 10_485_760.0, // 10MB + 104_857_600.0, // 100MB + 1_073_741_824.0, // 1GB + 10_737_418_240.0, // 10GB + ]) + .build(), + + execution_cpu_time: meter + .f64_histogram("execution_cpu_time") + .with_description("CPU time consumed by action execution in seconds") + .with_unit("s") + .with_boundaries(vec![ + 0.01, // 10ms + 0.1, // 100ms + 1.0, // 1s + 10.0, // 10s + 60.0, // 1 minute + 300.0, // 5 minutes + 600.0, // 10 minutes + 1800.0, // 30 minutes + 3600.0, // 1 hour + ]) + .build(), + + execution_memory_usage: meter + .u64_histogram("execution_memory_usage") + .with_description("Peak memory usage during execution in bytes") + .with_unit("By") + .with_boundaries(vec![ + 1_048_576.0, // 1MB + 10_485_760.0, // 10MB + 104_857_600.0, // 100MB + 524_288_000.0, // 500MB + 1_073_741_824.0, // 1GB + 5_368_709_120.0, // 5GB + 10_737_418_240.0, // 10GB + 53_687_091_200.0, // 50GB + ]) + .build(), + + execution_retry_count: meter + .u64_counter("execution_retry_count") + .with_description("Number of execution retries") + .with_unit("{retry}") + .build(), + + execution_actions_count: meter + .u64_gauge("execution_actions_count") + .with_description("Current number of actions in each stage") + .with_unit("{action}") + .build(), + + execution_queued_actions_count: meter + .u64_observable_gauge("execution_queued_actions_count_observable") + .with_description("Current number of queued actions by platform properties") + .with_unit("{action}") + .with_callback(|observer| { + if let Some(callback) = QUEUED_ACTIONS_CALLBACK.get() { + callback(&|value, attrs| { + observer.observe(value, attrs); + }); + } + }) + .build(), + + do_try_match_duration: meter + .f64_histogram("do_try_match_duration") + .with_description("Duration of do_try_match in seconds") + .with_unit("s") + .with_boundaries(vec![ + 0.01, // 10ms + 0.1, // 100ms + 1.0, // 1s + 10.0, // 10s + 60.0, // 1 minute + 300.0, // 5 minutes + 600.0, // 10 minutes + 1200.0, // 20 minutes + 1800.0, // 30 minutes + 2400.0, // 40 minutes + 3000.0, // 50 minutes + 3600.0, // 1 hour + ]) + .build(), + } +}); + +/// OpenTelemetry metrics instruments for remote execution monitoring. +#[derive(Debug)] +pub struct ExecutionMetrics { + /// Histogram of stage durations in seconds + pub execution_stage_duration: metrics::Histogram, + /// Histogram of total execution durations in seconds + pub execution_total_duration: metrics::Histogram, + /// Histogram of queue wait times in seconds + pub execution_queue_time: metrics::Histogram, + /// Current number of actions in each stage + pub execution_active_count: metrics::UpDownCounter, + /// Total number of completed executions + pub execution_completed_count: metrics::Counter, + /// Number of stage transitions + pub execution_stage_transitions: metrics::Counter, + /// Histogram of output sizes in bytes + pub execution_output_size: metrics::Histogram, + /// Histogram of CPU time in seconds + pub execution_cpu_time: metrics::Histogram, + /// Histogram of peak memory usage in bytes + pub execution_memory_usage: metrics::Histogram, + /// Counter for execution retries + pub execution_retry_count: metrics::Counter, + /// Gauge of actions by stage + pub execution_actions_count: metrics::Gauge, + // Gauge of queued actions by platform properties + pub execution_queued_actions_count: metrics::ObservableGauge, + /// Duration of do_try_match in ms + pub do_try_match_duration: metrics::Histogram, +} + +/// Helper function to create attributes for execution metrics +#[must_use] +pub fn make_execution_attributes( + instance_name: &str, + priority: Option, +) -> Vec { + let mut attrs = vec![KeyValue::new(EXECUTION_INSTANCE, instance_name.to_string())]; + + if let Some(priority) = priority { + attrs.push(KeyValue::new(EXECUTION_PRIORITY, i64::from(priority))); + } + + attrs +} + +// Metric attribute keys for worker pool operations. +pub const WORKER_POOL_INSTANCE: &str = "worker_pool_instance"; +pub const WORKER_EVENT_TYPE: &str = "worker_pool_event_type"; +pub const WORKER_STATE: &str = "worker_pool_state"; + +/// Worker event types for metrics classification. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum WorkerEventType { + /// Worker was added to the pool + Added, + /// Worker was removed from the pool + Removed, + /// Worker timed out + Timeout, + /// Worker connection failed + ConnectionFailed, + /// Worker was evicted due to error + Evicted, +} + +impl From for Value { + fn from(event: WorkerEventType) -> Self { + match event { + WorkerEventType::Added => Self::from("added"), + WorkerEventType::Removed => Self::from("removed"), + WorkerEventType::Timeout => Self::from("timeout"), + WorkerEventType::ConnectionFailed => Self::from("connection_failed"), + WorkerEventType::Evicted => Self::from("evicted"), + } + } +} + +/// Worker state types for metrics classification. +#[derive(Debug, Clone, Copy)] +pub enum WorkerState { + /// Worker is available and can accept work + Available, + /// Worker is paused (backpressure) + Paused, + /// Worker is draining (not accepting new work) + Draining, +} + +impl From for Value { + fn from(state: WorkerState) -> Self { + match state { + WorkerState::Available => Self::from("available"), + WorkerState::Paused => Self::from("paused"), + WorkerState::Draining => Self::from("draining"), + } + } +} + +/// Pre-allocated attribute combinations for efficient worker metrics collection. +#[derive(Debug)] +pub struct WorkerPoolMetricAttrs { + added: Vec, + removed: Vec, + timeout: Vec, + connection_failed: Vec, + evicted: Vec, + state_available: Vec, + state_paused: Vec, + state_draining: Vec, +} + +impl WorkerPoolMetricAttrs { + #[must_use] + pub fn new(base_attrs: &[KeyValue]) -> Self { + let make_event_attrs = |event: WorkerEventType| { + let mut attrs = base_attrs.to_vec(); + attrs.push(KeyValue::new(WORKER_EVENT_TYPE, event)); + attrs + }; + + let make_state_attrs = |state: WorkerState| { + let mut attrs = base_attrs.to_vec(); + attrs.push(KeyValue::new(WORKER_STATE, state)); + attrs + }; + + Self { + added: make_event_attrs(WorkerEventType::Added), + removed: make_event_attrs(WorkerEventType::Removed), + timeout: make_event_attrs(WorkerEventType::Timeout), + connection_failed: make_event_attrs(WorkerEventType::ConnectionFailed), + evicted: make_event_attrs(WorkerEventType::Evicted), + state_available: make_state_attrs(WorkerState::Available), + state_paused: make_state_attrs(WorkerState::Paused), + state_draining: make_state_attrs(WorkerState::Draining), + } + } + + #[must_use] + pub fn added(&self) -> &[KeyValue] { + &self.added + } + #[must_use] + pub fn removed(&self) -> &[KeyValue] { + &self.removed + } + #[must_use] + pub fn timeout(&self) -> &[KeyValue] { + &self.timeout + } + #[must_use] + pub fn connection_failed(&self) -> &[KeyValue] { + &self.connection_failed + } + #[must_use] + pub fn evicted(&self) -> &[KeyValue] { + &self.evicted + } + #[must_use] + pub fn state_available(&self) -> &[KeyValue] { + &self.state_available + } + #[must_use] + pub fn state_paused(&self) -> &[KeyValue] { + &self.state_paused + } + #[must_use] + pub fn state_draining(&self) -> &[KeyValue] { + &self.state_draining + } +} + +/// Global worker pool metrics instruments. +pub static WORKER_POOL_METRICS: LazyLock = LazyLock::new(|| { + let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); + + WorkerPoolMetrics { + worker_count: meter + .u64_gauge("worker_pool_count") + .with_description("Current number of workers in the pool") + .with_unit("{worker}") + .build(), + + worker_events: meter + .u64_counter("worker_pool_events") + .with_description("Total worker pool events by type") + .with_unit("{event}") + .build(), + + worker_actions_running: meter + .u64_gauge("worker_pool_actions_running") + .with_description("Current number of actions running on workers") + .with_unit("{action}") + .build(), + + worker_actions_dispatched: meter + .u64_counter("worker_pool_actions_dispatched") + .with_description("Total number of actions dispatched to workers") + .with_unit("{action}") + .build(), + + worker_actions_completed: meter + .u64_counter("worker_pool_actions_completed") + .with_description("Total number of actions completed on workers") + .with_unit("{action}") + .build(), + + worker_dispatch_failures: meter + .u64_counter("worker_pool_dispatch_failures") + .with_description("Total number of action dispatch failures") + .with_unit("{failure}") + .build(), + } +}); + +/// OpenTelemetry metrics instruments for worker pool monitoring. +#[derive(Debug)] +pub struct WorkerPoolMetrics { + /// Current number of workers in the pool + pub worker_count: metrics::Gauge, + /// Counter of worker events by type + pub worker_events: metrics::Counter, + /// Current number of actions running on workers + pub worker_actions_running: metrics::Gauge, + /// Counter of actions dispatched to workers + pub worker_actions_dispatched: metrics::Counter, + /// Counter of actions completed on workers + pub worker_actions_completed: metrics::Counter, + /// Counter of action dispatch failures + pub worker_dispatch_failures: metrics::Counter, +} + +// Metric attribute keys for local worker operations. +pub const WORKER_NAME: &str = "worker.name"; +pub const WORKER_OPERATION: &str = "worker.operation"; +pub const WORKER_RESULT: &str = "worker.result"; + +/// Global local worker metrics instruments. +pub static LOCAL_WORKER_METRICS: LazyLock = LazyLock::new(|| { + let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); + + LocalWorkerMetrics { + start_actions_received: meter + .u64_counter("worker_start_actions_received") + .with_description("Total number of actions sent to this worker to process") + .with_unit("{action}") + .build(), + + disconnects_received: meter + .u64_counter("worker_disconnects_received") + .with_description("Total number of disconnects received from the scheduler") + .with_unit("{disconnect}") + .build(), + + keep_alives_received: meter + .u64_counter("worker_keep_alives_received") + .with_description("Total number of keep-alives received from the scheduler") + .with_unit("{keepalive}") + .build(), + + preconditions_calls: meter + .u64_counter("worker_preconditions_calls") + .with_description("Total number of precondition check calls") + .with_unit("{call}") + .build(), + + preconditions_successes: meter + .u64_counter("worker_preconditions_successes") + .with_description("Total number of successful precondition checks") + .with_unit("{success}") + .build(), + + preconditions_failures: meter + .u64_counter("worker_preconditions_failures") + .with_description("Total number of failed precondition checks") + .with_unit("{failure}") + .build(), + + preconditions_duration: meter + .f64_histogram("worker_preconditions_duration") + .with_description("Duration of precondition checks in milliseconds") + .with_unit("ms") + .with_boundaries(vec![ + 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, + 5000.0, + ]) + .build(), + } +}); + +/// OpenTelemetry metrics instruments for local worker monitoring. +#[derive(Debug)] +pub struct LocalWorkerMetrics { + /// Counter for actions received by the worker + pub start_actions_received: metrics::Counter, + /// Counter for disconnects received from scheduler + pub disconnects_received: metrics::Counter, + /// Counter for keep-alives received from scheduler + pub keep_alives_received: metrics::Counter, + /// Counter for precondition check calls + pub preconditions_calls: metrics::Counter, + /// Counter for successful precondition checks + pub preconditions_successes: metrics::Counter, + /// Counter for failed precondition checks + pub preconditions_failures: metrics::Counter, + /// Histogram for precondition check durations + pub preconditions_duration: metrics::Histogram, +} + +/// Pre-allocated attribute combinations for efficient worker metrics collection. +#[derive(Debug)] +pub struct WorkerMetricAttrs { + base: Vec, +} + +impl WorkerMetricAttrs { + /// Creates a new set of pre-computed attributes with the worker name. + #[must_use] + pub fn new(worker_name: &str) -> Self { + Self { + base: vec![KeyValue::new(WORKER_NAME, worker_name.to_string())], + } + } + + #[must_use] + pub fn base(&self) -> &[KeyValue] { + &self.base + } +} + +// Metric attribute keys for running actions operations. +pub const RUNNING_ACTION_OPERATION: &str = "running_action.operation"; +pub const RUNNING_ACTION_RESULT: &str = "running_action.result"; + +/// Global running actions metrics instruments. +pub static RUNNING_ACTIONS_METRICS: LazyLock = LazyLock::new(|| { + let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); + + // Helper to create standard histogram boundaries for operation durations + let duration_boundaries = vec![ + 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0, + 10000.0, 30000.0, 60000.0, + ]; + + RunningActionsMetrics { + // Async operation counters + create_and_add_action_calls: meter + .u64_counter("running_actions_create_and_add_action_calls") + .with_description("Total calls to create_and_add_action") + .with_unit("{call}") + .build(), + create_and_add_action_successes: meter + .u64_counter("running_actions_create_and_add_action_successes") + .with_description("Successful create_and_add_action operations") + .with_unit("{success}") + .build(), + create_and_add_action_failures: meter + .u64_counter("running_actions_create_and_add_action_failures") + .with_description("Failed create_and_add_action operations") + .with_unit("{failure}") + .build(), + create_and_add_action_duration: meter + .f64_histogram("running_actions_create_and_add_action_duration") + .with_description("Duration of create_and_add_action operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + cache_action_result_calls: meter + .u64_counter("running_actions_cache_action_result_calls") + .with_description("Total calls to cache_action_result") + .with_unit("{call}") + .build(), + cache_action_result_successes: meter + .u64_counter("running_actions_cache_action_result_successes") + .with_description("Successful cache_action_result operations") + .with_unit("{success}") + .build(), + cache_action_result_failures: meter + .u64_counter("running_actions_cache_action_result_failures") + .with_description("Failed cache_action_result operations") + .with_unit("{failure}") + .build(), + cache_action_result_duration: meter + .f64_histogram("running_actions_cache_action_result_duration") + .with_description("Duration of cache_action_result operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + kill_all_calls: meter + .u64_counter("running_actions_kill_all_calls") + .with_description("Total calls to kill_all") + .with_unit("{call}") + .build(), + kill_all_duration: meter + .f64_histogram("running_actions_kill_all_duration") + .with_description("Duration of kill_all operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + create_action_info_calls: meter + .u64_counter("running_actions_create_action_info_calls") + .with_description("Total calls to create_action_info") + .with_unit("{call}") + .build(), + create_action_info_successes: meter + .u64_counter("running_actions_create_action_info_successes") + .with_description("Successful create_action_info operations") + .with_unit("{success}") + .build(), + create_action_info_failures: meter + .u64_counter("running_actions_create_action_info_failures") + .with_description("Failed create_action_info operations") + .with_unit("{failure}") + .build(), + create_action_info_duration: meter + .f64_histogram("running_actions_create_action_info_duration") + .with_description("Duration of create_action_info operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + make_action_directory_calls: meter + .u64_counter("running_actions_make_action_directory_calls") + .with_description("Total calls to make_action_directory") + .with_unit("{call}") + .build(), + make_action_directory_successes: meter + .u64_counter("running_actions_make_action_directory_successes") + .with_description("Successful make_action_directory operations") + .with_unit("{success}") + .build(), + make_action_directory_failures: meter + .u64_counter("running_actions_make_action_directory_failures") + .with_description("Failed make_action_directory operations") + .with_unit("{failure}") + .build(), + make_action_directory_duration: meter + .f64_histogram("running_actions_make_action_directory_duration") + .with_description("Duration of make_action_directory operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + prepare_action_calls: meter + .u64_counter("running_actions_prepare_action_calls") + .with_description("Total calls to prepare_action") + .with_unit("{call}") + .build(), + prepare_action_successes: meter + .u64_counter("running_actions_prepare_action_successes") + .with_description("Successful prepare_action operations") + .with_unit("{success}") + .build(), + prepare_action_failures: meter + .u64_counter("running_actions_prepare_action_failures") + .with_description("Failed prepare_action operations") + .with_unit("{failure}") + .build(), + prepare_action_duration: meter + .f64_histogram("running_actions_prepare_action_duration") + .with_description("Duration of prepare_action operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + execute_calls: meter + .u64_counter("running_actions_execute_calls") + .with_description("Total calls to execute") + .with_unit("{call}") + .build(), + execute_successes: meter + .u64_counter("running_actions_execute_successes") + .with_description("Successful execute operations") + .with_unit("{success}") + .build(), + execute_failures: meter + .u64_counter("running_actions_execute_failures") + .with_description("Failed execute operations") + .with_unit("{failure}") + .build(), + execute_duration: meter + .f64_histogram("running_actions_execute_duration") + .with_description("Duration of execute operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + upload_results_calls: meter + .u64_counter("running_actions_upload_results_calls") + .with_description("Total calls to upload_results") + .with_unit("{call}") + .build(), + upload_results_successes: meter + .u64_counter("running_actions_upload_results_successes") + .with_description("Successful upload_results operations") + .with_unit("{success}") + .build(), + upload_results_failures: meter + .u64_counter("running_actions_upload_results_failures") + .with_description("Failed upload_results operations") + .with_unit("{failure}") + .build(), + upload_results_duration: meter + .f64_histogram("running_actions_upload_results_duration") + .with_description("Duration of upload_results operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + cleanup_calls: meter + .u64_counter("running_actions_cleanup_calls") + .with_description("Total calls to cleanup") + .with_unit("{call}") + .build(), + cleanup_successes: meter + .u64_counter("running_actions_cleanup_successes") + .with_description("Successful cleanup operations") + .with_unit("{success}") + .build(), + cleanup_failures: meter + .u64_counter("running_actions_cleanup_failures") + .with_description("Failed cleanup operations") + .with_unit("{failure}") + .build(), + cleanup_duration: meter + .f64_histogram("running_actions_cleanup_duration") + .with_description("Duration of cleanup operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + get_finished_result_calls: meter + .u64_counter("running_actions_get_finished_result_calls") + .with_description("Total calls to get_finished_result") + .with_unit("{call}") + .build(), + get_finished_result_successes: meter + .u64_counter("running_actions_get_finished_result_successes") + .with_description("Successful get_finished_result operations") + .with_unit("{success}") + .build(), + get_finished_result_failures: meter + .u64_counter("running_actions_get_finished_result_failures") + .with_description("Failed get_finished_result operations") + .with_unit("{failure}") + .build(), + get_finished_result_duration: meter + .f64_histogram("running_actions_get_finished_result_duration") + .with_description("Duration of get_finished_result operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + // Simple counters + cleanup_waits: meter + .u64_counter("running_actions_cleanup_waits") + .with_description("Number of times an action waited for cleanup to complete") + .with_unit("{wait}") + .build(), + + stale_removals: meter + .u64_counter("running_actions_stale_removals") + .with_description("Number of stale directories removed during action retries") + .with_unit("{removal}") + .build(), + + cleanup_wait_timeouts: meter + .u64_counter("running_actions_cleanup_wait_timeouts") + .with_description("Number of timeouts while waiting for cleanup to complete") + .with_unit("{timeout}") + .build(), + + // Additional async operation metrics + get_proto_command_from_store_calls: meter + .u64_counter("running_actions_get_proto_command_from_store_calls") + .with_description("Total calls to get_proto_command_from_store") + .with_unit("{call}") + .build(), + get_proto_command_from_store_successes: meter + .u64_counter("running_actions_get_proto_command_from_store_successes") + .with_description("Successful get_proto_command_from_store operations") + .with_unit("{success}") + .build(), + get_proto_command_from_store_failures: meter + .u64_counter("running_actions_get_proto_command_from_store_failures") + .with_description("Failed get_proto_command_from_store operations") + .with_unit("{failure}") + .build(), + get_proto_command_from_store_duration: meter + .f64_histogram("running_actions_get_proto_command_from_store_duration") + .with_description("Duration of get_proto_command_from_store operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + download_to_directory_calls: meter + .u64_counter("running_actions_download_to_directory_calls") + .with_description("Total calls to download_to_directory") + .with_unit("{call}") + .build(), + download_to_directory_successes: meter + .u64_counter("running_actions_download_to_directory_successes") + .with_description("Successful download_to_directory operations") + .with_unit("{success}") + .build(), + download_to_directory_failures: meter + .u64_counter("running_actions_download_to_directory_failures") + .with_description("Failed download_to_directory operations") + .with_unit("{failure}") + .build(), + download_to_directory_duration: meter + .f64_histogram("running_actions_download_to_directory_duration") + .with_description("Duration of download_to_directory operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + prepare_output_files_calls: meter + .u64_counter("running_actions_prepare_output_files_calls") + .with_description("Total calls to prepare_output_files") + .with_unit("{call}") + .build(), + prepare_output_files_successes: meter + .u64_counter("running_actions_prepare_output_files_successes") + .with_description("Successful prepare_output_files operations") + .with_unit("{success}") + .build(), + prepare_output_files_failures: meter + .u64_counter("running_actions_prepare_output_files_failures") + .with_description("Failed prepare_output_files operations") + .with_unit("{failure}") + .build(), + prepare_output_files_duration: meter + .f64_histogram("running_actions_prepare_output_files_duration") + .with_description("Duration of prepare_output_files operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + prepare_output_paths_calls: meter + .u64_counter("running_actions_prepare_output_paths_calls") + .with_description("Total calls to prepare_output_paths") + .with_unit("{call}") + .build(), + prepare_output_paths_successes: meter + .u64_counter("running_actions_prepare_output_paths_successes") + .with_description("Successful prepare_output_paths operations") + .with_unit("{success}") + .build(), + prepare_output_paths_failures: meter + .u64_counter("running_actions_prepare_output_paths_failures") + .with_description("Failed prepare_output_paths operations") + .with_unit("{failure}") + .build(), + prepare_output_paths_duration: meter + .f64_histogram("running_actions_prepare_output_paths_duration") + .with_description("Duration of prepare_output_paths operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + child_process_calls: meter + .u64_counter("running_actions_child_process_calls") + .with_description("Total calls to child_process") + .with_unit("{call}") + .build(), + child_process_successes: meter + .u64_counter("running_actions_child_process_successes") + .with_description("Successful child_process operations") + .with_unit("{success}") + .build(), + child_process_duration: meter + .f64_histogram("running_actions_child_process_duration") + .with_description("Duration of child_process operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + child_process_success_error_code: meter + .u64_counter("running_actions_child_process_success_exit_code") + .with_description("Number of child processes with success exit code (0)") + .with_unit("{process}") + .build(), + + child_process_failure_error_code: meter + .u64_counter("running_actions_child_process_failure_exit_code") + .with_description("Number of child processes with non-zero exit code") + .with_unit("{process}") + .build(), + + upload_stdout_calls: meter + .u64_counter("running_actions_upload_stdout_calls") + .with_description("Total calls to upload_stdout") + .with_unit("{call}") + .build(), + upload_stdout_successes: meter + .u64_counter("running_actions_upload_stdout_successes") + .with_description("Successful upload_stdout operations") + .with_unit("{success}") + .build(), + upload_stdout_failures: meter + .u64_counter("running_actions_upload_stdout_failures") + .with_description("Failed upload_stdout operations") + .with_unit("{failure}") + .build(), + upload_stdout_duration: meter + .f64_histogram("running_actions_upload_stdout_duration") + .with_description("Duration of upload_stdout operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + upload_stderr_calls: meter + .u64_counter("running_actions_upload_stderr_calls") + .with_description("Total calls to upload_stderr") + .with_unit("{call}") + .build(), + upload_stderr_successes: meter + .u64_counter("running_actions_upload_stderr_successes") + .with_description("Successful upload_stderr operations") + .with_unit("{success}") + .build(), + upload_stderr_failures: meter + .u64_counter("running_actions_upload_stderr_failures") + .with_description("Failed upload_stderr operations") + .with_unit("{failure}") + .build(), + upload_stderr_duration: meter + .f64_histogram("running_actions_upload_stderr_duration") + .with_description("Duration of upload_stderr operations") + .with_unit("ms") + .with_boundaries(duration_boundaries.clone()) + .build(), + + task_timeouts: meter + .u64_counter("running_actions_task_timeouts") + .with_description("Total number of task timeouts") + .with_unit("{timeout}") + .build(), + } +}); + +/// OpenTelemetry metrics instruments for running actions monitoring. +#[derive(Debug)] +pub struct RunningActionsMetrics { + // create_and_add_action metrics + pub create_and_add_action_calls: metrics::Counter, + pub create_and_add_action_successes: metrics::Counter, + pub create_and_add_action_failures: metrics::Counter, + pub create_and_add_action_duration: metrics::Histogram, + + // cache_action_result metrics + pub cache_action_result_calls: metrics::Counter, + pub cache_action_result_successes: metrics::Counter, + pub cache_action_result_failures: metrics::Counter, + pub cache_action_result_duration: metrics::Histogram, + + // kill_all metrics + pub kill_all_calls: metrics::Counter, + pub kill_all_duration: metrics::Histogram, + + // create_action_info metrics + pub create_action_info_calls: metrics::Counter, + pub create_action_info_successes: metrics::Counter, + pub create_action_info_failures: metrics::Counter, + pub create_action_info_duration: metrics::Histogram, + + // make_action_directory metrics + pub make_action_directory_calls: metrics::Counter, + pub make_action_directory_successes: metrics::Counter, + pub make_action_directory_failures: metrics::Counter, + pub make_action_directory_duration: metrics::Histogram, + + // prepare_action metrics + pub prepare_action_calls: metrics::Counter, + pub prepare_action_successes: metrics::Counter, + pub prepare_action_failures: metrics::Counter, + pub prepare_action_duration: metrics::Histogram, + + // execute metrics + pub execute_calls: metrics::Counter, + pub execute_successes: metrics::Counter, + pub execute_failures: metrics::Counter, + pub execute_duration: metrics::Histogram, + + // upload_results metrics + pub upload_results_calls: metrics::Counter, + pub upload_results_successes: metrics::Counter, + pub upload_results_failures: metrics::Counter, + pub upload_results_duration: metrics::Histogram, + + // cleanup metrics + pub cleanup_calls: metrics::Counter, + pub cleanup_successes: metrics::Counter, + pub cleanup_failures: metrics::Counter, + pub cleanup_duration: metrics::Histogram, + + // get_finished_result metrics + pub get_finished_result_calls: metrics::Counter, + pub get_finished_result_successes: metrics::Counter, + pub get_finished_result_failures: metrics::Counter, + pub get_finished_result_duration: metrics::Histogram, + + // Simple counters + pub cleanup_waits: metrics::Counter, + pub stale_removals: metrics::Counter, + pub cleanup_wait_timeouts: metrics::Counter, + + // get_proto_command_from_store metrics + pub get_proto_command_from_store_calls: metrics::Counter, + pub get_proto_command_from_store_successes: metrics::Counter, + pub get_proto_command_from_store_failures: metrics::Counter, + pub get_proto_command_from_store_duration: metrics::Histogram, + + // download_to_directory metrics + pub download_to_directory_calls: metrics::Counter, + pub download_to_directory_successes: metrics::Counter, + pub download_to_directory_failures: metrics::Counter, + pub download_to_directory_duration: metrics::Histogram, + + // prepare_output_files metrics + pub prepare_output_files_calls: metrics::Counter, + pub prepare_output_files_successes: metrics::Counter, + pub prepare_output_files_failures: metrics::Counter, + pub prepare_output_files_duration: metrics::Histogram, + + // prepare_output_paths metrics + pub prepare_output_paths_calls: metrics::Counter, + pub prepare_output_paths_successes: metrics::Counter, + pub prepare_output_paths_failures: metrics::Counter, + pub prepare_output_paths_duration: metrics::Histogram, + + // child_process metrics + pub child_process_calls: metrics::Counter, + pub child_process_successes: metrics::Counter, + pub child_process_duration: metrics::Histogram, + pub child_process_success_error_code: metrics::Counter, + pub child_process_failure_error_code: metrics::Counter, + + // upload_stdout metrics + pub upload_stdout_calls: metrics::Counter, + pub upload_stdout_successes: metrics::Counter, + pub upload_stdout_failures: metrics::Counter, + pub upload_stdout_duration: metrics::Histogram, + + // upload_stderr metrics + pub upload_stderr_calls: metrics::Counter, + pub upload_stderr_successes: metrics::Counter, + pub upload_stderr_failures: metrics::Counter, + pub upload_stderr_duration: metrics::Histogram, + + // Other counters + pub task_timeouts: metrics::Counter, +} + +/// Global fast/slow store metrics instruments. +pub static FAST_SLOW_STORE_METRICS: LazyLock = LazyLock::new(|| { + let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); + + FastSlowStoreMetrics { + fast_store_hit_count: meter + .u64_counter("fast_slow_store.fast_store.hit_count") + .with_description("Hit count for the fast store") + .with_unit("{hit}") + .build(), + + fast_store_downloaded_bytes: meter + .u64_counter("fast_slow_store.fast_store.downloaded_bytes") + .with_description("Downloaded bytes from the fast store") + .with_unit("By") + .build(), + + slow_store_hit_count: meter + .u64_counter("fast_slow_store.slow_store.hit_count") + .with_description("Hit count for the slow store") + .with_unit("{hit}") + .build(), + + slow_store_downloaded_bytes: meter + .u64_counter("fast_slow_store.slow_store.downloaded_bytes") + .with_description("Downloaded bytes from the slow store") + .with_unit("By") + .build(), + } +}); + +/// OpenTelemetry metrics instruments for fast/slow store monitoring. +#[derive(Debug)] +pub struct FastSlowStoreMetrics { + /// Counter of cache hits on the fast store + pub fast_store_hit_count: metrics::Counter, + /// Counter of bytes downloaded from the fast store + pub fast_store_downloaded_bytes: metrics::Counter, + /// Counter of cache hits on the slow store + pub slow_store_hit_count: metrics::Counter, + /// Counter of bytes downloaded from the slow store + pub slow_store_downloaded_bytes: metrics::Counter, +} + +#[derive(Debug, Copy, Clone)] +pub enum StoreType { + Filesystem, + S3, + Gcs, + Grpc, + Mongo, + Redis, + OntapS3, + OntapS3ExistenceCache, + Memory, + Noop, + Compression, + Dedup, + ExistenceCache, + FastSlow, + SizePartitioning, + CompletenessChecking, + Verify, + Ref, + Shard, + Metrics, +} + +impl Display for StoreType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + StoreType::Filesystem => write!(f, "filesystem"), + StoreType::S3 => write!(f, "s3"), + StoreType::Grpc => write!(f, "grpc"), + StoreType::Mongo => write!(f, "mongo"), + StoreType::Redis => write!(f, "redis"), + StoreType::Gcs => write!(f, "gcs"), + StoreType::OntapS3 => write!(f, "ontap_s3"), + StoreType::OntapS3ExistenceCache => write!(f, "ontap_s3_existence_cache"), + StoreType::Memory => write!(f, "memory"), + StoreType::Noop => write!(f, "noop"), + StoreType::Compression => write!(f, "compression"), + StoreType::Dedup => write!(f, "dedup"), + StoreType::ExistenceCache => write!(f, "existence_cache"), + StoreType::FastSlow => write!(f, "fast_slow"), + StoreType::SizePartitioning => write!(f, "size_partitioning"), + StoreType::CompletenessChecking => write!(f, "completeness_checking"), + StoreType::Verify => write!(f, "verify"), + StoreType::Ref => write!(f, "ref"), + StoreType::Shard => write!(f, "shard"), + StoreType::Metrics => write!(f, "metrics"), + } + } +} + +pub static STORE_METRICS: LazyLock = LazyLock::new(|| { + let meter = global::meter_with_scope(InstrumentationScope::builder("nativelink").build()); + + StoreMetrics { + store_operations: meter + .u64_counter("store_operations") + .with_description("Total cache operations by type and result") + .build(), + + store_operation_duration: meter + .f64_histogram("store_operation_duration") + .with_description("Duration of store operations in milliseconds") + .with_unit("ms") + // The range of these is quite large as a store might be backed by + // memory, a filesystem, or network storage. The current values were + // determined empirically and might need adjustment. + .with_boundaries(vec![ + 0.1, // 100μs + // Sub-millisecond range + 0.5, // 500μs + 1.0, // 1ms + // Low millisecond range + 5.0, // 5ms + 10.0, // 10ms + 50.0, // 50ms + 100.0, // 100ms + // Higher latency range + 500.0, // 500ms + 1000.0, // 1 second + 5000.0, // 5 seconds + 10000.0, // 10 seconds + ]) + .build(), + + eviction_count: meter + .u64_counter("eviction_count") + .with_description("Number of evictions") + .build(), + + store_size: meter + .u64_gauge("store_size") + .with_description("Number of items in the store") + .build(), + } +}); + +#[derive(Debug)] +pub struct StoreMetrics { + /// Histogram of store operation durations in milliseconds + pub store_operation_duration: metrics::Histogram, + /// Counter of store operations by type and result + pub store_operations: metrics::Counter, + /// Counter of evictions + pub eviction_count: metrics::Counter, + /// Counter of items in the store + pub store_size: metrics::Gauge, +} + +#[derive(Debug, Clone)] +pub struct StoreMetricAttrs { + cache_hit: Vec, + cache_miss: Vec, + + read_success: Vec, + read_error: Vec, + write_success: Vec, + write_error: Vec, + eviction: Vec, + store_size: Vec, + +} + +impl StoreMetricAttrs { + /// Creates a new set of pre-computed attributes. + /// + /// The `base_attrs` are included in all attribute combinations (e.g., store + /// type, instance ID). + #[must_use] + pub fn new_with_name(store_type: StoreType, name: &str) -> Self { + let base_attrs = vec![ + KeyValue::new(STORE_TYPE, store_type.to_string()), + KeyValue::new(STORE_NAME, name.to_string()), + ]; + let make_attrs = |op: CacheOperationName, result: CacheOperationResult| { + let mut attrs = base_attrs.clone(); + attrs.push(KeyValue::new(CACHE_OPERATION, op)); + attrs.push(KeyValue::new(CACHE_RESULT, result)); + attrs + }; + + Self { + cache_hit: make_attrs(CacheOperationName::Read, CacheOperationResult::Hit), + cache_miss: make_attrs(CacheOperationName::Read, CacheOperationResult::Miss), + + read_success: make_attrs(CacheOperationName::Read, CacheOperationResult::Success), + read_error: make_attrs(CacheOperationName::Read, CacheOperationResult::Error), + write_success: make_attrs(CacheOperationName::Write, CacheOperationResult::Success), + write_error: make_attrs(CacheOperationName::Write, CacheOperationResult::Error), + eviction: make_attrs(CacheOperationName::Evict, CacheOperationResult::Success), + store_size: base_attrs.clone(), + + } + } + + // Attribute accessors + #[must_use] + pub fn cache_hit(&self) -> &[KeyValue] { + &self.cache_hit + } + #[must_use] + pub fn cache_miss(&self) -> &[KeyValue] { + &self.cache_miss + } + #[must_use] + pub fn read_success(&self) -> &[KeyValue] { + &self.read_success + } + #[must_use] + pub fn read_error(&self) -> &[KeyValue] { + &self.read_error + } + #[must_use] + pub fn write_success(&self) -> &[KeyValue] { + &self.write_success + } + #[must_use] + pub fn write_error(&self) -> &[KeyValue] { + &self.write_error + } + #[must_use] + pub fn eviction(&self) -> &[KeyValue] { + &self.eviction + } + #[must_use] + pub fn store_size(&self) -> &[KeyValue] { + &self.store_size + } +} diff --git a/nativelink-util/src/metrics_utils.rs b/nativelink-util/src/metrics_utils.rs index 7c78eb8da..d59f44a49 100644 --- a/nativelink-util/src/metrics_utils.rs +++ b/nativelink-util/src/metrics_utils.rs @@ -104,9 +104,10 @@ pub struct AsyncTimer<'a> { impl AsyncTimer<'_> { #[inline] pub fn measure(self) { - self.counter - .sum_func_duration_ns - .fetch_add(self.start.elapsed().as_nanos() as u64, Ordering::Acquire); + self.counter.sum_func_duration_ns.fetch_add( + u64::try_from(self.start.elapsed().as_nanos()).unwrap_or(u64::MAX), + Ordering::Acquire, + ); self.counter.calls.fetch_add(1, Ordering::Acquire); self.counter.successes.fetch_add(1, Ordering::Acquire); // This causes DropCounter's drop to never be called. @@ -227,8 +228,10 @@ impl AsyncCounterWrapper { // By default `drop_counter` will increment the drop counter when it goes out of scope. // This will ensure we don't increment the counter if we make it here with a zero cost. forget(drop_counter); - self.sum_func_duration_ns - .fetch_add(instant.elapsed().as_nanos() as u64, Ordering::Acquire); + self.sum_func_duration_ns.fetch_add( + u64::try_from(instant.elapsed().as_nanos()).unwrap_or(u64::MAX), + Ordering::Acquire, + ); result } diff --git a/nativelink-util/src/operation_state_manager.rs b/nativelink-util/src/operation_state_manager.rs index d820cb066..3a4b8806e 100644 --- a/nativelink-util/src/operation_state_manager.rs +++ b/nativelink-util/src/operation_state_manager.rs @@ -120,6 +120,9 @@ pub trait ClientStateManager: Sync + Send + Unpin + MetricsComponent + 'static { // into a KnownPlatformPropertyProvider instead. Rust currently does not support // casting traits to other traits. fn as_known_platform_property_provider(&self) -> Option<&dyn KnownPlatformPropertyProvider>; + + /// Returns the implementation as `Any` so that it can be downcast to a concrete type. + fn as_any(&self) -> &dyn std::any::Any; } /// The type of update to perform on an operation. @@ -140,6 +143,9 @@ pub enum UpdateOperationType { /// Notification that the worker disconnected. UpdateWithDisconnect, + + /// Notification that the execution stage has completed and it's just IO happening now. + ExecutionComplete, } #[async_trait] diff --git a/nativelink-util/src/platform_properties.rs b/nativelink-util/src/platform_properties.rs index d47383046..37d19b2e3 100644 --- a/nativelink-util/src/platform_properties.rs +++ b/nativelink-util/src/platform_properties.rs @@ -21,7 +21,6 @@ use nativelink_metric::{ use nativelink_proto::build::bazel::remote::execution::v2::Platform as ProtoPlatform; use nativelink_proto::build::bazel::remote::execution::v2::platform::Property as ProtoProperty; use serde::{Deserialize, Serialize}; -#[cfg(feature = "worker_find_logging")] use tracing::info; /// `PlatformProperties` helps manage the configuration of platform properties to @@ -45,21 +44,31 @@ impl PlatformProperties { /// Determines if the worker's `PlatformProperties` is satisfied by this struct. #[must_use] - pub fn is_satisfied_by(&self, worker_properties: &Self) -> bool { + pub fn is_satisfied_by(&self, worker_properties: &Self, full_worker_logging: bool) -> bool { for (property, check_value) in &self.properties { + if let PlatformPropertyValue::Ignore(_) = check_value { + continue; // always matches + } if let Some(worker_value) = worker_properties.properties.get(property) { if !check_value.is_satisfied_by(worker_value) { - #[cfg(feature = "worker_find_logging")] - { - info!( - "Property mismatch on worker property {property}. {worker_value:?} != {check_value:?}" - ); + if full_worker_logging { + match check_value { + PlatformPropertyValue::Minimum(_) => { + info!( + "Property mismatch on worker property {property}. {worker_value:?} < {check_value:?}" + ); + } + _ => { + info!( + "Property mismatch on worker property {property}. {worker_value:?} != {check_value:?}" + ); + } + } } return false; } } else { - #[cfg(feature = "worker_find_logging")] - { + if full_worker_logging { info!("Property missing on worker property {property}"); } return false; @@ -109,11 +118,15 @@ impl From<&PlatformProperties> for ProtoPlatform { /// TODO(palfrey) In the future this will be used by the scheduler and /// worker to cause the scheduler to prefer certain workers over others, /// but not restrict them based on these values. +/// Ignore - Jobs can request this key, but workers do not have to have it. This allows +/// for example the `InputRootAbsolutePath` case for chromium builds, where we can safely +/// ignore it without having to change the worker configs. #[derive(Eq, PartialEq, Hash, Clone, Ord, PartialOrd, Debug, Serialize, Deserialize)] pub enum PlatformPropertyValue { Exact(String), Minimum(u64), Priority(String), + Ignore(String), Unknown(String), } @@ -134,7 +147,7 @@ impl PlatformPropertyValue { // Priority is used to pass info to the worker and not restrict which // workers can be selected, but might be used to prefer certain workers // over others. - Self::Priority(_) => true, + Self::Priority(_) | Self::Ignore(_) => true, // Success exact case is handled above. Self::Exact(_) | Self::Unknown(_) => false, } @@ -142,9 +155,10 @@ impl PlatformPropertyValue { pub fn as_str(&self) -> Cow<'_, str> { match self { - Self::Exact(value) | Self::Priority(value) | Self::Unknown(value) => { - Cow::Borrowed(value) - } + Self::Exact(value) + | Self::Priority(value) + | Self::Unknown(value) + | Self::Ignore(value) => Cow::Borrowed(value), Self::Minimum(value) => Cow::Owned(value.to_string()), } } @@ -162,6 +176,7 @@ impl MetricsComponent for PlatformPropertyValue { Self::Exact(v) => publish!(name, v, kind, help, "exact"), Self::Minimum(v) => publish!(name, v, kind, help, "minimum"), Self::Priority(v) => publish!(name, v, kind, help, "priority"), + Self::Ignore(v) => publish!(name, v, kind, help, "ignore"), Self::Unknown(v) => publish!(name, v, kind, help, "unknown"), } diff --git a/nativelink-util/src/proto_stream_utils.rs b/nativelink-util/src/proto_stream_utils.rs index 875dedd05..b658168fc 100644 --- a/nativelink-util/src/proto_stream_utils.rs +++ b/nativelink-util/src/proto_stream_utils.rs @@ -56,7 +56,7 @@ where .next() .await .err_tip(|| "Error receiving first message in stream")? - .err_tip(|| "Expected WriteRequest struct in stream")?; + .err_tip(|| "Expected WriteRequest struct in stream (from)")?; let resource_info = ResourceInfo::new(&first_msg.resource_name, true) .err_tip(|| { @@ -83,6 +83,12 @@ where pub const fn is_first_msg(&self) -> bool { self.first_msg.is_some() } + + /// Returns whether the first message has `finish_write` set to true. + /// This indicates a single-shot upload where all data is in one message. + pub fn is_first_msg_complete(&self) -> bool { + self.first_msg.as_ref().is_some_and(|msg| msg.finish_write) + } } impl Stream for WriteRequestStreamWrapper @@ -114,7 +120,9 @@ where Poll::Pending => return Poll::Pending, Poll::Ready(Some(maybe_message)) => maybe_message .err_tip(|| format!("Stream error at byte {}", self.bytes_received)), - Poll::Ready(None) => Err(make_input_err!("Expected WriteRequest struct in stream")), + Poll::Ready(None) => Err(make_input_err!( + "Expected WriteRequest struct in stream (got None)" + )), } }; diff --git a/nativelink-util/src/retry.rs b/nativelink-util/src/retry.rs index 95b3865e7..e87bc4196 100644 --- a/nativelink-util/src/retry.rs +++ b/nativelink-util/src/retry.rs @@ -20,7 +20,7 @@ use futures::future::Future; use futures::stream::StreamExt; use nativelink_config::stores::{ErrorCode, Retry}; use nativelink_error::{Code, Error, make_err}; -use tracing::error; +use tracing::{error, info}; struct ExponentialBackoff { current: Duration, @@ -130,7 +130,7 @@ impl Retrier { } fn get_retry_config(&self) -> impl Iterator + '_ { - ExponentialBackoff::new(Duration::from_millis(self.config.delay as u64)) + ExponentialBackoff::new(Duration::from_secs_f32(self.config.delay)) .map(|d| (self.jitter_fn)(d)) .take(self.config.max_retries) // Remember this is number of retries, so will run max_retries + 1. } @@ -163,7 +163,11 @@ impl Retrier { } Some(RetryResult::Retry(err)) => { if !self.should_retry(err.code) { - error!(?attempt, ?err, "Not retrying permanent error"); + if err.code == Code::NotFound { + info!(?err, "Not found, not retrying"); + } else { + error!(?attempt, ?err, "Not retrying permanent error"); + } return Err(err); } (self.sleep_fn)( diff --git a/nativelink-util/src/store_trait.rs b/nativelink-util/src/store_trait.rs index c65d54485..3fb505229 100644 --- a/nativelink-util/src/store_trait.rs +++ b/nativelink-util/src/store_trait.rs @@ -15,6 +15,7 @@ use core::borrow::{Borrow, BorrowMut}; use core::convert::Into; use core::fmt::{self, Debug, Display}; +use core::future; use core::hash::{Hash, Hasher}; use core::ops::{Bound, RangeBounds}; use core::pin::Pin; @@ -127,6 +128,15 @@ pub enum StoreOptimizations { /// If the store will never serve downloads. NoopDownloads, + + /// If the store will determine whether a key has associated data once a read has been + /// attempted instead of calling `.has()` first. + LazyExistenceOnSync, + + /// The store provides an optimized `update_oneshot` implementation that bypasses + /// channel overhead for direct Bytes writes. Stores with this optimization can + /// accept complete data directly without going through the MPSC channel. + SubscribesToUpdateOneshot, } /// A wrapper struct for [`StoreKey`] to work around @@ -382,7 +392,7 @@ impl Store { #[inline] pub fn register_remove_callback( &self, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error> { self.inner.clone().register_remove_callback(callback) } @@ -446,6 +456,9 @@ pub trait StoreLike: Send + Sync + Sized + Unpin + 'static { &'a self, digests: &'a [StoreKey<'a>], ) -> impl Future>, Error>> + Send + 'a { + if digests.is_empty() { + return future::ready(Ok(vec![])).boxed(); + } self.as_store_driver_pin().has_many(digests) } @@ -457,6 +470,9 @@ pub trait StoreLike: Send + Sync + Sized + Unpin + 'static { digests: &'a [StoreKey<'a>], results: &'a mut [Option], ) -> impl Future> + Send + 'a { + if digests.is_empty() { + return future::ready(Ok(())).boxed(); + } self.as_store_driver_pin() .has_with_results(digests, results) } @@ -836,16 +852,18 @@ pub trait StoreDriver: fn register_remove_callback( self: Arc, - callback: &Arc>, + callback: Arc, ) -> Result<(), Error>; } // Callback to be called when a store deletes an item. This is used so // compound stores can remove items from their internal state when their // underlying stores remove items e.g. caches -#[async_trait] pub trait RemoveItemCallback: Debug + Send + Sync { - async fn callback(&self, store_key: &StoreKey<'_>); + fn callback<'a>( + &'a self, + store_key: StoreKey<'a>, + ) -> Pin + Send + 'a>>; } /// The instructions on how to decode a value from a Bytes & version into @@ -862,8 +880,6 @@ pub trait SchedulerSubscription: Send + Sync { pub trait SchedulerSubscriptionManager: Send + Sync { type Subscription: SchedulerSubscription; - fn notify_for_test(&self, value: String); - fn subscribe(&self, key: K) -> Result where K: SchedulerStoreKeyProvider; @@ -876,7 +892,9 @@ pub trait SchedulerStore: Send + Sync + 'static { type SubscriptionManager: SchedulerSubscriptionManager; /// Returns the subscription manager for the scheduler store. - fn subscription_manager(&self) -> Result, Error>; + fn subscription_manager( + &self, + ) -> impl Future, Error>> + Send; /// Updates or inserts an entry into the underlying store. /// Metadata about the key is attached to the compile-time type. @@ -913,6 +931,13 @@ pub trait SchedulerStore: Send + Sync + 'static { ) -> impl Future::DecodeOutput>, Error>> + Send where K: SchedulerStoreKeyProvider + SchedulerStoreDecodeTo + Send; + + fn count_by_index( + &self, + index: Vec, + ) -> impl Future, Error>> + Send + where + K: SchedulerIndexProvider + Send; } /// A type that is used to let the scheduler store know what diff --git a/nativelink-util/src/telemetry.rs b/nativelink-util/src/telemetry.rs index 77da09df8..18606ce1e 100644 --- a/nativelink-util/src/telemetry.rs +++ b/nativelink-util/src/telemetry.rs @@ -12,12 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -use core::default::Default; -use std::env; -use std::sync::OnceLock; - use base64::Engine; use base64::prelude::BASE64_STANDARD_NO_PAD; +use core::default::Default; +use ginepro::LoadBalancedChannel; use hyper::http::Response; use nativelink_error::{Code, ResultExt, make_err}; use nativelink_proto::build::bazel::remote::execution::v2::RequestMetadata; @@ -26,7 +24,9 @@ use opentelemetry::trace::{TraceContextExt, Tracer, TracerProvider}; use opentelemetry::{KeyValue, global}; use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge; use opentelemetry_http::HeaderExtractor; -use opentelemetry_otlp::{LogExporter, MetricExporter, Protocol, SpanExporter, WithExportConfig}; +use opentelemetry_otlp::{ + LogExporter, MetricExporter, Protocol, SpanExporter, WithExportConfig, WithTonicConfig, +}; use opentelemetry_sdk::Resource; use opentelemetry_sdk::logs::SdkLoggerProvider; use opentelemetry_sdk::metrics::SdkMeterProvider; @@ -34,6 +34,8 @@ use opentelemetry_sdk::propagation::{BaggagePropagator, TraceContextPropagator}; use opentelemetry_sdk::trace::SdkTracerProvider; use opentelemetry_semantic_conventions::attribute::ENDUSER_ID; use prost::Message; +use std::env; +use std::sync::OnceLock; use tracing::debug; use tracing::metadata::LevelFilter; use tracing_opentelemetry::{MetricsLayer, layer}; @@ -71,7 +73,7 @@ fn otlp_filter() -> EnvFilter { // Create a tracing layer intended for stdout printing. // -// The output of this layer is configurable via the `NL_LOG_FMT` environment +// The output of this layer is configurable via the `NL_LOG` environment // variable. fn tracing_stdout_layer() -> impl Layer { let nl_log_fmt = env::var("NL_LOG").unwrap_or_else(|_| "pretty".to_string()); @@ -103,7 +105,7 @@ fn tracing_stdout_layer() -> impl Layer { /// /// Returns `Err` if logging was already initialized or if the exporters can't /// be initialized. -pub fn init_tracing() -> Result<(), nativelink_error::Error> { +pub async fn init_tracing() -> Result<(), nativelink_error::Error> { static INITIALIZED: OnceLock<()> = OnceLock::new(); if INITIALIZED.get().is_some() { @@ -128,13 +130,18 @@ pub fn init_tracing() -> Result<(), nativelink_error::Error> { ]); global::set_text_map_propagator(propagator); + let maybe_channel = maybe_load_balanced_channel().await; + // Logs + let mut log_exporter_builder = LogExporter::builder().with_tonic(); + if let Some(channel) = maybe_channel.clone() { + log_exporter_builder = log_exporter_builder.with_channel(channel.into()); + } let otlp_log_layer = OpenTelemetryTracingBridge::new( &SdkLoggerProvider::builder() .with_resource(resource.clone()) .with_batch_exporter( - LogExporter::builder() - .with_tonic() + log_exporter_builder .with_protocol(Protocol::Grpc) .build() .map_err(|e| make_err!(Code::Internal, "{e}")) @@ -145,13 +152,16 @@ pub fn init_tracing() -> Result<(), nativelink_error::Error> { .with_filter(otlp_filter()); // Traces + let mut span_exporter_builder = SpanExporter::builder().with_tonic(); + if let Some(channel) = maybe_channel.clone() { + span_exporter_builder = span_exporter_builder.with_channel(channel.into()); + } let otlp_trace_layer = layer() .with_tracer( SdkTracerProvider::builder() .with_resource(resource.clone()) .with_batch_exporter( - SpanExporter::builder() - .with_tonic() + span_exporter_builder .with_protocol(Protocol::Grpc) .build() .map_err(|e| make_err!(Code::Internal, "{e}")) @@ -163,11 +173,14 @@ pub fn init_tracing() -> Result<(), nativelink_error::Error> { .with_filter(otlp_filter()); // Metrics + let mut metric_exporter_builder = MetricExporter::builder().with_tonic(); + if let Some(channel) = maybe_channel { + metric_exporter_builder = metric_exporter_builder.with_channel(channel.into()); + } let meter_provider = SdkMeterProvider::builder() .with_resource(resource) .with_periodic_exporter( - MetricExporter::builder() - .with_tonic() + metric_exporter_builder .with_protocol(Protocol::Grpc) .build() .map_err(|e| make_err!(Code::Internal, "{e}")) @@ -191,6 +204,36 @@ pub fn init_tracing() -> Result<(), nativelink_error::Error> { Ok(()) } +const NL_OTEL_ENDPOINT: &str = "NL_OTEL_ENDPOINT"; + +async fn maybe_load_balanced_channel() -> Option { + match env::var(NL_OTEL_ENDPOINT) { + Ok(endpoint) => { + let url = Url::parse(endpoint.as_str()).map_err(|e| { + make_err!(Code::Internal, "Unable to parse endpoint {endpoint}: {e:?}") + }).unwrap(); + + let host = url + .host() + .err_tip(|| format!("Unable to get host from endpoint {endpoint}")) + .unwrap(); + let port = url + .port() + .err_tip(|| format!("Unable to get port from endpoint {endpoint}")) + .unwrap(); + + Some( + LoadBalancedChannel::builder((host.to_string(), port)) + .channel() + .await + .map_err(|e| make_err!(Code::Internal, "Invalid hostname '{endpoint}': {e}")) + .unwrap(), + ) + } + Err(_) => None, + } +} + /// Custom metadata key field for Bazel metadata. const BAZEL_METADATA_KEY: &str = "bazel.metadata"; @@ -201,6 +244,7 @@ const BAZEL_REQUESTMETADATA_HEADER: &str = "build.bazel.remote.execution.v2.requ use opentelemetry::baggage::BaggageExt; use opentelemetry::context::FutureExt; +use url::Url; #[derive(Debug, Clone)] pub struct OtlpMiddleware { @@ -254,23 +298,25 @@ where .map(|value| value.as_str().to_string()) .unwrap_or_default(); - if identity.is_empty() && self.identity_required { - return Box::pin(async move { - Ok(tonic::Status::failed_precondition( - r" + if identity.is_empty() { + if self.identity_required { + return Box::pin(async move { + Ok(tonic::Status::failed_precondition( + r" NativeLink instance configured to require this OpenTelemetry Baggage header: `Baggage: enduser.id=YOUR_IDENTITY` ", - ) - .into_http()) - }); + ) + .into_http()) + }); + } + } else { + debug!("Baggage enduser.id: {identity}"); } - debug!("Baggage enduser.id: {identity}"); - let tracer = global::tracer("origin_middleware"); let span = tracer .span_builder("origin_request") diff --git a/nativelink-util/src/tls_utils.rs b/nativelink-util/src/tls_utils.rs index 1916cba7d..15f685861 100644 --- a/nativelink-util/src/tls_utils.rs +++ b/nativelink-util/src/tls_utils.rs @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +use core::time::Duration; + use nativelink_config::stores::{ClientTlsConfig, GrpcEndpoint}; use nativelink_error::{Code, Error, make_err, make_input_err}; use tonic::transport::Uri; -use tracing::warn; +use tracing::{info, warn}; pub fn load_client_config( config: &Option, @@ -126,9 +128,48 @@ pub fn endpoint(endpoint_config: &GrpcEndpoint) -> Result 0 { + Duration::from_secs(endpoint_config.connect_timeout_s) + } else { + Duration::from_secs(30) + }; + let tcp_keepalive = if endpoint_config.tcp_keepalive_s > 0 { + Duration::from_secs(endpoint_config.tcp_keepalive_s) + } else { + Duration::from_secs(30) + }; + let http2_keepalive_interval = if endpoint_config.http2_keepalive_interval_s > 0 { + Duration::from_secs(endpoint_config.http2_keepalive_interval_s) + } else { + Duration::from_secs(30) + }; + let http2_keepalive_timeout = if endpoint_config.http2_keepalive_timeout_s > 0 { + Duration::from_secs(endpoint_config.http2_keepalive_timeout_s) } else { - Ok(endpoint) + Duration::from_secs(20) + }; + + info!( + address = %endpoint_config.address, + concurrency_limit = ?endpoint_config.concurrency_limit, + connect_timeout_s = connect_timeout.as_secs(), + tcp_keepalive_s = tcp_keepalive.as_secs(), + http2_keepalive_interval_s = http2_keepalive_interval.as_secs(), + http2_keepalive_timeout_s = http2_keepalive_timeout.as_secs(), + "tls_utils::endpoint: creating gRPC endpoint with keepalive", + ); + + let mut endpoint = endpoint + .connect_timeout(connect_timeout) + .tcp_keepalive(Some(tcp_keepalive)) + .http2_keep_alive_interval(http2_keepalive_interval) + .keep_alive_timeout(http2_keepalive_timeout) + .keep_alive_while_idle(true); + + if let Some(concurrency_limit) = endpoint_config.concurrency_limit { + endpoint = endpoint.concurrency_limit(concurrency_limit); } + + Ok(endpoint) } diff --git a/nativelink-util/tests/evicting_map_test.rs b/nativelink-util/tests/evicting_map_test.rs index 2bf971ebf..e3f552f64 100644 --- a/nativelink-util/tests/evicting_map_test.rs +++ b/nativelink-util/tests/evicting_map_test.rs @@ -589,12 +589,10 @@ async fn range_multiple_items_test() -> Result<(), Error> { range: impl core::ops::RangeBounds + Send, ) -> Vec<(String, Bytes)> { let mut found_values = Vec::new(); - evicting_map - .range(range, |k, v: &BytesWrapper| { - found_values.push((k.clone(), v.0.clone())); - true - }) - .await; + evicting_map.range(range, |k, v: &BytesWrapper| { + found_values.push((k.clone(), v.0.clone())); + true + }); found_values } diff --git a/nativelink-util/tests/fs_test.rs b/nativelink-util/tests/fs_test.rs new file mode 100644 index 000000000..b0b21e733 --- /dev/null +++ b/nativelink-util/tests/fs_test.rs @@ -0,0 +1,39 @@ +#![cfg(not(target_family = "windows"))] +// Because windows does permissions differently + +use std::env; +use std::fs::{self, Permissions}; +use std::os::unix::fs::PermissionsExt; + +use nativelink_error::ResultExt; +use nativelink_macro::nativelink_test; +use nativelink_util::fs::remove_dir_all; + +#[nativelink_test] +async fn remove_files_with_bad_permissions() -> Result<(), Box> { + let temp_dir = env::temp_dir(); + let bad_perms_directory = temp_dir.join("bad_perms_directory"); + if fs::exists(&bad_perms_directory)? { + remove_dir_all(&bad_perms_directory) + .await + .err_tip(|| format!("first remove_dir_all for {bad_perms_directory:?}"))?; + } + fs::create_dir(&bad_perms_directory)?; + let bad_perms_file = bad_perms_directory.join("bad_perms_file"); + if !fs::exists(&bad_perms_file)? { + fs::write(&bad_perms_file, "").err_tip(|| "Can't create file")?; + } + + fs::set_permissions(&bad_perms_directory, Permissions::from_mode(0o100)) // execute owner only + .err_tip(|| "Can't set perms on directory")?; + + fs::set_permissions(&bad_perms_file, Permissions::from_mode(0o400)) // read owner only + .err_tip(|| "Can't set perms on file")?; + + remove_dir_all(&bad_perms_directory) + .await + .err_tip(|| format!("second remove_dir_all for {bad_perms_directory:?}"))?; + + assert!(!fs::exists(&bad_perms_directory)?); + Ok(()) +} diff --git a/nativelink-util/tests/metrics_test.rs b/nativelink-util/tests/metrics_test.rs new file mode 100644 index 000000000..05c08e2df --- /dev/null +++ b/nativelink-util/tests/metrics_test.rs @@ -0,0 +1,192 @@ +// Copyright 2025 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use nativelink_util::action_messages::{ActionResult, ActionStage}; +use nativelink_util::metrics::{ + CACHE_METRICS, CacheMetricAttrs, EXECUTION_METRICS, ExecutionMetricAttrs, ExecutionStage, + LOCAL_WORKER_METRICS, make_execution_attributes, +}; +use opentelemetry::KeyValue; + +#[test] +fn test_cache_metric_attrs() { + let base_attrs = vec![ + KeyValue::new("cache.type", "test_cache"), + KeyValue::new("instance", "test_instance"), + ]; + + let attrs = CacheMetricAttrs::new(&base_attrs); + + // Verify that the pre-computed attributes contain the expected values + let read_hit_attrs = attrs.read_hit(); + assert_eq!(read_hit_attrs.len(), 4); + assert!( + read_hit_attrs + .iter() + .any(|kv| kv.key.as_str() == "cache.type" && kv.value.to_string() == "test_cache") + ); + assert!( + read_hit_attrs + .iter() + .any(|kv| kv.key.as_str() == "cache.operation.name" && kv.value.to_string() == "read") + ); + assert!( + read_hit_attrs + .iter() + .any(|kv| kv.key.as_str() == "cache.operation.result" && kv.value.to_string() == "hit") + ); +} + +#[test] +fn test_execution_metric_attrs() { + let base_attrs = vec![ + KeyValue::new("execution_instance", "test_instance"), + KeyValue::new("execution_worker_id", "worker_123"), + ]; + + let attrs = ExecutionMetricAttrs::new(&base_attrs); + + // Verify that the pre-computed attributes contain the expected values + let queued_attrs = attrs.queued(); + assert_eq!(queued_attrs.len(), 3); + assert!(queued_attrs.iter().any( + |kv| kv.key.as_str() == "execution_instance" && kv.value.to_string() == "test_instance" + )); + assert!( + queued_attrs + .iter() + .any(|kv| kv.key.as_str() == "execution_stage" && kv.value.to_string() == "queued") + ); + + let completed_success_attrs = attrs.completed_success(); + assert_eq!(completed_success_attrs.len(), 4); + assert!( + completed_success_attrs + .iter() + .any(|kv| kv.key.as_str() == "execution_stage" && kv.value.to_string() == "completed") + ); + assert!( + completed_success_attrs + .iter() + .any(|kv| kv.key.as_str() == "execution_result" && kv.value.to_string() == "success") + ); +} + +#[test] +fn test_make_execution_attributes() { + let attrs = make_execution_attributes("test_instance", Some(100)); + + assert_eq!(attrs.len(), 2); + assert!(attrs.iter().any( + |kv| kv.key.as_str() == "execution_instance" && kv.value.to_string() == "test_instance" + )); + assert!( + attrs + .iter() + .any(|kv| kv.key.as_str() == "execution_priority" + && kv.value == opentelemetry::Value::I64(100)) + ); +} + +#[test] +fn test_metrics_lazy_initialization() { + // Verify that the lazy static initialization works + let _cache_metrics = &*CACHE_METRICS; + let _execution_metrics = &*EXECUTION_METRICS; + let _worker_metrics = &*LOCAL_WORKER_METRICS; + + // If we got here without panicking, the metrics were initialized successfully +} + +#[test] +fn test_action_stage_to_execution_stage_conversion() { + // Test conversion from owned ActionStage values + assert_eq!( + ExecutionStage::from(ActionStage::Unknown), + ExecutionStage::Unknown + ); + assert_eq!( + ExecutionStage::from(ActionStage::CacheCheck), + ExecutionStage::CacheCheck + ); + assert_eq!( + ExecutionStage::from(ActionStage::Queued), + ExecutionStage::Queued + ); + assert_eq!( + ExecutionStage::from(ActionStage::Executing), + ExecutionStage::Executing + ); + + // Test that Completed variants map to ExecutionStage::Completed + let action_result = ActionResult::default(); + assert_eq!( + ExecutionStage::from(ActionStage::Completed(action_result)), + ExecutionStage::Completed + ); + + // Note: We can't easily test CompletedFromCache without creating a ProtoActionResult, + // but the implementation handles it the same as Completed +} + +#[test] +fn test_action_stage_ref_to_execution_stage_conversion() { + // Test conversion from ActionStage references + let unknown = ActionStage::Unknown; + let cache_check = ActionStage::CacheCheck; + let queued = ActionStage::Queued; + let executing = ActionStage::Executing; + let completed = ActionStage::Completed(ActionResult::default()); + + assert_eq!(ExecutionStage::from(&unknown), ExecutionStage::Unknown); + assert_eq!( + ExecutionStage::from(&cache_check), + ExecutionStage::CacheCheck + ); + assert_eq!(ExecutionStage::from(&queued), ExecutionStage::Queued); + assert_eq!(ExecutionStage::from(&executing), ExecutionStage::Executing); + assert_eq!(ExecutionStage::from(&completed), ExecutionStage::Completed); +} + +#[test] +fn test_action_stage_conversion_avoids_clone() { + use nativelink_util::action_messages::{FileInfo, NameOrPath}; + use nativelink_util::common::DigestInfo; + + // This test verifies that using a reference doesn't clone the large ActionResult + let large_file_info = FileInfo { + name_or_path: NameOrPath::Path("test.txt".to_string()), + digest: DigestInfo::new([0u8; 32], 100), + is_executable: false, + }; + let large_action_result = ActionResult { + output_files: vec![large_file_info; 1000], // Large vector to make clone expensive + ..Default::default() + }; + let completed = ActionStage::Completed(large_action_result); + + // Using a reference should be fast even with large data + let start = std::time::Instant::now(); + for _ in 0..10000 { + let _stage = ExecutionStage::from(&completed); + } + let elapsed = start.elapsed(); + + // This should complete very quickly since we're not cloning + // In practice, 10000 conversions should take less than 1ms + assert!( + elapsed.as_millis() < 100, + "Reference conversion took too long: {elapsed:?}" + ); +} diff --git a/nativelink-util/tests/platform_properties_tests.rs b/nativelink-util/tests/platform_properties_tests.rs new file mode 100644 index 000000000..134e9c58a --- /dev/null +++ b/nativelink-util/tests/platform_properties_tests.rs @@ -0,0 +1,41 @@ +use std::collections::HashMap; + +use nativelink_macro::nativelink_test; +use nativelink_util::platform_properties::{PlatformProperties, PlatformPropertyValue}; + +#[test] +fn ignore_property_value_match_all() { + let ignore_property = PlatformPropertyValue::Ignore("foo".to_string()); + let other_property = PlatformPropertyValue::Exact("bar".to_string()); + assert!(ignore_property.is_satisfied_by(&ignore_property)); + assert!(ignore_property.is_satisfied_by(&other_property)); +} + +#[test] +fn ignore_property_match_all() { + let ignore_property = PlatformPropertyValue::Ignore("foo".to_string()); + let mut ignore_property_map = HashMap::new(); + ignore_property_map.insert("foo".into(), ignore_property); + let ignore_properties = PlatformProperties::new(ignore_property_map); + + assert!(ignore_properties.is_satisfied_by(&PlatformProperties::new(HashMap::new()), true)); +} + +#[nativelink_test] +fn minimum_property_logs_error() { + let minimum_property = PlatformPropertyValue::Minimum(1); + let mut minimum_property_map = HashMap::new(); + minimum_property_map.insert("foo".into(), minimum_property); + let minimum_properties = PlatformProperties::new(minimum_property_map); + + let worker_minimum_property = PlatformPropertyValue::Minimum(0); + let mut worker_minimum_property_map = HashMap::new(); + worker_minimum_property_map.insert("foo".into(), worker_minimum_property); + let worker_minimum_properties = PlatformProperties::new(worker_minimum_property_map); + + assert!(!minimum_properties.is_satisfied_by(&worker_minimum_properties, true)); + + assert!(logs_contain( + "Property mismatch on worker property foo. Minimum(0) < Minimum(1)" + )); +} diff --git a/nativelink-util/tests/retry_test.rs b/nativelink-util/tests/retry_test.rs index 48f648734..cfb8b07b2 100644 --- a/nativelink-util/tests/retry_test.rs +++ b/nativelink-util/tests/retry_test.rs @@ -84,6 +84,39 @@ async fn retry_fails_after_3_runs() -> Result<(), Error> { Ok(()) } +#[nativelink_test] +async fn dont_retry_for_not_found() -> Result<(), Error> { + let retrier = Retrier::new( + Arc::new(|_duration| Box::pin(ready(()))), + Arc::new(move |_delay| Duration::from_millis(1)), + Retry { + max_retries: 2, + ..Default::default() + }, + ); + let run_count = Arc::new(AtomicI32::new(0)); + let result = Pin::new(&retrier) + .retry(repeat_with(|| { + run_count.fetch_add(1, Ordering::Relaxed); + RetryResult::::Retry(make_err!(Code::NotFound, "Dummy failure",)) + })) + .await; + assert_eq!( + run_count.load(Ordering::Relaxed), + 1, + "Expected function to be called once" + ); + assert_eq!(result.is_err(), true, "Expected result to error"); + assert_eq!( + result.unwrap_err().to_string(), + "Error { code: NotFound, messages: [\"Dummy failure\"] }" + ); + assert!(logs_contain("Not found, not retrying")); + assert!(!logs_contain("ERROR")); + + Ok(()) +} + #[nativelink_test] async fn retry_success_after_2_runs() -> Result<(), Error> { let retrier = Retrier::new( diff --git a/nativelink-util/tests/store_trait_test.rs b/nativelink-util/tests/store_trait_test.rs new file mode 100644 index 000000000..efd4e4d68 --- /dev/null +++ b/nativelink-util/tests/store_trait_test.rs @@ -0,0 +1,86 @@ +use core::pin::Pin; +use std::sync::Arc; + +use nativelink_error::Error; +use nativelink_macro::nativelink_test; +use nativelink_metric::MetricsComponent; +use nativelink_util::buf_channel::{DropCloserReadHalf, DropCloserWriteHalf}; +use nativelink_util::default_health_status_indicator; +use nativelink_util::health_utils::HealthStatusIndicator; +use nativelink_util::store_trait::{ + RemoveItemCallback, Store, StoreDriver, StoreKey, StoreLike, UploadSizeInfo, +}; +use tonic::async_trait; + +#[derive(Debug, MetricsComponent)] +struct FakeStore {} + +#[async_trait] +#[allow(clippy::todo)] +impl StoreDriver for FakeStore { + async fn has_with_results( + self: Pin<&Self>, + _keys: &[StoreKey<'_>], + _results: &mut [Option], + ) -> Result<(), Error> { + todo!(); + } + + async fn update( + self: Pin<&Self>, + _key: StoreKey<'_>, + _reader: DropCloserReadHalf, + _size_info: UploadSizeInfo, + ) -> Result<(), Error> { + todo!(); + } + + async fn get_part( + self: Pin<&Self>, + _key: StoreKey<'_>, + _writer: &mut DropCloserWriteHalf, + _offset: u64, + _length: Option, + ) -> Result<(), Error> { + todo!(); + } + + fn inner_store(&self, _digest: Option) -> &dyn StoreDriver { + self + } + + fn as_any(&self) -> &(dyn core::any::Any + Sync + Send + 'static) { + self + } + + fn as_any_arc(self: Arc) -> Arc { + self + } + + fn register_remove_callback( + self: Arc, + _callback: Arc, + ) -> Result<(), Error> { + todo!(); + } +} + +default_health_status_indicator!(FakeStore); + +#[nativelink_test] +async fn fast_has_with_results() -> Result<(), Error> { + let store = Store::new(Arc::new(FakeStore {})); + let mut results: [Option; 0] = []; + store.has_with_results(&[], &mut results).await?; + + Ok(()) +} + +#[nativelink_test] +async fn fast_has_many() -> Result<(), Error> { + let store = Store::new(Arc::new(FakeStore {})); + let res = store.has_many(&[]).await?; + assert!(res.is_empty()); + + Ok(()) +} diff --git a/nativelink-util/tests/telemetry_test.rs b/nativelink-util/tests/telemetry_test.rs new file mode 100644 index 000000000..af97bbf2f --- /dev/null +++ b/nativelink-util/tests/telemetry_test.rs @@ -0,0 +1,77 @@ +use axum::Router; +use hyper::{Request, StatusCode, Uri}; +use nativelink_macro::nativelink_test; +use opentelemetry::baggage::BaggageExt; +use opentelemetry::{Context, KeyValue}; +use tonic::body::Body; +use tonic::service::Routes; +use tower::{Service, ServiceExt}; +use tracing::warn; + +fn demo_service() -> Router { + let tonic_services = Routes::builder().routes(); + tonic_services + .into_axum_router() + .fallback(|uri: Uri| async move { + warn!("No route for {uri}"); + (StatusCode::NOT_FOUND, format!("No route for {uri}")) + }) + .layer(nativelink_util::telemetry::OtlpLayer::new(false)) +} + +async fn run_request( + svc: &mut Router, + request: Request, +) -> Result<(), Box> { + let response: hyper::Response = + svc.as_service().ready().await?.call(request).await?; + assert_eq!(response.status(), 404); + + let response = String::from_utf8( + axum::body::to_bytes(response.into_body(), usize::MAX) + .await? + .to_vec(), + )?; + assert_eq!(response, String::from("No route for /demo")); + Ok(()) +} + +#[nativelink_test] +async fn oltp_logs_no_baggage() -> Result<(), Box> { + let mut svc = demo_service(); + + let request: Request = Request::builder() + .method("GET") + .uri("/demo") + .body(Body::empty())?; + run_request(&mut svc, request).await?; + + assert!(!logs_contain("Baggage enduser.id:")); + + Ok(()) +} + +#[nativelink_test] +async fn oltp_logs_with_baggage() -> Result<(), Box> { + let mut svc = demo_service(); + + let mut request: Request = Request::builder() + .method("GET") + .uri("/demo") + .body(Body::empty())?; + + let cx_guard = + Context::map_current(|cx| cx.with_baggage([KeyValue::new("enduser.id", "foobar")])) + .attach(); + + request + .headers_mut() + .insert("baggage", "enduser.id=foobar".parse().unwrap()); + + run_request(&mut svc, request).await?; + + assert!(logs_contain("Baggage enduser.id: foobar")); + drop(cx_guard); + + Ok(()) +} diff --git a/nativelink-worker/BUILD.bazel b/nativelink-worker/BUILD.bazel index 04217169c..14311f87f 100644 --- a/nativelink-worker/BUILD.bazel +++ b/nativelink-worker/BUILD.bazel @@ -10,6 +10,7 @@ load( rust_library( name = "nativelink-worker", srcs = [ + "src/directory_cache.rs", "src/lib.rs", "src/local_worker.rs", "src/running_actions_manager.rs", @@ -53,6 +54,7 @@ rust_test_suite( srcs = [ "tests/local_worker_test.rs", "tests/running_actions_manager_test.rs", + "tests/worker_utils_test.rs", ], compile_data = [ "tests/utils/local_worker_test_utils.rs", @@ -99,6 +101,8 @@ rust_test( "@crates//:prost-types", "@crates//:rand", "@crates//:serial_test", + "@crates//:tempfile", + "@crates//:tracing-test", ], ) diff --git a/nativelink-worker/Cargo.toml b/nativelink-worker/Cargo.toml index 3aa8ce356..34a546311 100644 --- a/nativelink-worker/Cargo.toml +++ b/nativelink-worker/Cargo.toml @@ -1,9 +1,10 @@ +#:schema ../tools/cargo-with-detailed-deps.json lints.workspace = true [package] edition = "2024" name = "nativelink-worker" -version = "0.7.3" +version = "1.0.0-rc4" [features] nix = [] @@ -18,16 +19,19 @@ nativelink-util = { path = "../nativelink-util" } async-lock = { version = "3.4.0", features = ["std"], default-features = false } bytes = { version = "1.10.1", default-features = false } -filetime = "0.2.25" -formatx = "0.2.3" +filetime = { version = "0.2.25", default-features = false } +formatx = { version = "0.2.3", default-features = false } futures = { version = "0.3.31", default-features = false } -opentelemetry = { version = "0.29.1", default-features = false } -parking_lot = "0.12.3" +opentelemetry = { version = "0.30.0", default-features = false } +parking_lot = { version = "0.12.3", default-features = false } prost = { version = "0.13.5", default-features = false } -relative-path = "1.9.3" +relative-path = { version = "2.0.0", default-features = false, features = [ + "alloc", + "std", +] } scopeguard = { version = "1.2.0", default-features = false } serde = { version = "1.0.219", default-features = false } -serde_json5 = "0.2.1" +serde_json5 = { version = "0.2.1", default-features = false } shlex = { version = "1.3.0", default-features = false } tokio = { version = "1.44.1", features = [ "fs", @@ -53,8 +57,10 @@ uuid = { version = "1.16.0", default-features = false, features = [ [dev-dependencies] nativelink-macro = { path = "../nativelink-macro" } -hyper = "1.6.0" -pretty_assertions = { version = "1.4.1", features = ["std"] } +hyper = { version = "1.6.0", default-features = false } +pretty_assertions = { version = "1.4.1", features = [ + "std", +], default-features = false } prost-types = { version = "0.13.5", default-features = false } rand = { version = "0.9.0", default-features = false, features = [ "thread_rng", @@ -62,6 +68,7 @@ rand = { version = "0.9.0", default-features = false, features = [ serial_test = { version = "3.2.0", features = [ "async", ], default-features = false } +tempfile = { version = "3.15.0", default-features = false } tracing-test = { version = "0.2.5", default-features = false, features = [ "no-env-filter", ] } diff --git a/nativelink-worker/src/directory_cache.rs b/nativelink-worker/src/directory_cache.rs new file mode 100644 index 000000000..f4a1f0f90 --- /dev/null +++ b/nativelink-worker/src/directory_cache.rs @@ -0,0 +1,530 @@ +// Copyright 2024 The NativeLink Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use core::future::Future; +use core::pin::Pin; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::SystemTime; + +use nativelink_error::{Code, Error, ResultExt, make_err}; +use nativelink_proto::build::bazel::remote::execution::v2::{ + Directory as ProtoDirectory, DirectoryNode, FileNode, SymlinkNode, +}; +use nativelink_store::ac_utils::get_and_decode_digest; +use nativelink_util::common::DigestInfo; +use nativelink_util::fs_util::{hardlink_directory_tree, set_readonly_recursive}; +use nativelink_util::store_trait::{Store, StoreKey, StoreLike}; +use tokio::fs; +use tokio::sync::{Mutex, RwLock}; +use tracing::{debug, trace, warn}; + +/// Configuration for the directory cache +#[derive(Debug, Clone)] +pub struct DirectoryCacheConfig { + /// Maximum number of cached directories + pub max_entries: usize, + /// Maximum total size in bytes (0 = unlimited) + pub max_size_bytes: u64, + /// Base directory for cache storage + pub cache_root: PathBuf, +} + +impl Default for DirectoryCacheConfig { + fn default() -> Self { + Self { + max_entries: 1000, + max_size_bytes: 10 * 1024 * 1024 * 1024, // 10 GB + cache_root: std::env::temp_dir().join("nativelink_directory_cache"), + } + } +} + +/// Metadata for a cached directory +#[derive(Debug, Clone)] +struct CachedDirectoryMetadata { + /// Path to the cached directory + path: PathBuf, + /// Size in bytes + size: u64, + /// Last access time for LRU eviction + last_access: SystemTime, + /// Reference count (number of active users) + ref_count: usize, +} + +/// High-performance directory cache that uses hardlinks to avoid repeated +/// directory reconstruction from the CAS. +/// +/// When actions need input directories, instead of fetching and reconstructing +/// files from the CAS each time, we: +/// 1. Check if we've already constructed this exact directory (by digest) +/// 2. If yes, hardlink the entire tree to the action's workspace +/// 3. If no, construct it once and cache for future use +/// +/// This dramatically reduces I/O and improves action startup time. +#[derive(Debug)] +pub struct DirectoryCache { + /// Configuration + config: DirectoryCacheConfig, + /// Cache mapping digest -> metadata + cache: Arc>>, + /// Lock for cache construction to prevent stampedes + construction_locks: Arc>>>>, + /// CAS store for fetching directories + cas_store: Store, +} + +impl DirectoryCache { + /// Creates a new `DirectoryCache` + pub async fn new(config: DirectoryCacheConfig, cas_store: Store) -> Result { + // Ensure cache root exists + fs::create_dir_all(&config.cache_root).await.err_tip(|| { + format!( + "Failed to create cache root: {}", + config.cache_root.display() + ) + })?; + + Ok(Self { + config, + cache: Arc::new(RwLock::new(HashMap::new())), + construction_locks: Arc::new(Mutex::new(HashMap::new())), + cas_store, + }) + } + + /// Gets or creates a directory in the cache, then hardlinks it to the destination + /// + /// # Arguments + /// * `digest` - Digest of the root Directory proto + /// * `dest_path` - Where to hardlink/create the directory + /// + /// # Returns + /// * `Ok(true)` - Cache hit (directory was hardlinked) + /// * `Ok(false)` - Cache miss (directory was constructed) + /// * `Err` - Error during construction or hardlinking + pub async fn get_or_create(&self, digest: DigestInfo, dest_path: &Path) -> Result { + // Fast path: check if already in cache + { + let mut cache = self.cache.write().await; + if let Some(metadata) = cache.get_mut(&digest) { + // Update access time and ref count + metadata.last_access = SystemTime::now(); + metadata.ref_count += 1; + + debug!( + ?digest, + path = ?metadata.path, + "Directory cache HIT" + ); + + // Try to hardlink from cache + match hardlink_directory_tree(&metadata.path, dest_path).await { + Ok(()) => { + metadata.ref_count -= 1; + return Ok(true); + } + Err(e) => { + warn!( + ?digest, + error = ?e, + "Failed to hardlink from cache, will reconstruct" + ); + metadata.ref_count -= 1; + // Fall through to reconstruction + } + } + } + } + + debug!(?digest, "Directory cache MISS"); + + // Get or create construction lock to prevent stampede + let construction_lock = { + let mut locks = self.construction_locks.lock().await; + locks + .entry(digest) + .or_insert_with(|| Arc::new(Mutex::new(()))) + .clone() + }; + + // Only one task constructs at a time for this digest + let _guard = construction_lock.lock().await; + + // Check again in case another task just constructed it + { + let cache = self.cache.read().await; + if let Some(metadata) = cache.get(&digest) { + return match hardlink_directory_tree(&metadata.path, dest_path).await { + Ok(()) => Ok(true), + Err(e) => { + warn!( + ?digest, + error = ?e, + "Failed to hardlink after construction" + ); + // Construct directly at dest_path + self.construct_directory(digest, dest_path).await?; + Ok(false) + } + }; + } + } + + // Construct the directory in cache + let cache_path = self.get_cache_path(&digest); + self.construct_directory(digest, &cache_path).await?; + + // Make it read-only to prevent modifications + set_readonly_recursive(&cache_path) + .await + .err_tip(|| "Failed to set cache directory to readonly")?; + + // Calculate size + let size = nativelink_util::fs_util::calculate_directory_size(&cache_path) + .await + .err_tip(|| "Failed to calculate directory size")?; + + // Add to cache + { + let mut cache = self.cache.write().await; + + // Evict if necessary + self.evict_if_needed(size, &mut cache).await?; + + cache.insert( + digest, + CachedDirectoryMetadata { + path: cache_path.clone(), + size, + last_access: SystemTime::now(), + ref_count: 0, + }, + ); + } + + // Hardlink to destination + hardlink_directory_tree(&cache_path, dest_path) + .await + .err_tip(|| "Failed to hardlink newly cached directory")?; + + Ok(false) + } + + /// Constructs a directory from the CAS at the given path + fn construct_directory<'a>( + &'a self, + digest: DigestInfo, + dest_path: &'a Path, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + debug!(?digest, ?dest_path, "Constructing directory"); + + // Fetch the Directory proto + let directory: ProtoDirectory = get_and_decode_digest(&self.cas_store, digest.into()) + .await + .err_tip(|| format!("Failed to fetch directory digest: {digest:?}"))?; + + // Create the destination directory + fs::create_dir_all(dest_path) + .await + .err_tip(|| format!("Failed to create directory: {}", dest_path.display()))?; + + // Process files + for file in &directory.files { + self.create_file(dest_path, file).await?; + } + + // Process subdirectories recursively + for dir_node in &directory.directories { + self.create_subdirectory(dest_path, dir_node).await?; + } + + // Process symlinks + for symlink in &directory.symlinks { + self.create_symlink(dest_path, symlink).await?; + } + + Ok(()) + }) + } + + /// Creates a file from a `FileNode` + async fn create_file(&self, parent: &Path, file_node: &FileNode) -> Result<(), Error> { + let file_path = parent.join(&file_node.name); + let digest = DigestInfo::try_from( + file_node + .digest + .clone() + .ok_or_else(|| make_err!(Code::InvalidArgument, "File node missing digest"))?, + ) + .err_tip(|| "Invalid file digest")?; + + trace!(?file_path, ?digest, "Creating file"); + + // Fetch file content from CAS + let data = self + .cas_store + .get_part_unchunked(StoreKey::Digest(digest), 0, None) + .await + .err_tip(|| format!("Failed to fetch file: {}", file_path.display()))?; + + // Write to disk + fs::write(&file_path, data.as_ref()) + .await + .err_tip(|| format!("Failed to write file: {}", file_path.display()))?; + + // Set permissions + #[cfg(unix)] + if file_node.is_executable { + use std::os::unix::fs::PermissionsExt; + let mut perms = fs::metadata(&file_path) + .await + .err_tip(|| "Failed to get file metadata")? + .permissions(); + perms.set_mode(0o755); + fs::set_permissions(&file_path, perms) + .await + .err_tip(|| "Failed to set file permissions")?; + } + + Ok(()) + } + + /// Creates a subdirectory from a `DirectoryNode` + async fn create_subdirectory( + &self, + parent: &Path, + dir_node: &DirectoryNode, + ) -> Result<(), Error> { + let dir_path = parent.join(&dir_node.name); + let digest = + DigestInfo::try_from(dir_node.digest.clone().ok_or_else(|| { + make_err!(Code::InvalidArgument, "Directory node missing digest") + })?) + .err_tip(|| "Invalid directory digest")?; + + trace!(?dir_path, ?digest, "Creating subdirectory"); + + // Recursively construct subdirectory + self.construct_directory(digest, &dir_path).await + } + + /// Creates a symlink from a `SymlinkNode` + async fn create_symlink(&self, parent: &Path, symlink: &SymlinkNode) -> Result<(), Error> { + let link_path = parent.join(&symlink.name); + let target = Path::new(&symlink.target); + + trace!(?link_path, ?target, "Creating symlink"); + + #[cfg(unix)] + fs::symlink(&target, &link_path) + .await + .err_tip(|| format!("Failed to create symlink: {}", link_path.display()))?; + + #[cfg(windows)] + { + // On Windows, we need to know if target is a directory + // For now, assume files (can be improved later) + fs::symlink_file(&target, &link_path) + .await + .err_tip(|| format!("Failed to create symlink: {}", link_path.display()))?; + } + + Ok(()) + } + + /// Evicts entries if cache is too full + async fn evict_if_needed( + &self, + incoming_size: u64, + cache: &mut HashMap, + ) -> Result<(), Error> { + // Check entry count + while cache.len() >= self.config.max_entries { + self.evict_lru(cache).await?; + } + + // Check total size + if self.config.max_size_bytes > 0 { + let current_size: u64 = cache.values().map(|m| m.size).sum(); + let mut size_after = current_size + incoming_size; + + while size_after > self.config.max_size_bytes { + let evicted_size = self.evict_lru(cache).await?; + size_after -= evicted_size; + } + } + + Ok(()) + } + + /// Evicts the least recently used entry + async fn evict_lru( + &self, + cache: &mut HashMap, + ) -> Result { + // Find LRU entry that isn't currently in use + let to_evict = cache + .iter() + .filter(|(_, m)| m.ref_count == 0) + .min_by_key(|(_, m)| m.last_access) + .map(|(digest, _)| *digest); + + if let Some(digest) = to_evict { + if let Some(metadata) = cache.remove(&digest) { + debug!(?digest, size = metadata.size, "Evicting cached directory"); + + // Remove from disk + if let Err(e) = fs::remove_dir_all(&metadata.path).await { + warn!( + ?digest, + path = ?metadata.path, + error = ?e, + "Failed to remove evicted directory from disk" + ); + } + + return Ok(metadata.size); + } + } + + Ok(0) + } + + /// Gets the cache path for a digest + fn get_cache_path(&self, digest: &DigestInfo) -> PathBuf { + self.config.cache_root.join(format!("{digest}")) + } + + /// Returns cache statistics + pub async fn stats(&self) -> CacheStats { + let cache = self.cache.read().await; + let total_size: u64 = cache.values().map(|m| m.size).sum(); + let in_use = cache.values().filter(|m| m.ref_count > 0).count(); + + CacheStats { + entries: cache.len(), + total_size_bytes: total_size, + in_use_entries: in_use, + } + } +} + +/// Statistics about the directory cache +#[derive(Debug, Clone, Copy)] +pub struct CacheStats { + pub entries: usize, + pub total_size_bytes: u64, + pub in_use_entries: usize, +} + +#[cfg(test)] +mod tests { + use nativelink_config::stores::MemorySpec; + use nativelink_macro::nativelink_test; + use nativelink_store::memory_store::MemoryStore; + use nativelink_util::common::DigestInfo; + use nativelink_util::store_trait::StoreLike; + use prost::Message; + use tempfile::TempDir; + + use super::*; + + async fn setup_test_store() -> (Store, DigestInfo) { + let store = Store::new(MemoryStore::new(&MemorySpec::default())); + + // Create a simple directory structure + let file_content = b"Hello, World!"; + // SHA256 hash of "Hello, World!" + let file_digest = DigestInfo::try_new( + "dffd6021bb2bd5b0af676290809ec3a53191dd81c7f70a4b28688a362182986f", + 13, + ) + .unwrap(); + + // Upload file + store + .as_store_driver_pin() + .update_oneshot(file_digest.into(), file_content.to_vec().into()) + .await + .unwrap(); + + // Create Directory proto + let directory = ProtoDirectory { + files: vec![FileNode { + name: "test.txt".to_string(), + digest: Some(file_digest.into()), + is_executable: false, + ..Default::default() + }], + directories: vec![], + symlinks: vec![], + ..Default::default() + }; + + // Encode and upload directory + let mut dir_data = Vec::new(); + directory.encode(&mut dir_data).unwrap(); + // Use a fixed hash for the directory + let dir_digest = DigestInfo::try_new( + "1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef", + dir_data.len() as i64, + ) + .unwrap(); + + store + .as_store_driver_pin() + .update_oneshot(dir_digest.into(), dir_data.into()) + .await + .unwrap(); + + (store, dir_digest) + } + + #[nativelink_test] + async fn test_directory_cache_basic() -> Result<(), Error> { + let temp_dir = TempDir::new().unwrap(); + let cache_root = temp_dir.path().join("cache"); + let (store, dir_digest) = setup_test_store().await; + + let config = DirectoryCacheConfig { + max_entries: 10, + max_size_bytes: 1024 * 1024, + cache_root, + }; + + let cache = DirectoryCache::new(config, store).await?; + + // First access - cache miss + let dest1 = temp_dir.path().join("dest1"); + let hit = cache.get_or_create(dir_digest, &dest1).await?; + assert!(!hit, "First access should be cache miss"); + assert!(dest1.join("test.txt").exists()); + + // Second access - cache hit + let dest2 = temp_dir.path().join("dest2"); + let hit = cache.get_or_create(dir_digest, &dest2).await?; + assert!(hit, "Second access should be cache hit"); + assert!(dest2.join("test.txt").exists()); + + // Verify stats + let stats = cache.stats().await; + assert_eq!(stats.entries, 1); + + Ok(()) + } +} diff --git a/nativelink-worker/src/lib.rs b/nativelink-worker/src/lib.rs index f80eaaa32..95a6a48d4 100644 --- a/nativelink-worker/src/lib.rs +++ b/nativelink-worker/src/lib.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod directory_cache; pub mod local_worker; pub mod running_actions_manager; pub mod worker_api_client_wrapper; diff --git a/nativelink-worker/src/local_worker.rs b/nativelink-worker/src/local_worker.rs index 0d7786bdd..1f7768830 100644 --- a/nativelink-worker/src/local_worker.rs +++ b/nativelink-worker/src/local_worker.rs @@ -12,40 +12,44 @@ // See the License for the specific language governing permissions and // limitations under the License. +use core::hash::BuildHasher; use core::pin::Pin; use core::str; use core::sync::atomic::{AtomicU64, Ordering}; use core::time::Duration; +use std::borrow::Cow; +use std::collections::HashMap; +use std::env; use std::process::Stdio; use std::sync::{Arc, Weak}; - +use std::time::Instant; use futures::future::BoxFuture; use futures::stream::FuturesUnordered; use futures::{Future, FutureExt, StreamExt, TryFutureExt, select}; -use nativelink_config::cas_server::LocalWorkerConfig; +use nativelink_config::cas_server::{EnvironmentSource, ExecutionCompletionBehaviour, LocalWorkerConfig}; use nativelink_error::{Code, Error, ResultExt, make_err, make_input_err}; -use nativelink_metric::{MetricsComponent, RootMetricsComponent}; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::update_for_worker::Update; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::worker_api_client::WorkerApiClient; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForWorker, execute_result, + ExecuteComplete, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForWorker, + execute_result, }; use nativelink_store::fast_slow_store::FastSlowStore; use nativelink_util::action_messages::{ActionResult, ActionStage, OperationId}; use nativelink_util::common::fs; use nativelink_util::digest_hasher::DigestHasherFunc; -use nativelink_util::metrics_utils::{AsyncCounterWrapper, CounterWithTime}; use nativelink_util::shutdown_guard::ShutdownGuard; use nativelink_util::store_trait::Store; use nativelink_util::{spawn, tls_utils}; use opentelemetry::context::Context; use tokio::process; -use tokio::sync::{broadcast, mpsc}; +use tokio::sync::broadcast::{Receiver, Sender}; +use tokio::sync::mpsc; use tokio::time::sleep; use tokio_stream::wrappers::UnboundedReceiverStream; use tonic::Streaming; -use tracing::{Level, debug, error, event, info, info_span, instrument, warn}; - +use tracing::{Level, debug, error, event, info, info_span, instrument, trace, warn}; +use nativelink_util::metrics::{WorkerMetricAttrs, LOCAL_WORKER_METRICS}; use crate::running_actions_manager::{ ExecutionConfiguration, Metrics as RunningActionManagerMetrics, RunningAction, RunningActionsManager, RunningActionsManagerArgs, RunningActionsManagerImpl, @@ -68,8 +72,9 @@ const DEFAULT_ENDPOINT_TIMEOUT_S: f32 = 5.; /// Default maximum amount of time a task is allowed to run for. /// If this value gets modified the documentation in `cas_server.rs` must also be updated. const DEFAULT_MAX_ACTION_TIMEOUT: Duration = Duration::from_secs(1200); // 20 mins. +const DEFAULT_MAX_UPLOAD_TIMEOUT: Duration = Duration::from_secs(600); // 10 mins. -struct LocalWorkerImpl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> { +struct LocalWorkerImpl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> { config: &'a LocalWorkerConfig, // According to the tonic documentation it is a cheap operation to clone this. grpc_client: T, @@ -81,9 +86,13 @@ struct LocalWorkerImpl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> { // on by the scheduler. actions_in_transit: Arc, metrics: Arc, + shutdown_tx: Sender, } -async fn preconditions_met(precondition_script: Option) -> Result<(), Error> { +pub async fn preconditions_met( + precondition_script: Option, + extra_envs: &HashMap, +) -> Result<(), Error> { let Some(precondition_script) = &precondition_script else { // No script means we are always ok to proceed. return Ok(()); @@ -94,15 +103,31 @@ async fn preconditions_met(precondition_script: Option) -> Result<(), Er // future to pass useful information through? Or perhaps we'll // have a pre-condition and a pre-execute script instead, although // arguably entrypoint already gives us that. - let precondition_process = process::Command::new(precondition_script) + + let maybe_split_cmd = shlex::split(precondition_script); + let (command, args) = match &maybe_split_cmd { + Some(split_cmd) => (&split_cmd[0], &split_cmd[1..]), + None => { + return Err(make_input_err!( + "Could not parse the value of precondition_script: '{}'", + precondition_script, + )); + } + }; + + let precondition_process = process::Command::new(command) + .args(args) .kill_on_drop(true) .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::null()) .env_clear() + .envs(extra_envs) .spawn() .err_tip(|| format!("Could not execute precondition command {precondition_script:?}"))?; let output = precondition_process.wait_with_output().await?; + let stdout = str::from_utf8(&output.stdout).unwrap_or(""); + trace!(status = %output.status, %stdout, "Preconditions script returned"); if output.status.code() == Some(0) { Ok(()) } else { @@ -110,18 +135,19 @@ async fn preconditions_met(precondition_script: Option) -> Result<(), Er Code::ResourceExhausted, "Preconditions script returned status {} - {}", output.status, - str::from_utf8(&output.stdout).unwrap_or("") + stdout )) } } -impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, T, U> { +impl<'a, T: WorkerApiClientTrait + 'static, U: RunningActionsManager> LocalWorkerImpl<'a, T, U> { fn new( config: &'a LocalWorkerConfig, grpc_client: T, worker_id: String, running_actions_manager: Arc, metrics: Arc, + shutdown_tx: Sender, ) -> Self { Self { config, @@ -134,6 +160,7 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, // on by the scheduler. actions_in_transit: Arc::new(AtomicU64::new(0)), metrics, + shutdown_tx, } } @@ -151,12 +178,7 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, // We always send 2 keep alive requests per timeout. Http2 should manage most of our // timeout issues, this is a secondary check to ensure we can still send data. sleep(Duration::from_secs_f32(timeout / 2.)).await; - if let Err(e) = grpc_client - .keep_alive(KeepAliveRequest { - worker_id: self.worker_id.clone(), - }) - .await - { + if let Err(e) = grpc_client.keep_alive(KeepAliveRequest {}).await { return Err(make_err!( Code::Internal, "Failed to send KeepAlive in LocalWorker : {:?}", @@ -169,7 +191,7 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, async fn run( &self, update_for_worker_stream: Streaming, - shutdown_rx: &mut broadcast::Receiver, + shutdown_rx: &mut Receiver, ) -> Result<(), Error> { // This big block of logic is designed to help simplify upstream components. Upstream // components can write standard futures that return a `Result<(), Error>` and this block @@ -188,6 +210,8 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, let (add_future_channel, add_future_rx) = mpsc::unbounded_channel(); let mut add_future_rx = UnboundedReceiverStream::new(add_future_rx).fuse(); + let (inner_shutdown_channel, inner_shutdown_rx) = mpsc::unbounded_channel(); + let mut inner_shutdown_rx = UnboundedReceiverStream::new(inner_shutdown_rx).fuse(); let mut update_for_worker_stream = update_for_worker_stream.fuse(); // A notify which is triggered every time actions_in_flight is subtracted. @@ -197,10 +221,13 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, let actions_in_flight = Arc::new(AtomicU64::new(0)); // Set to true when shutting down, this stops any new StartAction. let mut shutting_down = false; + // Channel to signal when shutdown is complete (GoingAway sent, ready to exit). + let (shutdown_complete_tx, shutdown_complete_rx) = mpsc::unbounded_channel::<()>(); + let mut shutdown_complete_rx = UnboundedReceiverStream::new(shutdown_complete_rx).fuse(); loop { select! { - maybe_update = update_for_worker_stream.next() => { + maybe_update = update_for_worker_stream.next() => if !shutting_down || maybe_update.is_some() { match maybe_update .err_tip(|| "UpdateForWorker stream closed early")? .err_tip(|| "Got error in UpdateForWorker stream")? @@ -214,16 +241,16 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, } // TODO(palfrey) We should possibly do something with this notification. Update::Disconnect(()) => { - self.metrics.disconnects_received.inc(); + self.metrics.inc_disconnects_received(); } Update::KeepAlive(()) => { - self.metrics.keep_alives_received.inc(); + self.metrics.inc_keep_alives_received(); } Update::KillOperationRequest(kill_operation_request) => { let operation_id = OperationId::from(kill_operation_request.operation_id); if let Err(err) = self.running_actions_manager.kill_operation(&operation_id).await { error!( - ?operation_id, + %operation_id, ?err, "Failed to send kill request for operation" ); @@ -235,7 +262,6 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, if let Some(instance_name) = start_execute.execute_request.map(|request| request.instance_name) { self.grpc_client.clone().execution_response( ExecuteResult{ - worker_id: self.worker_id.clone(), instance_name, operation_id: start_execute.operation_id, result: Some(execute_result::Result::InternalError(make_err!(Code::ResourceExhausted, "Worker shutting down").into())), @@ -245,7 +271,7 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, continue; } - self.metrics.start_actions_received.inc(); + self.metrics.inc_start_actions_received(); let execute_request = start_execute.execute_request.as_ref(); let operation_id = start_execute.operation_id.clone(); @@ -259,11 +285,32 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, let start_action_fut = { let precondition_script_cfg = self.config.experimental_precondition_script.clone(); + let mut extra_envs: HashMap = HashMap::new(); + if let Some(ref additional_environment) = self.config.additional_environment { + for (name, source) in additional_environment { + let value = match source { + EnvironmentSource::Property(property) => start_execute + .platform.as_ref().and_then(|p|p.properties.iter().find(|pr| &pr.name == property)) + .map_or_else(|| Cow::Borrowed(""), |v| Cow::Borrowed(v.value.as_str())), + EnvironmentSource::Value(value) => Cow::Borrowed(value.as_str()), + EnvironmentSource::FromEnvironment => Cow::Owned(env::var(name).unwrap_or_default()), + other => { + debug!(?other, "Worker doesn't support this type of additional environment"); + continue; + } + }; + extra_envs.insert(name.clone(), value.into_owned()); + } + } let actions_in_transit = self.actions_in_transit.clone(); let worker_id = self.worker_id.clone(); let running_actions_manager = self.running_actions_manager.clone(); + let mut grpc_client = self.grpc_client.clone(); + let complete = ExecuteComplete { + operation_id: operation_id.clone(), + }; self.metrics.clone().wrap(move |metrics| async move { - metrics.preconditions.wrap(preconditions_met(precondition_script_cfg)) + metrics.wrap_preconditions(preconditions_met(precondition_script_cfg, &extra_envs)) .and_then(|()| running_actions_manager.create_and_add_action(worker_id, start_execute)) .map(move |r| { // Now that we either failed or registered our action, we can @@ -273,13 +320,18 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, }) .and_then(|action| { debug!( - operation_id = ?action.get_operation_id(), + operation_id = %action.get_operation_id(), "Received request to run action" ); action .clone() .prepare_action() .and_then(RunningAction::execute) + .and_then(|result| async move { + // Notify that execution has completed so it can schedule a new action. + drop(grpc_client.execution_complete(complete).await); + Ok(result) + }) .and_then(RunningAction::upload_results) .and_then(RunningAction::get_finished_result) // Note: We need ensure we run cleanup even if one of the other steps fail. @@ -296,7 +348,6 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, let make_publish_future = { let mut grpc_client = self.grpc_client.clone(); - let worker_id = self.worker_id.clone(); let running_actions_manager = self.running_actions_manager.clone(); move |res: Result| async move { let instance_name = maybe_instance_name @@ -316,7 +367,6 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, let action_stage = ActionStage::Completed(action_result); grpc_client.execution_response( ExecuteResult{ - worker_id, instance_name, operation_id, result: Some(execute_result::Result::ExecuteResponse(action_stage.into())), @@ -326,12 +376,34 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, .err_tip(|| "Error while calling execution_response")?; }, Err(e) => { - grpc_client.execution_response(ExecuteResult{ - worker_id, - instance_name, - operation_id, - result: Some(execute_result::Result::InternalError(e.into())), - }).await.err_tip(|| "Error calling execution_response with error")?; + let is_cas_blob_missing = e.code == Code::NotFound + && e.message_string().contains("not found in either fast or slow store"); + if is_cas_blob_missing { + warn!( + ?e, + "Missing CAS inputs during prepare_action, returning FAILED_PRECONDITION" + ); + let action_result = ActionResult { + error: Some(make_err!( + Code::FailedPrecondition, + "{}", + e.message_string() + )), + ..ActionResult::default() + }; + let action_stage = ActionStage::Completed(action_result); + grpc_client.execution_response(ExecuteResult{ + instance_name, + operation_id, + result: Some(execute_result::Result::ExecuteResponse(action_stage.into())), + }).await.err_tip(|| "Error calling execution_response with missing inputs")?; + } else { + grpc_client.execution_response(ExecuteResult{ + instance_name, + operation_id, + result: Some(execute_result::Result::InternalError(e.into())), + }).await.err_tip(|| "Error calling execution_response with error")?; + } }, } Ok(()) @@ -341,6 +413,7 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, self.actions_in_transit.fetch_add(1, Ordering::Release); let add_future_channel = add_future_channel.clone(); + let inner_shutdown_channel = inner_shutdown_channel.clone(); info_span!( "worker_start_action_ctx", @@ -363,7 +436,16 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, error!(?err, "Error executing action"); } add_future_channel - .send(make_publish_future(res).then(move |res| { + .send(make_publish_future(res) + .then(move |res| { + match self.config.execution_completion_behaviour { + ExecutionCompletionBehaviour::OneShotAlways => { + inner_shutdown_channel.send(()).ok(); + } + ExecutionCompletionBehaviour::Default => { + // Do nothing + } + } actions_in_flight.fetch_sub(1, Ordering::Release); actions_notify.notify_one(); core::future::ready(res) @@ -387,14 +469,23 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, let fut = res.err_tip(|| "New future stream receives should never be closed")?; futures.push(fut); }, + _ = inner_shutdown_rx.next() => { + warn!("Shutting down worker because of inner shutdown signal",); + let guard = ShutdownGuard::default(); + drop(self.shutdown_tx.send(guard.clone())); + } res = futures.next() => res.err_tip(|| "Keep-alive should always pending. Likely unable to send data to scheduler")??, + _ = shutdown_complete_rx.next() => { + info!("Shutdown complete, exiting worker loop"); + return Ok(()); + }, complete_msg = shutdown_rx.recv().fuse() => { warn!("Worker loop received shutdown signal. Shutting down worker...",); let mut grpc_client = self.grpc_client.clone(); - let worker_id = self.worker_id.clone(); let shutdown_guard = complete_msg.map_err(|e| make_err!(Code::Internal, "Failed to receive shutdown message: {e:?}"))?; let actions_in_flight = actions_in_flight.clone(); let actions_notify = actions_notify.clone(); + let shutdown_complete_tx = shutdown_complete_tx.clone(); let shutdown_future = async move { // Wait for in-flight operations to be fully completed. while actions_in_flight.load(Ordering::Acquire) > 0 { @@ -402,12 +493,14 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, } // Sending this message immediately evicts all jobs from // this worker, of which there should be none. - if let Err(e) = grpc_client.going_away(GoingAwayRequest { worker_id }).await { + if let Err(e) = grpc_client.going_away(GoingAwayRequest {}).await { error!("Failed to send GoingAwayRequest: {e}",); - return Err(e.into()); + return Err(e); } // Allow shutdown to occur now. drop(shutdown_guard); + // Signal that shutdown is complete. + let _ = shutdown_complete_tx.send(()); Ok::<(), Error>(()) }; futures.push(shutdown_future.boxed()); @@ -421,7 +514,7 @@ impl<'a, T: WorkerApiClientTrait, U: RunningActionsManager> LocalWorkerImpl<'a, type ConnectionFactory = Box BoxFuture<'static, Result> + Send + Sync>; -pub struct LocalWorker { +pub struct LocalWorker { config: Arc, running_actions_manager: Arc, connection_factory: ConnectionFactory, @@ -429,8 +522,10 @@ pub struct LocalWorker { metrics: Arc, } -impl - core::fmt::Debug for LocalWorker +impl< + T: WorkerApiClientTrait + core::fmt::Debug + 'static, + U: RunningActionsManager + core::fmt::Debug, +> core::fmt::Debug for LocalWorker { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("LocalWorker") @@ -465,9 +560,12 @@ pub async fn new_local_worker( ); if let Ok(path) = fs::canonicalize(&config.work_directory).await { - fs::remove_dir_all(path) - .await - .err_tip(|| "Could not remove work_directory in LocalWorker")?; + fs::remove_dir_all(&path).await.err_tip(|| { + format!( + "Could not remove work_directory '{}' in LocalWorker", + &path.as_path().to_str().unwrap_or("bad path") + ) + })?; } fs::create_dir_all(&config.work_directory) @@ -483,6 +581,49 @@ pub async fn new_local_worker( } else { Duration::from_secs(config.max_action_timeout as u64) }; + let max_upload_timeout = if config.max_upload_timeout == 0 { + DEFAULT_MAX_UPLOAD_TIMEOUT + } else { + Duration::from_secs(config.max_upload_timeout as u64) + }; + + // Initialize directory cache if configured + let directory_cache = if let Some(cache_config) = &config.directory_cache { + use std::path::PathBuf; + + use crate::directory_cache::{ + DirectoryCache, DirectoryCacheConfig as WorkerDirCacheConfig, + }; + + let cache_root = if cache_config.cache_root.is_empty() { + PathBuf::from(&config.work_directory).parent().map_or_else( + || PathBuf::from("/tmp/nativelink_directory_cache"), + |p| p.join("directory_cache"), + ) + } else { + PathBuf::from(&cache_config.cache_root) + }; + + let worker_cache_config = WorkerDirCacheConfig { + max_entries: cache_config.max_entries, + max_size_bytes: cache_config.max_size_bytes, + cache_root, + }; + + match DirectoryCache::new(worker_cache_config, Store::new(fast_slow_store.clone())).await { + Ok(cache) => { + tracing::info!("Directory cache initialized successfully"); + Some(Arc::new(cache)) + } + Err(e) => { + tracing::warn!("Failed to initialize directory cache: {:?}", e); + None + } + } + } else { + None + }; + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { root_action_directory: config.work_directory.clone(), @@ -495,7 +636,9 @@ pub async fn new_local_worker( historical_store, upload_action_result_config: &config.upload_action_result, max_action_timeout, + max_upload_timeout, timeout_handled_externally: config.timeout_handled_externally, + directory_cache, })?); let local_worker = LocalWorker::new_with_connection_factory_and_actions_manager( config.clone(), @@ -532,16 +675,17 @@ pub async fn new_local_worker( Ok(local_worker) } -impl LocalWorker { +impl LocalWorker { pub fn new_with_connection_factory_and_actions_manager( config: Arc, running_actions_manager: Arc, connection_factory: ConnectionFactory, sleep_fn: Box BoxFuture<'static, ()> + Send + Sync>, ) -> Self { - let metrics = Arc::new(Metrics::new(Arc::downgrade( - running_actions_manager.metrics(), - ))); + let metrics = Arc::new(Metrics::new( + &config.name, + Arc::downgrade(running_actions_manager.metrics()), + )); Self { config, running_actions_manager, @@ -563,9 +707,33 @@ impl LocalWorker { &self, client: &mut T, ) -> Result<(String, Streaming), Error> { - let connect_worker_request = - make_connect_worker_request(self.config.name.clone(), &self.config.platform_properties) - .await?; + let mut extra_envs: HashMap = HashMap::new(); + if let Some(ref additional_environment) = self.config.additional_environment { + for (name, source) in additional_environment { + let value = match source { + EnvironmentSource::Value(value) => Cow::Borrowed(value.as_str()), + EnvironmentSource::FromEnvironment => { + Cow::Owned(env::var(name).unwrap_or_default()) + } + other => { + debug!( + ?other, + "Worker registration doesn't support this type of additional environment" + ); + continue; + } + }; + extra_envs.insert(name.clone(), value.into_owned()); + } + } + + let connect_worker_request = make_connect_worker_request( + self.config.name.clone(), + &self.config.platform_properties, + &extra_envs, + self.config.max_inflight_tasks, + ) + .await?; let mut update_for_worker_stream = client .connect_worker(connect_worker_request) .await @@ -594,7 +762,8 @@ impl LocalWorker { #[instrument(skip(self), level = Level::INFO)] pub async fn run( mut self, - mut shutdown_rx: broadcast::Receiver, + shutdown_tx: Sender, + mut shutdown_rx: Receiver, ) -> Result<(), Error> { let sleep_fn = self .sleep_fn @@ -629,6 +798,7 @@ impl LocalWorker { worker_id, self.running_actions_manager.clone(), self.metrics.clone(), + shutdown_tx.clone(), ), update_for_worker_stream, ), @@ -639,74 +809,113 @@ impl LocalWorker { ); // Now listen for connections and run all other services. - if let Err(err) = inner.run(update_for_worker_stream, &mut shutdown_rx).await { - 'no_more_actions: { - // Ensure there are no actions in transit before we try to kill - // all our actions. - const ITERATIONS: usize = 1_000; - - const ERROR_MSG: &str = "Actions in transit did not reach zero before we disconnected from the scheduler"; - - let sleep_duration = ACTIONS_IN_TRANSIT_TIMEOUT_S / ITERATIONS as f32; - for _ in 0..ITERATIONS { - if inner.actions_in_transit.load(Ordering::Acquire) == 0 { - break 'no_more_actions; + match inner.run(update_for_worker_stream, &mut shutdown_rx).await { + Ok(()) => { + // Graceful shutdown completed, return without retrying. + info!("Worker completed graceful shutdown"); + return Ok(()); + } + Err(err) => { + 'no_more_actions: { + // Ensure there are no actions in transit before we try to kill + // all our actions. + const ITERATIONS: usize = 1_000; + + const ERROR_MSG: &str = "Actions in transit did not reach zero before we disconnected from the scheduler"; + + let sleep_duration = ACTIONS_IN_TRANSIT_TIMEOUT_S / ITERATIONS as f32; + for _ in 0..ITERATIONS { + if inner.actions_in_transit.load(Ordering::Acquire) == 0 { + break 'no_more_actions; + } + (sleep_fn_pin)(Duration::from_secs_f32(sleep_duration)).await; } - (sleep_fn_pin)(Duration::from_secs_f32(sleep_duration)).await; + error!(ERROR_MSG); + return Err(err.append(ERROR_MSG)); } - error!(ERROR_MSG); - return Err(err.append(ERROR_MSG)); - } - error!(?err, "Worker disconnected from scheduler"); - // Kill off any existing actions because if we re-connect, we'll - // get some more and it might resource lock us. - self.running_actions_manager.kill_all().await; + error!(?err, "Worker disconnected from scheduler"); + // Kill off any existing actions because if we re-connect, we'll + // get some more and it might resource lock us. + self.running_actions_manager.kill_all().await; - (error_handler)(err).await; // Try to connect again. + (error_handler)(err).await; // Try to connect again. + } } } // Unreachable. } } -#[derive(Debug, MetricsComponent)] +/// Instance-based metrics wrapper that provides the `.wrap()` interface +/// and reports to global OpenTelemetry metrics. +#[derive(Debug)] pub struct Metrics { - #[metric( - help = "Total number of actions sent to this worker to process. This does not mean it started them, it just means it received a request to execute it." - )] - start_actions_received: CounterWithTime, - #[metric(help = "Total number of disconnects received from the scheduler.")] - disconnects_received: CounterWithTime, - #[metric(help = "Total number of keep-alives received from the scheduler.")] - keep_alives_received: CounterWithTime, - #[metric( - help = "Stats about the calls to check if an action satisfies the config supplied script." - )] - preconditions: AsyncCounterWrapper, - #[metric] - #[allow( - clippy::struct_field_names, - reason = "TODO Fix this. Triggers on nightly" - )] + attrs: WorkerMetricAttrs, + #[allow(dead_code)] running_actions_manager_metrics: Weak, } -impl RootMetricsComponent for Metrics {} - impl Metrics { - fn new(running_actions_manager_metrics: Weak) -> Self { + fn new( + worker_name: &str, + running_actions_manager_metrics: Weak, + ) -> Self { Self { - start_actions_received: CounterWithTime::default(), - disconnects_received: CounterWithTime::default(), - keep_alives_received: CounterWithTime::default(), - preconditions: AsyncCounterWrapper::default(), + attrs: WorkerMetricAttrs::new(worker_name), running_actions_manager_metrics, } } -} -impl Metrics { - async fn wrap, F: FnOnce(Arc) -> T>( + /// Increment the start_actions_received counter + pub fn inc_start_actions_received(&self) { + LOCAL_WORKER_METRICS + .start_actions_received + .add(1, self.attrs.base()); + } + + /// Increment the disconnects_received counter + pub fn inc_disconnects_received(&self) { + LOCAL_WORKER_METRICS + .disconnects_received + .add(1, self.attrs.base()); + } + + /// Increment the keep_alives_received counter + pub fn inc_keep_alives_received(&self) { + LOCAL_WORKER_METRICS + .keep_alives_received + .add(1, self.attrs.base()); + } + + /// Wrap an async operation and track precondition metrics + pub async fn wrap_preconditions>>( + &self, + future: F, + ) -> Result { + LOCAL_WORKER_METRICS + .preconditions_calls + .add(1, self.attrs.base()); + let start = Instant::now(); + let result = future.await; + let duration_ms = start.elapsed().as_secs_f64() * 1000.0; + LOCAL_WORKER_METRICS + .preconditions_duration + .record(duration_ms, self.attrs.base()); + + if result.is_ok() { + LOCAL_WORKER_METRICS + .preconditions_successes + .add(1, self.attrs.base()); + } else { + LOCAL_WORKER_METRICS + .preconditions_failures + .add(1, self.attrs.base()); + } + result + } + + /// Wrap for the action execution flow - passes self to the closure + pub async fn wrap, F: FnOnce(Arc) -> T>( self: Arc, fut: F, ) -> U { diff --git a/nativelink-worker/src/running_actions_manager.rs b/nativelink-worker/src/running_actions_manager.rs index bcd670b93..033e9e113 100644 --- a/nativelink-worker/src/running_actions_manager.rs +++ b/nativelink-worker/src/running_actions_manager.rs @@ -21,6 +21,7 @@ use core::time::Duration; use std::borrow::Cow; use std::collections::vec_deque::VecDeque; use std::collections::{HashMap, HashSet}; +use std::env; use std::ffi::{OsStr, OsString}; #[cfg(target_family = "unix")] use std::fs::Permissions; @@ -42,10 +43,9 @@ use nativelink_config::cas_server::{ EnvironmentSource, UploadActionResultConfig, UploadCacheResultsStrategy, }; use nativelink_error::{Code, Error, ResultExt, make_err, make_input_err}; -use nativelink_metric::MetricsComponent; use nativelink_proto::build::bazel::remote::execution::v2::{ - Action, ActionResult as ProtoActionResult, Command as ProtoCommand, - Directory as ProtoDirectory, Directory, DirectoryNode, ExecuteResponse, FileNode, SymlinkNode, + Action, ActionResult as ProtoActionResult, Command as ProtoCommand, Directory, + Directory as ProtoDirectory, DirectoryNode, ExecuteResponse, FileNode, SymlinkNode, Tree as ProtoTree, UpdateActionResultRequest, }; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ @@ -54,6 +54,7 @@ use nativelink_proto::com::github::trace_machina::nativelink::remote_execution:: use nativelink_store::ac_utils::{ ESTIMATED_DIGEST_SIZE, compute_buf_digest, get_and_decode_digest, serialize_and_upload_message, }; +use nativelink_store::cas_utils::is_zero_digest; use nativelink_store::fast_slow_store::FastSlowStore; use nativelink_store::filesystem_store::{FileEntry, FilesystemStore}; use nativelink_store::grpc_store::GrpcStore; @@ -63,15 +64,16 @@ use nativelink_util::action_messages::{ }; use nativelink_util::common::{DigestInfo, fs}; use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; -use nativelink_util::metrics_utils::{AsyncCounterWrapper, CounterWithTime}; +use nativelink_util::metrics::RUNNING_ACTIONS_METRICS; use nativelink_util::store_trait::{Store, StoreLike, UploadSizeInfo}; use nativelink_util::{background_spawn, spawn, spawn_blocking}; +use opentelemetry::{KeyValue, metrics}; use parking_lot::Mutex; use prost::Message; use relative_path::RelativePath; use scopeguard::{ScopeGuard, guard}; use serde::Deserialize; -use tokio::io::{AsyncReadExt, AsyncSeekExt}; +use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tokio::process; use tokio::sync::{Notify, oneshot, watch}; use tokio::time::Instant; @@ -150,33 +152,40 @@ pub fn download_to_directory<'a>( cas_store .populate_fast_store(digest.into()) .and_then(move |()| async move { - let file_entry = filesystem_store - .get_file_entry_for_digest(&digest) - .await - .err_tip(|| "During hard link")?; - file_entry - .get_file_path_locked(|src| fs::hard_link(src, &dest)) - .await - .map_err(|e| { - if e.code == Code::NotFound { - make_err!( - Code::Internal, - "Could not make hardlink, file was likely evicted from cache. {e:?} : {dest}\n\ - This error often occurs when the filesystem store's max_bytes is too small for your workload.\n\ - To fix this issue:\n\ - 1. Increase the 'max_bytes' value in your filesystem store configuration\n\ - 2. Example: Change 'max_bytes: 10000000000' to 'max_bytes: 50000000000' (or higher)\n\ - 3. The setting is typically found in your nativelink.json config under:\n\ - stores -> [your_filesystem_store] -> filesystem -> eviction_policy -> max_bytes\n\ - 4. Restart NativeLink after making the change\n\n\ - If this error persists after increasing max_bytes several times, please report at:\n\ - https://github.com/TraceMachina/nativelink/issues\n\ - Include your config file and both server and client logs to help us assist you." - ) - } else { - make_err!(Code::Internal, "Could not make hardlink, {e:?} : {dest}") - } - })?; + if is_zero_digest(digest) { + let mut file_slot = fs::create_file(&dest).await?; + file_slot.write_all(&[]).await?; + } + else { + let file_entry = filesystem_store + .get_file_entry_for_digest(&digest) + .await + .err_tip(|| "During hard link")?; + // TODO: add a test for #2051: deadlock with large number of files + let src_path = file_entry.get_file_path_locked(|src| async move { Ok(PathBuf::from(src)) }).await?; + fs::hard_link(&src_path, &dest) + .await + .map_err(|e| { + if e.code == Code::NotFound { + make_err!( + Code::Internal, + "Could not make hardlink, file was likely evicted from cache. {e:?} : {dest}\n\ + This error often occurs when the filesystem store's max_bytes is too small for your workload.\n\ + To fix this issue:\n\ + 1. Increase the 'max_bytes' value in your filesystem store configuration\n\ + 2. Example: Change 'max_bytes: 10000000000' to 'max_bytes: 50000000000' (or higher)\n\ + 3. The setting is typically found in your nativelink.json config under:\n\ + stores -> [your_filesystem_store] -> filesystem -> eviction_policy -> max_bytes\n\ + 4. Restart NativeLink after making the change\n\n\ + If this error persists after increasing max_bytes several times, please report at:\n\ + https://github.com/TraceMachina/nativelink/issues\n\ + Include your config file and both server and client logs to help us assist you." + ) + } else { + make_err!(Code::Internal, "Could not make hardlink, {e:?} : {dest}") + } + })?; + } #[cfg(target_family = "unix")] if let Some(unix_mode) = unix_mode { fs::set_permissions(&dest, Permissions::from_mode(unix_mode)) @@ -258,6 +267,46 @@ pub fn download_to_directory<'a>( .boxed() } +/// Prepares action inputs by first trying the directory cache (if available), +/// then falling back to traditional `download_to_directory`. +/// +/// This provides a significant performance improvement for repeated builds +/// with the same input directories. +pub async fn prepare_action_inputs( + directory_cache: &Option>, + cas_store: &FastSlowStore, + filesystem_store: Pin<&FilesystemStore>, + digest: &DigestInfo, + work_directory: &str, +) -> Result<(), Error> { + // Try cache first if available + if let Some(cache) = directory_cache { + match cache + .get_or_create(*digest, Path::new(work_directory)) + .await + { + Ok(cache_hit) => { + trace!( + ?digest, + work_directory, cache_hit, "Successfully prepared inputs via directory cache" + ); + return Ok(()); + } + Err(e) => { + warn!( + ?digest, + ?e, + "Directory cache failed, falling back to traditional download" + ); + // Fall through to traditional path + } + } + } + + // Traditional path (cache disabled or failed) + download_to_directory(cas_store, filesystem_store, digest, work_directory).await +} + #[cfg(target_family = "windows")] fn is_executable(_metadata: &std::fs::Metadata, full_path: &impl AsRef) -> bool { static EXECUTABLE_EXTENSIONS: &[&str] = &["exe", "bat", "com"]; @@ -307,29 +356,70 @@ async fn upload_file( // a much cheaper operation than an upload. let cas_store = cas_store.as_store_driver_pin(); let store_key: nativelink_util::store_trait::StoreKey<'_> = digest.into(); + let has_start = std::time::Instant::now(); if cas_store .has(store_key.borrow()) .await .is_ok_and(|result| result.is_some()) { + trace!( + ?digest, + has_elapsed_ms = has_start.elapsed().as_millis(), + "upload_file: digest already exists in CAS, skipping upload", + ); return Ok(()); } + trace!( + ?digest, + has_elapsed_ms = has_start.elapsed().as_millis(), + file_size = digest.size_bytes(), + "upload_file: digest not in CAS, starting upload", + ); file.rewind().await.err_tip(|| "Could not rewind file")?; // Note: For unknown reasons we appear to be hitting: // https://github.com/rust-lang/rust/issues/92096 - // or a smiliar issue if we try to use the non-store driver function, so we + // or a similar issue if we try to use the non-store driver function, so we // are using the store driver function here. - cas_store + let store_key_for_upload = store_key.clone(); + let file_upload_start = std::time::Instant::now(); + let upload_result = cas_store .update_with_whole_file( - store_key, + store_key_for_upload, full_path.as_ref().into(), file, UploadSizeInfo::ExactSize(digest.size_bytes()), ) .await - .map(|_slot| ()) + .map(|_slot| ()); + trace!( + ?digest, + upload_elapsed_ms = file_upload_start.elapsed().as_millis(), + success = upload_result.is_ok(), + "upload_file: update_with_whole_file completed", + ); + + match upload_result { + Ok(()) => Ok(()), + Err(err) => { + // Output uploads run concurrently and may overlap (e.g. a file is listed + // both as an output file and inside an output directory). When another + // upload has already moved the file into CAS, this update can fail with + // NotFound even though the digest is now present. Per the RE spec, missing + // outputs should be ignored, so treat this as success if the digest exists. + if err.code == Code::NotFound + && cas_store + .has(store_key.borrow()) + .await + .is_ok_and(|result| result.is_some()) + { + Ok(()) + } else { + Err(err) + } + } + } }) .await .err_tip(|| format!("for {full_path:?}"))?; @@ -590,10 +680,10 @@ async fn do_cleanup( .err_tip(|| format!("Could not remove working directory {action_directory}")); if let Err(err) = running_actions_manager.cleanup_action(operation_id) { - error!(?operation_id, ?err, "Error cleaning up action"); + error!(%operation_id, ?err, "Error cleaning up action"); Result::<(), Error>::Err(err).merge(remove_dir_result) } else if let Err(err) = remove_dir_result { - error!(?operation_id, ?err, "Error removing working directory"); + error!(%operation_id, ?err, "Error removing working directory"); Err(err) } else { Ok(()) @@ -666,7 +756,7 @@ pub struct RunningActionImpl { } impl RunningActionImpl { - fn new( + pub fn new( execution_metadata: ExecutionMetadata, operation_id: OperationId, action_directory: String, @@ -707,7 +797,7 @@ impl RunningActionImpl { &self.running_actions_manager.metrics } - /// Prepares any actions needed to execution this action. This action will do the following: + /// Prepares any actions needed to execute this action. This action will do the following: /// /// * Download any files needed to execute the action /// * Build a folder with all files needed to execute the action. @@ -722,7 +812,7 @@ impl RunningActionImpl { } let command = { // Download and build out our input files/folders. Also fetch and decode our Command. - let command_fut = self.metrics().get_proto_command_from_store.wrap(async { + let command_fut = self.metrics().wrap_get_proto_command_from_store(async { get_and_decode_digest::( self.running_actions_manager.cas_store.as_ref(), self.action_info.command_digest.into(), @@ -739,9 +829,10 @@ impl RunningActionImpl { // Now the work directory has been created, we have to clean up. self.did_cleanup.store(false, Ordering::Release); // Download the input files/folder and place them into the temp directory. + // Use directory cache if available for better performance. self.metrics() - .download_to_directory - .wrap(download_to_directory( + .wrap_download_to_directory(prepare_action_inputs( + &self.running_actions_manager.directory_cache, &self.running_actions_manager.cas_store, filesystem_store_pin, &self.action_info.input_root_digest, @@ -777,14 +868,12 @@ impl RunningActionImpl { } }; self.metrics() - .prepare_output_files - .wrap(try_join_all( + .wrap_prepare_output_files(try_join_all( command.output_files.iter().map(prepare_output_directories), )) .await?; self.metrics() - .prepare_output_paths - .wrap(try_join_all( + .wrap_prepare_output_paths(try_join_all( command.output_paths.iter().map(prepare_output_directories), )) .await?; @@ -870,6 +959,9 @@ impl RunningActionImpl { .get(property) .map_or_else(|| Cow::Borrowed(""), |v| Cow::Borrowed(v.as_str())), EnvironmentSource::Value(value) => Cow::Borrowed(value.as_str()), + EnvironmentSource::FromEnvironment => { + Cow::Owned(env::var(name).unwrap_or_default()) + } EnvironmentSource::TimeoutMillis => { Cow::Owned(requested_timeout.as_millis().to_string()) } @@ -929,16 +1021,22 @@ impl RunningActionImpl { .err_tip(|| "Expected stderr to exist on command this should never happen")?; let mut child_process_guard = guard(child_process, |mut child_process| { - if child_process.try_wait().is_ok_and(|res| res.is_some()) { - // The child already exited, probably a timeout or kill operation. - return; + let result: Result, std::io::Error> = + child_process.try_wait(); + match result { + Ok(res) if res.is_some() => { + // The child already exited, probably a timeout or kill operation + } + result => { + error!( + ?result, + "Child process was not cleaned up before dropping the call to execute(), killing in background spawn." + ); + background_spawn!("running_actions_manager_kill_child_process", async move { + child_process.kill().await + }); + } } - error!( - "Child process was not cleaned up before dropping the call to execute(), killing in background spawn." - ); - background_spawn!("running_actions_manager_kill_child_process", async move { - child_process.kill().await - }); }); let all_stdout_fut = spawn!("stdout_reader", async move { @@ -969,12 +1067,12 @@ impl RunningActionImpl { }); let mut killed_action = false; - let timer = self.metrics().child_process.begin_timer(); + let timer = self.metrics().begin_child_process_timer(); let mut sleep_fut = (self.running_actions_manager.callbacks.sleep_fn)(self.timeout).fuse(); loop { tokio::select! { () = &mut sleep_fut => { - self.running_actions_manager.metrics.task_timeouts.inc(); + self.running_actions_manager.metrics.inc_task_timeouts(); killed_action = true; if let Err(err) = child_process_guard.kill().await { error!( @@ -983,19 +1081,26 @@ impl RunningActionImpl { ); } { + let joined_command = args.join(OsStr::new(" ")); + let command = joined_command.to_string_lossy(); + info!( + seconds = self.action_info.timeout.as_secs_f32(), + %command, + "Command timed out" + ); let mut state = self.state.lock(); state.error = Error::merge_option(state.error.take(), Some(Error::new( Code::DeadlineExceeded, format!( "Command '{}' timed out after {} seconds", - args.join(OsStr::new(" ")).to_string_lossy(), + command, self.action_info.timeout.as_secs_f32() ) ))); } }, maybe_exit_status = child_process_guard.wait() => { - // Defuse our guard so it does not try to cleanup and make nessless logs. + // Defuse our guard so it does not try to cleanup and make senseless logs. drop(ScopeGuard::<_, _>::into_inner(child_process_guard)); let exit_status = maybe_exit_status.err_tip(|| "Failed to collect exit code of process")?; // TODO(palfrey) We should implement stderr/stdout streaming to client here. @@ -1016,13 +1121,15 @@ impl RunningActionImpl { let exit_code = exit_status.code().map_or(EXIT_CODE_FOR_SIGNAL, |exit_code| { if exit_code == 0 { - self.metrics().child_process_success_error_code.inc(); + self.metrics().inc_child_process_success_error_code(); } else { - self.metrics().child_process_failure_error_code.inc(); + self.metrics().inc_child_process_failure_error_code(); } exit_code }); + info!(?args, "Command complete"); + let maybe_error_override = if let Some(side_channel_file) = maybe_side_channel_file { process_side_channel_file(side_channel_file.clone(), &args, requested_timeout).await .err_tip(|| format!("Error processing side channel file: {}", side_channel_file.display()))? @@ -1077,7 +1184,11 @@ impl RunningActionImpl { DirectorySymlink(SymlinkInfo), } - debug!("Worker uploading results",); + let upload_start = std::time::Instant::now(); + debug!( + operation_id = ?self.operation_id, + "Worker uploading results - starting", + ); let (mut command_proto, execution_result, mut execution_metadata) = { let mut state = self.state.lock(); state.execution_metadata.output_upload_start_timestamp = @@ -1192,10 +1303,11 @@ impl RunningActionImpl { match fs::metadata(&full_path).await { Ok(metadata) => { if metadata.is_dir() { - return Ok(OutputType::DirectorySymlink(output_symlink)); + Ok(OutputType::DirectorySymlink(output_symlink)) + } else { + // Note: If it's anything but directory we put it as a file symlink. + Ok(OutputType::FileSymlink(output_symlink)) } - // Note: If it's anything but directory we put it as a file symlink. - return Ok(OutputType::FileSymlink(output_symlink)); } Err(e) => { if e.code != Code::NotFound { @@ -1208,7 +1320,7 @@ impl RunningActionImpl { } // If the file doesn't exist, we consider it a file. Even though the // file doesn't exist we still need to populate an entry. - return Ok(OutputType::FileSymlink(output_symlink)); + Ok(OutputType::FileSymlink(output_symlink)) } } } else { @@ -1235,25 +1347,47 @@ impl RunningActionImpl { ); } - let stdout_digest_fut = self.metrics().upload_stdout.wrap(async { + let stdout_digest_fut = self.metrics().wrap_upload_stdout(async { + let start = std::time::Instant::now(); let data = execution_result.stdout; + let data_len = data.len(); let digest = compute_buf_digest(&data, &mut hasher.hasher()); cas_store .update_oneshot(digest, data) .await .err_tip(|| "Uploading stdout")?; + debug!( + ?digest, + data_len, + elapsed_ms = start.elapsed().as_millis(), + "upload_results: stdout upload completed", + ); Result::::Ok(digest) }); - let stderr_digest_fut = self.metrics().upload_stderr.wrap(async { + let stderr_digest_fut = self.metrics().wrap_upload_stderr(async { + let start = std::time::Instant::now(); let data = execution_result.stderr; + let data_len = data.len(); let digest = compute_buf_digest(&data, &mut hasher.hasher()); cas_store .update_oneshot(digest, data) .await - .err_tip(|| "Uploading stdout")?; + .err_tip(|| "Uploading stderr")?; + debug!( + ?digest, + data_len, + elapsed_ms = start.elapsed().as_millis(), + "upload_results: stderr upload completed", + ); Result::::Ok(digest) }); + debug!( + operation_id = ?self.operation_id, + num_output_paths = output_path_futures.len(), + "upload_results: starting stdout/stderr/output_paths uploads", + ); + let join_start = std::time::Instant::now(); let upload_result = futures::try_join!(stdout_digest_fut, stderr_digest_fut, async { while let Some(output_type) = output_path_futures.try_next().await? { match output_type { @@ -1271,6 +1405,12 @@ impl RunningActionImpl { Ok(()) }); drop(output_path_futures); + debug!( + operation_id = ?self.operation_id, + elapsed_ms = join_start.elapsed().as_millis(), + success = upload_result.is_ok(), + "upload_results: all uploads completed", + ); let (stdout_digest, stderr_digest) = match upload_result { Ok((stdout_digest, stderr_digest, ())) => (stdout_digest, stderr_digest), Err(e) => return Err(e).err_tip(|| "Error while uploading results"), @@ -1282,6 +1422,8 @@ impl RunningActionImpl { output_folders.sort_unstable_by(|a, b| a.path.cmp(&b.path)); output_file_symlinks.sort_unstable_by(|a, b| a.name_or_path.cmp(&b.name_or_path)); output_directory_symlinks.sort_unstable_by(|a, b| a.name_or_path.cmp(&b.name_or_path)); + let num_output_files = output_files.len(); + let num_output_folders = output_folders.len(); { let mut state = self.state.lock(); execution_metadata.worker_completed_timestamp = @@ -1300,6 +1442,13 @@ impl RunningActionImpl { message: String::new(), // Will be filled in on cache_action_result if needed. }); } + debug!( + operation_id = ?self.operation_id, + total_elapsed_ms = upload_start.elapsed().as_millis(), + num_output_files, + num_output_folders, + "upload_results: inner_upload_results completed successfully", + ); Ok(self) } @@ -1325,7 +1474,7 @@ impl Drop for RunningActionImpl { } let operation_id = self.operation_id.clone(); error!( - ?operation_id, + %operation_id, "RunningActionImpl did not cleanup. This is a violation of the requirements, will attempt to do it in the background." ); let running_actions_manager = self.running_actions_manager.clone(); @@ -1337,7 +1486,7 @@ impl Drop for RunningActionImpl { return; }; error!( - ?operation_id, + %operation_id, ?action_directory, ?err, "Error cleaning up action" @@ -1352,34 +1501,78 @@ impl RunningAction for RunningActionImpl { } async fn prepare_action(self: Arc) -> Result, Error> { - self.metrics() - .clone() - .prepare_action - .wrap(Self::inner_prepare_action(self)) - .await + let metrics = self.metrics().clone(); + let res = metrics + .wrap_prepare_action(Self::inner_prepare_action(self)) + .await; + if let Err(ref e) = res { + warn!(?e, "Error during prepare_action"); + } + res } async fn execute(self: Arc) -> Result, Error> { - self.metrics() - .clone() - .execute - .wrap(Self::inner_execute(self)) - .await + let metrics = self.metrics().clone(); + let res = metrics.wrap_execute(Self::inner_execute(self)).await; + if let Err(ref e) = res { + warn!(?e, "Error during prepare_action"); + } + res } async fn upload_results(self: Arc) -> Result, Error> { - self.metrics() - .clone() - .upload_results - .wrap(Self::inner_upload_results(self)) - .await + let upload_timeout = self.running_actions_manager.max_upload_timeout; + let operation_id = self.operation_id.clone(); + info!( + ?operation_id, + upload_timeout_s = upload_timeout.as_secs(), + "upload_results: starting with timeout", + ); + let metrics = self.metrics().clone(); + let upload_fut = metrics + .wrap_upload_results(Self::inner_upload_results(self)); + + let stall_warn_fut = async { + let mut elapsed_secs = 0u64; + loop { + tokio::time::sleep(Duration::from_secs(60)).await; + elapsed_secs += 60; + warn!( + ?operation_id, + elapsed_s = elapsed_secs, + timeout_s = upload_timeout.as_secs(), + "upload_results: still in progress — possible stall", + ); + } + }; + + let res = tokio::time::timeout(upload_timeout, async { + tokio::pin!(upload_fut); + tokio::pin!(stall_warn_fut); + tokio::select! { + result = &mut upload_fut => result, + () = &mut stall_warn_fut => unreachable!(), + } + }) + .await + .map_err(|_| { + make_err!( + Code::DeadlineExceeded, + "Upload results timed out after {}s for operation {:?}", + upload_timeout.as_secs(), + operation_id, + ) + })?; + if let Err(ref e) = res { + warn!(?operation_id, ?e, "Error during upload_results"); + } + res } async fn cleanup(self: Arc) -> Result, Error> { - self.metrics() - .clone() - .cleanup - .wrap(async move { + let metrics = self.metrics().clone(); + let res = metrics + .wrap_cleanup(async move { let result = do_cleanup( &self.running_actions_manager, &self.operation_id, @@ -1390,14 +1583,17 @@ impl RunningAction for RunningActionImpl { self.did_cleanup.store(true, Ordering::Release); result.map(move |()| self) }) - .await + .await; + if let Err(ref e) = res { + warn!(?e, "Error during cleanup"); + } + res } async fn get_finished_result(self: Arc) -> Result { - self.metrics() - .clone() - .get_finished_result - .wrap(Self::inner_get_finished_result(self)) + let metrics = self.metrics().clone(); + metrics + .wrap_get_finished_result(Self::inner_get_finished_result(self)) .await } @@ -1721,7 +1917,9 @@ pub struct RunningActionsManagerArgs<'a> { pub historical_store: Store, pub upload_action_result_config: &'a UploadActionResultConfig, pub max_action_timeout: Duration, + pub max_upload_timeout: Duration, pub timeout_handled_externally: bool, + pub directory_cache: Option>, } struct CleanupGuard { @@ -1750,6 +1948,7 @@ pub struct RunningActionsManagerImpl { filesystem_store: Arc, upload_action_results: UploadActionResults, max_action_timeout: Duration, + max_upload_timeout: Duration, timeout_handled_externally: bool, running_actions: Mutex>>, // Note: We don't use Notify because we need to support a .wait_for()-like function, which @@ -1765,6 +1964,9 @@ pub struct RunningActionsManagerImpl { /// Notify waiters when a cleanup operation completes. This is used in conjunction with /// `cleaning_up_operations` to coordinate directory cleanup and creation. cleanup_complete_notify: Arc, + /// Optional directory cache for improving performance by caching reconstructed + /// input directories and using hardlinks. + directory_cache: Option>, } impl RunningActionsManagerImpl { @@ -1800,6 +2002,7 @@ impl RunningActionsManagerImpl { ) .err_tip(|| "During RunningActionsManagerImpl construction")?, max_action_timeout: args.max_action_timeout, + max_upload_timeout: args.max_upload_timeout, timeout_handled_externally: args.timeout_handled_externally, running_actions: Mutex::new(HashMap::new()), action_done_tx, @@ -1807,6 +2010,7 @@ impl RunningActionsManagerImpl { metrics: Arc::new(Metrics::default()), cleaning_up_operations: Mutex::new(HashSet::new()), cleanup_complete_notify: Arc::new(Notify::new()), + directory_cache: args.directory_cache, }) } @@ -1868,7 +2072,7 @@ impl RunningActionsManagerImpl { operation_id, dir_path.display() ); - self.metrics.stale_removals.inc(); + self.metrics.inc_stale_removals(); // Try to remove the directory, with one retry on failure let remove_result = fs::remove_dir_all(&dir_path).await; @@ -1888,7 +2092,7 @@ impl RunningActionsManagerImpl { } if start.elapsed() > Self::MAX_WAIT { - self.metrics.cleanup_wait_timeouts.inc(); + self.metrics.inc_cleanup_wait_timeouts(); return Err(make_err!( Code::DeadlineExceeded, "Timeout waiting for previous operation cleanup: {} (waited {:?})", @@ -1898,7 +2102,7 @@ impl RunningActionsManagerImpl { } if !has_waited { - self.metrics.cleanup_waits.inc(); + self.metrics.inc_cleanup_waits(); has_waited = true; } @@ -1923,7 +2127,7 @@ impl RunningActionsManagerImpl { &'a self, operation_id: &'a OperationId, ) -> impl Future> + 'a { - self.metrics.make_action_directory.wrap(async move { + self.metrics.wrap_make_action_directory(async move { let action_directory = format!("{}/{}", self.root_action_directory, operation_id); fs::create_dir(&action_directory) .await @@ -1937,7 +2141,7 @@ impl RunningActionsManagerImpl { start_execute: StartExecute, queued_timestamp: SystemTime, ) -> impl Future> + '_ { - self.metrics.create_action_info.wrap(async move { + self.metrics.wrap_create_action_info(async move { let execute_request = start_execute .execute_request .err_tip(|| "Expected execute_request to exist in StartExecute")?; @@ -1965,7 +2169,7 @@ impl RunningActionsManagerImpl { fn cleanup_action(&self, operation_id: &OperationId) -> Result<(), Error> { let mut running_actions = self.running_actions.lock(); let result = running_actions.remove(operation_id).err_tip(|| { - format!("Expected action id '{operation_id:?}' to exist in RunningActionsManagerImpl") + format!("Expected operation id '{operation_id}' to exist in RunningActionsManagerImpl") }); // No need to copy anything, we just are telling the receivers an event happened. self.action_done_tx.send_modify(|()| {}); @@ -2013,14 +2217,12 @@ impl RunningActionsManager for RunningActionsManagerImpl { start_execute: StartExecute, ) -> Result, Error> { self.metrics - .create_and_add_action - .wrap(async move { + .wrap_create_and_add_action(async move { let queued_timestamp = start_execute .queued_timestamp .and_then(|time| time.try_into().ok()) .unwrap_or(SystemTime::UNIX_EPOCH); - let operation_id = start_execute - .operation_id.as_str().into(); + let operation_id = start_execute.operation_id.as_str().into(); let action_info = self.create_action_info(start_execute, queued_timestamp).await?; debug!( ?action_info, @@ -2088,8 +2290,7 @@ impl RunningActionsManager for RunningActionsManagerImpl { hasher: DigestHasherFunc, ) -> Result<(), Error> { self.metrics - .cache_action_result - .wrap(self.upload_action_results.cache_action_result( + .wrap_cache_action_result(self.upload_action_results.cache_action_result( action_info, action_result, hasher, @@ -2112,8 +2313,7 @@ impl RunningActionsManager for RunningActionsManagerImpl { // Note: When the future returns the process should be fully killed and cleaned up. async fn kill_all(&self) { self.metrics - .kill_all - .wrap_no_capture_result(async move { + .wrap_kill_all(async move { let kill_operations: Vec> = { let running_actions = self.running_actions.lock(); running_actions @@ -2145,52 +2345,353 @@ impl RunningActionsManager for RunningActionsManagerImpl { } } -#[derive(Debug, Default, MetricsComponent)] +/// Instance-based metrics wrapper that provides helper methods +/// and reports to global OpenTelemetry metrics. +#[derive(Debug, Default, Clone)] pub struct Metrics { - #[metric(help = "Stats about the create_and_add_action command.")] - create_and_add_action: AsyncCounterWrapper, - #[metric(help = "Stats about the cache_action_result command.")] - cache_action_result: AsyncCounterWrapper, - #[metric(help = "Stats about the kill_all command.")] - kill_all: AsyncCounterWrapper, - #[metric(help = "Stats about the create_action_info command.")] - create_action_info: AsyncCounterWrapper, - #[metric(help = "Stats about the make_work_directory command.")] - make_action_directory: AsyncCounterWrapper, - #[metric(help = "Stats about the prepare_action command.")] - prepare_action: AsyncCounterWrapper, - #[metric(help = "Stats about the execute command.")] - execute: AsyncCounterWrapper, - #[metric(help = "Stats about the upload_results command.")] - upload_results: AsyncCounterWrapper, - #[metric(help = "Stats about the cleanup command.")] - cleanup: AsyncCounterWrapper, - #[metric(help = "Stats about the get_finished_result command.")] - get_finished_result: AsyncCounterWrapper, - #[metric(help = "Number of times an action waited for cleanup to complete.")] - cleanup_waits: CounterWithTime, - #[metric(help = "Number of stale directories removed during action retries.")] - stale_removals: CounterWithTime, - #[metric(help = "Number of timeouts while waiting for cleanup to complete.")] - cleanup_wait_timeouts: CounterWithTime, - #[metric(help = "Stats about the get_proto_command_from_store command.")] - get_proto_command_from_store: AsyncCounterWrapper, - #[metric(help = "Stats about the download_to_directory command.")] - download_to_directory: AsyncCounterWrapper, - #[metric(help = "Stats about the prepare_output_files command.")] - prepare_output_files: AsyncCounterWrapper, - #[metric(help = "Stats about the prepare_output_paths command.")] - prepare_output_paths: AsyncCounterWrapper, - #[metric(help = "Stats about the child_process command.")] - child_process: AsyncCounterWrapper, - #[metric(help = "Stats about the child_process_success_error_code command.")] - child_process_success_error_code: CounterWithTime, - #[metric(help = "Stats about the child_process_failure_error_code command.")] - child_process_failure_error_code: CounterWithTime, - #[metric(help = "Total time spent uploading stdout.")] - upload_stdout: AsyncCounterWrapper, - #[metric(help = "Total time spent uploading stderr.")] - upload_stderr: AsyncCounterWrapper, - #[metric(help = "Total number of task timeouts.")] - task_timeouts: CounterWithTime, + attrs: Vec, +} + +/// Timer for measuring async operation duration. +#[derive(Debug)] +pub struct MetricsTimer { + start: Instant, + duration_histogram: metrics::Histogram, + success_counter: Option>, + attrs: Vec, +} + +impl MetricsTimer { + /// Create a new timer that tracks both duration and success. + fn new_with_success( + duration_histogram: metrics::Histogram, + success_counter: metrics::Counter, + attrs: Vec, + ) -> Self { + Self { + start: Instant::now(), + duration_histogram, + success_counter: Some(success_counter), + attrs, + } + } + + /// Measure the elapsed time and record metrics. + pub fn measure(self) { + let duration_ms = self.start.elapsed().as_secs_f64() * 1000.0; + self.duration_histogram.record(duration_ms, &self.attrs); + if let Some(success_counter) = self.success_counter { + success_counter.add(1, &self.attrs); + } + } +} + +impl Metrics { + /// Create a new Metrics instance with optional attributes. + pub fn new() -> Self { + Self { attrs: Vec::new() } + } + + /// Helper to wrap an async operation and track metrics. + async fn wrap_async>>( + &self, + calls_counter: &metrics::Counter, + successes_counter: &metrics::Counter, + failures_counter: &metrics::Counter, + duration_histogram: &metrics::Histogram, + future: F, + ) -> Result { + calls_counter.add(1, &self.attrs); + let start = Instant::now(); + let result = future.await; + let duration_ms = start.elapsed().as_secs_f64() * 1000.0; + duration_histogram.record(duration_ms, &self.attrs); + + if result.is_ok() { + successes_counter.add(1, &self.attrs); + } else { + failures_counter.add(1, &self.attrs); + } + result + } + + /// Helper to wrap an async operation that doesn't return a Result. + async fn wrap_async_no_result>( + &self, + calls_counter: &metrics::Counter, + duration_histogram: &metrics::Histogram, + future: F, + ) -> T { + calls_counter.add(1, &self.attrs); + let start = Instant::now(); + let result = future.await; + let duration_ms = start.elapsed().as_secs_f64() * 1000.0; + duration_histogram.record(duration_ms, &self.attrs); + result + } + + // Wrapper methods for each operation + + pub async fn wrap_create_and_add_action>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.create_and_add_action_calls, + &RUNNING_ACTIONS_METRICS.create_and_add_action_successes, + &RUNNING_ACTIONS_METRICS.create_and_add_action_failures, + &RUNNING_ACTIONS_METRICS.create_and_add_action_duration, + future, + ) + .await + } + + pub async fn wrap_cache_action_result>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.cache_action_result_calls, + &RUNNING_ACTIONS_METRICS.cache_action_result_successes, + &RUNNING_ACTIONS_METRICS.cache_action_result_failures, + &RUNNING_ACTIONS_METRICS.cache_action_result_duration, + future, + ) + .await + } + + pub async fn wrap_kill_all>(&self, future: F) -> T { + self.wrap_async_no_result( + &RUNNING_ACTIONS_METRICS.kill_all_calls, + &RUNNING_ACTIONS_METRICS.kill_all_duration, + future, + ) + .await + } + + pub async fn wrap_create_action_info>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.create_action_info_calls, + &RUNNING_ACTIONS_METRICS.create_action_info_successes, + &RUNNING_ACTIONS_METRICS.create_action_info_failures, + &RUNNING_ACTIONS_METRICS.create_action_info_duration, + future, + ) + .await + } + + pub async fn wrap_make_action_directory>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.make_action_directory_calls, + &RUNNING_ACTIONS_METRICS.make_action_directory_successes, + &RUNNING_ACTIONS_METRICS.make_action_directory_failures, + &RUNNING_ACTIONS_METRICS.make_action_directory_duration, + future, + ) + .await + } + + pub async fn wrap_prepare_action>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.prepare_action_calls, + &RUNNING_ACTIONS_METRICS.prepare_action_successes, + &RUNNING_ACTIONS_METRICS.prepare_action_failures, + &RUNNING_ACTIONS_METRICS.prepare_action_duration, + future, + ) + .await + } + + pub async fn wrap_execute>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.execute_calls, + &RUNNING_ACTIONS_METRICS.execute_successes, + &RUNNING_ACTIONS_METRICS.execute_failures, + &RUNNING_ACTIONS_METRICS.execute_duration, + future, + ) + .await + } + + pub async fn wrap_upload_results>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.upload_results_calls, + &RUNNING_ACTIONS_METRICS.upload_results_successes, + &RUNNING_ACTIONS_METRICS.upload_results_failures, + &RUNNING_ACTIONS_METRICS.upload_results_duration, + future, + ) + .await + } + + pub async fn wrap_cleanup>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.cleanup_calls, + &RUNNING_ACTIONS_METRICS.cleanup_successes, + &RUNNING_ACTIONS_METRICS.cleanup_failures, + &RUNNING_ACTIONS_METRICS.cleanup_duration, + future, + ) + .await + } + + pub async fn wrap_get_finished_result>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.get_finished_result_calls, + &RUNNING_ACTIONS_METRICS.get_finished_result_successes, + &RUNNING_ACTIONS_METRICS.get_finished_result_failures, + &RUNNING_ACTIONS_METRICS.get_finished_result_duration, + future, + ) + .await + } + + pub async fn wrap_get_proto_command_from_store>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.get_proto_command_from_store_calls, + &RUNNING_ACTIONS_METRICS.get_proto_command_from_store_successes, + &RUNNING_ACTIONS_METRICS.get_proto_command_from_store_failures, + &RUNNING_ACTIONS_METRICS.get_proto_command_from_store_duration, + future, + ) + .await + } + + pub async fn wrap_download_to_directory>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.download_to_directory_calls, + &RUNNING_ACTIONS_METRICS.download_to_directory_successes, + &RUNNING_ACTIONS_METRICS.download_to_directory_failures, + &RUNNING_ACTIONS_METRICS.download_to_directory_duration, + future, + ) + .await + } + + pub async fn wrap_prepare_output_files>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.prepare_output_files_calls, + &RUNNING_ACTIONS_METRICS.prepare_output_files_successes, + &RUNNING_ACTIONS_METRICS.prepare_output_files_failures, + &RUNNING_ACTIONS_METRICS.prepare_output_files_duration, + future, + ) + .await + } + + pub async fn wrap_prepare_output_paths>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.prepare_output_paths_calls, + &RUNNING_ACTIONS_METRICS.prepare_output_paths_successes, + &RUNNING_ACTIONS_METRICS.prepare_output_paths_failures, + &RUNNING_ACTIONS_METRICS.prepare_output_paths_duration, + future, + ) + .await + } + + pub async fn wrap_upload_stdout>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.upload_stdout_calls, + &RUNNING_ACTIONS_METRICS.upload_stdout_successes, + &RUNNING_ACTIONS_METRICS.upload_stdout_failures, + &RUNNING_ACTIONS_METRICS.upload_stdout_duration, + future, + ) + .await + } + + pub async fn wrap_upload_stderr>>( + &self, + future: F, + ) -> Result { + self.wrap_async( + &RUNNING_ACTIONS_METRICS.upload_stderr_calls, + &RUNNING_ACTIONS_METRICS.upload_stderr_successes, + &RUNNING_ACTIONS_METRICS.upload_stderr_failures, + &RUNNING_ACTIONS_METRICS.upload_stderr_duration, + future, + ) + .await + } + + /// Begin timing a child process execution. + pub fn begin_child_process_timer(&self) -> MetricsTimer { + RUNNING_ACTIONS_METRICS + .child_process_calls + .add(1, &self.attrs); + MetricsTimer::new_with_success( + RUNNING_ACTIONS_METRICS.child_process_duration.clone(), + RUNNING_ACTIONS_METRICS.child_process_successes.clone(), + self.attrs.clone(), + ) + } + + // Simple counter increments + + pub fn inc_cleanup_waits(&self) { + RUNNING_ACTIONS_METRICS.cleanup_waits.add(1, &self.attrs); + } + + pub fn inc_stale_removals(&self) { + RUNNING_ACTIONS_METRICS.stale_removals.add(1, &self.attrs); + } + + pub fn inc_cleanup_wait_timeouts(&self) { + RUNNING_ACTIONS_METRICS + .cleanup_wait_timeouts + .add(1, &self.attrs); + } + + pub fn inc_child_process_success_error_code(&self) { + RUNNING_ACTIONS_METRICS + .child_process_success_error_code + .add(1, &self.attrs); + } + + pub fn inc_child_process_failure_error_code(&self) { + RUNNING_ACTIONS_METRICS + .child_process_failure_error_code + .add(1, &self.attrs); + } + + pub fn inc_task_timeouts(&self) { + RUNNING_ACTIONS_METRICS.task_timeouts.add(1, &self.attrs); + } } diff --git a/nativelink-worker/src/worker_api_client_wrapper.rs b/nativelink-worker/src/worker_api_client_wrapper.rs index 8911d7fee..1e2791fc0 100644 --- a/nativelink-worker/src/worker_api_client_wrapper.rs +++ b/nativelink-worker/src/worker_api_client_wrapper.rs @@ -14,13 +14,17 @@ use core::future::Future; +use futures::stream::unfold; +use nativelink_error::{make_err, Error, ResultExt}; +use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::update_for_scheduler::Update; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::worker_api_client::WorkerApiClient; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - ConnectWorkerRequest, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForWorker, + ConnectWorkerRequest, ExecuteComplete, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForScheduler, UpdateForWorker }; +use tokio::sync::mpsc::Sender; use tonic::codec::Streaming; use tonic::transport::Channel; -use tonic::{Response, Status}; +use tonic::{Code, Response, Status}; /// This is used in order to allow unit tests to intercept these calls. This should always match /// the API of `WorkerApiClient` defined in the `worker_api.proto` file. @@ -33,27 +37,56 @@ pub trait WorkerApiClientTrait: Clone + Sync + Send + Sized + Unpin { fn keep_alive( &mut self, request: KeepAliveRequest, - ) -> impl Future, Status>> + Send; + ) -> impl Future> + Send; fn going_away( &mut self, request: GoingAwayRequest, - ) -> impl Future, Status>> + Send; + ) -> impl Future> + Send; fn execution_response( &mut self, request: ExecuteResult, - ) -> impl Future, Status>> + Send; + ) -> impl Future> + Send; + + fn execution_complete( + &mut self, + request: ExecuteComplete, + ) -> impl Future> + Send; } #[derive(Debug, Clone)] pub struct WorkerApiClientWrapper { inner: WorkerApiClient, + channel: Option>, +} + +impl WorkerApiClientWrapper { + async fn send_update(&mut self, update: Update) -> Result<(), Error> { + let tx = self + .channel + .as_ref() + .err_tip(|| "worker update without connect_worker")?; + match tx.send(update).await { + Ok(()) => Ok(()), + Err(_err) => { + // Remove the sender if it's not going anywhere. + self.channel.take(); + Err(make_err!( + Code::Unavailable, + "worker update with disconnected channel" + )) + } + } + } } impl From> for WorkerApiClientWrapper { fn from(other: WorkerApiClient) -> Self { - Self { inner: other } + Self { + inner: other, + channel: None, + } } } @@ -62,18 +95,42 @@ impl WorkerApiClientTrait for WorkerApiClientWrapper { &mut self, request: ConnectWorkerRequest, ) -> Result>, Status> { - self.inner.connect_worker(request).await + drop(self.channel.take()); + let (tx, rx) = tokio::sync::mpsc::channel(1); + if tx + .send(Update::ConnectWorkerRequest(request)) + .await + .is_err() + { + return Err(Status::data_loss("Unable to push to newly created channel")); + } + self.channel = Some(tx); + self.inner + .connect_worker(unfold(rx, |mut rx| async move { + let update = rx.recv().await?; + Some(( + UpdateForScheduler { + update: Some(update), + }, + rx, + )) + })) + .await + } + + async fn keep_alive(&mut self, request: KeepAliveRequest) -> Result<(), Error> { + self.send_update(Update::KeepAliveRequest(request)).await } - async fn keep_alive(&mut self, request: KeepAliveRequest) -> Result, Status> { - self.inner.keep_alive(request).await + async fn going_away(&mut self, request: GoingAwayRequest) -> Result<(), Error> { + self.send_update(Update::GoingAwayRequest(request)).await } - async fn going_away(&mut self, request: GoingAwayRequest) -> Result, Status> { - self.inner.going_away(request).await + async fn execution_response(&mut self, request: ExecuteResult) -> Result<(), Error> { + self.send_update(Update::ExecuteResult(request)).await } - async fn execution_response(&mut self, request: ExecuteResult) -> Result, Status> { - self.inner.execution_response(request).await + async fn execution_complete(&mut self, request: ExecuteComplete) -> Result<(), Error> { + self.send_update(Update::ExecuteComplete(request)).await } } diff --git a/nativelink-worker/src/worker_utils.rs b/nativelink-worker/src/worker_utils.rs index 8f9a95680..69659d344 100644 --- a/nativelink-worker/src/worker_utils.rs +++ b/nativelink-worker/src/worker_utils.rs @@ -30,6 +30,8 @@ use tracing::info; pub async fn make_connect_worker_request( worker_id_prefix: String, worker_properties: &HashMap, + extra_envs: &HashMap, + max_inflight_tasks: u64, ) -> Result { let mut futures = vec![]; for (property_name, worker_property) in worker_properties { @@ -59,6 +61,7 @@ pub async fn make_connect_worker_request( }; let mut process = process::Command::new(command); process.env_clear(); + process.envs(extra_envs); process.args(args); process.stdin(Stdio::null()); let err_fn = @@ -102,5 +105,6 @@ pub async fn make_connect_worker_request( Ok(ConnectWorkerRequest { worker_id_prefix, properties: try_join_all(futures).await?.into_iter().flatten().collect(), + max_inflight_tasks, }) } diff --git a/nativelink-worker/tests/local_worker_test.rs b/nativelink-worker/tests/local_worker_test.rs index b0240385f..5e5c93e4b 100644 --- a/nativelink-worker/tests/local_worker_test.rs +++ b/nativelink-worker/tests/local_worker_test.rs @@ -29,8 +29,12 @@ mod utils { } use hyper::body::Frame; -use nativelink_config::cas_server::{LocalWorkerConfig, WorkerProperty}; -use nativelink_config::stores::{FastSlowSpec, FilesystemSpec, MemorySpec, StoreSpec}; +use nativelink_config::cas_server::{ + ExecutionCompletionBehaviour, LocalWorkerConfig, WorkerProperty, +}; +use nativelink_config::stores::{ + FastSlowSpec, FilesystemSpec, MemorySpec, StoreDirection, StoreSpec, +}; use nativelink_error::{Code, Error, make_err, make_input_err}; use nativelink_macro::nativelink_test; use nativelink_proto::build::bazel::remote::execution::v2::Platform; @@ -51,11 +55,12 @@ use nativelink_util::common::{DigestInfo, encode_stream_proto, fs}; use nativelink_util::digest_hasher::DigestHasherFunc; use nativelink_util::store_trait::Store; use nativelink_worker::local_worker::new_local_worker; +#[cfg(target_family = "unix")] +use nativelink_worker::local_worker::preconditions_met; use pretty_assertions::assert_eq; use prost::Message; use rand::Rng; use tokio::io::AsyncWriteExt; -use tonic::Response; use utils::local_worker_test_utils::{ setup_grpc_stream, setup_local_worker, setup_local_worker_with_config, }; @@ -123,7 +128,8 @@ async fn platform_properties_smoke_test() -> Result<(), Error> { name: "foo".to_string(), value: "bar2".to_string(), } - ] + ], + max_inflight_tasks: 0, } ); @@ -403,16 +409,12 @@ async fn simple_worker_start_action_test() -> Result<(), Error> { assert_eq!(digest_hasher, DigestHasherFunc::Sha256); // Now our client should be notified that our runner finished. - let execution_response = test_context - .client - .expect_execution_response(Ok(Response::new(()))) - .await; + let execution_response = test_context.client.expect_execution_response(Ok(())).await; // Now ensure the final results match our expectations. assert_eq!( execution_response, ExecuteResult { - worker_id: expected_worker_id, instance_name: INSTANCE_NAME.to_string(), operation_id: String::new(), result: Some(execute_result::Result::ExecuteResponse( @@ -424,6 +426,122 @@ async fn simple_worker_start_action_test() -> Result<(), Error> { Ok(()) } +#[nativelink_test] +async fn one_shot_shutdowns_worker_test() -> Result<(), Error> { + let config = LocalWorkerConfig { + execution_completion_behaviour: ExecutionCompletionBehaviour::OneShotAlways, + ..Default::default() + }; + let mut test_context = setup_local_worker_with_config(config).await; + let streaming_response = test_context.maybe_streaming_response.take().unwrap(); + + { + let props = test_context + .client + .expect_connect_worker(Ok(streaming_response)) + .await; + assert_eq!(props, ConnectWorkerRequest::default()); + } + + let expected_worker_id = "foobar".to_string(); + + let tx_stream = test_context.maybe_tx_stream.take().unwrap(); + { + // First initialize our worker by sending the response to the connection request. + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::ConnectionResult(ConnectionResult { + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let action_digest = DigestInfo::new([3u8; 32], 10); + let action_info = ActionInfo { + command_digest: DigestInfo::new([1u8; 32], 10), + input_root_digest: DigestInfo::new([2u8; 32], 10), + timeout: Duration::from_secs(1), + platform_properties: HashMap::new(), + priority: 0, + load_timestamp: SystemTime::UNIX_EPOCH, + insert_timestamp: SystemTime::UNIX_EPOCH, + unique_qualifier: ActionUniqueQualifier::Uncacheable(ActionUniqueKey { + instance_name: INSTANCE_NAME.to_string(), + digest_function: DigestHasherFunc::Sha256, + digest: action_digest, + }), + }; + + { + // Send execution request. + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::StartAction(StartExecute { + execute_request: Some((&action_info).into()), + operation_id: String::new(), + queued_timestamp: None, + platform: Some(Platform::default()), + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let running_action = Arc::new(MockRunningAction::new()); + + let action_result = ActionResult { + output_files: vec![], + output_folders: vec![], + output_file_symlinks: vec![], + output_directory_symlinks: vec![], + exit_code: 5, + stdout_digest: DigestInfo::new([21u8; 32], 10), + stderr_digest: DigestInfo::new([22u8; 32], 10), + execution_metadata: ExecutionMetadata { + worker: expected_worker_id.clone(), + queued_timestamp: SystemTime::UNIX_EPOCH, + worker_start_timestamp: SystemTime::UNIX_EPOCH, + worker_completed_timestamp: SystemTime::UNIX_EPOCH, + input_fetch_start_timestamp: SystemTime::UNIX_EPOCH, + input_fetch_completed_timestamp: SystemTime::UNIX_EPOCH, + execution_start_timestamp: SystemTime::UNIX_EPOCH, + execution_completed_timestamp: SystemTime::UNIX_EPOCH, + output_upload_start_timestamp: SystemTime::UNIX_EPOCH, + output_upload_completed_timestamp: SystemTime::UNIX_EPOCH, + }, + server_logs: HashMap::new(), + error: None, + message: String::new(), + }; + + // Send and wait for response from create_and_add_action to RunningActionsManager. + test_context + .actions_manager + .expect_create_and_add_action(Ok(running_action.clone())) + .await; + + // Now the RunningAction needs to send a series of state updates. This shortcuts them + // into a single call (shortcut for prepare, execute, upload, collect_results, cleanup). + running_action + .simple_expect_get_finished_result(Ok(action_result.clone())) + .await?; + + test_context.client.expect_execution_response(Ok(())).await; + + test_context.client.expect_going_away(Ok(())).await; + + Ok(()) +} + #[nativelink_test] async fn new_local_worker_creates_work_directory_test() -> Result<(), Error> { let cas_store = Store::new(FastSlowStore::new( @@ -431,6 +549,8 @@ async fn new_local_worker_creates_work_directory_test() -> Result<(), Error> { // Note: These are not needed for this test, so we put dummy memory stores here. fast: StoreSpec::Memory(MemorySpec::default()), slow: StoreSpec::Memory(MemorySpec::default()), + fast_direction: StoreDirection::default(), + slow_direction: StoreDirection::default(), }, Store::new( ::new(&FilesystemSpec { @@ -470,6 +590,8 @@ async fn new_local_worker_removes_work_directory_before_start_test() -> Result<( // Note: These are not needed for this test, so we put dummy memory stores here. fast: StoreSpec::Memory(MemorySpec::default()), slow: StoreSpec::Memory(MemorySpec::default()), + fast_direction: StoreDirection::default(), + slow_direction: StoreDirection::default(), }, Store::new( ::new(&FilesystemSpec { @@ -632,16 +754,12 @@ async fn experimental_precondition_script_fails() -> Result<(), Error> { } // Now our client should be notified that our runner finished. - let execution_response = test_context - .client - .expect_execution_response(Ok(Response::new(()))) - .await; + let execution_response = test_context.client.expect_execution_response(Ok(())).await; // Now ensure the final results match our expectations. assert_eq!( execution_response, ExecuteResult { - worker_id: expected_worker_id, instance_name: INSTANCE_NAME.to_string(), operation_id: String::new(), result: Some(execute_result::Result::InternalError( @@ -751,3 +869,226 @@ async fn kill_action_request_kills_action() -> Result<(), Error> { Ok(()) } + +#[nativelink_test] +async fn cas_not_found_returns_failed_precondition_test() -> Result<(), Error> { + let mut test_context = setup_local_worker(HashMap::new()).await; + let streaming_response = test_context.maybe_streaming_response.take().unwrap(); + + { + let props = test_context + .client + .expect_connect_worker(Ok(streaming_response)) + .await; + assert_eq!(props, ConnectWorkerRequest::default()); + } + + let expected_worker_id = "foobar".to_string(); + + let tx_stream = test_context.maybe_tx_stream.take().unwrap(); + { + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::ConnectionResult(ConnectionResult { + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let action_digest = DigestInfo::new([3u8; 32], 10); + let action_info = ActionInfo { + command_digest: DigestInfo::new([1u8; 32], 10), + input_root_digest: DigestInfo::new([2u8; 32], 10), + timeout: Duration::from_secs(1), + platform_properties: HashMap::new(), + priority: 0, + load_timestamp: SystemTime::UNIX_EPOCH, + insert_timestamp: SystemTime::UNIX_EPOCH, + unique_qualifier: ActionUniqueQualifier::Uncacheable(ActionUniqueKey { + instance_name: INSTANCE_NAME.to_string(), + digest_function: DigestHasherFunc::Sha256, + digest: action_digest, + }), + }; + + { + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::StartAction(StartExecute { + execute_request: Some((&action_info).into()), + operation_id: String::new(), + queued_timestamp: None, + platform: Some(Platform::default()), + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let running_action = Arc::new(MockRunningAction::new()); + + // Send and wait for response from create_and_add_action. + test_context + .actions_manager + .expect_create_and_add_action(Ok(running_action.clone())) + .await; + + // Simulate prepare_action failing with a CAS NotFound error containing the + // specific "not found in either fast or slow store" message. This is the exact + // condition that the code checks to decide whether to return FailedPrecondition. + running_action + .expect_prepare_action(Err(make_err!( + Code::NotFound, + "Hash 0123456789abcdef not found in either fast or slow store" + ))) + .await?; + + // Cleanup is still called even when prepare_action fails. + running_action.cleanup(Ok(())).await?; + + // The worker should respond with FailedPrecondition wrapped in an ExecuteResponse, + // NOT an InternalError. This allows Bazel to re-upload the missing artifacts. + let execution_response = test_context.client.expect_execution_response(Ok(())).await; + + let expected_action_result = ActionResult { + error: Some(make_err!( + Code::FailedPrecondition, + "Hash 0123456789abcdef not found in either fast or slow store" + )), + ..ActionResult::default() + }; + assert_eq!( + execution_response, + ExecuteResult { + instance_name: INSTANCE_NAME.to_string(), + operation_id: String::new(), + result: Some(execute_result::Result::ExecuteResponse( + ActionStage::Completed(expected_action_result).into() + )), + } + ); + + Ok(()) +} + +#[nativelink_test] +async fn non_cas_not_found_returns_internal_error_test() -> Result<(), Error> { + let mut test_context = setup_local_worker(HashMap::new()).await; + let streaming_response = test_context.maybe_streaming_response.take().unwrap(); + + { + let props = test_context + .client + .expect_connect_worker(Ok(streaming_response)) + .await; + assert_eq!(props, ConnectWorkerRequest::default()); + } + + let expected_worker_id = "foobar".to_string(); + + let tx_stream = test_context.maybe_tx_stream.take().unwrap(); + { + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::ConnectionResult(ConnectionResult { + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let action_digest = DigestInfo::new([3u8; 32], 10); + let action_info = ActionInfo { + command_digest: DigestInfo::new([1u8; 32], 10), + input_root_digest: DigestInfo::new([2u8; 32], 10), + timeout: Duration::from_secs(1), + platform_properties: HashMap::new(), + priority: 0, + load_timestamp: SystemTime::UNIX_EPOCH, + insert_timestamp: SystemTime::UNIX_EPOCH, + unique_qualifier: ActionUniqueQualifier::Uncacheable(ActionUniqueKey { + instance_name: INSTANCE_NAME.to_string(), + digest_function: DigestHasherFunc::Sha256, + digest: action_digest, + }), + }; + + { + tx_stream + .send(Frame::data( + encode_stream_proto(&UpdateForWorker { + update: Some(Update::StartAction(StartExecute { + execute_request: Some((&action_info).into()), + operation_id: String::new(), + queued_timestamp: None, + platform: Some(Platform::default()), + worker_id: expected_worker_id.clone(), + })), + }) + .unwrap(), + )) + .await + .map_err(|e| make_input_err!("Could not send : {:?}", e))?; + } + + let running_action = Arc::new(MockRunningAction::new()); + + test_context + .actions_manager + .expect_create_and_add_action(Ok(running_action.clone())) + .await; + + // Simulate prepare_action failing with a NotFound error that does NOT contain + // the CAS-specific message. This should result in an InternalError, not + // FailedPrecondition. + let other_not_found_error = make_err!(Code::NotFound, "Some other resource was not found"); + running_action + .expect_prepare_action(Err(other_not_found_error.clone())) + .await?; + + // Cleanup is still called even when prepare_action fails. + running_action.cleanup(Ok(())).await?; + + // The worker should respond with InternalError since this is not a CAS blob miss. + let execution_response = test_context.client.expect_execution_response(Ok(())).await; + + assert_eq!( + execution_response, + ExecuteResult { + instance_name: INSTANCE_NAME.to_string(), + operation_id: String::new(), + result: Some(execute_result::Result::InternalError( + other_not_found_error.into() + )), + } + ); + + Ok(()) +} + +#[cfg(target_family = "unix")] +#[nativelink_test] +async fn preconditions_met_extra_envs() -> Result<(), Error> { + let mut extra_envs = HashMap::new(); + extra_envs.insert("DEMO_ENV".into(), "test_value_for_demo_env".into()); + + // So we have bash for nix cases, because the PATH gets reset + extra_envs.insert("PATH".into(), env::var("PATH").unwrap()); + + preconditions_met(Some("bash -c \"echo $DEMO_ENV\"".to_string()), &extra_envs).await?; + assert!(logs_contain("test_value_for_demo_env")); + Ok(()) +} diff --git a/nativelink-worker/tests/running_actions_manager_test.rs b/nativelink-worker/tests/running_actions_manager_test.rs index 93907b02e..0c630bc41 100644 --- a/nativelink-worker/tests/running_actions_manager_test.rs +++ b/nativelink-worker/tests/running_actions_manager_test.rs @@ -12,702 +12,901 @@ // See the License for the specific language governing permissions and // limitations under the License. -use core::str::from_utf8; -use core::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering}; -#[cfg(target_family = "unix")] -use core::task::Poll; -use core::time::Duration; -use std::collections::HashMap; -use std::env; -use std::ffi::OsString; -use std::io::{Cursor, Write}; -#[cfg(target_family = "unix")] -use std::os::unix::fs::{MetadataExt, OpenOptionsExt}; -use std::sync::{Arc, LazyLock, Mutex}; -use std::time::{SystemTime, UNIX_EPOCH}; - -use futures::prelude::*; -use nativelink_config::cas_server::EnvironmentSource; -use nativelink_config::stores::{FastSlowSpec, FilesystemSpec, MemorySpec, StoreSpec}; -use nativelink_error::{Code, Error, ResultExt, make_input_err}; -use nativelink_macro::nativelink_test; -use nativelink_proto::build::bazel::remote::execution::v2::command::EnvironmentVariable; -#[cfg_attr(target_family = "windows", allow(unused_imports))] -use nativelink_proto::build::bazel::remote::execution::v2::{ - Action, ActionResult as ProtoActionResult, Command, Directory, DirectoryNode, ExecuteRequest, - ExecuteResponse, FileNode, NodeProperties, Platform, SymlinkNode, Tree, - digest_function::Value as ProtoDigestFunction, platform::Property, -}; -use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - HistoricalExecuteResponse, StartExecute, -}; -use nativelink_proto::google::rpc::Status; -use nativelink_store::ac_utils::{get_and_decode_digest, serialize_and_upload_message}; -use nativelink_store::fast_slow_store::FastSlowStore; -use nativelink_store::filesystem_store::FilesystemStore; -use nativelink_store::memory_store::MemoryStore; -#[cfg(target_family = "unix")] -use nativelink_util::action_messages::DirectoryInfo; -#[cfg_attr(target_family = "windows", allow(unused_imports))] -use nativelink_util::action_messages::SymlinkInfo; -use nativelink_util::action_messages::{ - ActionResult, ExecutionMetadata, FileInfo, NameOrPath, OperationId, -}; -use nativelink_util::common::{DigestInfo, fs}; -use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; -use nativelink_util::store_trait::{Store, StoreLike}; -use nativelink_worker::running_actions_manager::{ - Callbacks, ExecutionConfiguration, RunningAction, RunningActionImpl, RunningActionsManager, - RunningActionsManagerArgs, RunningActionsManagerImpl, download_to_directory, -}; -use pretty_assertions::assert_eq; -use prost::Message; -use rand::Rng; use serial_test::serial; -use tokio::sync::oneshot; -/// Get temporary path from either `TEST_TMPDIR` or best effort temp directory if -/// not set. -fn make_temp_path(data: &str) -> String { +#[serial] +mod tests { + use core::str::from_utf8; + use core::sync::atomic::{AtomicBool, AtomicI64, AtomicU64, Ordering}; #[cfg(target_family = "unix")] - return format!( - "{}/{}/{}", - env::var("TEST_TMPDIR").unwrap_or_else(|_| env::temp_dir().to_str().unwrap().to_string()), - rand::rng().random::(), - data - ); - #[cfg(target_family = "windows")] - return format!( - "{}\\{}\\{}", - env::var("TEST_TMPDIR").unwrap_or_else(|_| env::temp_dir().to_str().unwrap().to_string()), - rand::rng().random::(), - data - ); -} - -async fn setup_stores() -> Result< - ( - Arc, - Arc, - Arc, - Arc, - ), - Error, -> { - let fast_config = FilesystemSpec { - content_path: make_temp_path("content_path"), - temp_path: make_temp_path("temp_path"), - eviction_policy: None, - ..Default::default() + use core::task::Poll; + use core::time::Duration; + use std::collections::HashMap; + use std::env; + use std::ffi::OsString; + use std::io::{Cursor, Write}; + #[cfg(target_family = "unix")] + use std::os::unix::fs::{MetadataExt, OpenOptionsExt}; + use std::sync::{Arc, LazyLock, Mutex}; + use std::time::{SystemTime, UNIX_EPOCH}; + + use bytes::Bytes; + use futures::prelude::*; + use nativelink_config::cas_server::EnvironmentSource; + use nativelink_config::stores::{ + FastSlowSpec, FilesystemSpec, MemorySpec, StoreDirection, StoreSpec, }; - let slow_config = MemorySpec::default(); - let fast_store = FilesystemStore::new(&fast_config).await?; - let slow_store = MemoryStore::new(&slow_config); - let ac_store = MemoryStore::new(&slow_config); - let cas_store = FastSlowStore::new( - &FastSlowSpec { - fast: StoreSpec::Filesystem(fast_config), - slow: StoreSpec::Memory(slow_config), - }, - Store::new(fast_store.clone()), - Store::new(slow_store.clone()), - ); - Ok((fast_store, slow_store, cas_store, ac_store)) -} - -async fn run_action(action: Arc) -> Result { - action - .clone() - .prepare_action() - .and_then(RunningAction::execute) - .and_then(RunningAction::upload_results) - .and_then(RunningAction::get_finished_result) - .then(|result| async move { - action.cleanup().await?; - result - }) - .await -} - -const NOW_TIME: u64 = 10000; - -fn make_system_time(add_time: u64) -> SystemTime { - UNIX_EPOCH - .checked_add(Duration::from_secs(NOW_TIME + add_time)) - .unwrap() -} - -fn monotonic_clock(counter: &AtomicU64) -> SystemTime { - let count = counter.fetch_add(1, Ordering::Relaxed); - make_system_time(count) -} + use nativelink_error::{Code, Error, ResultExt, make_input_err}; + use nativelink_macro::nativelink_test; + use nativelink_proto::build::bazel::remote::execution::v2::command::EnvironmentVariable; + #[cfg_attr(target_family = "windows", allow(unused_imports))] + use nativelink_proto::build::bazel::remote::execution::v2::{ + Action, ActionResult as ProtoActionResult, Command, Directory, DirectoryNode, + ExecuteRequest, ExecuteResponse, FileNode, NodeProperties, Platform, SymlinkNode, Tree, + digest_function::Value as ProtoDigestFunction, platform::Property, + }; + use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ + HistoricalExecuteResponse, StartExecute, + }; + use nativelink_proto::google::rpc::Status; + use nativelink_store::ac_utils::{get_and_decode_digest, serialize_and_upload_message}; + use nativelink_store::fast_slow_store::FastSlowStore; + use nativelink_store::filesystem_store::FilesystemStore; + use nativelink_store::memory_store::MemoryStore; + #[cfg(target_family = "unix")] + use nativelink_util::action_messages::DirectoryInfo; + #[cfg_attr(target_family = "windows", allow(unused_imports))] + use nativelink_util::action_messages::SymlinkInfo; + use nativelink_util::action_messages::{ + ActionResult, ExecutionMetadata, FileInfo, NameOrPath, OperationId, + }; + use nativelink_util::common::{DigestInfo, fs}; + use nativelink_util::digest_hasher::{DigestHasher, DigestHasherFunc}; + use nativelink_util::store_trait::{Store, StoreLike}; + use nativelink_worker::running_actions_manager::{ + Callbacks, ExecutionConfiguration, RunningAction, RunningActionImpl, RunningActionsManager, + RunningActionsManagerArgs, RunningActionsManagerImpl, download_to_directory, + }; + use pretty_assertions::assert_eq; + use prost::Message; + use rand::Rng; + use tokio::sync::oneshot; -fn increment_clock(time: &mut SystemTime) -> SystemTime { - let previous_time = *time; - *time = previous_time.checked_add(Duration::from_secs(1)).unwrap(); - previous_time -} + const DEFAULT_MAX_UPLOAD_TIMEOUT: u64 = 600; -#[serial] -#[nativelink_test] -async fn download_to_directory_file_download_test() -> Result<(), Box> { - const FILE1_NAME: &str = "file1.txt"; - const FILE1_CONTENT: &str = "HELLOFILE1"; - const FILE2_NAME: &str = "file2.exec"; - const FILE2_CONTENT: &str = "HELLOFILE2"; - const FILE2_MODE: u32 = 0o710; - const FILE2_MTIME: u64 = 5; - - let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; - - let root_directory_digest = { - // Make and insert (into store) our digest info needed to create our directory & files. - let file1_content_digest = DigestInfo::new([2u8; 32], 32); - slow_store - .as_ref() - .update_oneshot(file1_content_digest, FILE1_CONTENT.into()) - .await?; - let file2_content_digest = DigestInfo::new([3u8; 32], 32); - slow_store - .as_ref() - .update_oneshot(file2_content_digest, FILE2_CONTENT.into()) - .await?; + /// Get temporary path from either `TEST_TMPDIR` or best effort temp directory if + /// not set. + fn make_temp_path(data: &str) -> String { + #[cfg(target_family = "unix")] + return format!( + "{}/{}/{}", + env::var("TEST_TMPDIR") + .unwrap_or_else(|_| env::temp_dir().to_str().unwrap().to_string()), + rand::rng().random::(), + data + ); + #[cfg(target_family = "windows")] + return format!( + "{}\\{}\\{}", + env::var("TEST_TMPDIR") + .unwrap_or_else(|_| env::temp_dir().to_str().unwrap().to_string()), + rand::rng().random::(), + data + ); + } - let root_directory_digest = DigestInfo::new([1u8; 32], 32); - let root_directory = Directory { - files: vec![ - FileNode { - name: FILE1_NAME.to_string(), - digest: Some(file1_content_digest.into()), - is_executable: false, - node_properties: None, - }, - FileNode { - name: FILE2_NAME.to_string(), - digest: Some(file2_content_digest.into()), - is_executable: true, - node_properties: Some(NodeProperties { - properties: vec![], - mtime: Some( - SystemTime::UNIX_EPOCH - .checked_add(Duration::from_secs(FILE2_MTIME)) - .unwrap() - .into(), - ), - unix_mode: Some(FILE2_MODE), - }), - }, - ], + async fn setup_stores() -> Result< + ( + Arc, + Arc, + Arc, + Arc, + ), + Error, + > { + let fast_config = FilesystemSpec { + content_path: make_temp_path("content_path"), + temp_path: make_temp_path("temp_path"), + eviction_policy: None, ..Default::default() }; + let slow_config = MemorySpec::default(); + let fast_store = FilesystemStore::new(&fast_config).await?; + let slow_store = MemoryStore::new(&slow_config); + let ac_store = MemoryStore::new(&slow_config); + let cas_store = FastSlowStore::new( + &FastSlowSpec { + fast: StoreSpec::Filesystem(fast_config), + slow: StoreSpec::Memory(slow_config), + fast_direction: StoreDirection::default(), + slow_direction: StoreDirection::default(), + }, + Store::new(fast_store.clone()), + Store::new(slow_store.clone()), + ); + Ok((fast_store, slow_store, cas_store, ac_store)) + } - slow_store - .as_ref() - .update_oneshot(root_directory_digest, root_directory.encode_to_vec().into()) - .await?; - root_directory_digest - }; - - let download_dir = { - // Tell it to download the digest info to a directory. - let download_dir = make_temp_path("download_dir"); - fs::create_dir_all(&download_dir) + async fn run_action(action: Arc) -> Result { + action + .clone() + .prepare_action() + .and_then(RunningAction::execute) + .and_then(RunningAction::upload_results) + .and_then(RunningAction::get_finished_result) + .then(|result| async move { + action.cleanup().await?; + result + }) .await - .err_tip(|| format!("Could not make download_dir : {download_dir}"))?; - download_to_directory( - cas_store.as_ref(), - fast_store.as_pin(), - &root_directory_digest, - &download_dir, - ) - .await?; - download_dir - }; - { - // Now ensure that our download_dir has the files. - let file1_content = fs::read(format!("{download_dir}/{FILE1_NAME}")).await?; - assert_eq!(from_utf8(&file1_content)?, FILE1_CONTENT); + } - let file2_path = format!("{download_dir}/{FILE2_NAME}"); - let file2_content = fs::read(&file2_path).await?; - assert_eq!(from_utf8(&file2_content)?, FILE2_CONTENT); + const NOW_TIME: u64 = 10000; - let file2_metadata = fs::metadata(&file2_path).await?; - // Note: We sent 0o710, but because is_executable was set it turns into 0o711. - #[cfg(target_family = "unix")] - assert_eq!(file2_metadata.mode() & 0o777, FILE2_MODE | 0o111); - assert_eq!( - file2_metadata - .modified()? - .duration_since(SystemTime::UNIX_EPOCH)? - .as_secs(), - FILE2_MTIME - ); + fn make_system_time(add_time: u64) -> SystemTime { + UNIX_EPOCH + .checked_add(Duration::from_secs(NOW_TIME + add_time)) + .unwrap() } - Ok(()) -} -#[serial] -#[nativelink_test] -async fn download_to_directory_folder_download_test() -> Result<(), Box> { - const DIRECTORY1_NAME: &str = "folder1"; - const FILE1_NAME: &str = "file1.txt"; - const FILE1_CONTENT: &str = "HELLOFILE1"; - const DIRECTORY2_NAME: &str = "folder2"; - - let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; - - let root_directory_digest = { - // Make and insert (into store) our digest info needed to create our directory & files. - let directory1_digest = DigestInfo::new([1u8; 32], 32); - { + fn monotonic_clock(counter: &AtomicU64) -> SystemTime { + let count = counter.fetch_add(1, Ordering::Relaxed); + make_system_time(count) + } + + fn increment_clock(time: &mut SystemTime) -> SystemTime { + let previous_time = *time; + *time = previous_time.checked_add(Duration::from_secs(1)).unwrap(); + previous_time + } + + #[nativelink_test] + async fn download_to_directory_file_download_test() -> Result<(), Box> { + const FILE1_NAME: &str = "file1.txt"; + const FILE1_CONTENT: &str = "HELLOFILE1"; + const FILE2_NAME: &str = "file2.exec"; + const FILE2_CONTENT: &str = "HELLOFILE2"; + const FILE2_MODE: u32 = 0o710; + const FILE2_MTIME: u64 = 5; + + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + // Make and insert (into store) our digest info needed to create our directory & files. let file1_content_digest = DigestInfo::new([2u8; 32], 32); slow_store .as_ref() .update_oneshot(file1_content_digest, FILE1_CONTENT.into()) .await?; - let directory1 = Directory { - files: vec![FileNode { - name: FILE1_NAME.to_string(), - digest: Some(file1_content_digest.into()), - ..Default::default() - }], - ..Default::default() - }; - slow_store - .as_ref() - .update_oneshot(directory1_digest, directory1.encode_to_vec().into()) - .await?; - } - let directory2_digest = DigestInfo::new([3u8; 32], 32); - { - // Now upload an empty directory. + let file2_content_digest = DigestInfo::new([3u8; 32], 32); slow_store .as_ref() - .update_oneshot( - directory2_digest, - Directory::default().encode_to_vec().into(), - ) + .update_oneshot(file2_content_digest, FILE2_CONTENT.into()) .await?; - } - let root_directory_digest = DigestInfo::new([5u8; 32], 32); - { + + let root_directory_digest = DigestInfo::new([1u8; 32], 32); let root_directory = Directory { - directories: vec![ - DirectoryNode { - name: DIRECTORY1_NAME.to_string(), - digest: Some(directory1_digest.into()), + files: vec![ + FileNode { + name: FILE1_NAME.to_string(), + digest: Some(file1_content_digest.into()), + is_executable: false, + node_properties: None, }, - DirectoryNode { - name: DIRECTORY2_NAME.to_string(), - digest: Some(directory2_digest.into()), + FileNode { + name: FILE2_NAME.to_string(), + digest: Some(file2_content_digest.into()), + is_executable: true, + node_properties: Some(NodeProperties { + properties: vec![], + mtime: Some( + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_secs(FILE2_MTIME)) + .unwrap() + .into(), + ), + unix_mode: Some(FILE2_MODE), + }), }, ], ..Default::default() }; + slow_store .as_ref() .update_oneshot(root_directory_digest, root_directory.encode_to_vec().into()) .await?; - } - root_directory_digest - }; + root_directory_digest + }; - let download_dir = { - // Tell it to download the digest info to a directory. - let download_dir = make_temp_path("download_dir"); - fs::create_dir_all(&download_dir) - .await - .err_tip(|| format!("Could not make download_dir : {download_dir}"))?; - download_to_directory( - cas_store.as_ref(), - fast_store.as_pin(), - &root_directory_digest, - &download_dir, - ) - .await?; - download_dir - }; - { - // Now ensure that our download_dir has the files. - let file1_content = fs::read(format!("{download_dir}/{DIRECTORY1_NAME}/{FILE1_NAME}")) - .await - .err_tip(|| "On file_1 read")?; - assert_eq!(from_utf8(&file1_content)?, FILE1_CONTENT); + let download_dir = { + // Tell it to download the digest info to a directory. + let download_dir = make_temp_path("download_dir"); + fs::create_dir_all(&download_dir) + .await + .err_tip(|| format!("Could not make download_dir : {download_dir}"))?; + download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) + .await?; + download_dir + }; + { + // Now ensure that our download_dir has the files. + let file1_content = fs::read(format!("{download_dir}/{FILE1_NAME}")).await?; + assert_eq!(from_utf8(&file1_content)?, FILE1_CONTENT); - let folder2_path = format!("{download_dir}/{DIRECTORY2_NAME}"); - let folder2_metadata = fs::metadata(&folder2_path) - .await - .err_tip(|| "On folder2_metadata metadata")?; - assert_eq!(folder2_metadata.is_dir(), true); - } - Ok(()) -} + let file2_path = format!("{download_dir}/{FILE2_NAME}"); + let file2_content = fs::read(&file2_path).await?; + assert_eq!(from_utf8(&file2_content)?, FILE2_CONTENT); -// Windows does not support symlinks. -#[cfg(not(target_family = "windows"))] -#[serial] -#[nativelink_test] -async fn download_to_directory_symlink_download_test() -> Result<(), Box> { - const FILE_NAME: &str = "file.txt"; - const FILE_CONTENT: &str = "HELLOFILE"; - const SYMLINK_NAME: &str = "symlink_file.txt"; - const SYMLINK_TARGET: &str = "file.txt"; - - let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; - - let root_directory_digest = { - // Make and insert (into store) our digest info needed to create our directory & files. - let file_content_digest = DigestInfo::new([1u8; 32], 32); - slow_store - .as_ref() - .update_oneshot(file_content_digest, FILE_CONTENT.into()) - .await?; + let file2_metadata = fs::metadata(&file2_path).await?; + // Note: We sent 0o710, but because is_executable was set it turns into 0o711. + #[cfg(target_family = "unix")] + assert_eq!(file2_metadata.mode() & 0o777, FILE2_MODE | 0o111); + assert_eq!( + file2_metadata + .modified()? + .duration_since(SystemTime::UNIX_EPOCH)? + .as_secs(), + FILE2_MTIME + ); + } + Ok(()) + } - let root_directory_digest = DigestInfo::new([2u8; 32], 32); - let root_directory = Directory { - files: vec![FileNode { - name: FILE_NAME.to_string(), - digest: Some(file_content_digest.into()), - is_executable: false, - node_properties: None, - }], - symlinks: vec![SymlinkNode { - name: SYMLINK_NAME.to_string(), - target: SYMLINK_TARGET.to_string(), - node_properties: None, - }], - ..Default::default() + #[nativelink_test] + async fn download_to_directory_folder_download_test() -> Result<(), Box> + { + const DIRECTORY1_NAME: &str = "folder1"; + const FILE1_NAME: &str = "file1.txt"; + const FILE1_CONTENT: &str = "HELLOFILE1"; + const DIRECTORY2_NAME: &str = "folder2"; + + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; + + let root_directory_digest = { + // Make and insert (into store) our digest info needed to create our directory & files. + let directory1_digest = DigestInfo::new([1u8; 32], 32); + { + let file1_content_digest = DigestInfo::new([2u8; 32], 32); + slow_store + .as_ref() + .update_oneshot(file1_content_digest, FILE1_CONTENT.into()) + .await?; + let directory1 = Directory { + files: vec![FileNode { + name: FILE1_NAME.to_string(), + digest: Some(file1_content_digest.into()), + ..Default::default() + }], + ..Default::default() + }; + slow_store + .as_ref() + .update_oneshot(directory1_digest, directory1.encode_to_vec().into()) + .await?; + } + let directory2_digest = DigestInfo::new([3u8; 32], 32); + { + // Now upload an empty directory. + slow_store + .as_ref() + .update_oneshot( + directory2_digest, + Directory::default().encode_to_vec().into(), + ) + .await?; + } + let root_directory_digest = DigestInfo::new([5u8; 32], 32); + { + let root_directory = Directory { + directories: vec![ + DirectoryNode { + name: DIRECTORY1_NAME.to_string(), + digest: Some(directory1_digest.into()), + }, + DirectoryNode { + name: DIRECTORY2_NAME.to_string(), + digest: Some(directory2_digest.into()), + }, + ], + ..Default::default() + }; + slow_store + .as_ref() + .update_oneshot(root_directory_digest, root_directory.encode_to_vec().into()) + .await?; + } + root_directory_digest }; - slow_store - .as_ref() - .update_oneshot(root_directory_digest, root_directory.encode_to_vec().into()) + let download_dir = { + // Tell it to download the digest info to a directory. + let download_dir = make_temp_path("download_dir"); + fs::create_dir_all(&download_dir) + .await + .err_tip(|| format!("Could not make download_dir : {download_dir}"))?; + download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, + ) .await?; - root_directory_digest - }; + download_dir + }; + { + // Now ensure that our download_dir has the files. + let file1_content = fs::read(format!("{download_dir}/{DIRECTORY1_NAME}/{FILE1_NAME}")) + .await + .err_tip(|| "On file_1 read")?; + assert_eq!(from_utf8(&file1_content)?, FILE1_CONTENT); + + let folder2_path = format!("{download_dir}/{DIRECTORY2_NAME}"); + let folder2_metadata = fs::metadata(&folder2_path) + .await + .err_tip(|| "On folder2_metadata metadata")?; + assert_eq!(folder2_metadata.is_dir(), true); + } + Ok(()) + } - let download_dir = { - // Tell it to download the digest info to a directory. - let download_dir = make_temp_path("download_dir"); - fs::create_dir_all(&download_dir) - .await - .err_tip(|| format!("Could not make download_dir : {download_dir}"))?; - download_to_directory( - cas_store.as_ref(), - fast_store.as_pin(), - &root_directory_digest, - &download_dir, - ) - .await?; - download_dir - }; + // Windows does not support symlinks. + #[cfg(not(target_family = "windows"))] + #[nativelink_test] + async fn download_to_directory_symlink_download_test() -> Result<(), Box> { - // Now ensure that our download_dir has the files. - let symlink_path = format!("{download_dir}/{SYMLINK_NAME}"); - let symlink_content = fs::read(&symlink_path) - .await - .err_tip(|| "On symlink read")?; - assert_eq!(from_utf8(&symlink_content)?, FILE_CONTENT); + const FILE_NAME: &str = "file.txt"; + const FILE_CONTENT: &str = "HELLOFILE"; + const SYMLINK_NAME: &str = "symlink_file.txt"; + const SYMLINK_TARGET: &str = "file.txt"; - let symlink_metadata = fs::symlink_metadata(&symlink_path) - .await - .err_tip(|| "On symlink symlink_metadata")?; - assert_eq!(symlink_metadata.is_symlink(), true); - } - Ok(()) -} + let (fast_store, slow_store, cas_store, _ac_store) = setup_stores().await?; -#[serial] -#[nativelink_test] -async fn ensure_output_files_full_directories_are_created_no_working_directory_test() --> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; - - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) - } + let root_directory_digest = { + // Make and insert (into store) our digest info needed to create our directory & files. + let file_content_digest = DigestInfo::new([1u8; 32], 32); + slow_store + .as_ref() + .update_oneshot(file_content_digest, FILE_CONTENT.into()) + .await?; - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory, - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| Box::pin(future::pending()), - }, - )?); - { - let command = Command { - arguments: vec!["touch".to_string(), "./some/path/test.txt".to_string()], - output_files: vec!["some/path/test.txt".to_string()], - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory { - directories: vec![DirectoryNode { - name: "some_cwd".to_string(), - digest: Some( - serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await? - .into(), - ), + let root_directory_digest = DigestInfo::new([2u8; 32], 32); + let root_directory = Directory { + files: vec![FileNode { + name: FILE_NAME.to_string(), + digest: Some(file_content_digest.into()), + is_executable: false, + node_properties: None, + }], + symlinks: vec![SymlinkNode { + name: SYMLINK_NAME.to_string(), + target: SYMLINK_TARGET.to_string(), + node_properties: None, }], ..Default::default() - }, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; + }; - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() + slow_store + .as_ref() + .update_oneshot(root_directory_digest, root_directory.encode_to_vec().into()) + .await?; + root_directory_digest }; - let operation_id = OperationId::default().to_string(); - let running_action = running_actions_manager - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: None, - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), - }, + let download_dir = { + // Tell it to download the digest info to a directory. + let download_dir = make_temp_path("download_dir"); + fs::create_dir_all(&download_dir) + .await + .err_tip(|| format!("Could not make download_dir : {download_dir}"))?; + download_to_directory( + cas_store.as_ref(), + fast_store.as_pin(), + &root_directory_digest, + &download_dir, ) .await?; + download_dir + }; + { + // Now ensure that our download_dir has the files. + let symlink_path = format!("{download_dir}/{SYMLINK_NAME}"); + let symlink_content = fs::read(&symlink_path) + .await + .err_tip(|| "On symlink read")?; + assert_eq!(from_utf8(&symlink_content)?, FILE_CONTENT); + + let symlink_metadata = fs::symlink_metadata(&symlink_path) + .await + .err_tip(|| "On symlink symlink_metadata")?; + assert_eq!(symlink_metadata.is_symlink(), true); + } + Ok(()) + } - let running_action = running_action.clone().prepare_action().await?; - - // The folder should have been created for our output file. - assert_eq!( - fs::metadata(format!( - "{}/{}", - running_action.get_work_directory(), - "some/path" - )) - .await - .is_ok(), - true, - "Expected path to exist" - ); + #[nativelink_test] + async fn ensure_output_files_full_directories_are_created_no_working_directory_test() + -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; - running_action.cleanup().await?; - }; - Ok(()) -} + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } -#[serial] -#[nativelink_test] -async fn ensure_output_files_full_directories_are_created_test() --> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; - - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) - } + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory, - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory, + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| Box::pin(future::pending()), - }, - )?); - { - let working_directory = "some_cwd"; - let command = Command { - arguments: vec!["touch".to_string(), "./some/path/test.txt".to_string()], - output_files: vec!["some/path/test.txt".to_string()], - working_directory: working_directory.to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory { - directories: vec![DirectoryNode { - name: "some_cwd".to_string(), - digest: Some( - serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await? - .into(), - ), + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |_duration| Box::pin(future::pending()), + }, + )?); + { + let command = Command { + arguments: vec!["touch".to_string(), "./some/path/test.txt".to_string()], + output_files: vec!["some/path/test.txt".to_string()], + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), }], ..Default::default() - }, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory { + directories: vec![DirectoryNode { + name: "some_cwd".to_string(), + digest: Some( + serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await? + .into(), + ), + }], + ..Default::default() + }, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; + let operation_id = OperationId::default().to_string(); + + let running_action = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: None, + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .await?; + + let running_action = running_action.clone().prepare_action().await?; + + // The folder should have been created for our output file. + assert_eq!( + fs::metadata(format!( + "{}/{}", + running_action.get_work_directory(), + "some/path" + )) + .await + .is_ok(), + true, + "Expected path to exist" + ); + + running_action.cleanup().await?; }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; + Ok(()) + } - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() + #[nativelink_test] + async fn ensure_output_files_full_directories_are_created_test() + -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; + + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } + + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory, + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + }, + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |_duration| Box::pin(future::pending()), + }, + )?); + { + let working_directory = "some_cwd"; + let command = Command { + arguments: vec!["touch".to_string(), "./some/path/test.txt".to_string()], + output_files: vec!["some/path/test.txt".to_string()], + working_directory: working_directory.to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory { + directories: vec![DirectoryNode { + name: "some_cwd".to_string(), + digest: Some( + serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await? + .into(), + ), + }], + ..Default::default() + }, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; + let operation_id = OperationId::default().to_string(); + + let running_action = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: None, + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .await?; + + let running_action = running_action.clone().prepare_action().await?; + + // The folder should have been created for our output file. + assert_eq!( + fs::metadata(format!( + "{}/{}/{}", + running_action.get_work_directory(), + working_directory, + "some/path" + )) + .await + .is_ok(), + true, + "Expected path to exist" + ); + + running_action.cleanup().await?; }; - let operation_id = OperationId::default().to_string(); + Ok(()) + } - let running_action = running_actions_manager - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: None, - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), + #[nativelink_test] + async fn blake3_upload_files() -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; + + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } + + let (_, slow_store, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory, + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + }, + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |_duration| Box::pin(future::pending()), + }, + )?); + let action_result = { + #[cfg(target_family = "unix")] + let arguments = vec![ + "sh".to_string(), + "-c".to_string(), + "printf '123 ' > ./test.txt; printf 'foo-stdout '; >&2 printf 'bar-stderr '" + .to_string(), + ]; + #[cfg(target_family = "windows")] + let arguments = vec![ + "cmd".to_string(), + "/C".to_string(), + // Note: Windows adds two spaces after 'set /p=XXX'. + "echo | set /p=123> ./test.txt & echo | set /p=foo-stdout & echo | set /p=bar-stderr 1>&2 & exit 0" + .to_string(), + ]; + let working_directory = "some_cwd"; + let command = Command { + arguments, + output_paths: vec!["test.txt".to_string()], + working_directory: working_directory.to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Blake3.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory { + directories: vec![DirectoryNode { + name: working_directory.to_string(), + digest: Some( + serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Blake3.hasher(), + ) + .await? + .into(), + ), + }], + ..Default::default() }, + cas_store.as_pin(), + &mut DigestHasherFunc::Blake3.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Blake3.hasher(), ) .await?; - let running_action = running_action.clone().prepare_action().await?; + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + digest_function: ProtoDigestFunction::Blake3.into(), + ..Default::default() + }; + let operation_id = OperationId::default().to_string(); + + let running_action_impl = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: None, + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .await?; - // The folder should have been created for our output file. + run_action(running_action_impl.clone()).await? + }; + let file_content = slow_store + .as_ref() + .get_part_unchunked(action_result.output_files[0].digest, 0, None) + .await?; + assert_eq!(from_utf8(&file_content)?, "123 "); + let stdout_content = slow_store + .as_ref() + .get_part_unchunked(action_result.stdout_digest, 0, None) + .await?; + assert_eq!(from_utf8(&stdout_content)?, "foo-stdout "); + let stderr_content = slow_store + .as_ref() + .get_part_unchunked(action_result.stderr_digest, 0, None) + .await?; + assert_eq!(from_utf8(&stderr_content)?, "bar-stderr "); + let mut clock_time = make_system_time(0); assert_eq!( - fs::metadata(format!( - "{}/{}/{}", - running_action.get_work_directory(), - working_directory, - "some/path" - )) - .await - .is_ok(), - true, - "Expected path to exist" + action_result, + ActionResult { + output_files: vec![FileInfo { + name_or_path: NameOrPath::Path("test.txt".to_string()), + digest: DigestInfo::try_new( + "3f488ba478fc6716c756922c9f34ebd7e84b85c3e03e33e22e7a3736cafdc6d8", + 4 + )?, + is_executable: false, + }], + stdout_digest: DigestInfo::try_new( + "af1720193ae81515067a3ef39f0dfda3ad54a1a9d216e55d32fe5c1e178c6a7d", + 11 + )?, + stderr_digest: DigestInfo::try_new( + "65e0abbae32a3aedaf040b654c6f02ace03c7690c17a8415a90fc2ec9c809a16", + 12 + )?, + exit_code: 0, + output_folders: vec![], + output_file_symlinks: vec![], + output_directory_symlinks: vec![], + server_logs: HashMap::new(), + execution_metadata: ExecutionMetadata { + worker: WORKER_ID.to_string(), + queued_timestamp: SystemTime::UNIX_EPOCH, + worker_start_timestamp: increment_clock(&mut clock_time), + input_fetch_start_timestamp: increment_clock(&mut clock_time), + input_fetch_completed_timestamp: increment_clock(&mut clock_time), + execution_start_timestamp: increment_clock(&mut clock_time), + execution_completed_timestamp: increment_clock(&mut clock_time), + output_upload_start_timestamp: increment_clock(&mut clock_time), + output_upload_completed_timestamp: increment_clock(&mut clock_time), + worker_completed_timestamp: increment_clock(&mut clock_time), + }, + error: None, + message: String::new(), + } ); + Ok(()) + } - running_action.cleanup().await?; - }; - Ok(()) -} + #[nativelink_test] + async fn upload_files_from_above_cwd_test() -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; -#[serial] -#[nativelink_test] -async fn blake3_upload_files() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) - } + let (_, slow_store, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; - let (_, slow_store, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory, - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory, + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| Box::pin(future::pending()), - }, - )?); - let action_result = { - #[cfg(target_family = "unix")] - let arguments = vec![ - "sh".to_string(), - "-c".to_string(), - "printf '123 ' > ./test.txt; printf 'foo-stdout '; >&2 printf 'bar-stderr '" - .to_string(), - ]; - #[cfg(target_family = "windows")] + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |_duration| Box::pin(future::pending()), + }, + )?); + let action_result = { + #[cfg(target_family = "unix")] + let arguments = vec![ + "sh".to_string(), + "-c".to_string(), + "printf '123 ' > ./test.txt; printf 'foo-stdout '; >&2 printf 'bar-stderr '" + .to_string(), + ]; + #[cfg(target_family = "windows")] let arguments = vec![ "cmd".to_string(), "/C".to_string(), @@ -715,11 +914,543 @@ async fn blake3_upload_files() -> Result<(), Box> { "echo | set /p=123> ./test.txt & echo | set /p=foo-stdout & echo | set /p=bar-stderr 1>&2 & exit 0" .to_string(), ]; - let working_directory = "some_cwd"; + let working_directory = "some_cwd"; + let command = Command { + arguments, + output_paths: vec!["test.txt".to_string()], + working_directory: working_directory.to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory { + directories: vec![DirectoryNode { + name: working_directory.to_string(), + digest: Some( + serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await? + .into(), + ), + }], + ..Default::default() + }, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; + let operation_id = OperationId::default().to_string(); + + let running_action_impl = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: None, + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .await?; + + run_action(running_action_impl.clone()).await? + }; + let file_content = slow_store + .as_ref() + .get_part_unchunked(action_result.output_files[0].digest, 0, None) + .await?; + assert_eq!(from_utf8(&file_content)?, "123 "); + let stdout_content = slow_store + .as_ref() + .get_part_unchunked(action_result.stdout_digest, 0, None) + .await?; + assert_eq!(from_utf8(&stdout_content)?, "foo-stdout "); + let stderr_content = slow_store + .as_ref() + .get_part_unchunked(action_result.stderr_digest, 0, None) + .await?; + assert_eq!(from_utf8(&stderr_content)?, "bar-stderr "); + let mut clock_time = make_system_time(0); + assert_eq!( + action_result, + ActionResult { + output_files: vec![FileInfo { + name_or_path: NameOrPath::Path("test.txt".to_string()), + digest: DigestInfo::try_new( + "c69e10a5f54f4e28e33897fbd4f8701595443fa8c3004aeaa20dd4d9a463483b", + 4 + )?, + is_executable: false, + }], + stdout_digest: DigestInfo::try_new( + "15019a676f057d97d1ad3af86f3cc1e623cb33b18ff28422bbe3248d2471cc94", + 11 + )?, + stderr_digest: DigestInfo::try_new( + "2375ab8a01ca11e1ea7606dfb58756c153d49733cde1dbfb5a1e00f39afacf06", + 12 + )?, + exit_code: 0, + output_folders: vec![], + output_file_symlinks: vec![], + output_directory_symlinks: vec![], + server_logs: HashMap::new(), + execution_metadata: ExecutionMetadata { + worker: WORKER_ID.to_string(), + queued_timestamp: SystemTime::UNIX_EPOCH, + worker_start_timestamp: increment_clock(&mut clock_time), + input_fetch_start_timestamp: increment_clock(&mut clock_time), + input_fetch_completed_timestamp: increment_clock(&mut clock_time), + execution_start_timestamp: increment_clock(&mut clock_time), + execution_completed_timestamp: increment_clock(&mut clock_time), + output_upload_start_timestamp: increment_clock(&mut clock_time), + output_upload_completed_timestamp: increment_clock(&mut clock_time), + worker_completed_timestamp: increment_clock(&mut clock_time), + }, + error: None, + message: String::new(), + } + ); + Ok(()) + } + + // Windows does not support symlinks. + #[cfg(not(target_family = "windows"))] + #[nativelink_test] + async fn upload_dir_and_symlink_test() -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; + + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } + + let (_, slow_store, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory, + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + }, + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |_duration| Box::pin(future::pending()), + }, + )?); + let queued_timestamp = make_system_time(1000); + let action_result = { + let command = Command { + arguments: vec![ + "sh".to_string(), + "-c".to_string(), + concat!( + "mkdir -p dir1/dir2 && ", + "echo foo > dir1/file && ", + "touch dir1/file2 && ", + "ln -s ../file dir1/dir2/sym &&", + "ln -s /dev/null empty_sym", + ) + .to_string(), + ], + output_paths: vec!["dir1".to_string(), "empty_sym".to_string()], + working_directory: ".".to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; + let operation_id = OperationId::default().to_string(); + + let running_action_impl = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: Some(queued_timestamp.into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .await?; + + run_action(running_action_impl.clone()).await? + }; + let tree = get_and_decode_digest::( + slow_store.as_ref(), + action_result.output_folders[0].tree_digest.into(), + ) + .await?; + let root_directory = Directory { + files: vec![ + FileNode { + name: "file".to_string(), + digest: Some( + DigestInfo::try_new( + "b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c", + 4, + )? + .into(), + ), + ..Default::default() + }, + FileNode { + name: "file2".to_string(), + digest: Some( + DigestInfo::try_new( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + 0, + )? + .into(), + ), + ..Default::default() + }, + ], + directories: vec![DirectoryNode { + name: "dir2".to_string(), + digest: Some( + DigestInfo::try_new( + "cce0098e0b0f1d785edb0da50beedb13e27dcd459b091b2f8f82543cb7cd0527", + 16, + )? + .into(), + ), + }], + ..Default::default() + }; + assert_eq!( + tree, + Tree { + root: Some(root_directory.clone()), + children: vec![ + Directory { + symlinks: vec![SymlinkNode { + name: "sym".to_string(), + target: "../file".to_string(), + ..Default::default() + }], + ..Default::default() + }, + root_directory + ], + } + ); + let mut clock_time = make_system_time(0); + assert_eq!( + action_result, + ActionResult { + output_files: vec![], + stdout_digest: DigestInfo::try_new( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + 0 + )?, + stderr_digest: DigestInfo::try_new( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + 0 + )?, + exit_code: 0, + output_folders: vec![DirectoryInfo { + path: "dir1".to_string(), + tree_digest: DigestInfo::try_new( + "adbb04fa6e166e663c1310bbf8ba494e468b1b6c33e1e5346e2216b6904c9917", + 490 + )?, + }], + output_file_symlinks: vec![SymlinkInfo { + name_or_path: NameOrPath::Path("empty_sym".to_string()), + target: "/dev/null".to_string(), + }], + output_directory_symlinks: vec![], + server_logs: HashMap::new(), + execution_metadata: ExecutionMetadata { + worker: WORKER_ID.to_string(), + queued_timestamp, + worker_start_timestamp: increment_clock(&mut clock_time), + input_fetch_start_timestamp: increment_clock(&mut clock_time), + input_fetch_completed_timestamp: increment_clock(&mut clock_time), + execution_start_timestamp: increment_clock(&mut clock_time), + execution_completed_timestamp: increment_clock(&mut clock_time), + output_upload_start_timestamp: increment_clock(&mut clock_time), + output_upload_completed_timestamp: increment_clock(&mut clock_time), + worker_completed_timestamp: increment_clock(&mut clock_time), + }, + error: None, + message: String::new(), + } + ); + Ok(()) + } + + #[nativelink_test] + async fn cleanup_happens_on_job_failure() -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; + + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } + + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + }, + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |_duration| Box::pin(future::pending()), + }, + )?); + let queued_timestamp = make_system_time(1000); + + #[cfg(target_family = "unix")] + let arguments = vec!["sh".to_string(), "-c".to_string(), "exit 33".to_string()]; + #[cfg(target_family = "windows")] + let arguments = vec!["cmd".to_string(), "/C".to_string(), "exit 33".to_string()]; + + let action_result = { + let command = Command { + arguments, + output_paths: vec![], + working_directory: ".".to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; + let operation_id = OperationId::default().to_string(); + + let running_action_impl = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: Some(queued_timestamp.into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .await?; + + run_action(running_action_impl.clone()).await? + }; + let mut clock_time = make_system_time(0); + assert_eq!( + action_result, + ActionResult { + output_files: vec![], + stdout_digest: DigestInfo::try_new( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + 0 + )?, + stderr_digest: DigestInfo::try_new( + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", + 0 + )?, + exit_code: 33, + output_folders: vec![], + output_file_symlinks: vec![], + output_directory_symlinks: vec![], + server_logs: HashMap::new(), + execution_metadata: ExecutionMetadata { + worker: WORKER_ID.to_string(), + queued_timestamp, + worker_start_timestamp: increment_clock(&mut clock_time), + input_fetch_start_timestamp: increment_clock(&mut clock_time), + input_fetch_completed_timestamp: increment_clock(&mut clock_time), + execution_start_timestamp: increment_clock(&mut clock_time), + execution_completed_timestamp: increment_clock(&mut clock_time), + output_upload_start_timestamp: increment_clock(&mut clock_time), + output_upload_completed_timestamp: increment_clock(&mut clock_time), + worker_completed_timestamp: increment_clock(&mut clock_time), + }, + error: None, + message: String::new(), + } + ); + let mut dir_stream = fs::read_dir(&root_action_directory).await?; + assert!( + dir_stream.as_mut().next_entry().await?.is_none(), + "Expected empty directory at {root_action_directory}" + ); + Ok(()) + } + + #[nativelink_test] + async fn kill_ends_action() -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; + + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); + + #[cfg(target_family = "unix")] + let (arguments, process_started_file) = { + let process_started_file = { + let tmp_dir = make_temp_path("root_action_directory"); + fs::create_dir_all(&tmp_dir).await.unwrap(); + format!("{tmp_dir}/process_started") + }; + ( + vec![ + "sh".to_string(), + "-c".to_string(), + format!("touch {process_started_file} && sleep infinity"), + ], + process_started_file, + ) + }; + #[cfg(target_family = "windows")] + // Windows is weird with timeout, so we use ping. See: + // https://www.ibm.com/support/pages/timeout-command-run-batch-job-exits-immediately-and-returns-error-input-redirection-not-supported-exiting-process-immediately + let arguments = vec![ + "cmd".to_string(), + "/C".to_string(), + "ping -n 99999 127.0.0.1".to_string(), + ]; + let command = Command { arguments, - output_paths: vec!["test.txt".to_string()], - working_directory: working_directory.to_string(), + output_paths: vec![], + working_directory: ".".to_string(), environment_variables: vec![EnvironmentVariable { name: "PATH".to_string(), value: env::var("PATH").unwrap(), @@ -729,27 +1460,13 @@ async fn blake3_upload_files() -> Result<(), Box> { let command_digest = serialize_and_upload_message( &command, cas_store.as_pin(), - &mut DigestHasherFunc::Blake3.hasher(), + &mut DigestHasherFunc::Sha256.hasher(), ) .await?; let input_root_digest = serialize_and_upload_message( - &Directory { - directories: vec![DirectoryNode { - name: working_directory.to_string(), - digest: Some( - serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Blake3.hasher(), - ) - .await? - .into(), - ), - }], - ..Default::default() - }, + &Directory::default(), cas_store.as_pin(), - &mut DigestHasherFunc::Blake3.hasher(), + &mut DigestHasherFunc::Sha256.hasher(), ) .await?; let action = Action { @@ -760,146 +1477,161 @@ async fn blake3_upload_files() -> Result<(), Box> { let action_digest = serialize_and_upload_message( &action, cas_store.as_pin(), - &mut DigestHasherFunc::Blake3.hasher(), + &mut DigestHasherFunc::Sha256.hasher(), ) .await?; let execute_request = ExecuteRequest { action_digest: Some(action_digest.into()), - digest_function: ProtoDigestFunction::Blake3.into(), ..Default::default() }; let operation_id = OperationId::default().to_string(); let running_action_impl = running_actions_manager + .clone() .create_and_add_action( WORKER_ID.to_string(), StartExecute { execute_request: Some(execute_request), operation_id, - queued_timestamp: None, + queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), }, ) .await?; - run_action(running_action_impl.clone()).await? - }; - let file_content = slow_store - .as_ref() - .get_part_unchunked(action_result.output_files[0].digest, 0, None) - .await?; - assert_eq!(from_utf8(&file_content)?, "123 "); - let stdout_content = slow_store - .as_ref() - .get_part_unchunked(action_result.stdout_digest, 0, None) - .await?; - assert_eq!(from_utf8(&stdout_content)?, "foo-stdout "); - let stderr_content = slow_store - .as_ref() - .get_part_unchunked(action_result.stderr_digest, 0, None) - .await?; - assert_eq!(from_utf8(&stderr_content)?, "bar-stderr "); - let mut clock_time = make_system_time(0); - assert_eq!( - action_result, - ActionResult { - output_files: vec![FileInfo { - name_or_path: NameOrPath::Path("test.txt".to_string()), - digest: DigestInfo::try_new( - "3f488ba478fc6716c756922c9f34ebd7e84b85c3e03e33e22e7a3736cafdc6d8", - 4 - )?, - is_executable: false, - }], - stdout_digest: DigestInfo::try_new( - "af1720193ae81515067a3ef39f0dfda3ad54a1a9d216e55d32fe5c1e178c6a7d", - 11 - )?, - stderr_digest: DigestInfo::try_new( - "65e0abbae32a3aedaf040b654c6f02ace03c7690c17a8415a90fc2ec9c809a16", - 12 - )?, - exit_code: 0, - output_folders: vec![], - output_file_symlinks: vec![], - output_directory_symlinks: vec![], - server_logs: HashMap::new(), - execution_metadata: ExecutionMetadata { - worker: WORKER_ID.to_string(), - queued_timestamp: SystemTime::UNIX_EPOCH, - worker_start_timestamp: increment_clock(&mut clock_time), - input_fetch_start_timestamp: increment_clock(&mut clock_time), - input_fetch_completed_timestamp: increment_clock(&mut clock_time), - execution_start_timestamp: increment_clock(&mut clock_time), - execution_completed_timestamp: increment_clock(&mut clock_time), - output_upload_start_timestamp: increment_clock(&mut clock_time), - output_upload_completed_timestamp: increment_clock(&mut clock_time), - worker_completed_timestamp: increment_clock(&mut clock_time), - }, - error: None, - message: String::new(), + let run_action_fut = run_action(running_action_impl); + tokio::pin!(run_action_fut); + + #[cfg(target_family = "unix")] + loop { + assert_eq!(futures::poll!(&mut run_action_fut), Poll::Pending); + tokio::task::yield_now().await; + match fs::metadata(&process_started_file).await { + Ok(_) => break, + Err(err) => { + assert_eq!(err.code, Code::NotFound, "Unknown error {err:?}"); + tokio::time::sleep(Duration::from_millis(1)).await; + } + } } - ); - Ok(()) -} -#[serial] -#[nativelink_test] -async fn upload_files_from_above_cwd_test() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; + let result = futures::join!(run_action_fut, running_actions_manager.kill_all()) + .0 + .unwrap(); + + // Check that the action was killed. + #[cfg(all(target_family = "unix", not(target_os = "macos")))] + assert_eq!(9, result.exit_code, "Wrong exit_code - {result:?}"); + // Mac for some reason sometimes returns 1 and 9. + #[cfg(all(target_family = "unix", target_os = "macos"))] + assert!( + 9 == result.exit_code || 1 == result.exit_code, + "Wrong exit_code - {result:?}" + ); + // Note: Windows kill command returns exit code 1. + #[cfg(target_family = "windows")] + assert_eq!(1, result.exit_code); + + Ok(()) + } + + // This script runs a command under a wrapper script set in a config. + // The wrapper script will print a constant string to stderr, and the test itself will + // print to stdout. We then check the results of both to make sure the shell script was + // invoked and the actual command was invoked under the shell script. + #[cfg_attr(feature = "nix", ignore)] + #[nativelink_test] + async fn entrypoint_does_invoke_if_set() -> Result<(), Box> { + #[cfg(target_family = "unix")] + const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ +#!/usr/bin/env bash +# Print some static text to stderr. This is what the test uses to +# make sure the script did run. +>&2 printf \"Wrapper script did run\" + +# Now run the real command. +exec \"$@\" +"; + #[cfg(target_family = "windows")] + const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ +@echo off +:: Print some static text to stderr. This is what the test uses to +:: make sure the script did run. +echo | set /p=\"Wrapper script did run\" 1>&2 + +:: Run command, but morph the echo to ensure it doesn't +:: add a new line to the end of the output. +%1 | set /p=%2 +exit 0 +"; + const WORKER_ID: &str = "foo_worker_id"; + const EXPECTED_STDOUT: &str = "Action did run"; + + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) - } + let test_wrapper_script = { + let test_wrapper_dir = make_temp_path("wrapper_dir"); + fs::create_dir_all(&test_wrapper_dir).await?; + #[cfg(target_family = "unix")] + let test_wrapper_script = OsString::from(test_wrapper_dir + "/test_wrapper_script.sh"); + #[cfg(target_family = "windows")] + let test_wrapper_script = + OsString::from(test_wrapper_dir + "\\test_wrapper_script.bat"); + { + let mut file_options = std::fs::OpenOptions::new(); + file_options.create(true); + file_options.truncate(true); + file_options.write(true); + #[cfg(target_family = "unix")] + file_options.mode(0o777); + let mut test_wrapper_script_handle = file_options + .open(OsString::from(&test_wrapper_script)) + .unwrap(); + test_wrapper_script_handle + .write_all(TEST_WRAPPER_SCRIPT_CONTENT.as_bytes()) + .unwrap(); + test_wrapper_script_handle.sync_all().unwrap(); + // Note: Github runners appear to use some kind of filesystem driver + // that does not sync data as expected. This is the easiest solution. + // See: https://github.com/pantsbuild/pants/issues/10507 + // See: https://github.com/moby/moby/issues/9547 + std::process::Command::new("sync").output().unwrap(); + } + test_wrapper_script + }; - let (_, slow_store, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory, - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| Box::pin(future::pending()), - }, - )?); - let action_result = { + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration { + entrypoint: Some(test_wrapper_script.into_string().unwrap()), + additional_environment: None, + }, + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); #[cfg(target_family = "unix")] - let arguments = vec![ - "sh".to_string(), - "-c".to_string(), - "printf '123 ' > ./test.txt; printf 'foo-stdout '; >&2 printf 'bar-stderr '" - .to_string(), - ]; + let arguments = vec!["printf".to_string(), EXPECTED_STDOUT.to_string()]; #[cfg(target_family = "windows")] - let arguments = vec![ - "cmd".to_string(), - "/C".to_string(), - // Note: Windows adds two spaces after 'set /p=XXX'. - "echo | set /p=123> ./test.txt & echo | set /p=foo-stdout & echo | set /p=bar-stderr 1>&2 & exit 0" - .to_string(), - ]; - let working_directory = "some_cwd"; + let arguments = vec!["echo".to_string(), EXPECTED_STDOUT.to_string()]; let command = Command { arguments, - output_paths: vec!["test.txt".to_string()], - working_directory: working_directory.to_string(), + working_directory: ".".to_string(), environment_variables: vec![EnvironmentVariable { name: "PATH".to_string(), value: env::var("PATH").unwrap(), @@ -913,21 +1645,7 @@ async fn upload_files_from_above_cwd_test() -> Result<(), Box Result<(), Box Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; + let expected_stdout = DigestHasherFunc::Sha256 + .hasher() + .compute_from_reader(Cursor::new(EXPECTED_STDOUT)) + .await?; + // Note: This string should match what is in worker_for_test.sh + let expected_stderr = DigestHasherFunc::Sha256 + .hasher() + .compute_from_reader(Cursor::new("Wrapper script did run")) + .await?; + assert_eq!(expected_stdout, result.stdout_digest); + assert_eq!(expected_stderr, result.stderr_digest); - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) + Ok(()) } - let (_, slow_store, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory, - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| Box::pin(future::pending()), - }, - )?); - let queued_timestamp = make_system_time(1000); - let action_result = { + #[cfg_attr(feature = "nix", ignore)] + #[nativelink_test] + async fn entrypoint_injects_properties() -> Result<(), Box> { + #[cfg(target_family = "unix")] + const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ +#!/usr/bin/env bash +# Print some static text to stderr. This is what the test uses to +# make sure the script did run. +>&2 printf \"Wrapper script did run with property $PROPERTY $VALUE $INNER_TIMEOUT\" + +# Now run the real command. +exec \"$@\" +"; + #[cfg(target_family = "windows")] + const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ +@echo off +:: Print some static text to stderr. This is what the test uses to +:: make sure the script did run. +echo | set /p=\"Wrapper script did run with property %PROPERTY% %VALUE% %INNER_TIMEOUT%\" 1>&2 + +:: Run command, but morph the echo to ensure it doesn't +:: add a new line to the end of the output. +%1 | set /p=%2 +exit 0 +"; + const WORKER_ID: &str = "foo_worker_id"; + const EXPECTED_STDOUT: &str = "Action did run"; + const TASK_TIMEOUT: Duration = Duration::from_secs(122); + + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let test_wrapper_script = { + let test_wrapper_dir = make_temp_path("wrapper_dir"); + fs::create_dir_all(&test_wrapper_dir).await?; + #[cfg(target_family = "unix")] + let test_wrapper_script = OsString::from(test_wrapper_dir + "/test_wrapper_script.sh"); + #[cfg(target_family = "windows")] + let test_wrapper_script = + OsString::from(test_wrapper_dir + "\\test_wrapper_script.bat"); + { + let mut file_options = std::fs::OpenOptions::new(); + file_options.create(true); + file_options.truncate(true); + file_options.write(true); + #[cfg(target_family = "unix")] + file_options.mode(0o777); + let mut test_wrapper_script_handle = file_options + .open(OsString::from(&test_wrapper_script)) + .unwrap(); + test_wrapper_script_handle + .write_all(TEST_WRAPPER_SCRIPT_CONTENT.as_bytes()) + .unwrap(); + test_wrapper_script_handle.sync_all().unwrap(); + // Note: Github runners appear to use some kind of filesystem driver + // that does not sync data as expected. This is the easiest solution. + // See: https://github.com/pantsbuild/pants/issues/10507 + // See: https://github.com/moby/moby/issues/9547 + std::process::Command::new("sync").output().unwrap(); + } + test_wrapper_script + }; + + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration { + entrypoint: Some(test_wrapper_script.into_string().unwrap()), + additional_environment: Some(HashMap::from([ + ( + "PROPERTY".to_string(), + EnvironmentSource::Property("property_name".to_string()), + ), + ( + "VALUE".to_string(), + EnvironmentSource::Value("raw_value".to_string()), + ), + ( + "INNER_TIMEOUT".to_string(), + EnvironmentSource::TimeoutMillis, + ), + ( + "PATH".to_string(), + EnvironmentSource::Value(env::var("PATH").unwrap()), + ), + ])), + }, + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); + #[cfg(target_family = "unix")] + let arguments = vec!["printf".to_string(), EXPECTED_STDOUT.to_string()]; + #[cfg(target_family = "windows")] + let arguments = vec!["echo".to_string(), EXPECTED_STDOUT.to_string()]; let command = Command { - arguments: vec![ - "sh".to_string(), - "-c".to_string(), - concat!( - "mkdir -p dir1/dir2 && ", - "echo foo > dir1/file && ", - "touch dir1/file2 && ", - "ln -s ../file dir1/dir2/sym &&", - "ln -s /dev/null empty_sym", - ) - .to_string(), - ], - output_paths: vec!["dir1".to_string(), "empty_sym".to_string()], + arguments, working_directory: ".".to_string(), environment_variables: vec![EnvironmentVariable { name: "PATH".to_string(), @@ -1098,6 +1830,16 @@ async fn upload_dir_and_symlink_test() -> Result<(), Box let action = Action { command_digest: Some(command_digest.into()), input_root_digest: Some(input_root_digest.into()), + platform: Some(Platform { + properties: vec![Property { + name: "property_name".into(), + value: "property_value".into(), + }], + }), + timeout: Some(prost_types::Duration { + seconds: TASK_TIMEOUT.as_secs() as i64, + nanos: 0, + }), ..Default::default() }; let action_digest = serialize_and_upload_message( @@ -1114,170 +1856,126 @@ async fn upload_dir_and_symlink_test() -> Result<(), Box let operation_id = OperationId::default().to_string(); let running_action_impl = running_actions_manager + .clone() .create_and_add_action( WORKER_ID.to_string(), StartExecute { execute_request: Some(execute_request), operation_id, - queued_timestamp: Some(queued_timestamp.into()), + queued_timestamp: Some(make_system_time(1000).into()), platform: action.platform.clone(), worker_id: WORKER_ID.to_string(), }, ) .await?; - run_action(running_action_impl.clone()).await? - }; - let tree = get_and_decode_digest::( - slow_store.as_ref(), - action_result.output_folders[0].tree_digest.into(), - ) - .await?; - let root_directory = Directory { - files: vec![ - FileNode { - name: "file".to_string(), - digest: Some( - DigestInfo::try_new( - "b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c", - 4, - )? - .into(), - ), - ..Default::default() - }, - FileNode { - name: "file2".to_string(), - digest: Some( - DigestInfo::try_new( - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", - 0, - )? - .into(), - ), - ..Default::default() - }, - ], - directories: vec![DirectoryNode { - name: "dir2".to_string(), - digest: Some( - DigestInfo::try_new( - "cce0098e0b0f1d785edb0da50beedb13e27dcd459b091b2f8f82543cb7cd0527", - 16, - )? - .into(), - ), - }], - ..Default::default() - }; - assert_eq!( - tree, - Tree { - root: Some(root_directory.clone()), - children: vec![ - Directory { - symlinks: vec![SymlinkNode { - name: "sym".to_string(), - target: "../file".to_string(), - ..Default::default() - }], - ..Default::default() - }, - root_directory - ], - } - ); - let mut clock_time = make_system_time(0); - assert_eq!( - action_result, - ActionResult { - output_files: vec![], - stdout_digest: DigestInfo::try_new( - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", - 0 - )?, - stderr_digest: DigestInfo::try_new( - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", - 0 - )?, - exit_code: 0, - output_folders: vec![DirectoryInfo { - path: "dir1".to_string(), - tree_digest: DigestInfo::try_new( - "adbb04fa6e166e663c1310bbf8ba494e468b1b6c33e1e5346e2216b6904c9917", - 490 - )?, - }], - output_file_symlinks: vec![SymlinkInfo { - name_or_path: NameOrPath::Path("empty_sym".to_string()), - target: "/dev/null".to_string(), - }], - output_directory_symlinks: vec![], - server_logs: HashMap::new(), - execution_metadata: ExecutionMetadata { - worker: WORKER_ID.to_string(), - queued_timestamp, - worker_start_timestamp: increment_clock(&mut clock_time), - input_fetch_start_timestamp: increment_clock(&mut clock_time), - input_fetch_completed_timestamp: increment_clock(&mut clock_time), - execution_start_timestamp: increment_clock(&mut clock_time), - execution_completed_timestamp: increment_clock(&mut clock_time), - output_upload_start_timestamp: increment_clock(&mut clock_time), - output_upload_completed_timestamp: increment_clock(&mut clock_time), - worker_completed_timestamp: increment_clock(&mut clock_time), - }, - error: None, - message: String::new(), - } - ); - Ok(()) -} + let result = run_action(running_action_impl).await?; + assert_eq!(result.exit_code, 0, "Exit code should be 0"); -#[serial] -#[nativelink_test] -async fn cleanup_happens_on_job_failure() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; + let expected_stdout = DigestHasherFunc::Sha256 + .hasher() + .compute_from_reader(Cursor::new(EXPECTED_STDOUT)) + .await?; + // Note: This string should match what is in worker_for_test.sh + let expected_stderr = + "Wrapper script did run with property property_value raw_value 122000"; + let expected_stderr_digest = DigestHasherFunc::Sha256 + .hasher() + .compute_from_reader(Cursor::new(expected_stderr)) + .await?; + + let actual_stderr: Bytes = cas_store + .as_ref() + .get_part_unchunked(result.stderr_digest, 0, None) + .await?; + let actual_stderr_decoded = from_utf8(&actual_stderr)?; + assert_eq!(expected_stderr, actual_stderr_decoded); + assert_eq!(expected_stdout, result.stdout_digest); + assert_eq!(expected_stderr_digest, result.stderr_digest); - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) + Ok(()) } - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| Box::pin(future::pending()), - }, - )?); - let queued_timestamp = make_system_time(1000); + #[cfg_attr(feature = "nix", ignore)] + #[nativelink_test] + async fn entrypoint_sends_timeout_via_side_channel() -> Result<(), Box> + { + #[cfg(target_family = "unix")] + const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ +#!/bin/bash +echo '{\"failure\":\"timeout\"}' > \"$SIDE_CHANNEL_FILE\" +exit 1 +"; + #[cfg(target_family = "windows")] + const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ +@echo off +echo | set /p={\"failure\":\"timeout\"} 1>&2 > %SIDE_CHANNEL_FILE% +exit 1 +"; + const WORKER_ID: &str = "foo_worker_id"; - #[cfg(target_family = "unix")] - let arguments = vec!["sh".to_string(), "-c".to_string(), "exit 33".to_string()]; - #[cfg(target_family = "windows")] - let arguments = vec!["cmd".to_string(), "/C".to_string(), "exit 33".to_string()]; + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let test_wrapper_script = { + let test_wrapper_dir = make_temp_path("wrapper_dir"); + fs::create_dir_all(&test_wrapper_dir).await?; + #[cfg(target_family = "unix")] + let test_wrapper_script = OsString::from(test_wrapper_dir + "/test_wrapper_script.sh"); + #[cfg(target_family = "windows")] + let test_wrapper_script = + OsString::from(test_wrapper_dir + "\\test_wrapper_script.bat"); + { + let mut file_options = std::fs::OpenOptions::new(); + file_options.create(true); + file_options.truncate(true); + file_options.write(true); + #[cfg(target_family = "unix")] + file_options.mode(0o777); + let mut test_wrapper_script_handle = file_options + .open(OsString::from(&test_wrapper_script)) + .unwrap(); + test_wrapper_script_handle + .write_all(TEST_WRAPPER_SCRIPT_CONTENT.as_bytes()) + .unwrap(); + test_wrapper_script_handle.sync_all().unwrap(); + // Note: Github runners appear to use some kind of filesystem driver + // that does not sync data as expected. This is the easiest solution. + // See: https://github.com/pantsbuild/pants/issues/10507 + // See: https://github.com/moby/moby/issues/9547 + std::process::Command::new("sync").output().unwrap(); + } + test_wrapper_script + }; - let action_result = { + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration { + entrypoint: Some(test_wrapper_script.into_string().unwrap()), + additional_environment: Some(HashMap::from([( + "SIDE_CHANNEL_FILE".to_string(), + EnvironmentSource::SideChannelFile, + )])), + }, + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); + let arguments = vec!["true".to_string()]; let command = Command { arguments, - output_paths: vec![], working_directory: ".".to_string(), environment_variables: vec![EnvironmentVariable { name: "PATH".to_string(), @@ -1316,1172 +2014,790 @@ async fn cleanup_happens_on_job_failure() -> Result<(), Box Result<(), Box> { + let (_, _, cas_store, ac_store) = setup_stores().await?; + + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: String::new(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::SuccessOnly, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); + + let action_digest = DigestInfo::new([2u8; 32], 32); + let mut action_result = ActionResult { + output_files: vec![FileInfo { + name_or_path: NameOrPath::Path("test.txt".to_string()), + digest: DigestInfo::try_new( + "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3", + 3, + )?, + is_executable: false, + }], + stdout_digest: DigestInfo::try_new( + "426afaf613d8cfdd9fa8addcc030ae6c95a7950ae0301164af1d5851012081d5", + 10, + )?, + stderr_digest: DigestInfo::try_new( + "7b2e400d08b8e334e3172d105be308b506c6036c62a9bde5c509d7808b28b213", + 10, + )?, + exit_code: 0, + output_folders: vec![], + output_file_symlinks: vec![], + output_directory_symlinks: vec![], + server_logs: HashMap::new(), + execution_metadata: ExecutionMetadata { + worker: "WORKER_ID".to_string(), + queued_timestamp: SystemTime::UNIX_EPOCH, + worker_start_timestamp: make_system_time(0), + input_fetch_start_timestamp: make_system_time(1), + input_fetch_completed_timestamp: make_system_time(2), + execution_start_timestamp: make_system_time(3), + execution_completed_timestamp: make_system_time(4), + output_upload_start_timestamp: make_system_time(5), + output_upload_completed_timestamp: make_system_time(6), + worker_completed_timestamp: make_system_time(7), + }, + error: None, + message: String::new(), + }; + running_actions_manager + .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) + .await?; + + let retrieved_result = + get_and_decode_digest::(ac_store.as_ref(), action_digest.into()) + .await?; + + let proto_result: ProtoActionResult = action_result.try_into()?; + assert_eq!(proto_result, retrieved_result); + + Ok(()) + } + + #[nativelink_test] + async fn failed_action_does_not_cache_in_action_cache() + -> Result<(), Box> { + let (_, _, cas_store, ac_store) = setup_stores().await?; + + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: String::new(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Everything, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); + + let action_digest = DigestInfo::new([2u8; 32], 32); + let mut action_result = ActionResult { + output_files: vec![FileInfo { + name_or_path: NameOrPath::Path("test.txt".to_string()), + digest: DigestInfo::try_new( + "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3", + 3, + )?, + is_executable: false, + }], stdout_digest: DigestInfo::try_new( - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", - 0 + "426afaf613d8cfdd9fa8addcc030ae6c95a7950ae0301164af1d5851012081d5", + 10, )?, stderr_digest: DigestInfo::try_new( - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", - 0 + "7b2e400d08b8e334e3172d105be308b506c6036c62a9bde5c509d7808b28b213", + 10, )?, - exit_code: 33, + exit_code: 1, output_folders: vec![], output_file_symlinks: vec![], output_directory_symlinks: vec![], server_logs: HashMap::new(), execution_metadata: ExecutionMetadata { - worker: WORKER_ID.to_string(), - queued_timestamp, - worker_start_timestamp: increment_clock(&mut clock_time), - input_fetch_start_timestamp: increment_clock(&mut clock_time), - input_fetch_completed_timestamp: increment_clock(&mut clock_time), - execution_start_timestamp: increment_clock(&mut clock_time), - execution_completed_timestamp: increment_clock(&mut clock_time), - output_upload_start_timestamp: increment_clock(&mut clock_time), - output_upload_completed_timestamp: increment_clock(&mut clock_time), - worker_completed_timestamp: increment_clock(&mut clock_time), + worker: "WORKER_ID".to_string(), + queued_timestamp: SystemTime::UNIX_EPOCH, + worker_start_timestamp: make_system_time(0), + input_fetch_start_timestamp: make_system_time(1), + input_fetch_completed_timestamp: make_system_time(2), + execution_start_timestamp: make_system_time(3), + execution_completed_timestamp: make_system_time(4), + output_upload_start_timestamp: make_system_time(5), + output_upload_completed_timestamp: make_system_time(6), + worker_completed_timestamp: make_system_time(7), }, error: None, message: String::new(), - } - ); - let mut dir_stream = fs::read_dir(&root_action_directory).await?; - assert!( - dir_stream.as_mut().next_entry().await?.is_none(), - "Expected empty directory at {root_action_directory}" - ); - Ok(()) -} - -#[serial] -#[nativelink_test] -async fn kill_ends_action() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; - - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - - #[cfg(target_family = "unix")] - let (arguments, process_started_file) = { - let process_started_file = { - let tmp_dir = make_temp_path("root_action_directory"); - fs::create_dir_all(&tmp_dir).await.unwrap(); - format!("{tmp_dir}/process_started") }; - ( - vec![ - "sh".to_string(), - "-c".to_string(), - format!("touch {process_started_file} && sleep infinity"), - ], - process_started_file, - ) - }; - #[cfg(target_family = "windows")] - // Windows is weird with timeout, so we use ping. See: - // https://www.ibm.com/support/pages/timeout-command-run-batch-job-exits-immediately-and-returns-error-input-redirection-not-supported-exiting-process-immediately - let arguments = vec![ - "cmd".to_string(), - "/C".to_string(), - "ping -n 99999 127.0.0.1".to_string(), - ]; - - let command = Command { - arguments, - output_paths: vec![], - working_directory: ".".to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; - let operation_id = OperationId::default().to_string(); - - let running_action_impl = running_actions_manager - .clone() - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: Some(make_system_time(1000).into()), - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), - }, - ) - .await?; + running_actions_manager + .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) + .await?; - let run_action_fut = run_action(running_action_impl); - tokio::pin!(run_action_fut); + let retrieved_result = + get_and_decode_digest::(ac_store.as_ref(), action_digest.into()) + .await?; - #[cfg(target_family = "unix")] - loop { - assert_eq!(futures::poll!(&mut run_action_fut), Poll::Pending); - tokio::task::yield_now().await; - match fs::metadata(&process_started_file).await { - Ok(_) => break, - Err(err) => { - assert_eq!(err.code, Code::NotFound, "Unknown error {err:?}"); - tokio::time::sleep(Duration::from_millis(1)).await; - } - } - } + let proto_result: ProtoActionResult = action_result.try_into()?; + assert_eq!(proto_result, retrieved_result); - let result = futures::join!(run_action_fut, running_actions_manager.kill_all()) - .0 - .unwrap(); - - // Check that the action was killed. - #[cfg(all(target_family = "unix", not(target_os = "macos")))] - assert_eq!(9, result.exit_code, "Wrong exit_code - {result:?}"); - // Mac for some reason sometimes returns 1 and 9. - #[cfg(all(target_family = "unix", target_os = "macos"))] - assert!( - 9 == result.exit_code || 1 == result.exit_code, - "Wrong exit_code - {result:?}" - ); - // Note: Windows kill command returns exit code 1. - #[cfg(target_family = "windows")] - assert_eq!(1, result.exit_code); - - Ok(()) -} + Ok(()) + } -// This script runs a command under a wrapper script set in a config. -// The wrapper script will print a constant string to stderr, and the test itself will -// print to stdout. We then check the results of both to make sure the shell script was -// invoked and the actual command was invoked under the shell script. -#[cfg_attr(feature = "nix", ignore)] -#[serial] -#[nativelink_test] -async fn entrypoint_does_invoke_if_set() -> Result<(), Box> { - #[cfg(target_family = "unix")] - const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ -#!/usr/bin/env bash -# Print some static text to stderr. This is what the test uses to -# make sure the script did run. ->&2 printf \"Wrapper script did run\" + #[nativelink_test] + async fn success_does_cache_in_historical_results() -> Result<(), Box> { + let (_, _, cas_store, ac_store) = setup_stores().await?; -# Now run the real command. -exec \"$@\" -"; - #[cfg(target_family = "windows")] - const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ -@echo off -:: Print some static text to stderr. This is what the test uses to -:: make sure the script did run. -echo | set /p=\"Wrapper script did run\" 1>&2 + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: String::new(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_historical_results_strategy: Some( + nativelink_config::cas_server::UploadCacheResultsStrategy::SuccessOnly, + ), + #[expect( + clippy::literal_string_with_formatting_args, + reason = "passed to `formatx` crate for runtime interpretation" + )] + success_message_template: + "{historical_results_hash}-{historical_results_size}".to_string(), + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); -:: Run command, but morph the echo to ensure it doesn't -:: add a new line to the end of the output. -%1 | set /p=%2 -exit 0 -"; - const WORKER_ID: &str = "foo_worker_id"; - const EXPECTED_STDOUT: &str = "Action did run"; + let action_digest = DigestInfo::new([2u8; 32], 32); + let mut action_result = ActionResult { + output_files: vec![FileInfo { + name_or_path: NameOrPath::Path("test.txt".to_string()), + digest: DigestInfo::try_new( + "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3", + 3, + )?, + is_executable: false, + }], + stdout_digest: DigestInfo::try_new( + "426afaf613d8cfdd9fa8addcc030ae6c95a7950ae0301164af1d5851012081d5", + 10, + )?, + stderr_digest: DigestInfo::try_new( + "7b2e400d08b8e334e3172d105be308b506c6036c62a9bde5c509d7808b28b213", + 10, + )?, + exit_code: 0, + output_folders: vec![], + output_file_symlinks: vec![], + output_directory_symlinks: vec![], + server_logs: HashMap::new(), + execution_metadata: ExecutionMetadata { + worker: "WORKER_ID".to_string(), + queued_timestamp: SystemTime::UNIX_EPOCH, + worker_start_timestamp: make_system_time(0), + input_fetch_start_timestamp: make_system_time(1), + input_fetch_completed_timestamp: make_system_time(2), + execution_start_timestamp: make_system_time(3), + execution_completed_timestamp: make_system_time(4), + output_upload_start_timestamp: make_system_time(5), + output_upload_completed_timestamp: make_system_time(6), + worker_completed_timestamp: make_system_time(7), + }, + error: None, + message: String::new(), + }; + running_actions_manager + .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) + .await?; - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; + assert!(!action_result.message.is_empty(), "Message should be set"); - let test_wrapper_script = { - let test_wrapper_dir = make_temp_path("wrapper_dir"); - fs::create_dir_all(&test_wrapper_dir).await?; - #[cfg(target_family = "unix")] - let test_wrapper_script = OsString::from(test_wrapper_dir + "/test_wrapper_script.sh"); - #[cfg(target_family = "windows")] - let test_wrapper_script = OsString::from(test_wrapper_dir + "\\test_wrapper_script.bat"); - { - let mut file_options = std::fs::OpenOptions::new(); - file_options.create(true); - file_options.truncate(true); - file_options.write(true); - #[cfg(target_family = "unix")] - file_options.mode(0o777); - let mut test_wrapper_script_handle = file_options - .open(OsString::from(&test_wrapper_script)) - .unwrap(); - test_wrapper_script_handle - .write_all(TEST_WRAPPER_SCRIPT_CONTENT.as_bytes()) - .unwrap(); - test_wrapper_script_handle.sync_all().unwrap(); - // Note: Github runners appear to use some kind of filesystem driver - // that does not sync data as expected. This is the easiest solution. - // See: https://github.com/pantsbuild/pants/issues/10507 - // See: https://github.com/moby/moby/issues/9547 - std::process::Command::new("sync").output().unwrap(); - } - test_wrapper_script - }; + let historical_digest = { + let (historical_results_hash, historical_results_size) = action_result + .message + .split_once('-') + .expect("Message should be in format {hash}-{size}"); - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration { - entrypoint: Some(test_wrapper_script.into_string().unwrap()), - additional_environment: None, - }, - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - #[cfg(target_family = "unix")] - let arguments = vec!["printf".to_string(), EXPECTED_STDOUT.to_string()]; - #[cfg(target_family = "windows")] - let arguments = vec!["echo".to_string(), EXPECTED_STDOUT.to_string()]; - let command = Command { - arguments, - working_directory: ".".to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; - let operation_id = OperationId::default().to_string(); - - let running_action_impl = running_actions_manager - .clone() - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: Some(make_system_time(1000).into()), - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), - }, + DigestInfo::try_new( + historical_results_hash, + historical_results_size.parse::()?, + )? + }; + let retrieved_result = get_and_decode_digest::( + cas_store.as_ref(), + historical_digest.into(), ) .await?; - let result = run_action(running_action_impl).await?; - assert_eq!(result.exit_code, 0, "Exit code should be 0"); - - let expected_stdout = DigestHasherFunc::Sha256 - .hasher() - .compute_from_reader(Cursor::new(EXPECTED_STDOUT)) - .await?; - // Note: This string should match what is in worker_for_test.sh - let expected_stderr = DigestHasherFunc::Sha256 - .hasher() - .compute_from_reader(Cursor::new("Wrapper script did run")) - .await?; - assert_eq!(expected_stdout, result.stdout_digest); - assert_eq!(expected_stderr, result.stderr_digest); - - Ok(()) -} - -#[cfg_attr(feature = "nix", ignore)] -#[serial] -#[nativelink_test] -async fn entrypoint_injects_properties() -> Result<(), Box> { - #[cfg(target_family = "unix")] - const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ -#!/usr/bin/env bash -# Print some static text to stderr. This is what the test uses to -# make sure the script did run. ->&2 printf \"Wrapper script did run with property $PROPERTY $VALUE $INNER_TIMEOUT\" - -# Now run the real command. -exec \"$@\" -"; - #[cfg(target_family = "windows")] - const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ -@echo off -:: Print some static text to stderr. This is what the test uses to -:: make sure the script did run. -echo | set /p=\"Wrapper script did run with property %PROPERTY% %VALUE% %INNER_TIMEOUT%\" 1>&2 + assert_eq!( + HistoricalExecuteResponse { + action_digest: Some(action_digest.into()), + execute_response: Some(ExecuteResponse { + result: Some(action_result.try_into()?), + status: Some(Status::default()), + ..Default::default() + }), + }, + retrieved_result + ); -:: Run command, but morph the echo to ensure it doesn't -:: add a new line to the end of the output. -%1 | set /p=%2 -exit 0 -"; - const WORKER_ID: &str = "foo_worker_id"; - const EXPECTED_STDOUT: &str = "Action did run"; - const TASK_TIMEOUT: Duration = Duration::from_secs(122); + Ok(()) + } - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; + #[nativelink_test] + async fn failure_does_not_cache_in_historical_results() + -> Result<(), Box> { + let (_, _, cas_store, ac_store) = setup_stores().await?; - let test_wrapper_script = { - let test_wrapper_dir = make_temp_path("wrapper_dir"); - fs::create_dir_all(&test_wrapper_dir).await?; - #[cfg(target_family = "unix")] - let test_wrapper_script = OsString::from(test_wrapper_dir + "/test_wrapper_script.sh"); - #[cfg(target_family = "windows")] - let test_wrapper_script = OsString::from(test_wrapper_dir + "\\test_wrapper_script.bat"); - { - let mut file_options = std::fs::OpenOptions::new(); - file_options.create(true); - file_options.truncate(true); - file_options.write(true); - #[cfg(target_family = "unix")] - file_options.mode(0o777); - let mut test_wrapper_script_handle = file_options - .open(OsString::from(&test_wrapper_script)) - .unwrap(); - test_wrapper_script_handle - .write_all(TEST_WRAPPER_SCRIPT_CONTENT.as_bytes()) - .unwrap(); - test_wrapper_script_handle.sync_all().unwrap(); - // Note: Github runners appear to use some kind of filesystem driver - // that does not sync data as expected. This is the easiest solution. - // See: https://github.com/pantsbuild/pants/issues/10507 - // See: https://github.com/moby/moby/issues/9547 - std::process::Command::new("sync").output().unwrap(); - } - test_wrapper_script - }; + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: String::new(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_historical_results_strategy: Some( + nativelink_config::cas_server::UploadCacheResultsStrategy::SuccessOnly, + ), + success_message_template: + "{historical_results_hash}-{historical_results_size}".to_string(), + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration { - entrypoint: Some(test_wrapper_script.into_string().unwrap()), - additional_environment: Some(HashMap::from([ - ( - "PROPERTY".to_string(), - EnvironmentSource::Property("property_name".to_string()), - ), - ( - "VALUE".to_string(), - EnvironmentSource::Value("raw_value".to_string()), - ), - ( - "INNER_TIMEOUT".to_string(), - EnvironmentSource::TimeoutMillis, - ), - ( - "PATH".to_string(), - EnvironmentSource::Value(env::var("PATH").unwrap()), - ), - ])), - }, - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - #[cfg(target_family = "unix")] - let arguments = vec!["printf".to_string(), EXPECTED_STDOUT.to_string()]; - #[cfg(target_family = "windows")] - let arguments = vec!["echo".to_string(), EXPECTED_STDOUT.to_string()]; - let command = Command { - arguments, - working_directory: ".".to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - platform: Some(Platform { - properties: vec![Property { - name: "property_name".into(), - value: "property_value".into(), - }], - }), - timeout: Some(prost_types::Duration { - seconds: TASK_TIMEOUT.as_secs() as i64, - nanos: 0, - }), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; - let operation_id = OperationId::default().to_string(); - - let running_action_impl = running_actions_manager - .clone() - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: Some(make_system_time(1000).into()), - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), - }, - ) - .await?; + let action_digest = DigestInfo::new([2u8; 32], 32); + let mut action_result = ActionResult { + exit_code: 1, + ..Default::default() + }; + running_actions_manager + .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) + .await?; - let result = run_action(running_action_impl).await?; - assert_eq!(result.exit_code, 0, "Exit code should be 0"); + assert!( + action_result.message.is_empty(), + "Message should not be set" + ); + Ok(()) + } - let expected_stdout = DigestHasherFunc::Sha256 - .hasher() - .compute_from_reader(Cursor::new(EXPECTED_STDOUT)) - .await?; - // Note: This string should match what is in worker_for_test.sh - let expected_stderr = "Wrapper script did run with property property_value raw_value 122000"; - let expected_stderr_digest = DigestHasherFunc::Sha256 - .hasher() - .compute_from_reader(Cursor::new(expected_stderr)) - .await?; + #[nativelink_test] + async fn infra_failure_does_cache_in_historical_results() + -> Result<(), Box> { + let (_, _, cas_store, ac_store) = setup_stores().await?; - let actual_stderr: bytes::Bytes = cas_store - .as_ref() - .get_part_unchunked(result.stderr_digest, 0, None) - .await?; - let actual_stderr_decoded = from_utf8(&actual_stderr)?; - assert_eq!(expected_stderr, actual_stderr_decoded); - assert_eq!(expected_stdout, result.stdout_digest); - assert_eq!(expected_stderr_digest, result.stderr_digest); + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: String::new(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_historical_results_strategy: Some( + nativelink_config::cas_server::UploadCacheResultsStrategy::FailuresOnly, + ), + #[expect( + clippy::literal_string_with_formatting_args, + reason = "passed to `formatx` crate for runtime interpretation" + )] + failure_message_template: + "{historical_results_hash}-{historical_results_size}".to_string(), + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); - Ok(()) -} + let action_digest = DigestInfo::new([2u8; 32], 32); + let mut action_result = ActionResult { + exit_code: 0, + error: Some(make_input_err!("test error")), + ..Default::default() + }; + running_actions_manager + .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) + .await?; -#[cfg_attr(feature = "nix", ignore)] -#[serial] -#[nativelink_test] -async fn entrypoint_sends_timeout_via_side_channel() -> Result<(), Box> { - #[cfg(target_family = "unix")] - const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ -#!/bin/bash -echo '{\"failure\":\"timeout\"}' > \"$SIDE_CHANNEL_FILE\" -exit 1 -"; - #[cfg(target_family = "windows")] - const TEST_WRAPPER_SCRIPT_CONTENT: &str = "\ -@echo off -echo | set /p={\"failure\":\"timeout\"} 1>&2 > %SIDE_CHANNEL_FILE% -exit 1 -"; - const WORKER_ID: &str = "foo_worker_id"; + assert!(!action_result.message.is_empty(), "Message should be set"); - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; + let historical_digest = { + let (historical_results_hash, historical_results_size) = action_result + .message + .split_once('-') + .expect("Message should be in format {hash}-{size}"); - let test_wrapper_script = { - let test_wrapper_dir = make_temp_path("wrapper_dir"); - fs::create_dir_all(&test_wrapper_dir).await?; - #[cfg(target_family = "unix")] - let test_wrapper_script = OsString::from(test_wrapper_dir + "/test_wrapper_script.sh"); - #[cfg(target_family = "windows")] - let test_wrapper_script = OsString::from(test_wrapper_dir + "\\test_wrapper_script.bat"); - { - let mut file_options = std::fs::OpenOptions::new(); - file_options.create(true); - file_options.truncate(true); - file_options.write(true); - #[cfg(target_family = "unix")] - file_options.mode(0o777); - let mut test_wrapper_script_handle = file_options - .open(OsString::from(&test_wrapper_script)) - .unwrap(); - test_wrapper_script_handle - .write_all(TEST_WRAPPER_SCRIPT_CONTENT.as_bytes()) - .unwrap(); - test_wrapper_script_handle.sync_all().unwrap(); - // Note: Github runners appear to use some kind of filesystem driver - // that does not sync data as expected. This is the easiest solution. - // See: https://github.com/pantsbuild/pants/issues/10507 - // See: https://github.com/moby/moby/issues/9547 - std::process::Command::new("sync").output().unwrap(); - } - test_wrapper_script - }; + DigestInfo::try_new( + historical_results_hash, + historical_results_size.parse::()?, + )? + }; - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration { - entrypoint: Some(test_wrapper_script.into_string().unwrap()), - additional_environment: Some(HashMap::from([( - "SIDE_CHANNEL_FILE".to_string(), - EnvironmentSource::SideChannelFile, - )])), - }, - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - let arguments = vec!["true".to_string()]; - let command = Command { - arguments, - working_directory: ".".to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; - let operation_id = OperationId::default().to_string(); - - let running_action_impl = running_actions_manager - .clone() - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: Some(make_system_time(1000).into()), - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), - }, + let retrieved_result = get_and_decode_digest::( + cas_store.as_ref(), + historical_digest.into(), ) .await?; - let result = run_action(running_action_impl).await?; - assert_eq!(result.exit_code, 1, "Exit code should be 1"); - assert_eq!( - result.error.err_tip(|| "Error should exist")?.code, - Code::DeadlineExceeded - ); - Ok(()) -} - -#[serial] -#[nativelink_test] -async fn caches_results_in_action_cache_store() -> Result<(), Box> { - let (_, _, cas_store, ac_store) = setup_stores().await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: String::new(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::SuccessOnly, - ..Default::default() + assert_eq!( + HistoricalExecuteResponse { + action_digest: Some(action_digest.into()), + execute_response: Some(ExecuteResponse { + result: Some(action_result.try_into()?), + status: Some(make_input_err!("test error").into()), + ..Default::default() + }), }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - - let action_digest = DigestInfo::new([2u8; 32], 32); - let mut action_result = ActionResult { - output_files: vec![FileInfo { - name_or_path: NameOrPath::Path("test.txt".to_string()), - digest: DigestInfo::try_new( - "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3", - 3, - )?, - is_executable: false, - }], - stdout_digest: DigestInfo::try_new( - "426afaf613d8cfdd9fa8addcc030ae6c95a7950ae0301164af1d5851012081d5", - 10, - )?, - stderr_digest: DigestInfo::try_new( - "7b2e400d08b8e334e3172d105be308b506c6036c62a9bde5c509d7808b28b213", - 10, - )?, - exit_code: 0, - output_folders: vec![], - output_file_symlinks: vec![], - output_directory_symlinks: vec![], - server_logs: HashMap::new(), - execution_metadata: ExecutionMetadata { - worker: "WORKER_ID".to_string(), - queued_timestamp: SystemTime::UNIX_EPOCH, - worker_start_timestamp: make_system_time(0), - input_fetch_start_timestamp: make_system_time(1), - input_fetch_completed_timestamp: make_system_time(2), - execution_start_timestamp: make_system_time(3), - execution_completed_timestamp: make_system_time(4), - output_upload_start_timestamp: make_system_time(5), - output_upload_completed_timestamp: make_system_time(6), - worker_completed_timestamp: make_system_time(7), - }, - error: None, - message: String::new(), - }; - running_actions_manager - .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) - .await?; - - let retrieved_result = - get_and_decode_digest::(ac_store.as_ref(), action_digest.into()).await?; + retrieved_result + ); + Ok(()) + } - let proto_result: ProtoActionResult = action_result.try_into()?; - assert_eq!(proto_result, retrieved_result); + #[nativelink_test] + async fn action_result_has_used_in_message() -> Result<(), Box> { + let (_, _, cas_store, ac_store) = setup_stores().await?; - Ok(()) -} + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: String::new(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::SuccessOnly, + success_message_template: "{action_digest_hash}-{action_digest_size}" + .to_string(), + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); -#[serial] -#[nativelink_test] -async fn failed_action_does_not_cache_in_action_cache() -> Result<(), Box> { - let (_, _, cas_store, ac_store) = setup_stores().await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: String::new(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Everything, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - - let action_digest = DigestInfo::new([2u8; 32], 32); - let mut action_result = ActionResult { - output_files: vec![FileInfo { - name_or_path: NameOrPath::Path("test.txt".to_string()), - digest: DigestInfo::try_new( - "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3", - 3, - )?, - is_executable: false, - }], - stdout_digest: DigestInfo::try_new( - "426afaf613d8cfdd9fa8addcc030ae6c95a7950ae0301164af1d5851012081d5", - 10, - )?, - stderr_digest: DigestInfo::try_new( - "7b2e400d08b8e334e3172d105be308b506c6036c62a9bde5c509d7808b28b213", - 10, - )?, - exit_code: 1, - output_folders: vec![], - output_file_symlinks: vec![], - output_directory_symlinks: vec![], - server_logs: HashMap::new(), - execution_metadata: ExecutionMetadata { - worker: "WORKER_ID".to_string(), - queued_timestamp: SystemTime::UNIX_EPOCH, - worker_start_timestamp: make_system_time(0), - input_fetch_start_timestamp: make_system_time(1), - input_fetch_completed_timestamp: make_system_time(2), - execution_start_timestamp: make_system_time(3), - execution_completed_timestamp: make_system_time(4), - output_upload_start_timestamp: make_system_time(5), - output_upload_completed_timestamp: make_system_time(6), - worker_completed_timestamp: make_system_time(7), - }, - error: None, - message: String::new(), - }; - running_actions_manager - .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) - .await?; + let action_digest = DigestInfo::new([2u8; 32], 32); + let mut action_result = ActionResult { + exit_code: 0, + ..Default::default() + }; + running_actions_manager + .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) + .await?; - let retrieved_result = - get_and_decode_digest::(ac_store.as_ref(), action_digest.into()).await?; + assert!(!action_result.message.is_empty(), "Message should be set"); - let proto_result: ProtoActionResult = action_result.try_into()?; - assert_eq!(proto_result, retrieved_result); + let action_result_digest = { + let (action_result_hash, action_result_size) = action_result + .message + .split_once('-') + .expect("Message should be in format {hash}-{size}"); - Ok(()) -} + DigestInfo::try_new(action_result_hash, action_result_size.parse::()?)? + }; -#[serial] -#[nativelink_test] -async fn success_does_cache_in_historical_results() -> Result<(), Box> { - let (_, _, cas_store, ac_store) = setup_stores().await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: String::new(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_historical_results_strategy: Some( - nativelink_config::cas_server::UploadCacheResultsStrategy::SuccessOnly, - ), - #[expect( - clippy::literal_string_with_formatting_args, - reason = "passed to `formatx` crate for runtime interpretation" - )] - success_message_template: "{historical_results_hash}-{historical_results_size}" - .to_string(), - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - - let action_digest = DigestInfo::new([2u8; 32], 32); - let mut action_result = ActionResult { - output_files: vec![FileInfo { - name_or_path: NameOrPath::Path("test.txt".to_string()), - digest: DigestInfo::try_new( - "a665a45920422f9d417e4867efdc4fb8a04a1f3fff1fa07e998e86f7f7a27ae3", - 3, - )?, - is_executable: false, - }], - stdout_digest: DigestInfo::try_new( - "426afaf613d8cfdd9fa8addcc030ae6c95a7950ae0301164af1d5851012081d5", - 10, - )?, - stderr_digest: DigestInfo::try_new( - "7b2e400d08b8e334e3172d105be308b506c6036c62a9bde5c509d7808b28b213", - 10, - )?, - exit_code: 0, - output_folders: vec![], - output_file_symlinks: vec![], - output_directory_symlinks: vec![], - server_logs: HashMap::new(), - execution_metadata: ExecutionMetadata { - worker: "WORKER_ID".to_string(), - queued_timestamp: SystemTime::UNIX_EPOCH, - worker_start_timestamp: make_system_time(0), - input_fetch_start_timestamp: make_system_time(1), - input_fetch_completed_timestamp: make_system_time(2), - execution_start_timestamp: make_system_time(3), - execution_completed_timestamp: make_system_time(4), - output_upload_start_timestamp: make_system_time(5), - output_upload_completed_timestamp: make_system_time(6), - worker_completed_timestamp: make_system_time(7), - }, - error: None, - message: String::new(), - }; - running_actions_manager - .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) + let retrieved_result = get_and_decode_digest::( + ac_store.as_ref(), + action_result_digest.into(), + ) .await?; - assert!(!action_result.message.is_empty(), "Message should be set"); + let proto_result: ProtoActionResult = action_result.try_into()?; + assert_eq!(proto_result, retrieved_result); + Ok(()) + } - let historical_digest = { - let (historical_results_hash, historical_results_size) = action_result - .message - .split_once('-') - .expect("Message should be in format {hash}-{size}"); + #[nativelink_test] + async fn ensure_worker_timeout_chooses_correct_values() + -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; - DigestInfo::try_new( - historical_results_hash, - historical_results_size.parse::()?, - )? - }; - let retrieved_result = get_and_decode_digest::( - cas_store.as_ref(), - historical_digest.into(), - ) - .await?; - - assert_eq!( - HistoricalExecuteResponse { - action_digest: Some(action_digest.into()), - execute_response: Some(ExecuteResponse { - result: Some(action_result.try_into()?), - status: Some(Status::default()), - ..Default::default() - }), - }, - retrieved_result - ); + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } - Ok(()) -} + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; -#[serial] -#[nativelink_test] -async fn failure_does_not_cache_in_historical_results() -> Result<(), Box> { - let (_, _, cas_store, ac_store) = setup_stores().await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: String::new(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_historical_results_strategy: Some( - nativelink_config::cas_server::UploadCacheResultsStrategy::SuccessOnly, - ), - success_message_template: "{historical_results_hash}-{historical_results_size}" - .to_string(), - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - - let action_digest = DigestInfo::new([2u8; 32], 32); - let mut action_result = ActionResult { - exit_code: 1, - ..Default::default() - }; - running_actions_manager - .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) - .await?; + let (_, _, cas_store, ac_store) = setup_stores().await?; - assert!( - action_result.message.is_empty(), - "Message should not be set" - ); - Ok(()) -} + #[cfg(target_family = "unix")] + let arguments = vec!["true".to_string()]; + #[cfg(target_family = "windows")] + let arguments = vec![ + "cmd".to_string(), + "/C".to_string(), + "exit".to_string(), + "0".to_string(), + ]; -#[serial] -#[nativelink_test] -async fn infra_failure_does_cache_in_historical_results() -> Result<(), Box> -{ - let (_, _, cas_store, ac_store) = setup_stores().await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: String::new(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_historical_results_strategy: Some( - nativelink_config::cas_server::UploadCacheResultsStrategy::FailuresOnly, - ), - #[expect( - clippy::literal_string_with_formatting_args, - reason = "passed to `formatx` crate for runtime interpretation" - )] - failure_message_template: "{historical_results_hash}-{historical_results_size}" - .to_string(), - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - - let action_digest = DigestInfo::new([2u8; 32], 32); - let mut action_result = ActionResult { - exit_code: 0, - error: Some(make_input_err!("test error")), - ..Default::default() - }; - running_actions_manager - .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) + let command = Command { + arguments, + output_paths: vec![], + working_directory: ".".to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) .await?; - assert!(!action_result.message.is_empty(), "Message should be set"); - - let historical_digest = { - let (historical_results_hash, historical_results_size) = action_result - .message - .split_once('-') - .expect("Message should be in format {hash}-{size}"); - - DigestInfo::try_new( - historical_results_hash, - historical_results_size.parse::()?, - )? - }; + { + // Test to ensure that the task timeout is chosen if it is less than the max timeout. + static SENT_TIMEOUT: AtomicI64 = AtomicI64::new(-1); + const MAX_TIMEOUT_DURATION: Duration = Duration::from_secs(100); + const TASK_TIMEOUT: Duration = Duration::from_secs(10); + + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + timeout: Some(prost_types::Duration { + seconds: TASK_TIMEOUT.as_secs() as i64, + nanos: 0, + }), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; - let retrieved_result = get_and_decode_digest::( - cas_store.as_ref(), - historical_digest.into(), - ) - .await?; + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: MAX_TIMEOUT_DURATION, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + }, + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |duration| { + SENT_TIMEOUT.store( + i64::try_from(duration.as_millis()).unwrap_or(i64::MAX), + Ordering::Relaxed, + ); + Box::pin(future::pending()) + }, + }, + )?); - assert_eq!( - HistoricalExecuteResponse { - action_digest: Some(action_digest.into()), - execute_response: Some(ExecuteResponse { - result: Some(action_result.try_into()?), - status: Some(make_input_err!("test error").into()), + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), ..Default::default() - }), - }, - retrieved_result - ); - Ok(()) -} - -#[serial] -#[nativelink_test] -async fn action_result_has_used_in_message() -> Result<(), Box> { - let (_, _, cas_store, ac_store) = setup_stores().await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: String::new(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::SuccessOnly, - success_message_template: "{action_digest_hash}-{action_digest_size}".to_string(), + }; + let operation_id = OperationId::default().to_string(); + + running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: Some(make_system_time(1000).into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .and_then(|action| { + action + .clone() + .prepare_action() + .and_then(RunningAction::execute) + .then(|result| async move { + if let Err(e) = action.cleanup().await { + return Result::::Err(e).merge(result); + } + result + }) + }) + .await?; + assert_eq!( + SENT_TIMEOUT.load(Ordering::Relaxed), + i64::try_from(TASK_TIMEOUT.as_millis()) + .expect("TASK_TIMEOUT.as_millis() exceeds i64::MAX") + ); + } + { + // Ensure if no timeout is set use max timeout. + static SENT_TIMEOUT: AtomicI64 = AtomicI64::new(-1); + const MAX_TIMEOUT_DURATION: Duration = Duration::from_secs(100); + const TASK_TIMEOUT: Duration = Duration::from_secs(0); + + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + timeout: Some(prost_types::Duration { + seconds: TASK_TIMEOUT.as_secs() as i64, + nanos: 0, + }), ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - - let action_digest = DigestInfo::new([2u8; 32], 32); - let mut action_result = ActionResult { - exit_code: 0, - ..Default::default() - }; - running_actions_manager - .cache_action_result(action_digest, &mut action_result, DigestHasherFunc::Sha256) - .await?; - - assert!(!action_result.message.is_empty(), "Message should be set"); - - let action_result_digest = { - let (action_result_hash, action_result_size) = action_result - .message - .split_once('-') - .expect("Message should be in format {hash}-{size}"); + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; - DigestInfo::try_new(action_result_hash, action_result_size.parse::()?)? - }; + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: MAX_TIMEOUT_DURATION, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + }, + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |duration| { + SENT_TIMEOUT.store( + i64::try_from(duration.as_millis()).unwrap_or(i64::MAX), + Ordering::Relaxed, + ); + Box::pin(future::pending()) + }, + }, + )?); - let retrieved_result = - get_and_decode_digest::(ac_store.as_ref(), action_result_digest.into()) + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; + let operation_id = OperationId::default().to_string(); + + running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: Some(make_system_time(1000).into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .and_then(|action| { + action + .clone() + .prepare_action() + .and_then(RunningAction::execute) + .then(|result| async move { + if let Err(e) = action.cleanup().await { + return Result::::Err(e).merge(result); + } + result + }) + }) + .await?; + assert_eq!( + SENT_TIMEOUT.load(Ordering::Relaxed), + i64::try_from(MAX_TIMEOUT_DURATION.as_millis()) + .expect("MAX_TIMEOUT_DURATION.as_millis() exceeds i64::MAX") + ); + } + { + // Ensure we reject tasks that have a timeout set too high. + static SENT_TIMEOUT: AtomicI64 = AtomicI64::new(-1); + const MAX_TIMEOUT_DURATION: Duration = Duration::from_secs(100); + const TASK_TIMEOUT: Duration = Duration::from_secs(200); + + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + timeout: Some(prost_types::Duration { + seconds: TASK_TIMEOUT.as_secs() as i64, + nanos: 0, + }), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) .await?; - let proto_result: ProtoActionResult = action_result.try_into()?; - assert_eq!(proto_result, retrieved_result); - Ok(()) -} - -#[serial] -#[nativelink_test] -async fn ensure_worker_timeout_chooses_correct_values() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: MAX_TIMEOUT_DURATION, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + }, + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |duration| { + SENT_TIMEOUT.store( + i64::try_from(duration.as_millis()).unwrap_or(i64::MAX), + Ordering::Relaxed, + ); + Box::pin(future::pending()) + }, + }, + )?); - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; + let operation_id = OperationId::default().to_string(); + + let result = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: Some(make_system_time(1000).into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .and_then(|action| { + action + .clone() + .prepare_action() + .and_then(RunningAction::execute) + .then(|result| async move { + if let Err(e) = action.cleanup().await { + return Result::::Err(e).merge(result); + } + result + }) + }) + .await; + assert_eq!(SENT_TIMEOUT.load(Ordering::Relaxed), -1); + assert_eq!(result.err().unwrap().code, Code::InvalidArgument); + } + Ok(()) } - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; + #[nativelink_test] + async fn worker_times_out() -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; - let (_, _, cas_store, ac_store) = setup_stores().await?; - - #[cfg(target_family = "unix")] - let arguments = vec!["true".to_string()]; - #[cfg(target_family = "windows")] - let arguments = vec![ - "cmd".to_string(), - "/C".to_string(), - "exit".to_string(), - "0".to_string(), - ]; - - let command = Command { - arguments, - output_paths: vec![], - working_directory: ".".to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - { - // Test to ensure that the task timeout is chosen if it is less than the max timeout. - static SENT_TIMEOUT: AtomicI64 = AtomicI64::new(-1); - const MAX_TIMEOUT_DURATION: Duration = Duration::from_secs(100); - const TASK_TIMEOUT: Duration = Duration::from_secs(10); + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - timeout: Some(prost_types::Duration { - seconds: TASK_TIMEOUT.as_secs() as i64, - nanos: 0, - }), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; + type StaticOneshotTuple = + Mutex<(Option>, Option>)>; + static TIMEOUT_ONESHOT: LazyLock = LazyLock::new(|| { + let (tx, rx) = oneshot::channel(); + Mutex::new((Some(tx), Some(rx))) + }); + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + let (_, _, cas_store, ac_store) = setup_stores().await?; let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( RunningActionsManagerArgs { root_action_directory: root_action_directory.clone(), @@ -2495,25 +2811,78 @@ async fn ensure_worker_timeout_chooses_correct_values() -> Result<(), Box Result<(), Box::Err(e).merge(result); } result }) - }) - .await?; - assert_eq!( - SENT_TIMEOUT.load(Ordering::Relaxed), - TASK_TIMEOUT.as_millis() as i64 - ); + }); + + let (results, ()) = tokio::join!(execute_results_fut, async move { + tokio::task::yield_now().await; + let tx = TIMEOUT_ONESHOT.lock().unwrap().0.take().unwrap(); + tx.send(()).expect("Could not send timeout signal"); + }); + assert_eq!(results?.error.unwrap().code, Code::DeadlineExceeded); + + #[cfg(target_family = "unix")] + let command = "[\"sh\", \"-c\", \"sleep infinity\"]"; + #[cfg(target_family = "windows")] + let command = "[\"cmd\", \"/C\", \"ping -n 99999 127.0.0.1\"]"; + + assert!(logs_contain(&format!("Executing command args={command}"))); + assert!(logs_contain(&format!("Command complete args={command}"))); + + assert!(!logs_contain( + "Child process was not cleaned up before dropping the call to execute(), killing in background spawn" + )); + #[cfg(target_family = "unix")] + assert!(logs_contain( + "Command timed out seconds=0.0 command=sh -c sleep infinity" + )); + #[cfg(target_family = "windows")] + assert!(logs_contain( + "Command timed out seconds=0.0 command=cmd /C ping -n 99999 127.0.0.1" + )); + + Ok(()) } - { - // Ensure if no timeout is set use max timeout. - static SENT_TIMEOUT: AtomicI64 = AtomicI64::new(-1); - const MAX_TIMEOUT_DURATION: Duration = Duration::from_secs(100); - const TASK_TIMEOUT: Duration = Duration::from_secs(0); + #[nativelink_test] + async fn kill_all_waits_for_all_tasks_to_finish() -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; + + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } + + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let (_, _, cas_store, ac_store) = setup_stores().await?; + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + }, + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |_duration| Box::pin(future::pending()), + }, + )?); + + #[cfg(target_family = "unix")] + let arguments = vec![ + "sh".to_string(), + "-c".to_string(), + "sleep infinity".to_string(), + ]; + #[cfg(target_family = "windows")] + // Windows is weird with timeout, so we use ping. See: + // https://www.ibm.com/support/pages/timeout-command-run-batch-job-exits-immediately-and-returns-error-input-redirection-not-supported-exiting-process-immediately + let arguments = vec![ + "cmd".to_string(), + "/C".to_string(), + "ping -n 99999 127.0.0.1".to_string(), + ]; + + let command = Command { + arguments, + output_paths: vec![], + working_directory: ".".to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; let action = Action { command_digest: Some(command_digest.into()), input_root_digest: Some(input_root_digest.into()), - timeout: Some(prost_types::Duration { - seconds: TASK_TIMEOUT.as_secs() as i64, - nanos: 0, - }), ..Default::default() }; let action_digest = serialize_and_upload_message( @@ -2564,38 +3024,15 @@ async fn ensure_worker_timeout_chooses_correct_values() -> Result<(), Box Result<(), Box::Err(e).merge(result); - } - result - }) - }) .await?; - assert_eq!( - SENT_TIMEOUT.load(Ordering::Relaxed), - MAX_TIMEOUT_DURATION.as_millis() as i64 - ); + let execute_results_fut = action + .clone() + .prepare_action() + .and_then(RunningAction::execute) + .and_then(RunningAction::upload_results) + .and_then(RunningAction::get_finished_result) + .then(|result| async { + cleanup_was_requested.store(true, Ordering::Release); + cleanup_rx.await.expect("Could not receive cleanup signal"); + if let Err(e) = action.cleanup().await { + return Result::::Err(e).merge(result); + } + result + }); + + tokio::pin!(execute_results_fut); + { + // Advance the action as far as possible and ensure we are not waiting on cleanup. + for _ in 0..100 { + assert!(futures::poll!(&mut execute_results_fut).is_pending()); + tokio::task::yield_now().await; + } + assert_eq!(cleanup_was_requested.load(Ordering::Acquire), false); + } + + let kill_all_fut = running_actions_manager.kill_all(); + tokio::pin!(kill_all_fut); + + { + // * Advance the action as far as possible. + // * Ensure we are now waiting on cleanup. + // * Ensure our kill_action is still pending. + while !cleanup_was_requested.load(Ordering::Acquire) { + // Wait for cleanup to be triggered. + tokio::task::yield_now().await; + assert!(futures::poll!(&mut execute_results_fut).is_pending()); + assert!(futures::poll!(&mut kill_all_fut).is_pending()); + } + } + // Allow cleanup, which allows execute_results_fut to advance. + cleanup_tx.send(()).expect("Could not send cleanup signal"); + // Advance our two futures to completion now. + let result = execute_results_fut.await; + kill_all_fut.await; + { + // Ensure our results are correct. + let action_result = result?; + let err = action_result + .error + .as_ref() + .err_tip(|| format!("No error exists in result : {action_result:?}"))?; + assert_eq!( + err.code, + Code::Aborted, + "Expected Aborted : {action_result:?}" + ); + } + + Ok(()) } - { - // Ensure we reject tasks that have a timeout set too high. - static SENT_TIMEOUT: AtomicI64 = AtomicI64::new(-1); - const MAX_TIMEOUT_DURATION: Duration = Duration::from_secs(100); - const TASK_TIMEOUT: Duration = Duration::from_secs(200); - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - timeout: Some(prost_types::Duration { - seconds: TASK_TIMEOUT.as_secs() as i64, - nanos: 0, - }), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; + /// Regression Test for Issue #675 + #[cfg(target_family = "unix")] + #[nativelink_test] + async fn unix_executable_file_test() -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; + const FILE_1_NAME: &str = "file1"; + + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } + + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration::default(), + root_action_directory, cas_store: cas_store.clone(), ac_store: Some(Store::new(ac_store.clone())), + execution_configuration: ExecutionConfiguration::default(), historical_store: Store::new(cas_store.clone()), upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { @@ -2659,17 +3134,152 @@ async fn ensure_worker_timeout_chooses_correct_values() -> Result<(), Box Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; + + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; + let temp_action_directory = make_temp_path("root_action_directory/temp"); + fs::create_dir_all(&temp_action_directory).await?; + + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); + let queued_timestamp = make_system_time(1000); + + #[cfg(target_family = "unix")] + let arguments = vec!["sh".to_string(), "-c".to_string(), "exit 0".to_string()]; + #[cfg(target_family = "windows")] + let arguments = vec!["cmd".to_string(), "/C".to_string(), "exit 0".to_string()]; + + let command = Command { + arguments, + output_paths: vec![], + working_directory: ".".to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; let execute_request = ExecuteRequest { action_digest: Some(action_digest.into()), @@ -2677,384 +3287,276 @@ async fn ensure_worker_timeout_chooses_correct_values() -> Result<(), Box::Err(e).merge(result); - } - result - }) - }) - .await; - assert_eq!(SENT_TIMEOUT.load(Ordering::Relaxed), -1); - assert_eq!(result.err().unwrap().code, Code::InvalidArgument); - } - Ok(()) -} + .await?; -#[serial] -#[nativelink_test] -async fn worker_times_out() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; + run_action(running_action_impl.clone()).await?; - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) + let mut dir_stream = fs::read_dir(&root_action_directory).await?; + assert!( + dir_stream.as_mut().next_entry().await?.is_none(), + "Expected empty directory at {root_action_directory}" + ); + Ok(()) } - type StaticOneshotTuple = Mutex<(Option>, Option>)>; - static TIMEOUT_ONESHOT: LazyLock = LazyLock::new(|| { - let (tx, rx) = oneshot::channel(); - Mutex::new((Some(tx), Some(rx))) - }); - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let (_, _, cas_store, ac_store) = setup_stores().await?; - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| { - Box::pin(async move { - let rx = TIMEOUT_ONESHOT.lock().unwrap().1.take().unwrap(); - rx.await.expect("Could not receive timeout signal"); - }) - }, - }, - )?); + // We've experienced deadlocks when uploading, so make only a single permit available and + // check it's able to handle uploading some directories with some files in. - #[cfg(target_family = "unix")] - let arguments = vec![ - "sh".to_string(), - "-c".to_string(), - "sleep infinity".to_string(), - ]; - #[cfg(target_family = "windows")] - // Windows is weird with timeout, so we use ping. See: - // https://www.ibm.com/support/pages/timeout-command-run-batch-job-exits-immediately-and-returns-error-input-redirection-not-supported-exiting-process-immediately - let arguments = vec![ - "cmd".to_string(), - "/C".to_string(), - "ping -n 99999 127.0.0.1".to_string(), - ]; - - let command = Command { - arguments, - output_paths: vec![], - working_directory: ".".to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; - let operation_id = OperationId::default().to_string(); - - let execute_results_fut = running_actions_manager - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: Some(make_system_time(1000).into()), - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), - }, - ) - .and_then(|action| { - action - .clone() - .prepare_action() - .and_then(RunningAction::execute) - .and_then(RunningAction::upload_results) - .and_then(RunningAction::get_finished_result) - .then(|result| async move { - if let Err(e) = action.cleanup().await { - return Result::::Err(e).merge(result); - } - result - }) - }); + // TODO(palfrey) This is unix only only because I was lazy and didn't spend the time to + // build the bash-like commands in windows as well. - let (results, ()) = tokio::join!(execute_results_fut, async move { - tokio::task::yield_now().await; - let tx = TIMEOUT_ONESHOT.lock().unwrap().0.take().unwrap(); - tx.send(()).expect("Could not send timeout signal"); - }); - assert_eq!(results?.error.unwrap().code, Code::DeadlineExceeded); + #[nativelink_test] + #[cfg(target_family = "unix")] + async fn upload_with_single_permit() -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; - Ok(()) -} + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } -#[serial] -#[nativelink_test] -async fn kill_all_waits_for_all_tasks_to_finish() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; + let (_, slow_store, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_action_directory"); + fs::create_dir_all(&root_action_directory).await?; - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) - } + // Take all but one FD permit away. + let _permits = stream::iter(1..fs::OPEN_FILE_SEMAPHORE.available_permits()) + .then(|_| fs::OPEN_FILE_SEMAPHORE.acquire()) + .try_collect::>() + .await?; + assert_eq!(1, fs::OPEN_FILE_SEMAPHORE.available_permits()); - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let (_, _, cas_store, ac_store) = setup_stores().await?; - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory, + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| Box::pin(future::pending()), - }, - )?); - - #[cfg(target_family = "unix")] - let arguments = vec![ - "sh".to_string(), - "-c".to_string(), - "sleep infinity".to_string(), - ]; - #[cfg(target_family = "windows")] - // Windows is weird with timeout, so we use ping. See: - // https://www.ibm.com/support/pages/timeout-command-run-batch-job-exits-immediately-and-returns-error-input-redirection-not-supported-exiting-process-immediately - let arguments = vec![ - "cmd".to_string(), - "/C".to_string(), - "ping -n 99999 127.0.0.1".to_string(), - ]; - - let command = Command { - arguments, - output_paths: vec![], - working_directory: ".".to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; - let operation_id = OperationId::default().to_string(); - - let (cleanup_tx, cleanup_rx) = oneshot::channel(); - let cleanup_was_requested = AtomicBool::new(false); - let action = running_actions_manager - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: Some(make_system_time(1000).into()), - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), + Callbacks { + now_fn: test_monotonic_clock, + sleep_fn: |_duration| Box::pin(future::pending()), }, - ) - .await?; - let execute_results_fut = action - .clone() - .prepare_action() - .and_then(RunningAction::execute) - .and_then(RunningAction::upload_results) - .and_then(RunningAction::get_finished_result) - .then(|result| async { - cleanup_was_requested.store(true, Ordering::Release); - cleanup_rx.await.expect("Could not receive cleanup signal"); - if let Err(e) = action.cleanup().await { - return Result::::Err(e).merge(result); - } - result - }); - - tokio::pin!(execute_results_fut); - { - // Advance the action as far as possible and ensure we are not waiting on cleanup. - for _ in 0..100 { - assert!(futures::poll!(&mut execute_results_fut).is_pending()); - tokio::task::yield_now().await; - } - assert_eq!(cleanup_was_requested.load(Ordering::Acquire), false); - } - - let kill_all_fut = running_actions_manager.kill_all(); - tokio::pin!(kill_all_fut); + )?); + let action_result = { + let arguments = vec![ + "sh".to_string(), + "-c".to_string(), + "printf '123 ' > ./test.txt; mkdir ./tst; printf '456 ' > ./tst/tst.txt; printf 'foo-stdout '; >&2 printf 'bar-stderr '" + .to_string(), + ]; + let working_directory = "some_cwd"; + let command = Command { + arguments, + output_paths: vec!["test.txt".to_string(), "tst".to_string()], + working_directory: working_directory.to_string(), + environment_variables: vec![EnvironmentVariable { + name: "PATH".to_string(), + value: env::var("PATH").unwrap(), + }], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory { + directories: vec![DirectoryNode { + name: working_directory.to_string(), + digest: Some( + serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await? + .into(), + ), + }], + ..Default::default() + }, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; - { - // * Advance the action as far as possible. - // * Ensure we are now waiting on cleanup. - // * Ensure our kill_action is still pending. - while !cleanup_was_requested.load(Ordering::Acquire) { - // Wait for cleanup to be triggered. - tokio::task::yield_now().await; - assert!(futures::poll!(&mut execute_results_fut).is_pending()); - assert!(futures::poll!(&mut kill_all_fut).is_pending()); - } - } - // Allow cleanup, which allows execute_results_fut to advance. - cleanup_tx.send(()).expect("Could not send cleanup signal"); - // Advance our two futures to completion now. - let result = execute_results_fut.await; - kill_all_fut.await; - { - // Ensure our results are correct. - let action_result = result?; - let err = action_result - .error + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; + let operation_id = OperationId::default().to_string(); + + let running_action_impl = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id, + queued_timestamp: None, + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), + }, + ) + .await?; + + run_action(running_action_impl.clone()).await? + }; + let file_content = slow_store + .as_ref() + .get_part_unchunked(action_result.output_files[0].digest, 0, None) + .await?; + assert_eq!(from_utf8(&file_content)?, "123 "); + let stdout_content = slow_store + .as_ref() + .get_part_unchunked(action_result.stdout_digest, 0, None) + .await?; + assert_eq!(from_utf8(&stdout_content)?, "foo-stdout "); + let stderr_content = slow_store .as_ref() - .err_tip(|| format!("No error exists in result : {action_result:?}"))?; + .get_part_unchunked(action_result.stderr_digest, 0, None) + .await?; + assert_eq!(from_utf8(&stderr_content)?, "bar-stderr "); + let mut clock_time = make_system_time(0); assert_eq!( - err.code, - Code::Aborted, - "Expected Aborted : {action_result:?}" + action_result, + ActionResult { + output_files: vec![FileInfo { + name_or_path: NameOrPath::Path("test.txt".to_string()), + digest: DigestInfo::try_new( + "c69e10a5f54f4e28e33897fbd4f8701595443fa8c3004aeaa20dd4d9a463483b", + 4 + )?, + is_executable: false, + }], + stdout_digest: DigestInfo::try_new( + "15019a676f057d97d1ad3af86f3cc1e623cb33b18ff28422bbe3248d2471cc94", + 11 + )?, + stderr_digest: DigestInfo::try_new( + "2375ab8a01ca11e1ea7606dfb58756c153d49733cde1dbfb5a1e00f39afacf06", + 12 + )?, + exit_code: 0, + output_folders: vec![DirectoryInfo { + path: "tst".to_string(), + tree_digest: DigestInfo::try_new( + "95711c1905d4898a70209dd6e98241dcafb479c00241a1ea4ed8415710d706f3", + 166, + )?, + },], + output_file_symlinks: vec![], + output_directory_symlinks: vec![], + server_logs: HashMap::new(), + execution_metadata: ExecutionMetadata { + worker: WORKER_ID.to_string(), + queued_timestamp: SystemTime::UNIX_EPOCH, + worker_start_timestamp: increment_clock(&mut clock_time), + input_fetch_start_timestamp: increment_clock(&mut clock_time), + input_fetch_completed_timestamp: increment_clock(&mut clock_time), + execution_start_timestamp: increment_clock(&mut clock_time), + execution_completed_timestamp: increment_clock(&mut clock_time), + output_upload_start_timestamp: increment_clock(&mut clock_time), + output_upload_completed_timestamp: increment_clock(&mut clock_time), + worker_completed_timestamp: increment_clock(&mut clock_time), + }, + error: None, + message: String::new(), + } ); + Ok(()) } - Ok(()) -} + #[nativelink_test] + async fn running_actions_manager_respects_action_timeout() + -> Result<(), Box> { + const WORKER_ID: &str = "foo_worker_id"; -/// Regression Test for Issue #675 -#[cfg(target_family = "unix")] -#[serial] -#[nativelink_test] -async fn unix_executable_file_test() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; - const FILE_1_NAME: &str = "file1"; - - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) - } + // Ignore the sleep and immediately timeout. + static ACTION_TIMEOUT: i64 = 1; + fn test_monotonic_clock() -> SystemTime { + static CLOCK: AtomicU64 = AtomicU64::new(0); + monotonic_clock(&CLOCK) + } - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory, - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - execution_configuration: ExecutionConfiguration::default(), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() + let (_, _, cas_store, ac_store) = setup_stores().await?; + let root_action_directory = make_temp_path("root_work_directory"); + fs::create_dir_all(&root_action_directory).await?; + + let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( + RunningActionsManagerArgs { + root_action_directory, + execution_configuration: ExecutionConfiguration::default(), + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + }, + Callbacks { + now_fn: test_monotonic_clock, + // If action_timeout is the passed duration then return immediately, + // which will cause the action to be killed and pass the test, + // otherwise return pending and fail the test. + sleep_fn: |duration| { + assert_eq!(duration.as_secs(), ACTION_TIMEOUT as u64); + Box::pin(future::ready(())) + }, }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| Box::pin(future::pending()), - }, - )?); - // Create and run an action which - // creates a file with owner executable permissions. - let action_result = { + )?); + #[cfg(target_family = "unix")] + let arguments = vec!["sh".to_string(), "-c".to_string(), "sleep 2".to_string()]; + #[cfg(target_family = "windows")] + let arguments = vec![ + "cmd".to_string(), + "/C".to_string(), + "ping -n 99999 127.0.0.1".to_string(), + ]; let command = Command { - arguments: vec![ - "sh".to_string(), - "-c".to_string(), - format!("touch {FILE_1_NAME} && chmod 700 {FILE_1_NAME}"), - ], - output_paths: vec![FILE_1_NAME.to_string()], + arguments, working_directory: ".".to_string(), environment_variables: vec![EnvironmentVariable { name: "PATH".to_string(), @@ -3077,6 +3579,16 @@ async fn unix_executable_file_test() -> Result<(), Box> let action = Action { command_digest: Some(command_digest.into()), input_root_digest: Some(input_root_digest.into()), + platform: Some(Platform { + properties: vec![Property { + name: "property_name".into(), + value: "property_value".into(), + }], + }), + timeout: Some(prost_types::Duration { + seconds: ACTION_TIMEOUT, + nanos: 0, + }), ..Default::default() }; let action_digest = serialize_and_upload_message( @@ -3093,186 +3605,62 @@ async fn unix_executable_file_test() -> Result<(), Box> let operation_id = OperationId::default().to_string(); let running_action_impl = running_actions_manager + .clone() .create_and_add_action( WORKER_ID.to_string(), StartExecute { execute_request: Some(execute_request), operation_id, - ..Default::default() + queued_timestamp: Some(make_system_time(1000).into()), + platform: action.platform.clone(), + worker_id: WORKER_ID.to_string(), }, ) .await?; - run_action(running_action_impl.clone()).await? - }; - // Ensure the file copied from worker to CAS is executable. - assert!( - action_result.output_files[0].is_executable, - "Expected output file to be executable" - ); - Ok(()) -} - -#[serial] -#[nativelink_test] -async fn action_directory_contents_are_cleaned() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; - - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; - let temp_action_directory = make_temp_path("root_action_directory/temp"); - fs::create_dir_all(&temp_action_directory).await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - let queued_timestamp = make_system_time(1000); - - #[cfg(target_family = "unix")] - let arguments = vec!["sh".to_string(), "-c".to_string(), "exit 0".to_string()]; - #[cfg(target_family = "windows")] - let arguments = vec!["cmd".to_string(), "/C".to_string(), "exit 0".to_string()]; - - let command = Command { - arguments, - output_paths: vec![], - working_directory: ".".to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; - let operation_id = OperationId::default().to_string(); - - let running_action_impl = running_actions_manager - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: Some(queued_timestamp.into()), - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), - }, - ) - .await?; + let result = run_action(running_action_impl).await?; - run_action(running_action_impl.clone()).await?; + #[cfg(target_family = "unix")] + assert_eq!(result.exit_code, 9, "Action process should be been killed"); + #[cfg(target_family = "windows")] + assert_eq!(result.exit_code, 1, "Action process should be been killed"); + Ok(()) + } - let mut dir_stream = fs::read_dir(&root_action_directory).await?; - assert!( - dir_stream.as_mut().next_entry().await?.is_none(), - "Expected empty directory at {root_action_directory}" - ); - Ok(()) -} + #[nativelink_test] + async fn test_handles_stale_directory_on_retry() -> Result<(), Error> { + const WORKER_ID: &str = "foo_worker_id"; + let (_, ac_store, cas_store, _) = setup_stores().await?; + let root_action_directory = make_temp_path("retry_work_directory"); -// We've experienced deadlocks when uploading, so make only a single permit available and -// check it's able to handle uploading some directories with some files in. -// Note: If this test is failing or timing out, check that other tests in this file -// are also `#[serial]`. -// TODO(palfrey) This is unix only only because I was lazy and didn't spend the time to -// build the bash-like commands in windows as well. -#[serial] -#[nativelink_test] -#[cfg(target_family = "unix")] -async fn upload_with_single_permit() -> Result<(), Box> { - const WORKER_ID: &str = "foo_worker_id"; - - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) - } + // Ensure root directory exists + fs::create_dir_all(&root_action_directory).await?; - let (_, slow_store, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_action_directory"); - fs::create_dir_all(&root_action_directory).await?; + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration { + entrypoint: None, + additional_environment: None, + }, + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); - // Take all but one FD permit away. - let _permits = stream::iter(1..fs::OPEN_FILE_SEMAPHORE.available_permits()) - .then(|_| fs::OPEN_FILE_SEMAPHORE.acquire()) - .try_collect::>() - .await?; - assert_eq!(1, fs::OPEN_FILE_SEMAPHORE.available_permits()); - - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory, - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - sleep_fn: |_duration| Box::pin(future::pending()), - }, - )?); - let action_result = { - let arguments = vec![ - "sh".to_string(), - "-c".to_string(), - "printf '123 ' > ./test.txt; mkdir ./tst; printf '456 ' > ./tst/tst.txt; printf 'foo-stdout '; >&2 printf 'bar-stderr '" - .to_string(), - ]; - let working_directory = "some_cwd"; + // Create a simple action let command = Command { - arguments, - output_paths: vec!["test.txt".to_string(), "tst".to_string()], - working_directory: working_directory.to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], + arguments: vec!["echo".to_string(), "test".to_string()], ..Default::default() }; let command_digest = serialize_and_upload_message( @@ -3282,21 +3670,7 @@ async fn upload_with_single_permit() -> Result<(), Box> ) .await?; let input_root_digest = serialize_and_upload_message( - &Directory { - directories: vec![DirectoryNode { - name: working_directory.to_string(), - digest: Some( - serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await? - .into(), - ), - }], - ..Default::default() - }, + &Directory::default(), cas_store.as_pin(), &mut DigestHasherFunc::Sha256.hasher(), ) @@ -3317,459 +3691,197 @@ async fn upload_with_single_permit() -> Result<(), Box> action_digest: Some(action_digest.into()), ..Default::default() }; - let operation_id = OperationId::default().to_string(); - let running_action_impl = running_actions_manager + // Use a fixed operation ID to simulate retry with same ID + let operation_id = "test-retry-operation-fixed-id".to_string(); + + // Create the directory manually to simulate a previous failed action + let action_directory = format!("{root_action_directory}/{operation_id}"); + eprintln!("Creating directory: {action_directory}"); + fs::create_dir_all(&action_directory).await?; + + // Also create the work subdirectory to ensure conflict + let work_directory = format!("{action_directory}/work"); + fs::create_dir_all(&work_directory).await?; + + // Add a marker file to detect if directory is deleted and recreated + let marker_file = format!("{action_directory}/marker.txt"); + tokio::fs::write(&marker_file, "test").await?; + + // Verify the directory was created + assert!( + tokio::fs::metadata(&action_directory).await.is_ok(), + "Directory should exist" + ); + assert!( + tokio::fs::metadata(&work_directory).await.is_ok(), + "Work directory should exist" + ); + assert!( + tokio::fs::metadata(&marker_file).await.is_ok(), + "Marker file should exist" + ); + + // Now try to create an action with the same operation ID + // This should fail with "File exists" error + eprintln!("Attempting to create action with existing directory..."); + let result = running_actions_manager .create_and_add_action( WORKER_ID.to_string(), StartExecute { execute_request: Some(execute_request), - operation_id, - queued_timestamp: None, - platform: action.platform.clone(), + operation_id: operation_id.clone(), + queued_timestamp: Some(SystemTime::now().into()), + platform: None, worker_id: WORKER_ID.to_string(), }, ) - .await?; + .await; - run_action(running_action_impl.clone()).await? - }; - let file_content = slow_store - .as_ref() - .get_part_unchunked(action_result.output_files[0].digest, 0, None) - .await?; - assert_eq!(from_utf8(&file_content)?, "123 "); - let stdout_content = slow_store - .as_ref() - .get_part_unchunked(action_result.stdout_digest, 0, None) - .await?; - assert_eq!(from_utf8(&stdout_content)?, "foo-stdout "); - let stderr_content = slow_store - .as_ref() - .get_part_unchunked(action_result.stderr_digest, 0, None) - .await?; - assert_eq!(from_utf8(&stderr_content)?, "bar-stderr "); - let mut clock_time = make_system_time(0); - assert_eq!( - action_result, - ActionResult { - output_files: vec![FileInfo { - name_or_path: NameOrPath::Path("test.txt".to_string()), - digest: DigestInfo::try_new( - "c69e10a5f54f4e28e33897fbd4f8701595443fa8c3004aeaa20dd4d9a463483b", - 4 - )?, - is_executable: false, - }], - stdout_digest: DigestInfo::try_new( - "15019a676f057d97d1ad3af86f3cc1e623cb33b18ff28422bbe3248d2471cc94", - 11 - )?, - stderr_digest: DigestInfo::try_new( - "2375ab8a01ca11e1ea7606dfb58756c153d49733cde1dbfb5a1e00f39afacf06", - 12 - )?, - exit_code: 0, - output_folders: vec![DirectoryInfo { - path: "tst".to_string(), - tree_digest: DigestInfo::try_new( - "95711c1905d4898a70209dd6e98241dcafb479c00241a1ea4ed8415710d706f3", - 166, - )?, - },], - output_file_symlinks: vec![], - output_directory_symlinks: vec![], - server_logs: HashMap::new(), - execution_metadata: ExecutionMetadata { - worker: WORKER_ID.to_string(), - queued_timestamp: SystemTime::UNIX_EPOCH, - worker_start_timestamp: increment_clock(&mut clock_time), - input_fetch_start_timestamp: increment_clock(&mut clock_time), - input_fetch_completed_timestamp: increment_clock(&mut clock_time), - execution_start_timestamp: increment_clock(&mut clock_time), - execution_completed_timestamp: increment_clock(&mut clock_time), - output_upload_start_timestamp: increment_clock(&mut clock_time), - output_upload_completed_timestamp: increment_clock(&mut clock_time), - worker_completed_timestamp: increment_clock(&mut clock_time), - }, - error: None, - message: String::new(), + // Verify the behavior - with the fix, it should succeed after removing stale directory + match result { + Ok(_) => { + // Check if the directory still exists and if marker file is gone + let dir_exists = tokio::fs::metadata(&action_directory).await.is_ok(); + let marker_exists = tokio::fs::metadata(&marker_file).await.is_ok(); + eprintln!( + "SUCCESS: Directory collision handled gracefully. Directory exists: {dir_exists}, Marker exists: {marker_exists}" + ); + assert!( + dir_exists, + "Directory should exist after successful creation" + ); + assert!( + !marker_exists, + "Marker file should be gone - stale directory was cleaned up" + ); + eprintln!( + "PASSED: The fix is working - stale directory was removed and action proceeded" + ); + } + Err(err) => { + panic!("Expected success after fix, but got error: {err}"); + } } - ); - Ok(()) -} -#[serial] -#[nativelink_test] -async fn running_actions_manager_respects_action_timeout() -> Result<(), Box> -{ - const WORKER_ID: &str = "foo_worker_id"; - - // Ignore the sleep and immediately timeout. - static ACTION_TIMEOUT: i64 = 1; - fn test_monotonic_clock() -> SystemTime { - static CLOCK: AtomicU64 = AtomicU64::new(0); - monotonic_clock(&CLOCK) + // Clean up + fs::remove_dir_all(&root_action_directory).await?; + Ok(()) } - let (_, _, cas_store, ac_store) = setup_stores().await?; - let root_action_directory = make_temp_path("root_work_directory"); - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = Arc::new(RunningActionsManagerImpl::new_with_callbacks( - RunningActionsManagerArgs { - root_action_directory, - execution_configuration: ExecutionConfiguration::default(), - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - }, - Callbacks { - now_fn: test_monotonic_clock, - // If action_timeout is the passed duration then return immediately, - // which will cause the action to be killed and pass the test, - // otherwise return pending and fail the test. - sleep_fn: |duration| { - assert_eq!(duration.as_secs(), ACTION_TIMEOUT as u64); - Box::pin(future::ready(())) - }, - }, - )?); - #[cfg(target_family = "unix")] - let arguments = vec!["sh".to_string(), "-c".to_string(), "sleep 2".to_string()]; - #[cfg(target_family = "windows")] - let arguments = vec![ - "cmd".to_string(), - "/C".to_string(), - "ping -n 99999 127.0.0.1".to_string(), - ]; - let command = Command { - arguments, - working_directory: ".".to_string(), - environment_variables: vec![EnvironmentVariable { - name: "PATH".to_string(), - value: env::var("PATH").unwrap(), - }], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - platform: Some(Platform { - properties: vec![Property { - name: "property_name".into(), - value: "property_value".into(), - }], - }), - timeout: Some(prost_types::Duration { - seconds: ACTION_TIMEOUT, - nanos: 0, - }), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; - let operation_id = OperationId::default().to_string(); - - let running_action_impl = running_actions_manager - .clone() - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id, - queued_timestamp: Some(make_system_time(1000).into()), - platform: action.platform.clone(), - worker_id: WORKER_ID.to_string(), - }, - ) - .await?; - - let result = run_action(running_action_impl).await?; + #[nativelink_test] + async fn test_retry_after_cleanup_succeeds() -> Result<(), Error> { + const WORKER_ID: &str = "foo_worker_id"; + let (_, ac_store, cas_store, _) = setup_stores().await?; + let root_action_directory = make_temp_path("retry_after_cleanup_work_directory"); - #[cfg(target_family = "unix")] - assert_eq!(result.exit_code, 9, "Action process should be been killed"); - #[cfg(target_family = "windows")] - assert_eq!(result.exit_code, 1, "Action process should be been killed"); - Ok(()) -} + // Ensure root directory exists + fs::create_dir_all(&root_action_directory).await?; -#[nativelink_test] -async fn test_handles_stale_directory_on_retry() -> Result<(), Error> { - const WORKER_ID: &str = "foo_worker_id"; - let (_, ac_store, cas_store, _) = setup_stores().await?; - let root_action_directory = make_temp_path("retry_work_directory"); - - // Ensure root directory exists - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration { - entrypoint: None, - additional_environment: None, - }, - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - - // Create a simple action - let command = Command { - arguments: vec!["echo".to_string(), "test".to_string()], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; + let running_actions_manager = + Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { + root_action_directory: root_action_directory.clone(), + execution_configuration: ExecutionConfiguration { + entrypoint: None, + additional_environment: None, + }, + cas_store: cas_store.clone(), + ac_store: Some(Store::new(ac_store.clone())), + historical_store: Store::new(cas_store.clone()), + upload_action_result_config: + &nativelink_config::cas_server::UploadActionResultConfig { + upload_ac_results_strategy: + nativelink_config::cas_server::UploadCacheResultsStrategy::Never, + ..Default::default() + }, + max_action_timeout: Duration::MAX, + max_upload_timeout: Duration::from_secs(DEFAULT_MAX_UPLOAD_TIMEOUT), + timeout_handled_externally: false, + directory_cache: None, + })?); - // Use a fixed operation ID to simulate retry with same ID - let operation_id = "test-retry-operation-fixed-id".to_string(); - - // Create the directory manually to simulate a previous failed action - let action_directory = format!("{root_action_directory}/{operation_id}"); - eprintln!("Creating directory: {action_directory}"); - fs::create_dir_all(&action_directory).await?; - - // Also create the work subdirectory to ensure conflict - let work_directory = format!("{action_directory}/work"); - fs::create_dir_all(&work_directory).await?; - - // Add a marker file to detect if directory is deleted and recreated - let marker_file = format!("{action_directory}/marker.txt"); - tokio::fs::write(&marker_file, "test").await?; - - // Verify the directory was created - assert!( - tokio::fs::metadata(&action_directory).await.is_ok(), - "Directory should exist" - ); - assert!( - tokio::fs::metadata(&work_directory).await.is_ok(), - "Work directory should exist" - ); - assert!( - tokio::fs::metadata(&marker_file).await.is_ok(), - "Marker file should exist" - ); - - // Now try to create an action with the same operation ID - // This should fail with "File exists" error - eprintln!("Attempting to create action with existing directory..."); - let result = running_actions_manager - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id: operation_id.clone(), - queued_timestamp: Some(SystemTime::now().into()), - platform: None, - worker_id: WORKER_ID.to_string(), - }, + // Create a simple action + let command = Command { + arguments: vec!["echo".to_string(), "test".to_string()], + ..Default::default() + }; + let command_digest = serialize_and_upload_message( + &command, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), ) - .await; - - // Verify the behavior - with the fix, it should succeed after removing stale directory - match result { - Ok(_) => { - // Check if the directory still exists and if marker file is gone - let dir_exists = tokio::fs::metadata(&action_directory).await.is_ok(); - let marker_exists = tokio::fs::metadata(&marker_file).await.is_ok(); - eprintln!( - "SUCCESS: Directory collision handled gracefully. Directory exists: {dir_exists}, Marker exists: {marker_exists}" - ); - assert!( - dir_exists, - "Directory should exist after successful creation" - ); - assert!( - !marker_exists, - "Marker file should be gone - stale directory was cleaned up" - ); - eprintln!( - "PASSED: The fix is working - stale directory was removed and action proceeded" - ); - } - Err(err) => { - panic!("Expected success after fix, but got error: {err}"); - } - } + .await?; + let input_root_digest = serialize_and_upload_message( + &Directory::default(), + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; + let action = Action { + command_digest: Some(command_digest.into()), + input_root_digest: Some(input_root_digest.into()), + ..Default::default() + }; + let action_digest = serialize_and_upload_message( + &action, + cas_store.as_pin(), + &mut DigestHasherFunc::Sha256.hasher(), + ) + .await?; - // Clean up - fs::remove_dir_all(&root_action_directory).await?; - Ok(()) -} + let execute_request = ExecuteRequest { + action_digest: Some(action_digest.into()), + ..Default::default() + }; -#[nativelink_test] -async fn test_retry_after_cleanup_succeeds() -> Result<(), Error> { - const WORKER_ID: &str = "foo_worker_id"; - let (_, ac_store, cas_store, _) = setup_stores().await?; - let root_action_directory = make_temp_path("retry_after_cleanup_work_directory"); - - // Ensure root directory exists - fs::create_dir_all(&root_action_directory).await?; - - let running_actions_manager = - Arc::new(RunningActionsManagerImpl::new(RunningActionsManagerArgs { - root_action_directory: root_action_directory.clone(), - execution_configuration: ExecutionConfiguration { - entrypoint: None, - additional_environment: None, - }, - cas_store: cas_store.clone(), - ac_store: Some(Store::new(ac_store.clone())), - historical_store: Store::new(cas_store.clone()), - upload_action_result_config: &nativelink_config::cas_server::UploadActionResultConfig { - upload_ac_results_strategy: - nativelink_config::cas_server::UploadCacheResultsStrategy::Never, - ..Default::default() - }, - max_action_timeout: Duration::MAX, - timeout_handled_externally: false, - })?); - - // Create a simple action - let command = Command { - arguments: vec!["echo".to_string(), "test".to_string()], - ..Default::default() - }; - let command_digest = serialize_and_upload_message( - &command, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let input_root_digest = serialize_and_upload_message( - &Directory::default(), - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - let action = Action { - command_digest: Some(command_digest.into()), - input_root_digest: Some(input_root_digest.into()), - ..Default::default() - }; - let action_digest = serialize_and_upload_message( - &action, - cas_store.as_pin(), - &mut DigestHasherFunc::Sha256.hasher(), - ) - .await?; - - let execute_request = ExecuteRequest { - action_digest: Some(action_digest.into()), - ..Default::default() - }; + let operation_id = "test-retry-after-cleanup-fixed-id".to_string(); - let operation_id = "test-retry-after-cleanup-fixed-id".to_string(); - - // First, create and execute an action - let action1 = running_actions_manager - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request.clone()), - operation_id: operation_id.clone(), - queued_timestamp: Some(SystemTime::now().into()), - platform: None, - worker_id: WORKER_ID.to_string(), - }, - ) - .await?; + // First, create and execute an action + let action1 = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request.clone()), + operation_id: operation_id.clone(), + queued_timestamp: Some(SystemTime::now().into()), + platform: None, + worker_id: WORKER_ID.to_string(), + }, + ) + .await?; - // Clean up the action - action1.cleanup().await?; - - // Give cleanup a moment to complete - tokio::time::sleep(Duration::from_millis(100)).await; - - // Now try to create another action with the same operation ID - // This should succeed because the directory has been cleaned up - let result = running_actions_manager - .create_and_add_action( - WORKER_ID.to_string(), - StartExecute { - execute_request: Some(execute_request), - operation_id: operation_id.clone(), - queued_timestamp: Some(SystemTime::now().into()), - platform: None, - worker_id: WORKER_ID.to_string(), - }, - ) - .await; + // Clean up the action + action1.cleanup().await?; + + // Give cleanup a moment to complete + tokio::time::sleep(Duration::from_millis(100)).await; + + // Now try to create another action with the same operation ID + // This should succeed because the directory has been cleaned up + let result = running_actions_manager + .create_and_add_action( + WORKER_ID.to_string(), + StartExecute { + execute_request: Some(execute_request), + operation_id: operation_id.clone(), + queued_timestamp: Some(SystemTime::now().into()), + platform: None, + worker_id: WORKER_ID.to_string(), + }, + ) + .await; - assert!( - result.is_ok(), - "Expected success when creating action after cleanup, got: {:?}", - result.err() - ); + assert!( + result.is_ok(), + "Expected success when creating action after cleanup, got: {:?}", + result.err() + ); - // Clean up - if let Ok(action2) = result { - action2.cleanup().await?; + // Clean up + if let Ok(action2) = result { + action2.cleanup().await?; + } + fs::remove_dir_all(&root_action_directory).await?; + Ok(()) } - fs::remove_dir_all(&root_action_directory).await?; - Ok(()) } diff --git a/nativelink-worker/tests/utils/local_worker_test_utils.rs b/nativelink-worker/tests/utils/local_worker_test_utils.rs index 8edef5614..adbef171e 100644 --- a/nativelink-worker/tests/utils/local_worker_test_utils.rs +++ b/nativelink-worker/tests/utils/local_worker_test_utils.rs @@ -21,7 +21,8 @@ use hyper::body::Frame; use nativelink_config::cas_server::{EndpointConfig, LocalWorkerConfig, WorkerProperty}; use nativelink_error::Error; use nativelink_proto::com::github::trace_machina::nativelink::remote_execution::{ - ConnectWorkerRequest, ExecuteResult, GoingAwayRequest, KeepAliveRequest, UpdateForWorker, + ConnectWorkerRequest, ExecuteComplete, ExecuteResult, GoingAwayRequest, KeepAliveRequest, + UpdateForWorker, }; use nativelink_util::channel_body_for_tests::ChannelBody; use nativelink_util::shutdown_guard::ShutdownGuard; @@ -53,6 +54,7 @@ const BROADCAST_CAPACITY: usize = 1; enum WorkerClientApiCalls { ConnectWorker(ConnectWorkerRequest), ExecutionResponse(ExecuteResult), + GoingAway(GoingAwayRequest), } #[derive(Debug)] @@ -62,7 +64,8 @@ enum WorkerClientApiCalls { )] enum WorkerClientApiReturns { ConnectWorker(Result>, Status>), - ExecutionResponse(Result, Status>), + ExecutionResponse(Result<(), Error>), + GoingAway(Result<(), Error>), } #[derive(Clone)] @@ -107,6 +110,9 @@ impl MockWorkerApiClient { req @ WorkerClientApiCalls::ExecutionResponse(_) => { panic!("expect_connect_worker expected ConnectWorker, got : {req:?}") } + req @ WorkerClientApiCalls::GoingAway(_) => { + panic!("expect_connect_worker expected ConnectWorker, got : {req:?}") + } }; self.tx_resp .send(WorkerClientApiReturns::ConnectWorker(result)) @@ -116,7 +122,7 @@ impl MockWorkerApiClient { pub(crate) async fn expect_execution_response( &self, - result: Result, Status>, + result: Result<(), Error>, ) -> ExecuteResult { let mut rx_call_lock = self.rx_call.lock().await; let req = match rx_call_lock @@ -128,12 +134,39 @@ impl MockWorkerApiClient { req @ WorkerClientApiCalls::ConnectWorker(_) => { panic!("expect_execution_response expected ExecutionResponse, got : {req:?}") } + req @ WorkerClientApiCalls::GoingAway(_) => { + panic!("expect_execution_response expected ExecutionResponse, got : {req:?}") + } }; self.tx_resp .send(WorkerClientApiReturns::ExecutionResponse(result)) .expect("Could not send request to mpsc"); req } + + pub(crate) async fn expect_going_away( + &self, + result: Result<(), Error>, + ) -> GoingAwayRequest { + let mut rx_call_lock = self.rx_call.lock().await; + let req = match rx_call_lock + .recv() + .await + .expect("Could not receive msg in mpsc") + { + WorkerClientApiCalls::GoingAway(req) => req, + req @ WorkerClientApiCalls::ConnectWorker(_) => { + panic!("expect_going_away expected GoingAway, got : {req:?}") + } + req @ WorkerClientApiCalls::ExecutionResponse(_) => { + panic!("expect_going_away expected GoingAway, got : {req:?}") + } + }; + self.tx_resp + .send(WorkerClientApiReturns::GoingAway(result)) + .expect("Could not send request to mpsc"); + req + } } impl WorkerApiClientTrait for MockWorkerApiClient { @@ -153,19 +186,38 @@ impl WorkerApiClientTrait for MockWorkerApiClient { WorkerClientApiReturns::ConnectWorker(result) => result, resp @ WorkerClientApiReturns::ExecutionResponse(_) => { panic!("connect_worker expected ConnectWorker response, received {resp:?}") - } + }, + resp @ WorkerClientApiReturns::GoingAway(_) => { + panic!("connect_worker expected ConnectWorker response, received {resp:?}") + }, } } - async fn keep_alive(&mut self, _request: KeepAliveRequest) -> Result, Status> { + async fn keep_alive(&mut self, _request: KeepAliveRequest) -> Result<(), Error> { unreachable!(); } - async fn going_away(&mut self, _request: GoingAwayRequest) -> Result, Status> { - unreachable!(); + async fn going_away(&mut self, request: GoingAwayRequest) -> Result<(), Error> { + self.tx_call + .send(WorkerClientApiCalls::GoingAway(request)) + .expect("Could not send request to mpsc"); + let mut rx_resp_lock = self.rx_resp.lock().await; + match rx_resp_lock + .recv() + .await + .expect("Could not receive msg in mpsc") + { + WorkerClientApiReturns::GoingAway(result) => result, + resp @ WorkerClientApiReturns::ConnectWorker(_) => { + panic!("going_away expected GoingAway response, received {resp:?}") + } + resp @ WorkerClientApiReturns::ExecutionResponse(_) => { + panic!("going_away expected GoingAway response, received {resp:?}") + } + } } - async fn execution_response(&mut self, request: ExecuteResult) -> Result, Status> { + async fn execution_response(&mut self, request: ExecuteResult) -> Result<(), Error> { self.tx_call .send(WorkerClientApiCalls::ExecutionResponse(request)) .expect("Could not send request to mpsc"); @@ -179,8 +231,15 @@ impl WorkerApiClientTrait for MockWorkerApiClient { resp @ WorkerClientApiReturns::ConnectWorker(_) => { panic!("execution_response expected ExecutionResponse response, received {resp:?}") } + resp @ WorkerClientApiReturns::GoingAway(_) => { + panic!("execution_response expected ExecutionResponse response, received {resp:?}") + } } } + + async fn execution_complete(&mut self, _request: ExecuteComplete) -> Result<(), Error> { + Ok(()) + } } pub(crate) fn setup_grpc_stream() -> ( @@ -212,7 +271,7 @@ pub(crate) async fn setup_local_worker_with_config( let (shutdown_tx_test, _) = broadcast::channel::(BROADCAST_CAPACITY); let drop_guard = spawn!("local_worker_spawn", async move { - worker.run(shutdown_tx_test.subscribe()).await + worker.run(shutdown_tx_test.clone(), shutdown_tx_test.subscribe()).await }); let (tx_stream, streaming_response) = setup_grpc_stream(); diff --git a/nativelink-worker/tests/worker_utils_test.rs b/nativelink-worker/tests/worker_utils_test.rs new file mode 100644 index 000000000..62e16b574 --- /dev/null +++ b/nativelink-worker/tests/worker_utils_test.rs @@ -0,0 +1,34 @@ +#![cfg(target_family = "unix")] +use std::collections::HashMap; +use std::env; + +use nativelink_config::cas_server::WorkerProperty; +use nativelink_error::Error; +use nativelink_macro::nativelink_test; +use nativelink_proto::build::bazel::remote::execution::v2::platform::Property; +use nativelink_worker::worker_utils::make_connect_worker_request; + +#[nativelink_test] +async fn make_connect_worker_request_with_extra_envs() -> Result<(), Error> { + let mut worker_properties: HashMap = HashMap::new(); + worker_properties.insert( + "test".into(), + WorkerProperty::QueryCmd("bash -c \"echo $DEMO_ENV\"".to_string()), + ); + let mut extra_envs = HashMap::new(); + extra_envs.insert("DEMO_ENV".into(), "test_value_for_demo_env".into()); + + // So we have bash for nix cases, because the PATH gets reset + extra_envs.insert("PATH".into(), env::var("PATH").unwrap()); + + let res = + make_connect_worker_request("1234".to_string(), &worker_properties, &extra_envs, 1).await?; + assert_eq!( + res.properties.first(), + Some(&Property { + name: "test".into(), + value: "test_value_for_demo_env".into() + }) + ); + Ok(()) +} diff --git a/src/bin/cas_speed_check.rs b/src/bin/cas_speed_check.rs new file mode 100644 index 000000000..e731d8c61 --- /dev/null +++ b/src/bin/cas_speed_check.rs @@ -0,0 +1,105 @@ +use core::time::Duration; +use std::sync::Arc; + +use clap::Parser; +use nativelink_error::{Error, ResultExt}; +use nativelink_proto::build::bazel::remote::execution::v2::content_addressable_storage_client::ContentAddressableStorageClient; +use nativelink_proto::build::bazel::remote::execution::v2::{ + Digest, FindMissingBlobsRequest, digest_function, +}; +use nativelink_util::spawn; +use nativelink_util::telemetry::init_tracing; +use nativelink_util::tls_utils::endpoint_from; +use rand::{Rng, RngCore}; +use sha2::{Digest as _, Sha256}; +use tokio::sync::Mutex; +use tokio::time::Instant; +use tonic::Request; +use tonic::transport::ClientTlsConfig; +use tracing::info; + +#[derive(Parser, Debug)] +#[command(version, about)] +struct Args { + #[arg(short, long)] + endpoint: String, + + #[arg(short, long)] + nativelink_key: Option, +} + +fn main() -> Result<(), Box> { + let args = Args::parse(); + #[expect( + clippy::disallowed_methods, + reason = "It's the top-level, so we need the function" + )] + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap() + .block_on(async { + init_tracing().await?; + let timings = Arc::new(Mutex::new(Vec::new())); + let spawns: Vec<_> = (0..200) + .map(|_| { + let local_timings = timings.clone(); + let local_endpoint = args.endpoint.clone(); + let local_api_key = args.nativelink_key.clone(); + spawn!("CAS requester", async move { + let tls_config = ClientTlsConfig::new().with_enabled_roots(); + let endpoint = endpoint_from(&local_endpoint, Some(tls_config))?; + let channel = endpoint.connect().await.unwrap(); + + let mut client = ContentAddressableStorageClient::new(channel); + + for _ in 0..100 { + let raw_data: String = rand::rng() + .sample_iter::(rand::distr::StandardUniform) + .take(300) + .collect(); + let hashed = Sha256::digest(raw_data.as_bytes()); + let rand_hash = hex::encode(hashed); + let digest = Digest { + hash: rand_hash, + size_bytes: i64::from(rand::rng().next_u32()), + }; + + let mut request = Request::new(FindMissingBlobsRequest { + instance_name: String::new(), + blob_digests: vec![digest.clone()], + digest_function: digest_function::Value::Sha256.into(), + }); + if let Some(ref api_key) = local_api_key { + request + .metadata_mut() + .insert("x-nativelink-api-key", api_key.parse().unwrap()); + } + let start = Instant::now(); + client + .find_missing_blobs(request) + .await + .err_tip(|| "in find_missing_blobs")? + .into_inner(); + let duration = Instant::now().checked_duration_since(start).unwrap(); + + // info!("response duration={duration:?} res={:?}", res); + local_timings.lock().await.push(duration); + } + Ok::<(), Error>(()) + }) + }) + .collect(); + for thread in spawns { + let res = thread.await; + res.err_tip(|| "with spawn")??; + } + let avg = Duration::from_secs_f64({ + let locked = timings.lock().await; + locked.iter().map(Duration::as_secs_f64).sum::() / locked.len() as f64 + }); + info!(?avg, "avg"); + Ok::<(), Error>(()) + })?; + Ok(()) +} diff --git a/src/bin/cluster.conf b/src/bin/cluster.conf new file mode 100644 index 000000000..78a45933d --- /dev/null +++ b/src/bin/cluster.conf @@ -0,0 +1,6 @@ +port 7000 +cluster-enabled yes +cluster-config-file nodes.conf +cluster-node-timeout 5000 +appendonly yes +bind 0.0.0.0 diff --git a/src/bin/docker-compose.store-tester.yaml b/src/bin/docker-compose.store-tester.yaml new file mode 100644 index 000000000..06256b314 --- /dev/null +++ b/src/bin/docker-compose.store-tester.yaml @@ -0,0 +1,145 @@ +services: + redis: + image: redis:8.4-alpine3.22 + ports: + - 6379:6379 + command: redis-server --loglevel debug --enable-debug-command yes + + redis-replica-1: + image: redis:8.4-alpine3.22 + depends_on: + - redis + command: redis-server --replicaof redis 6379 --enable-debug-command yes + + # Based on https://gregornovak.eu/setting-up-redis-sentinel-with-docker-compose + # To demo sentinel failover, run `redis-cli -p 6379 DEBUG sleep 30` + sentinel: + image: redis:8.4-alpine3.22 + depends_on: + - redis + - redis-replica-1 + ports: + - 26379:26379 + # Sentinel configuration is created dynamically and mounted by volume because Sentinel itself will modify the configuration + # once it is running. If master changes this will be reflected in all configurations and some additional things are added which are + # meant only for runtime use and not something that should be committed as base configuration. + command: > + sh -c 'echo "sentinel resolve-hostnames yes" > /etc/sentinel.conf && + echo "sentinel monitor master redis 6379 1" >> /etc/sentinel.conf && + echo "sentinel down-after-milliseconds master 1000" >> /etc/sentinel.conf && + echo "sentinel failover-timeout master 5000" >> /etc/sentinel.conf && + echo "sentinel parallel-syncs master 1" >> /etc/sentinel.conf && + redis-server /etc/sentinel.conf --sentinel' + + cluster-node-1: + image: redis:8.4-alpine3.22 + ports: + - 7000:7000 + volumes: + - ./cluster.conf:/etc/cluster.conf + command: > + redis-server /etc/cluster.conf + healthcheck: + interval: 2s + retries: '3' + test: ["CMD", "redis-cli", "-p", "7000", "-c", "ping"] + timeout: 5s + + cluster-node-2: + image: redis:8.4-alpine3.22 + ports: + - 7001:7000 + volumes: + - ./cluster.conf:/etc/cluster.conf + command: > + redis-server /etc/cluster.conf + healthcheck: + interval: 2s + retries: '3' + test: ["CMD", "redis-cli", "-p", "7000", "-c", "ping"] + timeout: 5s + + cluster-node-3: + image: redis:8.4-alpine3.22 + ports: + - 7002:7000 + volumes: + - ./cluster.conf:/etc/cluster.conf + command: > + redis-server /etc/cluster.conf + healthcheck: + interval: 2s + retries: '3' + test: ["CMD", "redis-cli", "-p", "7000", "-c", "ping"] + timeout: 5s + + cluster-node-4: + image: redis:8.4-alpine3.22 + ports: + - 7003:7000 + volumes: + - ./cluster.conf:/etc/cluster.conf + command: > + redis-server /etc/cluster.conf + healthcheck: + interval: 2s + retries: '3' + test: ["CMD", "redis-cli", "-p", "7000", "-c", "ping"] + timeout: 5s + + cluster-node-5: + image: redis:8.4-alpine3.22 + ports: + - 7004:7000 + volumes: + - ./cluster.conf:/etc/cluster.conf + command: > + redis-server /etc/cluster.conf + healthcheck: + interval: 2s + retries: '3' + test: ["CMD", "redis-cli", "-p", "7000", "-c", "ping"] + timeout: 5s + + cluster-node-6: + image: redis:8.4-alpine3.22 + ports: + - 7005:7000 + volumes: + - ./cluster.conf:/etc/cluster.conf + command: > + redis-server /etc/cluster.conf + healthcheck: + interval: 2s + retries: '3' + test: ["CMD", "redis-cli", "-p", "7000", "-c", "ping"] + timeout: 5s + + cluster-creator: + command: + - redis-cli + - --cluster + - create + - cluster-node-1:7000 + - cluster-node-2:7000 + - cluster-node-3:7000 + - cluster-node-4:7000 + - cluster-node-5:7000 + - cluster-node-6:7000 + - --cluster-yes + - --cluster-replicas + - '1' + depends_on: + cluster-node-1: + condition: service_healthy + cluster-node-2: + condition: service_healthy + cluster-node-3: + condition: service_healthy + cluster-node-4: + condition: service_healthy + cluster-node-5: + condition: service_healthy + cluster-node-6: + condition: service_healthy + image: redis:8.4-alpine3.22 diff --git a/src/bin/nativelink.rs b/src/bin/nativelink.rs index 8c8257faa..c9578626c 100644 --- a/src/bin/nativelink.rs +++ b/src/bin/nativelink.rs @@ -61,12 +61,14 @@ use nativelink_util::task::TaskExecutor; use nativelink_util::telemetry::init_tracing; use nativelink_util::{background_spawn, fs, spawn}; use nativelink_worker::local_worker::new_local_worker; -use rustls_pemfile::{certs as extract_certs, crls as extract_crls}; +use rustls_pki_types::pem::PemObject; +use rustls_pki_types::{CertificateRevocationListDer, PrivateKeyDer}; use tokio::net::TcpListener; use tokio::select; #[cfg(target_family = "unix")] use tokio::signal::unix::{SignalKind, signal}; -use tokio::sync::{broadcast, mpsc}; +use tokio::sync::oneshot::Sender; +use tokio::sync::{broadcast, mpsc, oneshot}; use tokio_rustls::TlsAcceptor; use tokio_rustls::rustls::pki_types::CertificateDer; use tokio_rustls::rustls::server::WebPkiClientVerifier; @@ -145,9 +147,36 @@ impl RoutesExt for Routes { /// If this value changes update the documentation in the config definition. const DEFAULT_MAX_DECODING_MESSAGE_SIZE: usize = 4 * 1024 * 1024; +macro_rules! service_setup { + ($v: tt, $http_config: tt) => {{ + let mut service = $v.into_service(); + let max_decoding_message_size = if $http_config.max_decoding_message_size == 0 { + DEFAULT_MAX_DECODING_MESSAGE_SIZE + } else { + $http_config.max_decoding_message_size + }; + service = service.max_decoding_message_size(max_decoding_message_size); + let send_algo = &$http_config.compression.send_compression_algorithm; + if let Some(encoding) = into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) { + service = service.send_compressed(encoding); + } + for encoding in $http_config + .compression + .accepted_compression_algorithms + .iter() + // Filter None values. + .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) + { + service = service.accept_compressed(encoding); + } + service + }}; +} + async fn inner_main( cfg: CasConfig, shutdown_tx: broadcast::Sender, + scheduler_shutdown_tx: Sender<()>, ) -> Result<(), Error> { const fn into_encoding(from: HttpCompressionAlgorithm) -> Option { match from { @@ -167,7 +196,7 @@ async fn inner_main( let health_component_name = format!("stores/{name}"); let mut health_register_store = health_registry_lock.sub_builder(&health_component_name); - let store = store_factory(&spec, &store_manager, Some(&mut health_register_store)) + let store = store_factory(&name, &spec, &store_manager, Some(&mut health_register_store)) .await .err_tip(|| format!("Failed to create store '{name}'"))?; store_manager.add_store(&name, store); @@ -205,6 +234,7 @@ async fn inner_main( for SchedulerConfig { name, spec } in cfg.schedulers.iter().flatten() { let (maybe_action_scheduler, maybe_worker_scheduler) = scheduler_factory(spec, &store_manager, maybe_origin_event_tx.as_ref()) + .await .err_tip(|| format!("Failed to create scheduler '{name}'"))?; if let Some(action_scheduler) = maybe_action_scheduler { action_schedulers.insert(name.clone(), action_scheduler.clone()); @@ -230,25 +260,8 @@ async fn inner_main( services .ac .map_or(Ok(None), |cfg| { - AcServer::new(&cfg, &store_manager).map(|v| { - let mut service = v.into_service(); - let send_algo = &http_config.compression.send_compression_algorithm; - if let Some(encoding) = - into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) - { - service = service.send_compressed(encoding); - } - for encoding in http_config - .compression - .accepted_compression_algorithms - .iter() - // Filter None values. - .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) - { - service = service.accept_compressed(encoding); - } - Some(service) - }) + AcServer::new(&cfg, &store_manager) + .map(|v| Some(service_setup!(v, http_config))) }) .err_tip(|| "Could not create AC service")?, ) @@ -256,25 +269,8 @@ async fn inner_main( services .cas .map_or(Ok(None), |cfg| { - CasServer::new(&cfg, &store_manager).map(|v| { - let mut service = v.into_service(); - let send_algo = &http_config.compression.send_compression_algorithm; - if let Some(encoding) = - into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) - { - service = service.send_compressed(encoding); - } - for encoding in http_config - .compression - .accepted_compression_algorithms - .iter() - // Filter None values. - .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) - { - service = service.accept_compressed(encoding); - } - Some(service) - }) + CasServer::new(&cfg, &store_manager) + .map(|v| Some(service_setup!(v, http_config))) }) .err_tip(|| "Could not create CAS service")?, ) @@ -282,25 +278,8 @@ async fn inner_main( services .execution .map_or(Ok(None), |cfg| { - ExecutionServer::new(&cfg, &action_schedulers, &store_manager).map(|v| { - let mut service = v.into_service(); - let send_algo = &http_config.compression.send_compression_algorithm; - if let Some(encoding) = - into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) - { - service = service.send_compressed(encoding); - } - for encoding in http_config - .compression - .accepted_compression_algorithms - .iter() - // Filter None values. - .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) - { - service = service.accept_compressed(encoding); - } - Some(service) - }) + ExecutionServer::new(&cfg, &action_schedulers, &store_manager) + .map(|v| Some(service_setup!(v, http_config))) }) .err_tip(|| "Could not create Execution service")?, ) @@ -308,25 +287,8 @@ async fn inner_main( services .fetch .map_or(Ok(None), |cfg| { - FetchServer::new(&cfg, &store_manager).map(|v| { - let mut service = v.into_service(); - let send_algo = &http_config.compression.send_compression_algorithm; - if let Some(encoding) = - into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) - { - service = service.send_compressed(encoding); - } - for encoding in http_config - .compression - .accepted_compression_algorithms - .iter() - // Filter None values. - .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) - { - service = service.accept_compressed(encoding); - } - Some(service) - }) + FetchServer::new(&cfg, &store_manager) + .map(|v| Some(service_setup!(v, http_config))) }) .err_tip(|| "Could not create Fetch service")?, ) @@ -334,25 +296,8 @@ async fn inner_main( services .push .map_or(Ok(None), |cfg| { - PushServer::new(&cfg, &store_manager).map(|v| { - let mut service = v.into_service(); - let send_algo = &http_config.compression.send_compression_algorithm; - if let Some(encoding) = - into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) - { - service = service.send_compressed(encoding); - } - for encoding in http_config - .compression - .accepted_compression_algorithms - .iter() - // Filter None values. - .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) - { - service = service.accept_compressed(encoding); - } - Some(service) - }) + PushServer::new(&cfg, &store_manager) + .map(|v| Some(service_setup!(v, http_config))) }) .err_tip(|| "Could not create Push service")?, ) @@ -360,33 +305,8 @@ async fn inner_main( services .bytestream .map_or(Ok(None), |cfg| { - ByteStreamServer::new(&cfg, &store_manager).map(|v| { - let mut service = v.into_service(); - // TODO(palfrey): generalise this to all the services - let max_decoding_message_size = - if http_config.max_decoding_message_size == 0 { - DEFAULT_MAX_DECODING_MESSAGE_SIZE - } else { - http_config.max_decoding_message_size - }; - service = service.max_decoding_message_size(max_decoding_message_size); - let send_algo = &http_config.compression.send_compression_algorithm; - if let Some(encoding) = - into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) - { - service = service.send_compressed(encoding); - } - for encoding in http_config - .compression - .accepted_compression_algorithms - .iter() - // Filter None values. - .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) - { - service = service.accept_compressed(encoding); - } - Some(service) - }) + ByteStreamServer::new(&cfg, &store_manager) + .map(|v| Some(service_setup!(v, http_config))) }) .err_tip(|| "Could not create ByteStream service")?, ) @@ -395,62 +315,21 @@ async fn inner_main( services .capabilities .as_ref() - // Borrow checker fighting here... - .map(|_| { - CapabilitiesServer::new( - services.capabilities.as_ref().unwrap(), - &action_schedulers, - ) - }), + .map(|cfg| CapabilitiesServer::new(cfg, &action_schedulers)), ) .await .map_or(Ok::, Error>(None), |server| { Ok(Some(server?)) }) .err_tip(|| "Could not create Capabilities service")? - .map(|v| { - let mut service = v.into_service(); - let send_algo = &http_config.compression.send_compression_algorithm; - if let Some(encoding) = - into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) - { - service = service.send_compressed(encoding); - } - for encoding in http_config - .compression - .accepted_compression_algorithms - .iter() - // Filter None values. - .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) - { - service = service.accept_compressed(encoding); - } - service - }), + .map(|v| service_setup!(v, http_config)), ) .add_optional_service( services .worker_api .map_or(Ok(None), |cfg| { - WorkerApiServer::new(&cfg, &worker_schedulers).map(|v| { - let mut service = v.into_service(); - let send_algo = &http_config.compression.send_compression_algorithm; - if let Some(encoding) = - into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) - { - service = service.send_compressed(encoding); - } - for encoding in http_config - .compression - .accepted_compression_algorithms - .iter() - // Filter None values. - .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) - { - service = service.accept_compressed(encoding); - } - Some(service) - }) + WorkerApiServer::new(&cfg, &worker_schedulers) + .map(|v| Some(service_setup!(v, http_config))) }) .err_tip(|| "Could not create WorkerApi service")?, ) @@ -458,25 +337,8 @@ async fn inner_main( services .experimental_bep .map_or(Ok(None), |cfg| { - BepServer::new(&cfg, &store_manager).map(|v| { - let mut service = v.into_service(); - let send_algo = &http_config.compression.send_compression_algorithm; - if let Some(encoding) = - into_encoding(send_algo.unwrap_or(HttpCompressionAlgorithm::None)) - { - service = service.send_compressed(encoding); - } - for encoding in http_config - .compression - .accepted_compression_algorithms - .iter() - // Filter None values. - .filter_map(|from: &HttpCompressionAlgorithm| into_encoding(*from)) - { - service = service.accept_compressed(encoding); - } - Some(service) - }) + BepServer::new(&cfg, &store_manager) + .map(|v| Some(service_setup!(v, http_config))) }) .err_tip(|| "Could not create BEP service")?, ); @@ -564,7 +426,7 @@ async fn inner_main( std::fs::File::open(cert_file) .err_tip(|| format!("Could not open cert file {cert_file}"))?, ); - let certs = extract_certs(&mut cert_reader) + let certs = CertificateDer::pem_reader_iter(&mut cert_reader) .collect::>, _>>() .err_tip(|| format!("Could not extract certs from file {cert_file}"))?; Ok(certs) @@ -574,12 +436,12 @@ async fn inner_main( std::fs::File::open(&tls_config.key_file) .err_tip(|| format!("Could not open key file {}", tls_config.key_file))?, ); - let key = match rustls_pemfile::read_one(&mut key_reader) + let key = match PrivateKeyDer::from_pem_reader(&mut key_reader) .err_tip(|| format!("Could not extract key(s) from file {}", tls_config.key_file))? { - Some(rustls_pemfile::Item::Pkcs8Key(key)) => key.into(), - Some(rustls_pemfile::Item::Sec1Key(key)) => key.into(), - Some(rustls_pemfile::Item::Pkcs1Key(key)) => key.into(), + PrivateKeyDer::Pkcs8(key) => key.into(), + PrivateKeyDer::Sec1(key) => key.into(), + PrivateKeyDer::Pkcs1(key) => key.into(), _ => { return Err(make_err!( Code::Internal, @@ -588,7 +450,7 @@ async fn inner_main( )); } }; - if let Ok(Some(_)) = rustls_pemfile::read_one(&mut key_reader) { + if PrivateKeyDer::from_pem_reader(&mut key_reader).is_ok() { return Err(make_err!( Code::InvalidArgument, "Expected 1 key in file {}", @@ -607,7 +469,7 @@ async fn inner_main( std::fs::File::open(client_crl_file) .err_tip(|| format!("Could not open CRL file {client_crl_file}"))?, ); - extract_crls(&mut crl_reader) + CertificateRevocationListDer::pem_reader_iter(&mut crl_reader) .collect::>() .err_tip(|| format!("Could not extract CRLs from file {client_crl_file}"))? } else { @@ -816,9 +678,19 @@ async fn inner_main( } worker_names.insert(name.clone()); let shutdown_rx = shutdown_tx.subscribe(); + let worker_name = name.clone(); let fut = trace_span!("worker_ctx", worker_name = %name) - .in_scope(|| local_worker.run(shutdown_rx)); - spawn!("worker", fut, ?name) + .in_scope(|| local_worker.run(shutdown_tx.clone(), shutdown_rx)); + spawn!("worker", async move { + let result = fut.await; + if result.is_ok() { + // Worker completed successfully (graceful shutdown). + // Exit the process with code 0. + info!(worker_name = %worker_name, "Worker completed successfully, exiting process"); + std::process::exit(0); + } + result + }, ?name) } }; root_futures.push(Box::pin(spawn_fut.map_ok_or_else(|e| Err(e.into()), |v| v))); @@ -829,6 +701,7 @@ async fn inner_main( let mut shutdown_rx = shutdown_tx.subscribe(); root_futures.push(Box::pin(async move { if let Ok(shutdown_guard) = shutdown_rx.recv().await { + let _ = scheduler_shutdown_tx.send(()); for (_name, scheduler) in worker_schedulers { scheduler.shutdown(shutdown_guard.clone()).await; } @@ -857,11 +730,7 @@ fn main() -> Result<(), Box> { // The OTLP exporters need to run in a Tokio context // Do this first so all the other logging works #[expect(clippy::disallowed_methods, reason = "tracing init on main runtime")] - runtime.block_on(async { tokio::spawn(async { init_tracing() }).await? })?; - - if cfg!(feature = "worker_find_logging") { - info!("worker_find_logging enabled"); - } + runtime.block_on(async { tokio::spawn(async { init_tracing().await }).await? })?; let mut cfg = get_config()?; @@ -907,6 +776,9 @@ fn main() -> Result<(), Box> { std::process::exit(130); }); + #[allow(unused_variables)] + let (scheduler_shutdown_tx, scheduler_shutdown_rx) = oneshot::channel(); + #[cfg(target_family = "unix")] #[expect(clippy::disallowed_methods, reason = "signal handler on main runtime")] runtime.spawn(async move { @@ -916,6 +788,9 @@ fn main() -> Result<(), Box> { .await; warn!("Process terminated via SIGTERM",); drop(shutdown_tx_clone.send(shutdown_guard.clone())); + scheduler_shutdown_rx + .await + .expect("Failed to receive scheduler shutdown"); let () = shutdown_guard.wait_for(Priority::P0).await; warn!("Successfully shut down nativelink.",); std::process::exit(143); @@ -925,7 +800,7 @@ fn main() -> Result<(), Box> { runtime .block_on(async { trace_span!("main") - .in_scope(|| async { inner_main(cfg, shutdown_tx).await }) + .in_scope(|| async { inner_main(cfg, shutdown_tx, scheduler_shutdown_tx).await }) .await }) .err_tip(|| "main() function failed")?; diff --git a/src/bin/redis_store_tester.rs b/src/bin/redis_store_tester.rs new file mode 100644 index 000000000..ee9073b18 --- /dev/null +++ b/src/bin/redis_store_tester.rs @@ -0,0 +1,344 @@ +use core::sync::atomic::{AtomicUsize, Ordering}; +use core::time::Duration; +use std::borrow::Cow; +use std::env; +use std::sync::{Arc, RwLock}; + +use bytes::Bytes; +use clap::{Parser, ValueEnum}; +use futures::TryStreamExt; +use nativelink_config::stores::{RedisMode, RedisSpec}; +use nativelink_error::{Code, Error, ResultExt}; +use nativelink_store::redis_store::RedisStore; +use nativelink_util::buf_channel::make_buf_channel_pair; +use nativelink_util::store_trait::{ + SchedulerCurrentVersionProvider, SchedulerIndexProvider, SchedulerStore, + SchedulerStoreDataProvider, SchedulerStoreDecodeTo, SchedulerStoreKeyProvider, StoreDriver, + StoreKey, StoreLike, TrueValue, UploadSizeInfo, +}; +use nativelink_util::telemetry::init_tracing; +use nativelink_util::{background_spawn, spawn}; +use rand::Rng; +use tokio::time::sleep; +use tracing::{error, info}; + +// Define test structures that implement the scheduler traits +#[derive(Debug, Clone, PartialEq)] +struct TestSchedulerData { + key: String, + content: String, + version: i64, +} + +#[derive(Debug)] +struct TestSchedulerReturn { + version: i64, +} + +impl SchedulerStoreKeyProvider for TestSchedulerData { + type Versioned = TrueValue; // Using versioned storage + + fn get_key(&self) -> StoreKey<'static> { + StoreKey::Str(Cow::Owned(self.key.clone())) + } +} + +impl SchedulerStoreDataProvider for TestSchedulerData { + fn try_into_bytes(self) -> Result { + Ok(Bytes::from(self.content.into_bytes())) + } + + fn get_indexes(&self) -> Result, Error> { + // Add some test indexes - need to use 'static strings + Ok(vec![ + ("test_index", Bytes::from("test_value")), + ( + "content_prefix", + Bytes::from(self.content.chars().take(10).collect::()), + ), + ]) + } +} + +impl SchedulerStoreDecodeTo for TestSchedulerData { + type DecodeOutput = TestSchedulerReturn; + + fn decode(version: i64, _data: Bytes) -> Result { + Ok(TestSchedulerReturn { version }) + } +} + +impl SchedulerCurrentVersionProvider for TestSchedulerData { + fn current_version(&self) -> i64 { + self.version + } +} + +struct SearchByContentPrefix { + prefix: String, +} + +impl SchedulerIndexProvider for SearchByContentPrefix { + const KEY_PREFIX: &'static str = "test:"; + const INDEX_NAME: &'static str = "content_prefix"; + type Versioned = TrueValue; + + fn index_value(&self) -> Cow<'_, str> { + Cow::Borrowed(&self.prefix) + } +} + +impl SchedulerStoreKeyProvider for SearchByContentPrefix { + type Versioned = TrueValue; + + fn get_key(&self) -> StoreKey<'static> { + StoreKey::Str(Cow::Owned("dummy_key".to_string())) + } +} + +impl SchedulerStoreDecodeTo for SearchByContentPrefix { + type DecodeOutput = TestSchedulerReturn; + + fn decode(version: i64, data: Bytes) -> Result { + TestSchedulerData::decode(version, data) + } +} + +const MAX_KEY: u16 = 1024; + +/// Wrapper type for CLI parsing since we can't implement foreign traits on foreign types. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum RedisModeArg { + Cluster, + Sentinel, + #[default] + Standard, +} + +impl From for RedisMode { + fn from(arg: RedisModeArg) -> Self { + match arg { + RedisModeArg::Standard => Self::Standard, + RedisModeArg::Sentinel => Self::Sentinel, + RedisModeArg::Cluster => Self::Cluster, + } + } +} + +fn random_key() -> StoreKey<'static> { + let key = rand::rng().random_range(0..MAX_KEY); + StoreKey::new_str(&key.to_string()).into_owned() +} + +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum TestMode { + #[default] + Random, + Sequential, +} + +#[derive(Parser, Debug)] +#[command(version, about)] +struct Args { + #[arg(value_enum, short, long, default_value_t)] + redis_mode: RedisModeArg, + + #[arg(value_enum, short, long, default_value_t)] + mode: TestMode, +} + +async fn run( + store: Arc, + max_loops: usize, + failed: Arc>, + mode: TestMode, +) -> Result<(), Error> { + let mut count = 0; + let in_flight = Arc::new(AtomicUsize::new(0)); + + loop { + if count % 1000 == 0 { + info!( + "Loop count {count}. In flight: {}", + in_flight.load(Ordering::Relaxed) + ); + if *failed.read().unwrap() { + return Err(Error::new( + Code::Internal, + "Failed in redis_store_tester".to_string(), + )); + } + } + if count == max_loops { + loop { + let remaining = in_flight.load(Ordering::Relaxed); + if remaining == 0 { + return Ok(()); + } + info!(remaining, "Remaining"); + sleep(Duration::from_secs(1)).await; + } + } + count += 1; + in_flight.fetch_add(1, Ordering::Relaxed); + + let store_clone = store.clone(); + let local_fail = failed.clone(); + let local_in_flight = in_flight.clone(); + + let max_action_value = 7; + let action_value = match mode { + TestMode::Random => rand::rng().random_range(0..max_action_value), + TestMode::Sequential => count % max_action_value, + }; + + background_spawn!("action", async move { + async fn run_action( + action_value: usize, + store_clone: Arc, + ) -> Result<(), Error> { + match action_value { + 0 => { + store_clone.has(random_key()).await?; + } + 1 => { + let (mut tx, rx) = make_buf_channel_pair(); + tx.send(Bytes::from_static(b"12345")).await?; + tx.send_eof()?; + store_clone + .update(random_key(), rx, UploadSizeInfo::ExactSize(5)) + .await?; + } + 2 => { + let mut results = (0..MAX_KEY).map(|_| None).collect::>(); + + store_clone + .has_with_results( + &(0..MAX_KEY) + .map(|i| StoreKey::Str(Cow::Owned(i.to_string()))) + .collect::>(), + &mut results, + ) + .await?; + } + 3 => { + store_clone + .update_oneshot(random_key(), Bytes::from_static(b"1234")) + .await?; + } + 4 => { + let res = store_clone + .list(.., |_key| true) + .await + .err_tip(|| "In list")?; + info!(%res, "end list"); + } + 5 => { + let search_provider = SearchByContentPrefix { + prefix: "Searchable".to_string(), + }; + for i in 0..5 { + let data = TestSchedulerData { + key: format!("test:search_key_{i}"), + content: format!("Searchable content #{i}"), + version: 0, + }; + + store_clone.update_data(data).await?; + } + let search_results: Vec<_> = store_clone + .search_by_index_prefix(search_provider) + .await? + .try_collect() + .await?; + info!(?search_results, "search results"); + } + _ => { + let mut data = TestSchedulerData { + key: "test:scheduler_key_1".to_string(), + content: "Test scheduler data #1".to_string(), + version: 0, + }; + + let res = store_clone.get_and_decode(data.clone()).await?; + if let Some(existing_data) = res { + data.version = existing_data.version + 1; + } + + store_clone.update_data(data).await?; + } + } + Ok(()) + } + match run_action(action_value, store_clone).await { + Ok(()) => {} + Err(e) => { + error!(?e, "Error!"); + *local_fail.write().unwrap() = true; + } + } + local_in_flight.fetch_sub(1, Ordering::Relaxed); + }); + } +} + +fn main() -> Result<(), Box> { + let args = Args::parse(); + let redis_mode: RedisMode = args.redis_mode.into(); + + let failed = Arc::new(RwLock::new(false)); + let redis_host = env::var("REDIS_HOST").unwrap_or_else(|_| "127.0.0.1".to_string()); + let max_client_permits = env::var("MAX_REDIS_PERMITS") + .unwrap_or_else(|_| "100".to_string()) + .parse()?; + let max_loops: usize = env::var("MAX_LOOPS") + .unwrap_or_else(|_| "2000000".to_string()) + .parse()?; + + #[expect( + clippy::disallowed_methods, + reason = "`We need `tokio::runtime::Runtime::block_on` so we can get errors _after_ threads finished" + )] + tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap() + .block_on(async { + // The OTLP exporters need to run in a Tokio context. + spawn!("init tracing", async { init_tracing().await }) + .await? + .expect("Init tracing should work"); + + let redis_port = match redis_mode { + RedisMode::Standard => 6379, + RedisMode::Sentinel => 26379, + RedisMode::Cluster => 7000, + }; + let addr = match redis_mode { + RedisMode::Sentinel => format!("redis+sentinel://{redis_host}:{redis_port}/"), + _ => format!("redis://{redis_host}:{redis_port}/"), + }; + let spec = RedisSpec { + addresses: vec![addr], + connection_timeout_ms: 1000, + max_client_permits, + mode: redis_mode, + ..Default::default() + }; + match spec.mode { + RedisMode::Standard | RedisMode::Sentinel => { + let store = RedisStore::new_standard(spec).await?; + run(store, max_loops, failed.clone(), args.mode).await + } + RedisMode::Cluster => { + let store = RedisStore::new_cluster(spec).await?; + run(store, max_loops, failed.clone(), args.mode).await + } + } + }) + .unwrap(); + if *failed.read().unwrap() { + return Err(Error::new(Code::Internal, "Failed in redis_store_tester".to_string()).into()); + } + Ok(()) +} diff --git a/templates/README.md b/templates/README.md index 73ec81333..8cf83b26d 100644 --- a/templates/README.md +++ b/templates/README.md @@ -1,5 +1,4 @@ -NativeLink provides the following templates to use caching and remote execution -on the NativeLink cloud: +NativeLink provides the following templates to use caching and remote execution: - **`bazel`**: C++ with local remote execution using Bazel. Provides the same toolchain during local and remote execution to share cache diff --git a/templates/bazel/README.md b/templates/bazel/README.md index ac6c32859..1cca8b1e6 100644 --- a/templates/bazel/README.md +++ b/templates/bazel/README.md @@ -1,13 +1,9 @@ # Getting started -Get your credentials for the [NativeLink cloud](https://app.nativelink.com/) and -paste them into `user.bazelrc` +Get your credentials and paste them into `user.bazelrc` ``` build --remote_cache=grpcs://TODO -build --remote_header=x-nativelink-api-key=TODO build --bes_backend=grpcs://TODO -build --bes_header=x-nativelink-api-key=TODO -build --bes_results_url=TODO build --remote_timeout=600 build --remote_executor=grpcs://TODO ``` @@ -30,7 +26,7 @@ You're ready to build the provided example with `bazel build hello-world`. - **`user.bazelrc`**: Add Bazel flags to your builds, see [Command-Line Reference](https://bazel.build/reference/command-line-reference). - Don't forget to add your NativeLink cloud credentials or set `remote_cache` + Don't forget to set `remote_cache` and `remote_executor` to your on-prem solution, see [remote execution infrastructure](https://www.nativelink.com/docs/rbe/remote-execution-examples#preparing-the-remote-execution-infrastructure). @@ -40,7 +36,7 @@ You're ready to build the provided example with `bazel build hello-world`. - **`platforms/BUILD.bazel`**: The platform `lre-cc` specifies the URL of the `container-image` that gets - passed to the NativeLink cloud with `exec_properties`. + passed to your Nativelink instance with `exec_properties`. This platform inherits its properties from the LRE Bazel module. # Code quality and CI diff --git a/templates/bazel/user.bazelrc b/templates/bazel/user.bazelrc index 519dba0b0..04e01f944 100644 --- a/templates/bazel/user.bazelrc +++ b/templates/bazel/user.bazelrc @@ -1,8 +1,4 @@ -# Replace with credentials from https://app.nativelink.com/. build --remote_cache=grpcs://TODO -build --remote_header=x-nativelink-api-key=TODO build --bes_backend=grpcs://TODO -build --bes_header=x-nativelink-api-key=TODO -build --bes_results_url=TODO build --remote_timeout=600 build --remote_executor=grpcs://TODO diff --git a/toolchain-examples/.bazelrc b/toolchain-examples/.bazelrc index a55209d7a..6b8325de6 100644 --- a/toolchain-examples/.bazelrc +++ b/toolchain-examples/.bazelrc @@ -1,81 +1,22 @@ -# Don't use the host's default PATH and LD_LIBRARY_PATH. -build --incompatible_strict_action_env - # Use rules_python's builtin script to emulate a bootstrap python. build --@rules_python//python/config_settings:bootstrap_impl=script -# Toolchain to verify remote execution with zig-cc. -build:zig-cc --platforms @zig_sdk//platform:linux_amd64 -build:zig-cc --platforms @zig_sdk//platform:linux_amd64 - # These toolchains map out everything in # https://github.com/uber/hermetic_cc_toolchain/blob/bfc407599e503a44928a3cefad27421c9341eff0/MODULE.bazel#L44 -# -# TODO(palfrey): Change this after the next release that contains -# https://github.com/uber/hermetic_cc_toolchain/commit/892973baa37ee1cb7adc8e5b0f75e1966093b1d3 -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//toolchain:windows_amd64 -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//toolchain:windows_arm64 -# build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//toolchain:darwin_amd64 -# build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//toolchain:darwin_arm64 -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//libc_aware/toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//libc_aware/toolchain:linux_amd64_gnu.2.31 -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//libc_aware/toolchain:linux_amd64_musl -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//libc_aware/toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//libc_aware/toolchain:linux_arm64_musl -build:zig-cc --extra_toolchains @zig_sdk-linux-amd64//toolchain:wasip1_wasm -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//toolchain:windows_amd64 -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//toolchain:windows_arm64 -# build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//toolchain:darwin_amd64 -# build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//toolchain:darwin_arm64 -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//libc_aware/toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//libc_aware/toolchain:linux_amd64_gnu.2.31 -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//libc_aware/toolchain:linux_amd64_musl -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//libc_aware/toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//libc_aware/toolchain:linux_arm64_musl -build:zig-cc --extra_toolchains @zig_sdk-linux-arm64//toolchain:wasip1_wasm -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//toolchain:windows_amd64 -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//toolchain:windows_arm64 -# build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//toolchain:darwin_amd64 -# build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//toolchain:darwin_arm64 -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//libc_aware/toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//libc_aware/toolchain:linux_amd64_gnu.2.31 -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//libc_aware/toolchain:linux_amd64_musl -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//libc_aware/toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//libc_aware/toolchain:linux_arm64_musl -build:zig-cc --extra_toolchains @zig_sdk-windows-amd64//toolchain:wasip1_wasm -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//toolchain:windows_amd64 -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//toolchain:windows_arm64 -# build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//toolchain:darwin_amd64 -# build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//toolchain:darwin_arm64 -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//libc_aware/toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//libc_aware/toolchain:linux_amd64_gnu.2.31 -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//libc_aware/toolchain:linux_amd64_musl -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//libc_aware/toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//libc_aware/toolchain:linux_arm64_musl -build:zig-cc --extra_toolchains @zig_sdk-macos-arm64//toolchain:wasip1_wasm -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//toolchain:windows_amd64 -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//toolchain:windows_arm64 -# build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//toolchain:darwin_amd64 -# build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//toolchain:darwin_arm64 -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//libc_aware/toolchain:linux_amd64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//libc_aware/toolchain:linux_amd64_gnu.2.31 -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//libc_aware/toolchain:linux_amd64_musl -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//libc_aware/toolchain:linux_arm64_gnu.2.28 -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//libc_aware/toolchain:linux_arm64_musl -build:zig-cc --extra_toolchains @zig_sdk-macos-amd64//toolchain:wasip1_wasm +build:zig-cc --extra_toolchains @zig_sdk//toolchain:linux_amd64_gnu.2.28 +build:zig-cc --extra_toolchains @zig_sdk//toolchain:linux_arm64_gnu.2.28 +build:zig-cc --extra_toolchains @zig_sdk//toolchain:windows_amd64 +build:zig-cc --extra_toolchains @zig_sdk//toolchain:windows_arm64 +build:zig-cc --extra_toolchains @zig_sdk//toolchain:darwin_amd64 +build:zig-cc --extra_toolchains @zig_sdk//toolchain:darwin_arm64 +build:zig-cc --extra_toolchains @zig_sdk//libc_aware/toolchain:linux_amd64_gnu.2.28 +build:zig-cc --extra_toolchains @zig_sdk//libc_aware/toolchain:linux_amd64_gnu.2.31 +build:zig-cc --extra_toolchains @zig_sdk//libc_aware/toolchain:linux_amd64_musl +build:zig-cc --extra_toolchains @zig_sdk//libc_aware/toolchain:linux_arm64_gnu.2.28 +build:zig-cc --extra_toolchains @zig_sdk//libc_aware/toolchain:linux_arm64_musl +build:zig-cc --extra_toolchains @zig_sdk//toolchain:wasip1_wasm # Toolchain to verify remote execution with contrib/toolchains_llvm. -build:llvm --platforms=@toolchains_llvm//platforms:linux-x86_64 build:llvm --extra_toolchains=@llvm_toolchain//:cc-toolchain-x86_64-linux # Java runtime to ensure hermeticity on the remote. diff --git a/toolchain-examples/MODULE.bazel b/toolchain-examples/MODULE.bazel index 50465b54f..80c7b3881 100644 --- a/toolchain-examples/MODULE.bazel +++ b/toolchain-examples/MODULE.bazel @@ -4,10 +4,10 @@ module( compatibility_level = 0, ) -bazel_dep(name = "platforms", version = "0.0.11") +bazel_dep(name = "platforms", version = "1.0.0") # C++ -bazel_dep(name = "rules_cc", version = "0.1.1") +bazel_dep(name = "rules_cc", version = "0.1.5") # Java bazel_dep(name = "rules_java", version = "8.11.0") @@ -34,7 +34,16 @@ python.toolchain( use_repo(pip, "pip") # Go -bazel_dep(name = "rules_go", version = "0.53.0") +bazel_dep(name = "rules_go", version = "0.57.0") + +# Adds https://github.com/bazel-contrib/rules_go/commit/74199c92e20399b6ef46684b2c6fdd94b50a7892 +# to fix bash issues with Nix +archive_override( + module_name = "rules_go", + integrity = "sha256-ukyyC80j4VhRCD7DOaenkk41Vvnmsp7uAfHr4lxdXtQ=", + strip_prefix = "rules_go-74199c92e20399b6ef46684b2c6fdd94b50a7892", + urls = ["https://github.com/bazel-contrib/rules_go/archive/74199c92e20399b6ef46684b2c6fdd94b50a7892.zip"], +) # Rust bazel_dep(name = "rules_rust", version = "0.61.0") @@ -47,17 +56,12 @@ bazel_dep(name = "rules_rust", version = "0.61.0") # # To test this toolchain, use for use with the config flag `--config=zig-cc`. # -bazel_dep(name = "hermetic_cc_toolchain", version = "3.2.0") +bazel_dep(name = "hermetic_cc_toolchain", version = "4.0.1") zig = use_extension("@hermetic_cc_toolchain//toolchain:ext.bzl", "toolchains") use_repo( zig, "zig_sdk", - "zig_sdk-linux-amd64", - "zig_sdk-linux-arm64", - "zig_sdk-macos-amd64", - "zig_sdk-macos-arm64", - "zig_sdk-windows-amd64", ) # C++ toolchain via contrib/toolchains_llvm. @@ -84,13 +88,10 @@ bazel_dep(name = "curl", version = "8.8.0.bcr.3") bazel_dep(name = "zstd", version = "1.5.7") # Abseil for C++ -bazel_dep(name = "abseil-cpp", version = "20250127.0") +bazel_dep(name = "abseil-cpp", version = "20250512.1") # Abseil for python bazel_dep(name = "abseil-py", version = "2.1.0") -# GRPC -bazel_dep(name = "grpc", version = "1.71.0") - # Circl (Go, C++) bazel_dep(name = "circl", version = "1.3.8") diff --git a/toolchain-examples/nativelink-config.json5 b/toolchain-examples/nativelink-config.json5 index 7fdc425ba..7e40a65e4 100644 --- a/toolchain-examples/nativelink-config.json5 +++ b/toolchain-examples/nativelink-config.json5 @@ -1,16 +1,17 @@ { - stores: { - AC_MAIN_STORE: { + stores: [ + { + name: "AC_MAIN_STORE", filesystem: { content_path: "/tmp/nativelink/data-worker-test/content_path-ac", temp_path: "/tmp/nativelink/data-worker-test/tmp_path-ac", eviction_policy: { - // 1gb. - max_bytes: 1000000000, + max_bytes: "1gb", }, }, }, - WORKER_FAST_SLOW_STORE: { + { + name: "WORKER_FAST_SLOW_STORE", fast_slow: { // "fast" must be a "filesystem" store because the worker uses it to make // hardlinks on disk to a directory where the jobs are running. @@ -19,8 +20,7 @@ content_path: "/tmp/nativelink/data-worker-test/content_path-cas", temp_path: "/tmp/nativelink/data-worker-test/tmp_path-cas", eviction_policy: { - // 10gb. - max_bytes: 10000000000, + max_bytes: "10gb", }, }, }, @@ -34,9 +34,10 @@ }, }, }, - }, - schedulers: { - MAIN_SCHEDULER: { + ], + schedulers: [ + { + name: "MAIN_SCHEDULER", simple: { supported_platform_properties: { cpu_count: "minimum", @@ -48,7 +49,7 @@ }, }, }, - }, + ], workers: [ { local: { @@ -127,11 +128,11 @@ }, }, ], - bytestream: { - cas_stores: { - "": "WORKER_FAST_SLOW_STORE", + bytestream: [ + { + cas_store: "WORKER_FAST_SLOW_STORE", }, - }, + ], }, }, { diff --git a/toolchain-examples/rbe-toolchain-test.nix b/toolchain-examples/rbe-toolchain-test.nix new file mode 100644 index 000000000..dd89f16b6 --- /dev/null +++ b/toolchain-examples/rbe-toolchain-test.nix @@ -0,0 +1,75 @@ +{ + nativelink, + writeShellScriptBin, + bazelisk, +}: +writeShellScriptBin "rbe-toolchain-test" '' + set -uo pipefail + + cleanup() { + local pids=$(jobs -pr) + [ -n "$pids" ] && kill $pids + } + trap "cleanup" INT QUIT TERM EXIT + + NO_COLOR=true ${nativelink}/bin/nativelink -- toolchain-examples/nativelink-config.json5 | tee -i toolchain-examples/nativelink.log & + + CORE_BAZEL_ARGS="--check_direct_dependencies=error --remote_cache=grpc://localhost:50051 --remote_executor=grpc://localhost:50051" + + CPU_TYPE=$(uname -m) + + if [[ "$CPU_TYPE" == 'x86_64' ]]; then + PLATFORM='amd64' + else + PLATFORM='arm64' + fi + + LLVM_PLATFORM="--config=llvm --platforms=@toolchains_llvm//platforms:linux-''${CPU_TYPE}" + ZIG_PLATFORM="--config=zig-cc --platforms @zig_sdk//platform:linux_''${PLATFORM}" + + # As per https://nativelink.com/docs/rbe/remote-execution-examples#minimal-example-targets + COMMANDS=("test //cpp $ZIG_PLATFORM" + "test //cpp $LLVM_PLATFORM" + "test //python" + "test //go $ZIG_PLATFORM" + "test //rust $ZIG_PLATFORM" + "test //java:HelloWorld --config=java" + "build @curl//... $ZIG_PLATFORM" + "build @zstd//... $ZIG_PLATFORM" + # "test @abseil-cpp//... $ZIG_PLATFORM" # Buggy build due to google_benchmark errors + "test @abseil-py//..." + "test @circl//... $ZIG_PLATFORM" + ) + + echo "" > toolchain-examples/cmd.log + for cmd in "''${COMMANDS[@]}" + do + FULL_CMD="${bazelisk}/bin/bazelisk $cmd $CORE_BAZEL_ARGS" + echo $FULL_CMD + echo -e \\n$FULL_CMD\\n >> toolchain-examples/cmd.log + cmd_output=$(cd toolchain-examples && eval "$FULL_CMD" 2>&1 | tee -ai cmd.log) + cmd_exit_code=$? + case $cmd_exit_code in + 0 ) + echo "Saw a successful $cmd build" + ;; + *) + echo "Failed $cmd build:" + echo $cmd_output + exit 1 + ;; + esac + done + + nativelink_output=$(cat toolchain-examples/nativelink.log) + + case $nativelink_output in + *"ERROR "* ) + echo "Error in nativelink build" + exit 1 + ;; + *) + echo 'Successful nativelink build' + ;; + esac +'' diff --git a/tools/cargo-with-detailed-deps.json b/tools/cargo-with-detailed-deps.json new file mode 100644 index 000000000..fcd8e3841 --- /dev/null +++ b/tools/cargo-with-detailed-deps.json @@ -0,0 +1,2342 @@ +{ + "type": "object", + "$comment": "Derived from https://www.schemastore.org/cargo.json, with edits to force default-features=false", + "$id": "https://json.schemastore.org/cargo.json", + "$schema": "http://json-schema.org/draft-07/schema#", + "additionalProperties": false, + "definitions": { + "Authors": { + "type": "array", + "description": "The `authors` field lists people or organizations that are considered the\n\"authors\" of the package. The exact meaning is open to interpretation \u2014 it may\nlist the original or primary authors, current maintainers, or owners of the\npackage. These names will be listed on the crate's page on\n[crates.io](https://crates.io). An optional email address may be included within angled\nbrackets at the end of each author.\n\n> **Note**: [crates.io](https://crates.io) requires at least one author to be listed.", + "items": { + "type": "string", + "description": "The `authors` field lists people or organizations that are considered the\n\"authors\" of the package. The exact meaning is open to interpretation \u2014 it may\nlist the original or primary authors, current maintainers, or owners of the\npackage. These names will be listed on the crate's page on\n[crates.io](https://crates.io). An optional email address may be included within angled\nbrackets at the end of each author.\n\n> **Note**: [crates.io](https://crates.io) requires at least one author to be listed.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-authors-field" + } + } + }, + "title": "Authors", + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-authors-field" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "Build": { + "anyOf": [ + { + "type": "string", + "description": "Path to the build file." + }, + { + "type": "boolean", + "enum": [ + true, + false + ], + "x-taplo": { + "docs": { + "enumValues": [ + "Automatically detect the build file (`build.rs`).", + "Disable automatic detection of the build file." + ] + } + } + } + ], + "description": "The `build` field specifies a file in the package root which is a [build script](https://doc.rust-lang.org/cargo/reference/build-scripts.html) for building native code. More information can be found in the [build script guide](https://doc.rust-lang.org/cargo/reference/build-scripts.html).\n\n\n```toml\n[package]\n# ...\nbuild = \"build.rs\"\n```\n\nThe default is `\"build.rs\"`, which loads the script from a file named\n`build.rs` in the root of the package. Use `build = \"custom_build_name.rs\"` to\nspecify a path to a different file or `build = false` to disable automatic\ndetection of the build script.\n", + "title": "Build", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-build-field" + } + } + }, + "BuildOverride": { + "allOf": [ + { + "$ref": "#/definitions/Profile" + } + ], + "description": "Profile settings can be overridden for specific packages and build-time\ncrates. To override the settings for a specific package, use the `package`\ntable to change the settings for the named package:\n\n```toml\n# The `foo` package will use the -Copt-level=3 flag.\n[profile.dev.package.foo]\nopt-level = 3\n```\n\nThe package name is actually a [Package ID Spec](https://doc.rust-lang.org/cargo/reference/pkgid-spec.html), so you can\ntarget individual versions of a package with syntax such as\n`[profile.dev.package.\"foo:2.1.0\"]`.\n\nTo override the settings for all dependencies (but not any workspace member),\nuse the `\"*\"` package name:\n\n```toml\n# Set the default for dependencies.\n[profile.dev.package.\"*\"]\nopt-level = 2\n```\n\nTo override the settings for build scripts, proc macros, and their\ndependencies, use the `build-override` table:\n\n```toml\n# Set the settings for build scripts and proc-macros.\n[profile.dev.build-override]\nopt-level = 3\n```\n\n> Note: When a dependency is both a normal dependency and a build dependency,\n> Cargo will try to only build it once when `--target` is not specified. When\n> using `build-override`, the dependency may need to be built twice, once as a\n> normal dependency and once with the overridden build settings. This may\n> increase initial build times.\n", + "title": "Build Override", + "x-taplo": { + "docs": { + "main": "Profile settings can be overridden for specific packages and build-time\ncrates. To override the settings for a specific package, use the `package`\ntable to change the settings for the named package:\n\n```toml\n# The `foo` package will use the -Copt-level=3 flag.\n[profile.dev.package.foo]\nopt-level = 3\n```\n\nThe package name is actually a [Package ID Spec](https://doc.rust-lang.org/cargo/reference/pkgid-spec.html), so you can\ntarget individual versions of a package with syntax such as\n`[profile.dev.package.\"foo:2.1.0\"]`.\n\nTo override the settings for all dependencies (but not any workspace member),\nuse the `\"*\"` package name:\n\n```toml\n# Set the default for dependencies.\n[profile.dev.package.\"*\"]\nopt-level = 2\n```\n\nTo override the settings for build scripts, proc macros, and their\ndependencies, use the `build-override` table:\n\n```toml\n# Set the settings for build scripts and proc-macros.\n[profile.dev.build-override]\nopt-level = 3\n```\n\n> Note: When a dependency is both a normal dependency and a build dependency,\n> Cargo will try to only build it once when `--target` is not specified. When\n> using `build-override`, the dependency may need to be built twice, once as a\n> normal dependency and once with the overridden build settings. This may\n> increase initial build times.\n" + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#overrides" + } + } + }, + "Categories": { + "type": "array", + "description": "The `categories` field is an array of strings of the categories this package\nbelongs to.\n\n```toml\ncategories = [\"command-line-utilities\", \"development-tools::cargo-plugins\"]\n```\n\n> **Note**: [crates.io](https://crates.io) has a maximum of 5 categories. Each category should\n> match one of the strings available at https://crates.io/category_slugs, and\n> must match exactly.", + "items": { + "type": "string", + "description": "The `categories` field is an array of strings of the categories this package\nbelongs to.\n\n```toml\ncategories = [\"command-line-utilities\", \"development-tools::cargo-plugins\"]\n```\n\n> **Note**: [crates.io](https://crates.io) has a maximum of 5 categories. Each category should\n> match one of the strings available at https://crates.io/category_slugs, and\n> must match exactly.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-categories-field" + } + } + }, + "title": "Categories", + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-categories-field" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "CodegenUnits": { + "type": "integer", + "description": "The `codegen-units` setting controls the [`-C codegen-units` flag](https://doc.rust-lang.org/rustc/codegen-options/index.html#codegen-units) which\ncontrols how many \"code generation units\" a crate will be split into. More\ncode generation units allows more of a crate to be processed in parallel\npossibly reducing compile time, but may produce slower code.\n\nThis option takes an integer greater than 0.\n\nThe default is 256 for [incremental](https://doc.rust-lang.org/cargo/reference/profiles.html#incremental) builds, and 16 for\nnon-incremental builds.", + "format": "uint32", + "minimum": 0, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#codegen-units" + } + } + }, + "DebugAssertions": { + "type": "boolean", + "description": "The `debug-assertions` setting controls the [`-C debug-assertions` flag](https://doc.rust-lang.org/rustc/codegen-options/index.html#debug-assertions) which\nturns `cfg(debug_assertions)` [conditional compilation](https://doc.rust-lang.org/reference/conditional-compilation.html#debug_assertions) on or off. Debug\nassertions are intended to include runtime validation which is only available\nin debug/development builds. These may be things that are too expensive or\notherwise undesirable in a release build. Debug assertions enables the\n[`debug_assert!` macro](https://doc.rust-lang.org/std/macro.debug_assert.html) in the standard library.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#debug-assertions" + } + } + }, + "DebugLevel": { + "description": "The `debug` setting controls the [`-C debuginfo` flag](https://doc.rust-lang.org/rustc/codegen-options/index.html#debuginfo) which controls the\namount of debug information included in the compiled binary.", + "oneOf": [ + { + "type": "string", + "enum": [ + "none", + "line-directives-only", + "line-tables-only", + "limited", + "full" + ] + }, + { + "type": "boolean" + }, + { + "type": "integer", + "enum": [ + 0, + 1, + 2 + ] + } + ], + "title": "Debug Level", + "x-taplo": { + "docs": { + "enumValues": [ + "No debug info at all, default for `release` profile", + "Debug info without type or variable-level information. Generates more detailed module-level info than `line-tables-only`.", + "Full debug info, default for `dev` profile", + "Full debug info, default for `dev` profile", + "No debug info at all, default for `release` profile", + "No debug info at all, default for `release` profile", + "Line info directives only. For the nvptx* targets this enables [profiling](https://reviews.llvm.org/D46061). For other use cases, `line-tables-only` is the better, more compatible choice.", + "Line tables only. Generates the minimal amount of debug info for backtraces with filename/line number info, but not anything else, i.e. no variable or function parameter info.", + "Debug info without type or variable-level information. Generates more detailed module-level info than `line-tables-only`.", + "Full debug info, default for `dev` profile" + ] + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#debug" + } + } + }, + "Dependency": { + "$ref": "#/definitions/DetailedDependency", + "title": "Dependency" + }, + "Description": { + "type": "string", + "description": "The description is a short blurb about the package. [crates.io](https://crates.io) will display\nthis with your package. This should be plain text (not Markdown).\n\n```toml\n[package]\n# ...\ndescription = \"A short description of my package\"\n```\n\n> **Note**: [crates.io](https://crates.io) requires the `description` to be set.", + "title": "Description", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-description-field" + } + } + }, + "DetailedDependency": { + "type": "object", + "additionalProperties": false, + "dependencies": { + "version": [ + "default-features" + ] + }, + "minProperties": 1, + "properties": { + "branch": { + "type": "string", + "description": "Specify the Git branch to use in case of a [Git dependency](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies-from-git-repositories).", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies-from-git-repositories" + } + } + }, + "default-features": { + "type": "boolean", + "description": "Use the default features of the dependency.", + "enum": [ + false + ], + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#choosing-features" + } + } + }, + "default_features": { + "type": "boolean", + "deprecated": true, + "description": "\"default_features\" is deprecated. Use \"default-features\" instead.", + "x-taplo": { + "hidden": true + } + }, + "features": { + "type": "array", + "description": "List of features to activate in the dependency.", + "items": { + "type": "string", + "description": "List of features to activate in the dependency.", + "x-taplo": { + "crates": { + "schemas": "feature" + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#choosing-features" + }, + "plugins": [ + "crates" + ] + } + }, + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#choosing-features" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "git": { + "type": "string", + "description": "To depend on a library located in a `git` repository, the minimum information\nyou need to specify is the location of the repository with the `git` key:\n\n```toml\n[dependencies]\nrand = { git = \"https://github.com/rust-lang-nursery/rand\" }\n```\n\nCargo will fetch the `git` repository at this location then look for a\n`Cargo.toml` for the requested crate anywhere inside the `git` repository\n(not necessarily at the root - for example, specifying a member crate name\nof a workspace and setting `git` to the repository containing the workspace).\n\nSince we haven't specified any other information, Cargo assumes that\nwe intend to use the latest commit on the main branch to build our package.\nYou can combine the `git` key with the `rev`, `tag`, or `branch` keys to\nspecify something else. Here's an example of specifying that you want to use\nthe latest commit on a branch named `next`:\n\n```toml\n[dependencies]\nrand = { git = \"https://github.com/rust-lang-nursery/rand\", branch = \"next\" }\n```\n\nSee [Git Authentication](https://doc.rust-lang.org/cargo/appendix/git-authentication.html) for help with git authentication for private repos.\n\n> **Note**: [crates.io](https://crates.io/) does not allow packages to be published with `git`\n> dependencies (`git` [dev-dependencies](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#development-dependencies) are ignored). See the [Multiple\n> locations](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#multiple-locations) section for a fallback alternative.\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies-from-git-repositories" + } + } + }, + "optional": { + "type": "boolean", + "description": "Mark the dependency as optional.\n\nOptional dependencies can be activated through features.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#choosing-features" + } + } + }, + "package": { + "type": "string", + "description": "Specify the name of the package.\n\nWhen writing a `[dependencies]` section in `Cargo.toml` the key you write for a\ndependency typically matches up to the name of the crate you import from in the\ncode. For some projects, though, you may wish to reference the crate with a\ndifferent name in the code regardless of how it's published on crates.io. For\nexample you may wish to:\n\n* Avoid the need to `use foo as bar` in Rust source.\n* Depend on multiple versions of a crate.\n* Depend on crates with the same name from different registries.\n\nTo support this Cargo supports a `package` key in the `[dependencies]` section\nof which package should be depended on:\n\n```toml\n[package]\nname = \"mypackage\"\nversion = \"0.0.1\"\n\n[dependencies]\nfoo = \"0.1\"\nbar = { git = \"https://github.com/example/project\", package = \"foo\" }\nbaz = { version = \"0.1\", registry = \"custom\", package = \"foo\" }\n```\n\nIn this example, three crates are now available in your Rust code:\n\n```rust\nextern crate foo; // crates.io\nextern crate bar; // git repository\nextern crate baz; // registry `custom`\n```\n\nAll three of these crates have the package name of `foo` in their own\n`Cargo.toml`, so we're explicitly using the `package` key to inform Cargo that\nwe want the `foo` package even though we're calling it something else locally.\nThe `package` key, if not specified, defaults to the name of the dependency\nbeing requested.\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#renaming-dependencies-in-cargotoml" + } + } + }, + "path": { + "type": "string", + "description": "Cargo supports **path dependencies** which are typically sub-crates that live within one repository.\nLet's start off by making a new crate inside of our `hello_world` package:\n\n```console\n# inside of hello_world/\n$ cargo new hello_utils\n```\n\nThis will create a new folder `hello_utils` inside of which a `Cargo.toml` and\n`src` folder are ready to be configured. In order to tell Cargo about this, open\nup `hello_world/Cargo.toml` and add `hello_utils` to your dependencies:\n\n```toml\n[dependencies]\nhello_utils = { path = \"hello_utils\" }\n```\n\nThis tells Cargo that we depend on a crate called `hello_utils` which is found\nin the `hello_utils` folder (relative to the `Cargo.toml` it's written in).", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-path-dependencies" + } + } + }, + "public": { + "type": "boolean", + "x-taplo": { + "hidden": true + } + }, + "registry": { + "type": "string", + "description": "To specify a dependency from a registry other than [crates.io](https://crates.io), first the\nregistry must be configured in a `.cargo/config.toml` file. See the [registries\ndocumentation](https://doc.rust-lang.org/cargo/reference/registries.html) for more information. In the dependency, set the `registry` key\nto the name of the registry to use.\n\n```toml\n[dependencies]\nsome-crate = { version = \"1.0\", registry = \"my-registry\" }\n```\n\n> **Note**: [crates.io](https://crates.io) does not allow packages to be published with\n> dependencies on other registries.\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies-from-other-registries" + } + } + }, + "registry-index": { + "type": "string", + "x-taplo": { + "hidden": true + } + }, + "rev": { + "type": "string", + "description": "Specify the Git revision to use in case of a [Git dependency](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#choice-of-commit).\n\nThis can be a commit hash, or a named reference exposed by the remote repository. GitHub Pull Requests may be specified using the `refs/pull/ID/head` format.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#choice-of-commit" + } + } + }, + "tag": { + "type": "string", + "description": "Specify the Git tag to use in case of a [Git dependency](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies-from-git-repositories).", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-dependencies-from-git-repositories" + } + } + }, + "version": { + "$ref": "#/definitions/SemVerRequirement" + }, + "workspace": { + "type": "boolean", + "description": "Inherit this dependency from the workspace manifest.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#inheriting-a-dependency-from-a-workspace" + } + } + } + }, + "title": "Detailed Dependency", + "x-taplo": { + "initFields": [ + "version" + ] + }, + "x-tombi-table-keys-order": "schema" + }, + "DetailedLint": { + "type": "object", + "properties": { + "level": { + "$ref": "#/definitions/LintLevel" + }, + "priority": { + "type": "integer", + "description": "The priority that controls which lints or [lint groups](https://doc.rust-lang.org/rustc/lints/groups.html) override other lint groups. Lower (particularly negative) numbers have lower priority, being overridden by higher numbers, and show up first on the command-line to tools like rustc.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/stable/cargo/reference/manifest.html#the-lints-section" + } + } + } + }, + "title": "Detailed Lint", + "x-tombi-table-keys-order": "version-sort" + }, + "Documentation": { + "type": "string", + "description": "\nThe `documentation` field specifies a URL to a website hosting the crate's\ndocumentation. If no URL is specified in the manifest file, [crates.io](https://crates.io) will\nautomatically link your crate to the corresponding [docs.rs](https://docs.rs) page.\n\n```toml\n[package]\n# ...\ndocumentation = \"https://docs.rs/bitflags\"\n```\n", + "title": "Documentation", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-documentation-field" + } + } + }, + "Edition": { + "type": "string", + "description": "The `edition` key affects which edition your package is compiled with. Cargo\nwill always generate packages via [`cargo new`](https://doc.rust-lang.org/cargo/commands/cargo-new.html) with the `edition` key set to the\nlatest edition. Setting the `edition` key in `[package]` will affect all\ntargets/crates in the package, including test suites, benchmarks, binaries,\nexamples, etc.", + "enum": [ + "2015", + "2018", + "2021", + "2024" + ], + "title": "Edition", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/stable/edition-guide/introduction.html" + } + } + }, + "Exclude": { + "type": "array", + "description": "You can explicitly specify that a set of file patterns should be ignored or\nincluded for the purposes of packaging. The patterns specified in the\n`exclude` field identify a set of files that are not included, and the\npatterns in `include` specify files that are explicitly included.\n\nThe patterns should be [gitignore](https://git-scm.com/docs/gitignore)-style patterns. Briefly:\n\n- `foo` matches any file or directory with the name `foo` anywhere in the\n package. This is equivalent to the pattern `**/foo`.\n- `/foo` matches any file or directory with the name `foo` only in the root of\n the package.\n- `foo/` matches any *directory* with the name `foo` anywhere in the package.\n- Common glob patterns like `*`, `?`, and `[]` are supported:\n - `*` matches zero or more characters except `/`. For example, `*.html`\n matches any file or directory with the `.html` extension anywhere in the\n package.\n - `?` matches any character except `/`. For example, `foo?` matches `food`,\n but not `foo`.\n - `[]` allows for matching a range of characters. For example, `[ab]`\n matches either `a` or `b`. `[a-z]` matches letters a through z.\n- `**/` prefix matches in any directory. For example, `**/foo/bar` matches the\n file or directory `bar` anywhere that is directly under directory `foo`.\n- `/**` suffix matches everything inside. For example, `foo/**` matches all\n files inside directory `foo`, including all files in subdirectories below\n `foo`.\n- `/**/` matches zero or more directories. For example, `a/**/b` matches\n `a/b`, `a/x/b`, `a/x/y/b`, and so on.\n- `!` prefix negates a pattern. For example, a pattern of `src/**.rs` and\n `!foo.rs` would match all files with the `.rs` extension inside the `src`\n directory, except for any file named `foo.rs`.\n\nIf git is being used for a package, the `exclude` field will be seeded with\nthe `gitignore` settings from the repository.\n\n```toml\n[package]\n# ...\nexclude = [\"build/**/*.o\", \"doc/**/*.html\"]\n```\n\n```toml\n[package]\n# ...\ninclude = [\"src/**/*\", \"Cargo.toml\"]\n```\n\nThe options are mutually exclusive: setting `include` will override an\n`exclude`. Note that `include` must be an exhaustive list of files as otherwise\nnecessary source files may not be included. The package's `Cargo.toml` is\nautomatically included.\n\nThe include/exclude list is also used for change tracking in some situations.\nFor targets built with `rustdoc`, it is used to determine the list of files to\ntrack to determine if the target should be rebuilt. If the package has a\n[build script](https://doc.rust-lang.org/cargo/reference/build-scripts.html) that does not emit any `rerun-if-*` directives, then the\ninclude/exclude list is used for tracking if the build script should be re-run\nif any of those files change.", + "items": { + "type": "string", + "description": "You can explicitly specify that a set of file patterns should be ignored or\nincluded for the purposes of packaging. The patterns specified in the\n`exclude` field identify a set of files that are not included, and the\npatterns in `include` specify files that are explicitly included.\n\nThe patterns should be [gitignore](https://git-scm.com/docs/gitignore)-style patterns. Briefly:\n\n- `foo` matches any file or directory with the name `foo` anywhere in the\n package. This is equivalent to the pattern `**/foo`.\n- `/foo` matches any file or directory with the name `foo` only in the root of\n the package.\n- `foo/` matches any *directory* with the name `foo` anywhere in the package.\n- Common glob patterns like `*`, `?`, and `[]` are supported:\n - `*` matches zero or more characters except `/`. For example, `*.html`\n matches any file or directory with the `.html` extension anywhere in the\n package.\n - `?` matches any character except `/`. For example, `foo?` matches `food`,\n but not `foo`.\n - `[]` allows for matching a range of characters. For example, `[ab]`\n matches either `a` or `b`. `[a-z]` matches letters a through z.\n- `**/` prefix matches in any directory. For example, `**/foo/bar` matches the\n file or directory `bar` anywhere that is directly under directory `foo`.\n- `/**` suffix matches everything inside. For example, `foo/**` matches all\n files inside directory `foo`, including all files in subdirectories below\n `foo`.\n- `/**/` matches zero or more directories. For example, `a/**/b` matches\n `a/b`, `a/x/b`, `a/x/y/b`, and so on.\n- `!` prefix negates a pattern. For example, a pattern of `src/**.rs` and\n `!foo.rs` would match all files with the `.rs` extension inside the `src`\n directory, except for any file named `foo.rs`.\n\nIf git is being used for a package, the `exclude` field will be seeded with\nthe `gitignore` settings from the repository.\n\n```toml\n[package]\n# ...\nexclude = [\"build/**/*.o\", \"doc/**/*.html\"]\n```\n\n```toml\n[package]\n# ...\ninclude = [\"src/**/*\", \"Cargo.toml\"]\n```\n\nThe options are mutually exclusive: setting `include` will override an\n`exclude`. Note that `include` must be an exhaustive list of files as otherwise\nnecessary source files may not be included. The package's `Cargo.toml` is\nautomatically included.\n\nThe include/exclude list is also used for change tracking in some situations.\nFor targets built with `rustdoc`, it is used to determine the list of files to\ntrack to determine if the target should be rebuilt. If the package has a\n[build script](https://doc.rust-lang.org/cargo/reference/build-scripts.html) that does not emit any `rerun-if-*` directives, then the\ninclude/exclude list is used for tracking if the build script should be re-run\nif any of those files change.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-exclude-and-include-fields" + } + } + }, + "title": "Exclude", + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-exclude-and-include-fields" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "Homepage": { + "type": "string", + "description": "The `homepage` field should be a URL to a site that is the home page for your\npackage.\n\n```toml\n[package]\n# ...\nhomepage = \"https://serde.rs/\"\n```", + "title": "Homepage", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-homepage-field" + } + } + }, + "Include": { + "type": "array", + "description": "You can explicitly specify that a set of file patterns should be ignored or\nincluded for the purposes of packaging. The patterns specified in the\n`exclude` field identify a set of files that are not included, and the\npatterns in `include` specify files that are explicitly included.\n\nThe patterns should be [gitignore](https://git-scm.com/docs/gitignore)-style patterns. Briefly:\n\n- `foo` matches any file or directory with the name `foo` anywhere in the\n package. This is equivalent to the pattern `**/foo`.\n- `/foo` matches any file or directory with the name `foo` only in the root of\n the package.\n- `foo/` matches any *directory* with the name `foo` anywhere in the package.\n- Common glob patterns like `*`, `?`, and `[]` are supported:\n - `*` matches zero or more characters except `/`. For example, `*.html`\n matches any file or directory with the `.html` extension anywhere in the\n package.\n - `?` matches any character except `/`. For example, `foo?` matches `food`,\n but not `foo`.\n - `[]` allows for matching a range of characters. For example, `[ab]`\n matches either `a` or `b`. `[a-z]` matches letters a through z.\n- `**/` prefix matches in any directory. For example, `**/foo/bar` matches the\n file or directory `bar` anywhere that is directly under directory `foo`.\n- `/**` suffix matches everything inside. For example, `foo/**` matches all\n files inside directory `foo`, including all files in subdirectories below\n `foo`.\n- `/**/` matches zero or more directories. For example, `a/**/b` matches\n `a/b`, `a/x/b`, `a/x/y/b`, and so on.\n- `!` prefix negates a pattern. For example, a pattern of `src/**.rs` and\n `!foo.rs` would match all files with the `.rs` extension inside the `src`\n directory, except for any file named `foo.rs`.\n\nIf git is being used for a package, the `exclude` field will be seeded with\nthe `gitignore` settings from the repository.\n\n```toml\n[package]\n# ...\nexclude = [\"build/**/*.o\", \"doc/**/*.html\"]\n```\n\n```toml\n[package]\n# ...\ninclude = [\"src/**/*\", \"Cargo.toml\"]\n```\n\nThe options are mutually exclusive: setting `include` will override an\n`exclude`. Note that `include` must be an exhaustive list of files as otherwise\nnecessary source files may not be included. The package's `Cargo.toml` is\nautomatically included.\n\nThe include/exclude list is also used for change tracking in some situations.\nFor targets built with `rustdoc`, it is used to determine the list of files to\ntrack to determine if the target should be rebuilt. If the package has a\n[build script](https://doc.rust-lang.org/cargo/reference/build-scripts.html) that does not emit any `rerun-if-*` directives, then the\ninclude/exclude list is used for tracking if the build script should be re-run\nif any of those files change.", + "items": { + "type": "string", + "description": "You can explicitly specify that a set of file patterns should be ignored or\nincluded for the purposes of packaging. The patterns specified in the\n`exclude` field identify a set of files that are not included, and the\npatterns in `include` specify files that are explicitly included.\n\nThe patterns should be [gitignore](https://git-scm.com/docs/gitignore)-style patterns. Briefly:\n\n- `foo` matches any file or directory with the name `foo` anywhere in the\n package. This is equivalent to the pattern `**/foo`.\n- `/foo` matches any file or directory with the name `foo` only in the root of\n the package.\n- `foo/` matches any *directory* with the name `foo` anywhere in the package.\n- Common glob patterns like `*`, `?`, and `[]` are supported:\n - `*` matches zero or more characters except `/`. For example, `*.html`\n matches any file or directory with the `.html` extension anywhere in the\n package.\n - `?` matches any character except `/`. For example, `foo?` matches `food`,\n but not `foo`.\n - `[]` allows for matching a range of characters. For example, `[ab]`\n matches either `a` or `b`. `[a-z]` matches letters a through z.\n- `**/` prefix matches in any directory. For example, `**/foo/bar` matches the\n file or directory `bar` anywhere that is directly under directory `foo`.\n- `/**` suffix matches everything inside. For example, `foo/**` matches all\n files inside directory `foo`, including all files in subdirectories below\n `foo`.\n- `/**/` matches zero or more directories. For example, `a/**/b` matches\n `a/b`, `a/x/b`, `a/x/y/b`, and so on.\n- `!` prefix negates a pattern. For example, a pattern of `src/**.rs` and\n `!foo.rs` would match all files with the `.rs` extension inside the `src`\n directory, except for any file named `foo.rs`.\n\nIf git is being used for a package, the `exclude` field will be seeded with\nthe `gitignore` settings from the repository.\n\n```toml\n[package]\n# ...\nexclude = [\"build/**/*.o\", \"doc/**/*.html\"]\n```\n\n```toml\n[package]\n# ...\ninclude = [\"src/**/*\", \"Cargo.toml\"]\n```\n\nThe options are mutually exclusive: setting `include` will override an\n`exclude`. Note that `include` must be an exhaustive list of files as otherwise\nnecessary source files may not be included. The package's `Cargo.toml` is\nautomatically included.\n\nThe include/exclude list is also used for change tracking in some situations.\nFor targets built with `rustdoc`, it is used to determine the list of files to\ntrack to determine if the target should be rebuilt. If the package has a\n[build script](https://doc.rust-lang.org/cargo/reference/build-scripts.html) that does not emit any `rerun-if-*` directives, then the\ninclude/exclude list is used for tracking if the build script should be re-run\nif any of those files change.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-exclude-and-include-fields" + } + } + }, + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-exclude-and-include-fields" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "Incremental": { + "type": "boolean", + "description": "The `incremental` setting controls the [`-C incremental` flag](https://doc.rust-lang.org/rustc/codegen-options/index.html#incremental) which controls\nwhether or not incremental compilation is enabled. Incremental compilation\ncauses `rustc` to to save additional information to disk which will be reused\nwhen recompiling the crate, improving re-compile times. The additional\ninformation is stored in the `target` directory.\n\nThe valid options are:\n\n* `true`: enabled\n* `false`: disabled\n\nIncremental compilation is only used for workspace members and \"path\"\ndependencies.\n\nThe incremental value can be overridden globally with the `CARGO_INCREMENTAL`\n[environment variable](https://doc.rust-lang.org/cargo/reference/environment-variables.html) or the [`build.incremental`](https://doc.rust-lang.org/cargo/reference/config.html#buildincremental) config variable.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#incremental" + } + } + }, + "Inherits": { + "type": "string", + "description": "In addition to the built-in profiles, additional custom profiles can be defined.", + "enum": [ + "dev", + "test", + "bench", + "release" + ], + "title": "Inherits", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#custom-profiles" + } + } + }, + "Keywords": { + "type": "array", + "description": "The `keywords` field is an array of strings that describe this package. This\ncan help when searching for the package on a registry, and you may choose any\nwords that would help someone find this crate.\n\n```toml\n[package]\n# ...\nkeywords = [\"gamedev\", \"graphics\"]\n```\n\n> **Note**: [crates.io](https://crates.io) has a maximum of 5 keywords. Each keyword must be\n> ASCII text, start with a letter, and only contain letters, numbers, `_` or\n> `-`, and have at most 20 characters.", + "items": { + "type": "string", + "description": "The `keywords` field is an array of strings that describe this package. This\ncan help when searching for the package on a registry, and you may choose any\nwords that would help someone find this crate.\n\n```toml\n[package]\n# ...\nkeywords = [\"gamedev\", \"graphics\"]\n```\n\n> **Note**: [crates.io](https://crates.io) has a maximum of 5 keywords. Each keyword must be\n> ASCII text, start with a letter, and only contain letters, numbers, `_` or\n> `-`, and have at most 20 characters.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-keywords-field" + } + } + }, + "title": "Keywords", + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-keywords-field" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "License": { + "type": "string", + "description": "The `license` field contains the name of the software license that the package\nis released under.\n\n[crates.io](https://crates.io/) interprets the `license` field as an [SPDX 2.1 license\nexpression](https://spdx.org/spdx-specification-21-web-version#h.jxpfx0ykyb60). The name must be a known license\nfrom the [SPDX license list 3.6](https://github.com/spdx/license-list-data/tree/v3.6). Parentheses are not\ncurrently supported. See the [SPDX site](https://spdx.org/license-list) for more information.\n\nSPDX license expressions support AND and OR operators to combine multiple\nlicenses.\n\n```toml\n[package]\n# ...\nlicense = \"MIT OR Apache-2.0\"\n```\n\nUsing `OR` indicates the user may choose either license. Using `AND` indicates\nthe user must comply with both licenses simultaneously. The `WITH` operator\nindicates a license with a special exception. Some examples:\n\n* `MIT OR Apache-2.0`\n* `LGPL-2.1 AND MIT AND BSD-2-Clause`\n* `GPL-2.0+ WITH Bison-exception-2.2`\n\nIf a package is using a nonstandard license, then the `license-file` field may\nbe specified in lieu of the `license` field.", + "title": "License", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-license-and-license-file-fields" + } + } + }, + "LicenseFile": { + "type": "string", + "description": "The `license-file` field contains the path to a file\ncontaining the text of the license (relative to this `Cargo.toml`).\n\n```toml\n[package]\n# ...\nlicense-file = \"LICENSE.txt\"\n```\n\n> **Note**: [crates.io](https://crates.io) requires either `license` or `license-file` to be set.", + "title": "LicenseFile", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-license-and-license-file-fields" + } + } + }, + "Lint": { + "anyOf": [ + { + "$ref": "#/definitions/LintLevel" + }, + { + "$ref": "#/definitions/DetailedLint" + } + ], + "title": "Lint" + }, + "LintLevel": { + "type": "string", + "description": "Specify the [lint level](https://doc.rust-lang.org/rustc/lints/levels.html) for a lint or lint group.", + "enum": [ + "forbid", + "deny", + "warn", + "allow" + ], + "title": "Lint Level", + "x-taplo": { + "docs": { + "enumValues": [ + "`forbid` is the same as `deny` in that a lint at this level will produce an error, but unlike the `deny` level, the `forbid` level can not be overridden to be anything lower than an error. However, lint levels may still be capped with [`--cap-lints`](https://doc.rust-lang.org/rustc/lints/levels.html#capping-lints) so `rustc --cap-lints warn` will make lints set to `forbid` just warn.", + "The `deny` lint level produces an error if you violate the lint.", + "The `warn` lint level produces a warning if you violate the lint.", + "The `allow` lint level ignores violations of the lint." + ] + }, + "links": { + "key": "https://doc.rust-lang.org/rustc/lints/levels.html" + } + } + }, + "Lints": { + "type": "object", + "additionalProperties": false, + "properties": { + "clippy": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Lint" + }, + "description": "Lint settings for [Clippy](https://doc.rust-lang.org/clippy/). See Clippy's [individual lints](https://rust-lang.github.io/rust-clippy/master/index.html) or [lint groups](https://doc.rust-lang.org/clippy/lints.html) documentation.", + "x-tombi-table-keys-order": "version-sort" + }, + "rust": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Lint" + }, + "description": "Lint settings for the Rust compiler. See the Rust compiler's [individual lints](https://doc.rust-lang.org/rustc/lints/listing/index.html) or [lint groups](https://doc.rust-lang.org/rustc/lints/groups.html).", + "x-tombi-table-keys-order": "version-sort" + }, + "rustdoc": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Lint" + }, + "description": "Lint settings for [Rustdoc](https://doc.rust-lang.org/rustdoc/). See Rustdoc's [individual lints](https://doc.rust-lang.org/rustdoc/lints.html) (rustdoc does not have lint groups)", + "x-tombi-table-keys-order": "version-sort" + } + }, + "x-tombi-table-keys-order": "version-sort" + }, + "Lto": { + "description": "The `lto` setting controls the [`-C lto` flag](https://doc.rust-lang.org/rustc/codegen-options/index.html#lto) which controls LLVM's [link time optimizations](https://llvm.org/docs/LinkTimeOptimization.html). LTO can produce better optimized code, using\nwhole-program analysis, at the cost of longer linking time.\n \nSee also the [`-C linker-plugin-lto`](https://doc.rust-lang.org/rustc/codegen-options/index.html#linker-plugin-lto) `rustc` flag for cross-language LTO.", + "oneOf": [ + { + "type": "string", + "enum": [ + "fat", + "thin", + "off" + ] + }, + { + "type": "boolean" + } + ], + "title": "Lto", + "x-taplo": { + "docs": { + "enumValues": [ + "Performs \"fat\" LTO which attempts to perform optimizations across all crates within the dependency graph.", + "Performs [\"thin\" LTO](http://blog.llvm.org/2016/06/thinlto-scalable-and-incremental-lto.html). This is similar to \"fat\", but takes\nsubstantially less time to run while still achieving performance gains\nsimilar to \"fat\".", + "Disables LTO.", + "Performs \"fat\" LTO which attempts to perform optimizations across all crates within the dependency graph.", + "Performs \"thin local LTO\" which performs \"thin\" LTO on the local\ncrate only across its [codegen units](https://doc.rust-lang.org/cargo/reference/profiles.html#codegen-units). No LTO is performed\nif codegen units is 1 or [opt-level](https://doc.rust-lang.org/cargo/reference/profiles.html#opt-level) is 0." + ] + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#lto" + } + } + }, + "MetaBuild": { + "type": "array", + "items": { + "type": "string" + }, + "title": "Meta Build", + "uniqueItems": true, + "x-tombi-array-values-order": "version-sort" + }, + "OptLevel": { + "description": "The `opt-level` setting controls the [`-C opt-level` flag](https://doc.rust-lang.org/rustc/codegen-options/index.html#opt-level) which controls the level\nof optimization. Higher optimization levels may produce faster runtime code at\nthe expense of longer compiler times. Higher levels may also change and\nrearrange the compiled code which may make it harder to use with a debugger.\n\nIt is recommended to experiment with different levels to find the right\nbalance for your project. There may be surprising results, such as level `3`\nbeing slower than `2`, or the `\"s\"` and `\"z\"` levels not being necessarily\nsmaller. You may also want to reevaluate your settings over time as newer\nversions of `rustc` changes optimization behavior.\n\nSee also [Profile Guided Optimization](https://doc.rust-lang.org/rustc/profile-guided-optimization.html) for more advanced optimization\ntechniques.", + "oneOf": [ + { + "type": "string", + "enum": [ + "s", + "z" + ] + }, + { + "type": "integer", + "enum": [ + 0, + 1, + 2, + 3 + ] + } + ], + "title": "Optimization Level", + "x-taplo": { + "docs": { + "enumValues": [ + "No optimizations, also turns on [`cfg(debug_assertions)`](https://doc.rust-lang.org/cargo/reference/profiles.html#debug-assertions).", + "Basic optimizations.", + "Some optimizations.", + "All optimizations.", + "Optimize for binary size.", + "Optimize for binary size, but also turn off loop vectorization." + ] + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#opt-level" + } + } + }, + "OverflowChecks": { + "type": "boolean", + "description": "The `overflow-checks` setting controls the [`-C overflow-checks` flag](https://doc.rust-lang.org/rustc/codegen-options/index.html#overflow-checks) which\ncontrols the behavior of [runtime integer overflow](https://doc.rust-lang.org/reference/expressions/operator-expr.html#overflow). When overflow-checks are\nenabled, a panic will occur on overflow.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#overflow-checks" + } + } + }, + "Package": { + "type": "object", + "additionalProperties": false, + "description": "The only field required by Cargo is [`name`](https://doc.rust-lang.org/cargo/reference/manifest.html#the-name-field).\n If publishing to a registry, the registry may\nrequire additional fields. See the notes below and [the publishing chapter](https://doc.rust-lang.org/cargo/reference/publishing.html) for requirements for publishing to [crates.io](https://crates.io/).", + "properties": { + "name": { + "type": "string", + "description": "The package name is an identifier used to refer to the package. It is used\nwhen listed as a dependency in another package, and as the default name of\ninferred lib and bin targets.\n\nThe name must use only [alphanumeric](https://doc.rust-lang.org/std/primitive.char.html#method.is_alphanumeric) characters or `-` or `_`, and cannot be empty.\nNote that [`cargo new`](https://doc.rust-lang.org/cargo/commands/cargo-new.html) and [`cargo init`](https://doc.rust-lang.org/cargo/commands/cargo-init.html) impose some additional restrictions on\nthe package name, such as enforcing that it is a valid Rust identifier and not\na keyword. [crates.io](https://crates.io) imposes even more restrictions, such as\nenforcing only ASCII characters, not a reserved name, not a special Windows\nname such as \"nul\", is not too long, etc.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-name-field" + } + } + }, + "authors": { + "anyOf": [ + { + "$ref": "#/definitions/Authors" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `authors` field lists people or organizations that are considered the\n\"authors\" of the package. The exact meaning is open to interpretation \u2014 it may\nlist the original or primary authors, current maintainers, or owners of the\npackage. These names will be listed on the crate's page on\n[crates.io](https://crates.io). An optional email address may be included within angled\nbrackets at the end of each author.\n\n> **Note**: [crates.io](https://crates.io) requires at least one author to be listed.", + "title": "Authors" + }, + "autobenches": { + "type": "boolean", + "description": "Disable automatic discovery of `bench` targets.\n\nDisabling automatic discovery should only be needed for specialized\nsituations. For example, if you have a library where you want a *module* named\n`bin`, this would present a problem because Cargo would usually attempt to\ncompile anything in the `bin` directory as an executable. Here is a sample\nlayout of this scenario:\n\n```\n\u251c\u2500\u2500 Cargo.toml\n\u2514\u2500\u2500 src\n \u251c\u2500\u2500 lib.rs\n \u2514\u2500\u2500 bin\n \u2514\u2500\u2500 mod.rs\n```\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#target-auto-discovery" + } + } + }, + "autobins": { + "type": "boolean", + "description": "Disable automatic discovery of `bin` targets.\n\nDisabling automatic discovery should only be needed for specialized\nsituations. For example, if you have a library where you want a *module* named\n`bin`, this would present a problem because Cargo would usually attempt to\ncompile anything in the `bin` directory as an executable. Here is a sample\nlayout of this scenario:\n\n```\n\u251c\u2500\u2500 Cargo.toml\n\u2514\u2500\u2500 src\n \u251c\u2500\u2500 lib.rs\n \u2514\u2500\u2500 bin\n \u2514\u2500\u2500 mod.rs\n```\n\nTo prevent Cargo from inferring `src/bin/mod.rs` as an executable, set\nthis to `false` to disable auto-discovery.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#target-auto-discovery" + } + } + }, + "autoexamples": { + "type": "boolean", + "description": "Disable automatic discovery of `example` targets.\n\nDisabling automatic discovery should only be needed for specialized\nsituations. For example, if you have a library where you want a *module* named\n`bin`, this would present a problem because Cargo would usually attempt to\ncompile anything in the `bin` directory as an executable. Here is a sample\nlayout of this scenario:\n\n```\n\u251c\u2500\u2500 Cargo.toml\n\u2514\u2500\u2500 src\n \u251c\u2500\u2500 lib.rs\n \u2514\u2500\u2500 bin\n \u2514\u2500\u2500 mod.rs\n```\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#target-auto-discovery" + } + } + }, + "autotests": { + "type": "boolean", + "description": "Disable automatic discovery of `test` targets.\n\nDisabling automatic discovery should only be needed for specialized\nsituations. For example, if you have a library where you want a *module* named\n`bin`, this would present a problem because Cargo would usually attempt to\ncompile anything in the `bin` directory as an executable. Here is a sample\nlayout of this scenario:\n\n```\n\u251c\u2500\u2500 Cargo.toml\n\u2514\u2500\u2500 src\n \u251c\u2500\u2500 lib.rs\n \u2514\u2500\u2500 bin\n \u2514\u2500\u2500 mod.rs\n```\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#target-auto-discovery" + } + } + }, + "build": { + "$ref": "#/definitions/Build" + }, + "categories": { + "anyOf": [ + { + "$ref": "#/definitions/Categories" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `categories` field is an array of strings of the categories this package\nbelongs to.\n\n```toml\ncategories = [\"command-line-utilities\", \"development-tools::cargo-plugins\"]\n```\n\n> **Note**: [crates.io](https://crates.io) has a maximum of 5 categories. Each category should\n> match one of the strings available at https://crates.io/category_slugs, and\n> must match exactly.", + "title": "Categories" + }, + "default-run": { + "type": "string", + "description": "The `default-run` field in the `[package]` section of the manifest can be used\nto specify a default binary picked by [`cargo run`](https://doc.rust-lang.org/cargo/commands/cargo-run.html). For example, when there is\nboth `src/bin/a.rs` and `src/bin/b.rs`:\n\n```toml\n[package]\ndefault-run = \"a\"\n```", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-default-run-field" + } + } + }, + "description": { + "anyOf": [ + { + "$ref": "#/definitions/Description" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The description is a short blurb about the package. [crates.io](https://crates.io) will display\nthis with your package. This should be plain text (not Markdown).\n\n```toml\n[package]\n# ...\ndescription = \"A short description of my package\"\n```\n\n> **Note**: [crates.io](https://crates.io) requires the `description` to be set.", + "title": "Description" + }, + "documentation": { + "anyOf": [ + { + "$ref": "#/definitions/Documentation" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "\nThe `documentation` field specifies a URL to a website hosting the crate's\ndocumentation. If no URL is specified in the manifest file, [crates.io](https://crates.io) will\nautomatically link your crate to the corresponding [docs.rs](https://docs.rs) page.\n\n```toml\n[package]\n# ...\ndocumentation = \"https://docs.rs/bitflags\"\n```\n", + "title": "Documentation" + }, + "edition": { + "anyOf": [ + { + "$ref": "#/definitions/Edition" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `edition` key affects which edition your package is compiled with. Cargo\nwill always generate packages via [`cargo new`](https://doc.rust-lang.org/cargo/commands/cargo-new.html) with the `edition` key set to the\nlatest edition. Setting the `edition` key in `[package]` will affect all\ntargets/crates in the package, including test suites, benchmarks, binaries,\nexamples, etc.", + "title": "Edition" + }, + "exclude": { + "anyOf": [ + { + "$ref": "#/definitions/Exclude" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "You can explicitly specify that a set of file patterns should be ignored or\nincluded for the purposes of packaging. The patterns specified in the\n`exclude` field identify a set of files that are not included, and the\npatterns in `include` specify files that are explicitly included.\n\nThe patterns should be [gitignore](https://git-scm.com/docs/gitignore)-style patterns. Briefly:\n\n- `foo` matches any file or directory with the name `foo` anywhere in the\n package. This is equivalent to the pattern `**/foo`.\n- `/foo` matches any file or directory with the name `foo` only in the root of\n the package.\n- `foo/` matches any *directory* with the name `foo` anywhere in the package.\n- Common glob patterns like `*`, `?`, and `[]` are supported:\n - `*` matches zero or more characters except `/`. For example, `*.html`\n matches any file or directory with the `.html` extension anywhere in the\n package.\n - `?` matches any character except `/`. For example, `foo?` matches `food`,\n but not `foo`.\n - `[]` allows for matching a range of characters. For example, `[ab]`\n matches either `a` or `b`. `[a-z]` matches letters a through z.\n- `**/` prefix matches in any directory. For example, `**/foo/bar` matches the\n file or directory `bar` anywhere that is directly under directory `foo`.\n- `/**` suffix matches everything inside. For example, `foo/**` matches all\n files inside directory `foo`, including all files in subdirectories below\n `foo`.\n- `/**/` matches zero or more directories. For example, `a/**/b` matches\n `a/b`, `a/x/b`, `a/x/y/b`, and so on.\n- `!` prefix negates a pattern. For example, a pattern of `src/**.rs` and\n `!foo.rs` would match all files with the `.rs` extension inside the `src`\n directory, except for any file named `foo.rs`.\n\nIf git is being used for a package, the `exclude` field will be seeded with\nthe `gitignore` settings from the repository.\n\n```toml\n[package]\n# ...\nexclude = [\"build/**/*.o\", \"doc/**/*.html\"]\n```\n\n```toml\n[package]\n# ...\ninclude = [\"src/**/*\", \"Cargo.toml\"]\n```\n\nThe options are mutually exclusive: setting `include` will override an\n`exclude`. Note that `include` must be an exhaustive list of files as otherwise\nnecessary source files may not be included. The package's `Cargo.toml` is\nautomatically included.\n\nThe include/exclude list is also used for change tracking in some situations.\nFor targets built with `rustdoc`, it is used to determine the list of files to\ntrack to determine if the target should be rebuilt. If the package has a\n[build script](https://doc.rust-lang.org/cargo/reference/build-scripts.html) that does not emit any `rerun-if-*` directives, then the\ninclude/exclude list is used for tracking if the build script should be re-run\nif any of those files change.", + "title": "Exclude" + }, + "homepage": { + "anyOf": [ + { + "$ref": "#/definitions/Homepage" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `homepage` field should be a URL to a site that is the home page for your\npackage.\n\n```toml\n[package]\n# ...\nhomepage = \"https://serde.rs/\"\n```", + "title": "Homepage" + }, + "im-a-teapot": { + "type": "boolean", + "description": "Sets whether the current package is a teapot or something else that is not capable of brewing tea.", + "x-taplo": { + "hidden": true + } + }, + "include": { + "anyOf": [ + { + "$ref": "#/definitions/Include" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "You can explicitly specify that a set of file patterns should be ignored or\nincluded for the purposes of packaging. The patterns specified in the\n`exclude` field identify a set of files that are not included, and the\npatterns in `include` specify files that are explicitly included.\n\nThe patterns should be [gitignore](https://git-scm.com/docs/gitignore)-style patterns. Briefly:\n\n- `foo` matches any file or directory with the name `foo` anywhere in the\n package. This is equivalent to the pattern `**/foo`.\n- `/foo` matches any file or directory with the name `foo` only in the root of\n the package.\n- `foo/` matches any *directory* with the name `foo` anywhere in the package.\n- Common glob patterns like `*`, `?`, and `[]` are supported:\n - `*` matches zero or more characters except `/`. For example, `*.html`\n matches any file or directory with the `.html` extension anywhere in the\n package.\n - `?` matches any character except `/`. For example, `foo?` matches `food`,\n but not `foo`.\n - `[]` allows for matching a range of characters. For example, `[ab]`\n matches either `a` or `b`. `[a-z]` matches letters a through z.\n- `**/` prefix matches in any directory. For example, `**/foo/bar` matches the\n file or directory `bar` anywhere that is directly under directory `foo`.\n- `/**` suffix matches everything inside. For example, `foo/**` matches all\n files inside directory `foo`, including all files in subdirectories below\n `foo`.\n- `/**/` matches zero or more directories. For example, `a/**/b` matches\n `a/b`, `a/x/b`, `a/x/y/b`, and so on.\n- `!` prefix negates a pattern. For example, a pattern of `src/**.rs` and\n `!foo.rs` would match all files with the `.rs` extension inside the `src`\n directory, except for any file named `foo.rs`.\n\nIf git is being used for a package, the `exclude` field will be seeded with\nthe `gitignore` settings from the repository.\n\n```toml\n[package]\n# ...\nexclude = [\"build/**/*.o\", \"doc/**/*.html\"]\n```\n\n```toml\n[package]\n# ...\ninclude = [\"src/**/*\", \"Cargo.toml\"]\n```\n\nThe options are mutually exclusive: setting `include` will override an\n`exclude`. Note that `include` must be an exhaustive list of files as otherwise\nnecessary source files may not be included. The package's `Cargo.toml` is\nautomatically included.\n\nThe include/exclude list is also used for change tracking in some situations.\nFor targets built with `rustdoc`, it is used to determine the list of files to\ntrack to determine if the target should be rebuilt. If the package has a\n[build script](https://doc.rust-lang.org/cargo/reference/build-scripts.html) that does not emit any `rerun-if-*` directives, then the\ninclude/exclude list is used for tracking if the build script should be re-run\nif any of those files change." + }, + "keywords": { + "anyOf": [ + { + "$ref": "#/definitions/Keywords" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `keywords` field is an array of strings that describe this package. This\ncan help when searching for the package on a registry, and you may choose any\nwords that would help someone find this crate.\n\n```toml\n[package]\n# ...\nkeywords = [\"gamedev\", \"graphics\"]\n```\n\n> **Note**: [crates.io](https://crates.io) has a maximum of 5 keywords. Each keyword must be\n> ASCII text, start with a letter, and only contain letters, numbers, `_` or\n> `-`, and have at most 20 characters.", + "title": "Keywords" + }, + "license": { + "anyOf": [ + { + "$ref": "#/definitions/License" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `license` field contains the name of the software license that the package\nis released under.\n\n[crates.io](https://crates.io/) interprets the `license` field as an [SPDX 2.1 license\nexpression](https://spdx.org/spdx-specification-21-web-version#h.jxpfx0ykyb60). The name must be a known license\nfrom the [SPDX license list 3.6](https://github.com/spdx/license-list-data/tree/v3.6). Parentheses are not\ncurrently supported. See the [SPDX site](https://spdx.org/license-list) for more information.\n\nSPDX license expressions support AND and OR operators to combine multiple\nlicenses.\n\n```toml\n[package]\n# ...\nlicense = \"MIT OR Apache-2.0\"\n```\n\nUsing `OR` indicates the user may choose either license. Using `AND` indicates\nthe user must comply with both licenses simultaneously. The `WITH` operator\nindicates a license with a special exception. Some examples:\n\n* `MIT OR Apache-2.0`\n* `LGPL-2.1 AND MIT AND BSD-2-Clause`\n* `GPL-2.0+ WITH Bison-exception-2.2`\n\nIf a package is using a nonstandard license, then the `license-file` field may\nbe specified in lieu of the `license` field.", + "title": "License" + }, + "license-file": { + "anyOf": [ + { + "$ref": "#/definitions/LicenseFile" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `license-file` field contains the path to a file\ncontaining the text of the license (relative to this `Cargo.toml`).\n\n```toml\n[package]\n# ...\nlicense-file = \"LICENSE.txt\"\n```\n\n> **Note**: [crates.io](https://crates.io) requires either `license` or `license-file` to be set.", + "title": "LicenseFile" + }, + "links": { + "type": "string", + "description": "The `links` field specifies the name of a native library that is being linked\nto. More information can be found in the [`links`](https://doc.rust-lang.org/cargo/reference/build-scripts.html#the-links-manifest-key) section of the build\nscript guide.\n\n```toml\n[package]\n# ...\nlinks = \"foo\"\n```", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-links-field" + } + } + }, + "metabuild": { + "$ref": "#/definitions/MetaBuild", + "x-taplo": { + "hidden": true + } + }, + "metadata": { + "type": "object", + "additionalProperties": true, + "description": "Cargo by default will warn about unused keys in `Cargo.toml` to assist in\ndetecting typos and such. The `package.metadata` table, however, is completely\nignored by Cargo and will not be warned about. This section can be used for\ntools which would like to store package configuration in `Cargo.toml`. For\nexample:\n\n```toml\n[package]\nname = \"...\"\n# ...\n\n# Metadata used when generating an Android APK, for example.\n[package.metadata.android]\npackage-name = \"my-awesome-android-app\"\nassets = \"path/to/static\"\n```\n", + "properties": { + "playdate": { + "$ref": "#/definitions/PlaydateMetadata" + } + }, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-metadata-table" + } + }, + "x-tombi-table-keys-order": "schema" + }, + "namespaced-features": { + "type": "boolean", + "x-taplo": { + "hidden": true + } + }, + "publish": { + "anyOf": [ + { + "$ref": "#/definitions/Publish" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `publish` field can be used to prevent a package from being published to a package registry (like *crates.io*) by mistake, for instance to keep a package\nprivate in a company.\n\n```toml\n[package]\n# ...\npublish = false\n```\n\nThe value may also be an array of strings which are registry names that are\nallowed to be published to.\n\n```toml\n[package]\n# ...\npublish = [\"some-registry-name\"]\n```", + "title": "Publish" + }, + "publish-lockfile": { + "type": "boolean", + "x-taplo": { + "hidden": true + } + }, + "readme": { + "anyOf": [ + { + "$ref": "#/definitions/Readme" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `readme` field should be the path to a file in the package root (relative\nto this `Cargo.toml`) that contains general information about the package.\nThis file will be transferred to the registry when you publish. [crates.io](https://crates.io)\nwill interpret it as Markdown and render it on the crate's page.\n\n```toml\n[package]\n# ...\nreadme = \"README.md\"\n```\n\nIf no value is specified for this field, and a file named `README.md`,\n`README.txt` or `README` exists in the package root, then the name of that\nfile will be used. You can suppress this behavior by setting this field to\n`false`. If the field is set to `true`, a default value of `README.md` will\nbe assumed.\n", + "title": "Readme" + }, + "repository": { + "anyOf": [ + { + "$ref": "#/definitions/Repository" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `repository` field should be a URL to the source repository for your\npackage.\n\n```toml\n[package]\n# ...\nrepository = \"https://github.com/rust-lang/cargo/\"\n```", + "title": "Repository" + }, + "resolver": { + "$ref": "#/definitions/Resolver" + }, + "rust-version": { + "anyOf": [ + { + "$ref": "#/definitions/RustVersion" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "The `rust-version` field is an optional key that tells cargo what version of the\nRust language and compiler your package can be compiled with. If the currently\nselected version of the Rust compiler is older than the stated version, cargo\nwill exit with an error, telling the user what version is required.\n\nThe first version of Cargo that supports this field was released with Rust 1.56.0.\nIn older releases, the field will be ignored, and Cargo will display a warning.\n\n```toml\n[package]\n# ...\nrust-version = \"1.56\"\n```\n\nThe Rust version must be a bare version number with two or three components; it\ncannot include semver operators or pre-release identifiers. Compiler pre-release\nidentifiers such as -nightly will be ignored while checking the Rust version.\nThe `rust-version` must be equal to or newer than the version that first\nintroduced the configured `edition`.\n\nThe `rust-version` may be ignored using the `--ignore-rust-version` option.\n\nSetting the `rust-version` key in `[package]` will affect all targets/crates in\nthe package, including test suites, benchmarks, binaries, examples, etc.", + "title": "RustVersion" + }, + "version": { + "anyOf": [ + { + "$ref": "#/definitions/SemVer" + }, + { + "$ref": "#/definitions/WorkspaceInheritance" + } + ], + "description": "Cargo bakes in the concept of [Semantic Versioning](https://semver.org/), so make sure you follow some basic rules:\n\n* Before you reach 1.0.0, anything goes, but if you make breaking changes,\n increment the minor version. In Rust, breaking changes include adding fields to\n structs or variants to enums.\n* After 1.0.0, only make breaking changes when you increment the major version.\n Don't break the build.\n* After 1.0.0, don't add any new public API (no new `pub` anything) in patch-level\n versions. Always increment the minor version if you add any new `pub` structs,\n traits, fields, types, functions, methods or anything else.\n* Use version numbers with three numeric parts such as 1.0.0 rather than 1.0.", + "title": "Semantic Version" + }, + "workspace": { + "type": "string", + "description": "The `workspace` field can be used to configure the workspace that this package\nwill be a member of. If not specified this will be inferred as the first\nCargo.toml with `[workspace]` upwards in the filesystem. Setting this is\nuseful if the member is not inside a subdirectory of the workspace root.\n\n```toml\n[package]\n# ...\nworkspace = \"path/to/workspace/root\"\n```\n\nThis field cannot be specified if the manifest already has a `[workspace]`\ntable defined. That is, a crate cannot both be a root crate in a workspace\n(contain `[workspace]`) and also be a member crate of another workspace\n(contain `package.workspace`).\n\nFor more information, see the [workspaces chapter](https://doc.rust-lang.org/cargo/reference/workspaces.html).", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-workspace-field" + } + } + } + }, + "required": [ + "name" + ], + "title": "Package", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-package-section" + } + }, + "x-tombi-table-keys-order": "schema" + }, + "Panic": { + "type": "string", + "description": "The `panic` setting controls the [`-C panic` flag](https://doc.rust-lang.org/rustc/codegen-options/index.html#panic) which controls which panic\nstrategy to use.\n\nWhen set to `\"unwind\"`, the actual value depends on the default of the target\nplatform. For example, the NVPTX platform does not support unwinding, so it\nalways uses `\"abort\"`.\n\nTests, benchmarks, build scripts, and proc macros ignore the `panic` setting.\nThe `rustc` test harness currently requires `unwind` behavior. See the\n[`panic-abort-tests`](https://doc.rust-lang.org/cargo/reference/unstable.html#panic-abort-tests) unstable flag which enables `abort` behavior.\n\nAdditionally, when using the `abort` strategy and building a test, all of the\ndependencies will also be forced to built with the `unwind` strategy.", + "enum": [ + "unwind", + "abort" + ], + "title": "Panic", + "x-taplo": { + "docs": { + "enumValues": [ + "Unwind the stack upon panic.", + "Terminate the process upon panic." + ] + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#panic" + } + } + }, + "Platform": { + "type": "object", + "properties": { + "build-dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "description": "You can depend on other Cargo-based crates for use in your build scripts.\nDependencies are declared through the `build-dependencies` section of the\nmanifest:\n\n```toml\n[build-dependencies]\ncc = \"1.0.3\"\n```\n\nThe build script **does not** have access to the dependencies listed\nin the `dependencies` or `dev-dependencies` section. Build\ndependencies will likewise not be available to the package itself\nunless listed under the `dependencies` section as well. A package\nitself and its build script are built separately, so their\ndependencies need not coincide. Cargo is kept simpler and cleaner by\nusing independent dependencies for independent purposes.", + "x-taplo": { + "crates": { + "schemas": "dependencies" + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#build-dependencies" + }, + "plugins": [ + "crates" + ] + }, + "x-tombi-table-keys-order": "version-sort" + }, + "build_dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "x-taplo": { + "hidden": true + }, + "x-tombi-table-keys-order": "version-sort" + }, + "dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "description": "Cargo is configured to look for dependencies on [crates.io](https://crates.io) by default. Only\nthe name and a version string are required in this case. In [the cargo\nguide](https://doc.rust-lang.org/cargo/guide/index.html), we specified a dependency on the `time` crate:\n\n```toml\n[dependencies]\ntime = \"0.1.12\"\n```\n\nThe string `\"0.1.12\"` is a [semver](https://github.com/steveklabnik/semver#requirements) version requirement. Since this\nstring does not have any operators in it, it is interpreted the same way as\nif we had specified `\"^0.1.12\"`, which is called a caret requirement.\n\nA dependency can also be defined by a table with additional options:\n\n```toml\n[dependencies]\ntime = { path = \"../time\", version = \"0.1.12\" }\n```", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html" + } + }, + "x-tombi-table-keys-order": "version-sort" + }, + "dev-dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "description": "The format of `[dev-dependencies]` is equivalent to `[dependencies]`:\n\n```toml\n[dev-dependencies]\ntempdir = \"0.3\"\n```\n\nDev-dependencies are not used when compiling\na package for building, but are used for compiling tests, examples, and\nbenchmarks.\n\nThese dependencies are *not* propagated to other packages which depend on this\npackage.\n\nYou can also have target-specific development dependencies by using\n`dev-dependencies` in the target section header instead of `dependencies`. For\nexample:\n\n```toml\n[target.'cfg(unix)'.dev-dependencies]\nmio = \"0.0.1\"\n```\n\n> **Note**: When a package is published, only dev-dependencies that specify a\n> `version` will be included in the published crate. For most use cases,\n> dev-dependencies are not needed when published, though some users (like OS\n> packagers) may want to run tests within a crate, so providing a `version` if\n> possible can still be beneficial.\n", + "x-taplo": { + "crates": { + "schemas": "dependencies" + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#development-dependencies" + }, + "plugins": [ + "crates" + ] + }, + "x-tombi-table-keys-order": "version-sort" + }, + "dev_dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "x-taplo": { + "hidden": true + }, + "x-tombi-table-keys-order": "version-sort" + } + }, + "title": "Platform", + "x-tombi-table-keys-order": "schema" + }, + "PlaydateMetadata": { + "type": "object", + "additionalProperties": false, + "description": "Metadata and build configuration.", + "properties": { + "name": { + "type": "string", + "description": "A game version number, formatted any way you wish, that is displayed to players. It is not used to compute when updates should occur.", + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + }, + "assets": { + "anyOf": [ + { + "$ref": "#/definitions/PlaydateMetadataAssetsMap" + }, + { + "$ref": "#/definitions/PlaydateMetadataAssetsArray" + } + ] + }, + "author": { + "type": "string", + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + }, + "build-number": { + "type": "integer", + "description": "A monotonically-increasing integer value used to indicate a unique version of your game. This can be set using an automated build process like Continuous Integration to avoid having to set the value by hand.\n\nFor sideloaded games, buildNumber is required and is used to determine when a newer version is available to download.", + "exclusiveMinimum": 0, + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + }, + "bundle-id": { + "type": "string", + "description": "A unique identifier for your game, in reverse DNS notation.", + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + }, + "content-warning": { + "type": "string", + "description": "Optional. A content warning that displays when the user launches your game for the first time. The user will have the option of backing out and not launching your game if they choose.", + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + }, + "content-warning2": { + "type": "string", + "description": "Optional. A second content warning that displays on a second screen when the user launches your game for the first time. The user will have the option of backing out and not launching your game if they choose.\n\nNote: `content-warning2` will only display if a `content-warning` attribute is also specified.\n\nThe string displayed on the content warning screen can only be so long before it will be truncated with an \"\u2026\" character. Be sure to keep this in mind when designing your `content-warning` and `content-warning2` text.", + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + }, + "description": { + "type": "string", + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + }, + "dev-assets": { + "anyOf": [ + { + "$ref": "#/definitions/PlaydateMetadataAssetsMap" + }, + { + "$ref": "#/definitions/PlaydateMetadataAssetsArray" + } + ] + }, + "image-path": { + "type": "string", + "description": "A directory of images that will be used by the launcher.\n\nMore in [official documentation](https://sdk.play.date/#pdxinfo).", + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + }, + "launch-sound-path": { + "type": "string", + "description": "Should point to the path of a short audio file to be played as the game launch animation is taking place.\n\nMore in [official documentation](https://sdk.play.date/#pdxinfo).", + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + }, + "options": { + "$ref": "#/definitions/PlaydateMetadataOptions" + }, + "support": { + "type": "object", + "additionalProperties": true, + "properties": {} + }, + "version": { + "type": "string", + "x-taplo": { + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + } + } + }, + "required": [ + "bundle-id" + ], + "title": "Playdate Package Metadata", + "x-taplo": { + "initKeys": [ + "bundle-id", + "name", + "description", + "author", + "image-path", + "launch-sound-path" + ], + "links": { + "key": "https://sdk.play.date/#pdxinfo" + } + }, + "x-taplo-info": { + "authors": [ + "Alex Koz. (https://github.com/boozook)" + ] + }, + "x-tombi-table-keys-order": "schema" + }, + "PlaydateMetadataAssetsArray": { + "type": "array", + "description": "List of paths to include.", + "items": { + "type": "string", + "description": "Path to include.", + "title": "Path" + }, + "title": "Assets list", + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://github.com/boozook/playdate/blob/main/support/build/README.md#assets-list" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "PlaydateMetadataAssetsMap": { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string", + "description": "Path of files to include. Can be absolute, relative to the crate root, or/and glob.\n\nLeft hand is where to put files, path in the resulting package.\n\nRight hand is a path or pattern to match files to include.", + "title": "Path" + }, + { + "type": "boolean", + "description": "Include or exclude the file or glob-pattern.", + "title": "Include" + } + ] + }, + "description": "Rules used to resolve paths to include.", + "properties": { + "options": { + "$ref": "#/definitions/PlaydateMetadataAssetsOptions" + } + }, + "title": "Assets rules", + "x-taplo": { + "links": { + "key": "https://github.com/boozook/playdate/blob/main/support/build/README.md#assets-table" + } + }, + "x-tombi-table-keys-order": "schema" + }, + "PlaydateMetadataAssetsOptions": { + "type": "object", + "additionalProperties": false, + "description": "Options for assets paths resolution and how to build assets collection", + "properties": { + "dependencies": { + "type": "boolean", + "description": "Allow build assets for dependencies." + }, + "follow-symlinks": { + "type": "boolean" + }, + "method": { + "type": "string", + "enum": [ + "copy", + "link" + ] + }, + "overwrite": { + "type": "boolean", + "description": "Allow overwriting existing files." + } + }, + "title": "Assets Configuration", + "x-taplo": { + "links": { + "key": "https://github.com/boozook/playdate/blob/main/support/build/README.md#assets-options" + } + }, + "x-tombi-table-keys-order": "schema" + }, + "PlaydateMetadataOptions": { + "type": "object", + "additionalProperties": true, + "description": "Package build options.", + "properties": { + "assets": { + "$ref": "#/definitions/PlaydateMetadataAssetsOptions" + } + }, + "title": "Configuration", + "x-taplo": { + "links": { + "key": "https://github.com/boozook/playdate/blob/main/support/build/README.md#options" + } + }, + "x-tombi-table-keys-order": "schema" + }, + "Profile": { + "type": "object", + "properties": { + "codegen-units": { + "$ref": "#/definitions/CodegenUnits" + }, + "debug": { + "$ref": "#/definitions/DebugLevel" + }, + "debug-assertions": { + "$ref": "#/definitions/DebugAssertions" + }, + "dir-name": { + "type": "string", + "x-taplo": { + "hidden": true + } + }, + "incremental": { + "$ref": "#/definitions/Incremental" + }, + "inherits": { + "$ref": "#/definitions/Inherits" + }, + "lto": { + "$ref": "#/definitions/Lto" + }, + "opt-level": { + "$ref": "#/definitions/OptLevel" + }, + "overflow-checks": { + "$ref": "#/definitions/OverflowChecks" + }, + "package": { + "$ref": "#/definitions/ProfilePackageOverrides" + }, + "panic": { + "$ref": "#/definitions/Panic" + }, + "rpath": { + "$ref": "#/definitions/Rpath" + }, + "split-debuginfo": { + "$ref": "#/definitions/SplitDebuginfo" + }, + "strip": { + "$ref": "#/definitions/Strip" + } + }, + "title": "Profile", + "x-tombi-table-keys-order": "schema" + }, + "ProfilePackageOverrides": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Profile" + }, + "description": "Package-specific overrides.\n\nThe package name is a [Package ID Spec](https://doc.rust-lang.org/cargo/reference/pkgid-spec.html), so you can\ntarget individual versions of a package with syntax such as `[profile.dev.package.\"foo:2.1.0\"]`.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#overrides" + } + }, + "x-tombi-table-keys-order": "version-sort" + }, + "ProfileWithBuildOverride": { + "type": "object", + "properties": { + "build-override": { + "$ref": "#/definitions/Profile" + }, + "codegen-units": { + "$ref": "#/definitions/CodegenUnits" + }, + "debug": { + "$ref": "#/definitions/DebugLevel" + }, + "debug-assertions": { + "$ref": "#/definitions/DebugAssertions" + }, + "incremental": { + "$ref": "#/definitions/Incremental" + }, + "inherits": { + "$ref": "#/definitions/Inherits" + }, + "lto": { + "$ref": "#/definitions/Lto" + }, + "opt-level": { + "$ref": "#/definitions/OptLevel" + }, + "overflow-checks": { + "$ref": "#/definitions/OverflowChecks" + }, + "package": { + "$ref": "#/definitions/ProfilePackageOverrides" + }, + "panic": { + "$ref": "#/definitions/Panic" + }, + "rpath": { + "$ref": "#/definitions/Rpath" + }, + "split-debuginfo": { + "$ref": "#/definitions/SplitDebuginfo" + }, + "strip": { + "$ref": "#/definitions/Strip" + } + }, + "title": "Profile with Build Override", + "x-tombi-table-keys-order": "schema" + }, + "Profiles": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/ProfileWithBuildOverride" + }, + "description": "Profiles provide a way to alter the compiler settings, influencing things like optimizations and debugging symbols.\n\nCargo has 4 built-in profiles: dev, release, test, and bench. It automatically chooses the profile based on which command is being run, the package and target that is being built, and command-line flags like --release.", + "properties": { + "bench": { + "$ref": "#/definitions/ProfileWithBuildOverride" + }, + "dev": { + "$ref": "#/definitions/ProfileWithBuildOverride" + }, + "release": { + "$ref": "#/definitions/ProfileWithBuildOverride" + }, + "test": { + "$ref": "#/definitions/ProfileWithBuildOverride" + } + }, + "title": "Profiles", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html" + } + } + }, + "Publish": { + "anyOf": [ + { + "type": "boolean", + "default": true, + "description": "A boolean indicating whether the package can be published.", + "enum": [ + true, + false + ], + "x-taplo": { + "docs": { + "enumValues": [ + "The package can be published.", + "The package cannot be published." + ] + } + } + }, + { + "type": "array", + "description": "An array of registry names.", + "items": { + "type": "string" + }, + "uniqueItems": true, + "x-tombi-array-values-order": "version-sort" + } + ], + "description": "The `publish` field can be used to prevent a package from being published to a package registry (like *crates.io*) by mistake, for instance to keep a package\nprivate in a company.\n\n```toml\n[package]\n# ...\npublish = false\n```\n\nThe value may also be an array of strings which are registry names that are\nallowed to be published to.\n\n```toml\n[package]\n# ...\npublish = [\"some-registry-name\"]\n```", + "title": "Publish", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-publish-field" + } + } + }, + "Readme": { + "anyOf": [ + { + "type": "string", + "description": "The `readme` field should be the path to a file in the package root (relative\nto this `Cargo.toml`) that contains general information about the package." + }, + { + "type": "boolean", + "enum": [ + true, + false + ], + "x-taplo": { + "docs": { + "enumValues": [ + "Use the `README.md` file.", + "Do not use the default `README.md` file" + ] + } + } + } + ], + "description": "The `readme` field should be the path to a file in the package root (relative\nto this `Cargo.toml`) that contains general information about the package.\nThis file will be transferred to the registry when you publish. [crates.io](https://crates.io)\nwill interpret it as Markdown and render it on the crate's page.\n\n```toml\n[package]\n# ...\nreadme = \"README.md\"\n```\n\nIf no value is specified for this field, and a file named `README.md`,\n`README.txt` or `README` exists in the package root, then the name of that\nfile will be used. You can suppress this behavior by setting this field to\n`false`. If the field is set to `true`, a default value of `README.md` will\nbe assumed.\n", + "title": "Readme", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-readme-field" + } + } + }, + "Repository": { + "type": "string", + "description": "The `repository` field should be a URL to the source repository for your\npackage.\n\n```toml\n[package]\n# ...\nrepository = \"https://github.com/rust-lang/cargo/\"\n```", + "title": "Repository", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-repository-field" + } + } + }, + "Resolver": { + "type": "string", + "description": "A different feature resolver algorithm can be used by specifying the resolver version in Cargo.toml like this:\n\n[package]\nname = \"my-package\"\nversion = \"1.0.0\"\nresolver = \"2\"\n\nThe version \"1\" resolver is the original resolver that shipped with Cargo up to version 1.50. The default is \"2\" if the root package specifies edition = \"2021\" or a newer edition. Otherwise the default is \"1\".\n\nThe version \"2\" resolver introduces changes in feature unification. See the features chapter for more details.\n\nThe resolver is a global option that affects the entire workspace. The resolver version in dependencies is ignored, only the value in the top-level package will be used. If using a virtual workspace, the version should be specified in the [workspace] table, for example:\n\n[workspace]\nmembers = [\"member1\", \"member2\"]\nresolver = \"2\"", + "enum": [ + "1", + "2", + "3" + ], + "title": "Resolver", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/resolver.html#resolver-versions" + } + } + }, + "Rpath": { + "type": "boolean", + "description": "The `rpath` setting controls the [`-C rpath` flag](https://doc.rust-lang.org/rustc/codegen-options/index.html#rpath) which controls\nwhether or not [`rpath`](https://en.wikipedia.org/wiki/Rpath) is enabled.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#rpath" + } + } + }, + "RustVersion": { + "type": "string", + "description": "The `rust-version` field is an optional key that tells cargo what version of the\nRust language and compiler your package can be compiled with. If the currently\nselected version of the Rust compiler is older than the stated version, cargo\nwill exit with an error, telling the user what version is required.\n\nThe first version of Cargo that supports this field was released with Rust 1.56.0.\nIn older releases, the field will be ignored, and Cargo will display a warning.\n\n```toml\n[package]\n# ...\nrust-version = \"1.56\"\n```\n\nThe Rust version must be a bare version number with two or three components; it\ncannot include semver operators or pre-release identifiers. Compiler pre-release\nidentifiers such as -nightly will be ignored while checking the Rust version.\nThe `rust-version` must be equal to or newer than the version that first\nintroduced the configured `edition`.\n\nThe `rust-version` may be ignored using the `--ignore-rust-version` option.\n\nSetting the `rust-version` key in `[package]` will affect all targets/crates in\nthe package, including test suites, benchmarks, binaries, examples, etc.", + "title": "RustVersion", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-rust-version-field" + } + } + }, + "SemVer": { + "type": "string", + "default": "0.1.0", + "description": "Cargo bakes in the concept of [Semantic Versioning](https://semver.org/), so make sure you follow some basic rules:\n\n* Before you reach 1.0.0, anything goes, but if you make breaking changes,\n increment the minor version. In Rust, breaking changes include adding fields to\n structs or variants to enums.\n* After 1.0.0, only make breaking changes when you increment the major version.\n Don't break the build.\n* After 1.0.0, don't add any new public API (no new `pub` anything) in patch-level\n versions. Always increment the minor version if you add any new `pub` structs,\n traits, fields, types, functions, methods or anything else.\n* Use version numbers with three numeric parts such as 1.0.0 rather than 1.0.", + "format": "semver", + "title": "Semantic Version", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-version-field" + } + } + }, + "SemVerRequirement": { + "type": "string", + "default": "*", + "description": "The [version requirement](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html) of the target dependency.", + "format": "semver-requirement", + "title": "Semantic Version Requirement", + "x-taplo": { + "crates": { + "schemas": "version" + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html" + }, + "plugins": [ + "crates" + ] + } + }, + "SplitDebuginfo": { + "description": "The split-debuginfo setting controls the -C split-debuginfo flag which controls whether debug information, if generated, is either placed in the executable itself or adjacent to it. This can be useful for reducing the size of the executable, but may make it harder to debug the executable.", + "oneOf": [ + { + "type": "string", + "description": "This is the default for platforms with ELF binaries and windows-gnu (not Windows MSVC and not macOS). This typically means that DWARF debug information can be found in the final artifact in sections of the executable. This option is not supported on Windows MSVC. On macOS this options prevents the final execution of dsymutil to generate debuginfo.", + "enum": [ + "off" + ] + }, + { + "type": "string", + "description": "This is the default for Windows MSVC and macOS. The term \"packed\" here means that all the debug information is packed into a separate file from the main executable. On Windows MSVC this is a *.pdb file, on macOS this is a *.dSYM folder, and on other platforms this is a *.dwp file.", + "enum": [ + "packed" + ] + }, + { + "type": "string", + "description": "This means that debug information will be found in separate files for each compilation unit (object file). This is not supported on Windows MSVC. On macOS this means the original object files will contain debug information. On other Unix platforms this means that *.dwo files will contain debug information.", + "enum": [ + "unpacked" + ] + } + ], + "title": "SplitDebuginfo", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#split-debuginfo" + } + } + }, + "Strip": { + "oneOf": [ + { + "type": "string", + "default": "none", + "description": "The strip option controls the -C strip flag, which directs rustc to strip either symbols or debuginfo from a binary.", + "enum": [ + "none", + "debuginfo", + "symbols" + ] + }, + { + "type": "boolean", + "description": "The strip option controls the -C strip flag, which directs rustc to strip either symbols or debuginfo from a binary.", + "enum": [ + true + ], + "title": "Equivalent to \"symbols\"" + }, + { + "type": "boolean", + "description": "The strip option controls the -C strip flag, which directs rustc to strip either symbols or debuginfo from a binary.", + "enum": [ + false + ], + "title": "Equivalent to \"none\"" + } + ], + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/profiles.html#strip" + } + } + }, + "Target": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "The `name` field specifies the name of the target, which corresponds to the\nfilename of the artifact that will be generated. For a library, this is the\ncrate name that dependencies will use to reference it.\n\nFor the `[lib]` and the default binary (`src/main.rs`), this defaults to the\nname of the package, with any dashes replaced with underscores. For other\n[auto discovered](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#target-auto-discovery) targets, it defaults to the\ndirectory or file name.\n\nThis is required for all targets except `[lib]`.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-name-field" + } + } + }, + "bench": { + "type": "boolean", + "description": "The `bench` field indicates whether or not the target is benchmarked by\ndefault by [`cargo bench`](https://doc.rust-lang.org/cargo/commands/cargo-bench.html). The default is `true` for lib, bins, and\nbenchmarks.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-bench-field" + } + } + }, + "crate-type": { + "type": "array", + "description": "The `crate-type` field defines the [crate types](https://doc.rust-lang.org/reference/linkage.html) that will be generated by the\ntarget. It is an array of strings, allowing you to specify multiple crate\ntypes for a single target. This can only be specified for libraries and\nexamples. Binaries, tests, and benchmarks are always the \"bin\" crate type.\n\nThe available options are `bin`, `lib`, `rlib`, `dylib`, `cdylib`,\n`staticlib`, and `proc-macro`. You can read more about the different crate\ntypes in the [Rust Reference Manual](https://doc.rust-lang.org/reference/linkage.html).", + "items": { + "type": "string", + "description": "The `crate-type` field defines the [crate types](https://doc.rust-lang.org/reference/linkage.html) that will be generated by the\ntarget. It is an array of strings, allowing you to specify multiple crate\ntypes for a single target. This can only be specified for libraries and\nexamples. Binaries, tests, and benchmarks are always the \"bin\" crate type.\n\nThe available options are `bin`, `lib`, `rlib`, `dylib`, `cdylib`,\n`staticlib`, and `proc-macro`. You can read more about the different crate\ntypes in the [Rust Reference Manual](https://doc.rust-lang.org/reference/linkage.html).", + "x-taplo": { + "docs": { + "enumValues": [ + "A runnable executable will be produced. This requires that there is a `main` function in the crate which\nwill be run when the program begins executing. This will link in all Rust and\nnative dependencies, producing a distributable binary.", + "A Rust library will be produced.\nThis is an ambiguous concept as to what exactly is produced because a library\ncan manifest itself in several forms. The purpose of this generic `lib` option\nis to generate the \"compiler recommended\" style of library. The output library\nwill always be usable by rustc, but the actual type of library may change from\ntime-to-time. The remaining output types are all different flavors of\nlibraries, and the `lib` type can be seen as an alias for one of them (but the\nactual one is compiler-defined).", + "A \"Rust library\" file will be produced. This is used as an intermediate artifact and can be thought of as a\n\"static Rust library\". These `rlib` files, unlike `staticlib` files, are\ninterpreted by the compiler in future linkage. This essentially means\nthat `rustc` will look for metadata in `rlib` files like it looks for metadata\nin dynamic libraries. This form of output is used to produce statically linked\nexecutables as well as `staticlib` outputs.", + "A dynamic Rust library will be produced. This is different from the `lib` output type in that this forces\ndynamic library generation. The resulting dynamic library can be used as a\ndependency for other libraries and/or executables. This output type will\ncreate `*.so` files on linux, `*.dylib` files on osx, and `*.dll` files on\nwindows.", + "A dynamic system library will be produced. This is used when compiling\na dynamic library to be loaded from another language. This output type will\ncreate `*.so` files on Linux, `*.dylib` files on macOS, and `*.dll` files on\nWindows.", + "A static system library will be produced. This is different from other library outputs in that\nthe compiler will never attempt to link to `staticlib` outputs. The\npurpose of this output type is to create a static library containing all of\nthe local crate's code along with all upstream dependencies. The static\nlibrary is actually a `*.a` archive on linux and osx and a `*.lib` file on\nwindows. This format is recommended for use in situations such as linking\nRust code into an existing non-Rust application because it will not have\ndynamic dependencies on other Rust code.", + "The output produced is not specified, but if a `-L` path is provided to it then the\ncompiler will recognize the output artifacts as a macro and it can be loaded\nfor a program. Crates compiled with this crate type must only export\n[procedural macros](https://doc.rust-lang.org/reference/procedural-macros.html). The compiler will automatically set the `proc_macro`\n[configuration option](https://doc.rust-lang.org/reference/conditional-compilation.html). The crates are always compiled with the same target\nthat the compiler itself was built with. For example, if you are executing\nthe compiler from Linux with an `x86_64` CPU, the target will be\n`x86_64-unknown-linux-gnu` even if the crate is a dependency of another crate\nbeing built for a different target." + ] + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-crate-type-field" + } + } + }, + "uniqueItems": true, + "x-taplo": { + "docs": { + "enumValues": [ + "A runnable executable will be produced. This requires that there is a `main` function in the crate which\nwill be run when the program begins executing. This will link in all Rust and\nnative dependencies, producing a distributable binary.", + "A Rust library will be produced.\nThis is an ambiguous concept as to what exactly is produced because a library\ncan manifest itself in several forms. The purpose of this generic `lib` option\nis to generate the \"compiler recommended\" style of library. The output library\nwill always be usable by rustc, but the actual type of library may change from\ntime-to-time. The remaining output types are all different flavors of\nlibraries, and the `lib` type can be seen as an alias for one of them (but the\nactual one is compiler-defined).", + "A \"Rust library\" file will be produced. This is used as an intermediate artifact and can be thought of as a\n\"static Rust library\". These `rlib` files, unlike `staticlib` files, are\ninterpreted by the compiler in future linkage. This essentially means\nthat `rustc` will look for metadata in `rlib` files like it looks for metadata\nin dynamic libraries. This form of output is used to produce statically linked\nexecutables as well as `staticlib` outputs.", + "A dynamic Rust library will be produced. This is different from the `lib` output type in that this forces\ndynamic library generation. The resulting dynamic library can be used as a\ndependency for other libraries and/or executables. This output type will\ncreate `*.so` files on linux, `*.dylib` files on osx, and `*.dll` files on\nwindows.", + "A dynamic system library will be produced. This is used when compiling\na dynamic library to be loaded from another language. This output type will\ncreate `*.so` files on Linux, `*.dylib` files on macOS, and `*.dll` files on\nWindows.", + "A static system library will be produced. This is different from other library outputs in that\nthe compiler will never attempt to link to `staticlib` outputs. The\npurpose of this output type is to create a static library containing all of\nthe local crate's code along with all upstream dependencies. The static\nlibrary is actually a `*.a` archive on linux and osx and a `*.lib` file on\nwindows. This format is recommended for use in situations such as linking\nRust code into an existing non-Rust application because it will not have\ndynamic dependencies on other Rust code.", + "The output produced is not specified, but if a `-L` path is provided to it then the\ncompiler will recognize the output artifacts as a macro and it can be loaded\nfor a program. Crates compiled with this crate type must only export\n[procedural macros](https://doc.rust-lang.org/reference/procedural-macros.html). The compiler will automatically set the `proc_macro`\n[configuration option](https://doc.rust-lang.org/reference/conditional-compilation.html). The crates are always compiled with the same target\nthat the compiler itself was built with. For example, if you are executing\nthe compiler from Linux with an `x86_64` CPU, the target will be\n`x86_64-unknown-linux-gnu` even if the crate is a dependency of another crate\nbeing built for a different target." + ] + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-crate-type-field" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "crate_type": { + "type": "array", + "items": { + "type": "string", + "x-taplo": { + "hidden": true + } + }, + "uniqueItems": true, + "x-taplo": { + "hidden": true + }, + "x-tombi-array-values-order": "version-sort" + }, + "doc": { + "type": "boolean", + "description": "The `doc` field indicates whether or not the target is included in the\ndocumentation generated by [`cargo doc`](https://doc.rust-lang.org/cargo/commands/cargo-doc.html) by default. The default is `true` for\nlibraries and binaries.\n\n> **Note**: The binary will be skipped if its name is the same as the lib\n> target.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-doc-field" + } + } + }, + "doctest": { + "type": "boolean", + "description": "The `doctest` field indicates whether or not [documentation examples](https://doc.rust-lang.org/rustdoc/documentation-tests.html) are\ntested by default by [`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html). This is only relevant for libraries, it\nhas no effect on other sections. The default is `true` for the library.\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-doctest-field" + } + } + }, + "edition": { + "$ref": "#/definitions/Edition" + }, + "harness": { + "type": "boolean", + "description": "The `harness` field indicates that the [`--test` flag](https://doc.rust-lang.org/rustc/command-line-arguments.html#option-test) will be passed to\n`rustc` which will automatically include the libtest library which is the\ndriver for collecting and running tests marked with the [`#[test]` attribute](https://doc.rust-lang.org/reference/attributes/testing.html#the-test-attribute) or benchmarks with the `#[bench]` attribute. The\ndefault is `true` for all targets.\n\nIf set to `false`, then you are responsible for defining a `main()` function\nto run tests and benchmarks.\n\nTests have the [`cfg(test)` conditional expression](https://doc.rust-lang.org/reference/conditional-compilation.html#test) enabled whether\nor not the harness is enabled.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-harness-field" + } + } + }, + "path": { + "type": "string", + "description": "The `path` field specifies where the source for the crate is located, relative\nto the `Cargo.toml` file.\n\nIf not specified, the [inferred path](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#target-auto-discovery) is used based on\nthe target name.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-path-field" + } + } + }, + "plugin": { + "type": "boolean", + "x-taplo": { + "hidden": true + } + }, + "proc-macro": { + "type": "boolean", + "description": "The `proc-macro` field indicates that the library is a [procedural macro](https://doc.rust-lang.org/book/ch19-06-macros.html)\n([reference](https://doc.rust-lang.org/reference/procedural-macros.html)). This is only valid for the `[lib]`\ntarget.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-proc-macro-field" + } + } + }, + "proc_macro": { + "type": "boolean", + "x-taplo": { + "hidden": true + } + }, + "required-features": { + "type": "array", + "description": "The `required-features` field specifies which [features](https://doc.rust-lang.org/cargo/reference/features.html) the target needs in\norder to be built. If any of the required features are not enabled, the\ntarget will be skipped. This is only relevant for the `[[bin]]`, `[[bench]]`,\n`[[test]]`, and `[[example]]` sections, it has no effect on `[lib]`.\n\n```toml\n[features]\n# ...\npostgres = []\nsqlite = []\ntools = []\n\n[[bin]]\nname = \"my-pg-tool\"\nrequired-features = [\"postgres\", \"tools\"]\n```\n", + "items": { + "type": "string", + "description": "The `required-features` field specifies which [features](https://doc.rust-lang.org/cargo/reference/features.html) the target needs in\norder to be built. If any of the required features are not enabled, the\ntarget will be skipped. This is only relevant for the `[[bin]]`, `[[bench]]`,\n`[[test]]`, and `[[example]]` sections, it has no effect on `[lib]`.\n\n```toml\n[features]\n# ...\npostgres = []\nsqlite = []\ntools = []\n\n[[bin]]\nname = \"my-pg-tool\"\nrequired-features = [\"postgres\", \"tools\"]\n```\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-required-features-field" + } + } + }, + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-required-features-field" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "test": { + "type": "boolean", + "description": "The `test` field indicates whether or not the target is tested by default by\n[`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html). The default is `true` for lib, bins, and tests.\n\n> **Note**: Examples are built by [`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html) by default to ensure they\n> continue to compile, but they are not *tested* by default. Setting `test =\n> true` for an example will also build it as a test and run any\n> [`#[test]`](https://doc.rust-lang.org/reference/attributes/testing.html#the-test-attribute) functions defined in the example.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-test-field" + } + } + } + }, + "title": "Target", + "x-tombi-table-keys-order": "schema" + }, + "Workspace": { + "type": "object", + "description": "The `[workspace]` table in `Cargo.toml` defines which packages are members of\nthe workspace:\n\n```toml\n[workspace]\nmembers = [\"member1\", \"path/to/member2\", \"crates/*\"]\nexclude = [\"crates/foo\", \"path/to/other\"]\n```\n\nAn empty `[workspace]` table can be used with a `[package]` to conveniently\ncreate a workspace with the package and all of its path dependencies.\n\nAll [`path` dependencies](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#specifying-path-dependencies) residing in the workspace directory automatically\nbecome members. Additional members can be listed with the `members` key, which\nshould be an array of strings containing directories with `Cargo.toml` files.\n\nThe `members` list also supports [globs](https://docs.rs/glob/0.3.0/glob/struct.Pattern.html) to match multiple paths, using\ntypical filename glob patterns like `*` and `?`.\n\nThe `exclude` key can be used to prevent paths from being included in a\nworkspace. This can be useful if some path dependencies aren't desired to be\nin the workspace at all, or using a glob pattern and you want to remove a\ndirectory.\n\nAn empty `[workspace]` table can be used with a `[package]` to conveniently\ncreate a workspace with the package and all of its path dependencies.", + "properties": { + "default-members": { + "type": "array", + "description": "The optional `default-members` key can be specified to set the members to\noperate on when in the workspace root and the package selection flags are not\nused:\n\n```toml\n[workspace]\nmembers = [\"path/to/member1\", \"path/to/member2\", \"path/to/member3/*\"]\ndefault-members = [\"path/to/member2\", \"path/to/member3/foo\"]\n```\n\nWhen specified, `default-members` must expand to a subset of `members`.", + "items": { + "type": "string", + "description": "The optional `default-members` key can be specified to set the members to\noperate on when in the workspace root and the package selection flags are not\nused:\n\n```toml\n[workspace]\nmembers = [\"path/to/member1\", \"path/to/member2\", \"path/to/member3/*\"]\ndefault-members = [\"path/to/member2\", \"path/to/member3/foo\"]\n```\n\nWhen specified, `default-members` must expand to a subset of `members`.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-workspace-section" + } + } + }, + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-workspace-section" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "description": "The `workspace.dependencies` table is where you define dependencies to be\ninherited by members of a workspace.\n\nSpecifying a workspace dependency is similar to [package dependencies](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html) except:\n- Dependencies from this table cannot be declared as `optional`\n- [`features`][features] declared in this table are additive with the `features` from `[dependencies]`\n\nYou can then [inherit the workspace dependency as a package dependency](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#inheriting-a-dependency-from-a-workspace)\n\nExample:\n```toml\n# [PROJECT_DIR]/Cargo.toml\n[workspace]\nmembers = [\"bar\"]\n\n[workspace.dependencies]\ncc = \"1.0.73\"\nrand = \"0.8.5\"\nregex = { version = \"1.6.0\", default-features = false, features = [\"std\"] }\n```\n\n```toml\n# [PROJECT_DIR]/bar/Cargo.toml\n[package]\nname = \"bar\"\nversion = \"0.2.0\"\n\n[dependencies]\nregex = { workspace = true, features = [\"unicode\"] }\n\n[build-dependencies]\ncc.workspace = true\n\n[dev-dependencies]\nrand.workspace = true\n```", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-workspace-section" + } + }, + "x-tombi-table-keys-order": "version-sort" + }, + "exclude": { + "type": "array", + "description": "The `exclude` key can be used to prevent paths from being included in a\nworkspace. This can be useful if some path dependencies aren't desired to be\nin the workspace at all, or using a glob pattern and you want to remove a\ndirectory.", + "items": { + "type": "string", + "description": "The `exclude` key can be used to prevent paths from being included in a\nworkspace. This can be useful if some path dependencies aren't desired to be\nin the workspace at all, or using a glob pattern and you want to remove a\ndirectory.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-workspace-section" + } + } + }, + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-workspace-section" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "lints": { + "$ref": "#/definitions/Lints", + "description": "The `workspace.lints` table is where you define lint configuration to be inherited by members of a workspace.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-workspace-section" + } + } + }, + "members": { + "type": "array", + "description": "All [`path` dependencies] residing in the workspace directory automatically\nbecome members. Additional members can be listed with the `members` key, which\nshould be an array of strings containing directories with `Cargo.toml` files.\n\nThe `members` list also supports [globs] to match multiple paths, using\ntypical filename glob patterns like `*` and `?`.", + "items": { + "type": "string", + "description": "All [`path` dependencies] residing in the workspace directory automatically\nbecome members. Additional members can be listed with the `members` key, which\nshould be an array of strings containing directories with `Cargo.toml` files.\n\nThe `members` list also supports [globs] to match multiple paths, using\ntypical filename glob patterns like `*` and `?`.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-workspace-section" + } + } + }, + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-workspace-section" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "metadata": { + "type": "object", + "additionalProperties": true, + "description": "The `workspace.metadata` table is ignored by Cargo and will not be warned\nabout. This section can be used for tools that would like to store workspace\nconfiguration in `Cargo.toml`. For example:\n\n```toml\n[workspace]\nmembers = [\"member1\", \"member2\"]\n\n[workspace.metadata.webcontents]\nroot = \"path/to/webproject\"\ntool = [\"npm\", \"run\", \"build\"]\n# ...\n```\n\nThere is a similar set of tables at the package level at\n`package.metadata`. While cargo does not specify a\nformat for the content of either of these tables, it is suggested that\nexternal tools may wish to use them in a consistent fashion, such as referring\nto the data in `workspace.metadata` if data is missing from `package.metadata`,\nif that makes sense for the tool in question.\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-workspace-section" + } + } + }, + "package": { + "type": "object", + "description": "The `workspace.package` table is where you define keys that can be\ninherited by members of a workspace. These keys can be inherited by\ndefining them in the member package with `{key}.workspace = true`.\n\nKeys that are supported:\n\n| | |\n|----------------|-----------------|\n| `authors` | `categories` |\n| `description` | `documentation` |\n| `edition` | `exclude` |\n| `homepage` | `include` |\n| `keywords` | `license` |\n| `license-file` | `publish` |\n| `readme` | `repository` |\n| `rust-version` | `version` |\n\n- `license-file` and `readme` are relative to the workspace root\n- `include` and `exclude` are relative to your package root\n\nExample:\n```toml\n# [PROJECT_DIR]/Cargo.toml\n[workspace]\nmembers = [\"bar\"]\n\n[workspace.package]\nversion = \"1.2.3\"\nauthors = [\"Nice Folks\"]\ndescription = \"A short description of my package\"\ndocumentation = \"https://example.com/bar\"\n```\n\n```toml\n# [PROJECT_DIR]/bar/Cargo.toml\n[package]\nname = \"bar\"\nversion.workspace = true\nauthors.workspace = true\ndescription.workspace = true\ndocumentation.workspace = true\n```", + "properties": { + "authors": { + "$ref": "#/definitions/Authors" + }, + "categories": { + "$ref": "#/definitions/Categories" + }, + "description": { + "$ref": "#/definitions/Description" + }, + "documentation": { + "$ref": "#/definitions/Documentation" + }, + "edition": { + "$ref": "#/definitions/Edition" + }, + "exclude": { + "$ref": "#/definitions/Exclude" + }, + "homepage": { + "$ref": "#/definitions/Homepage" + }, + "include": { + "$ref": "#/definitions/Include" + }, + "keywords": { + "$ref": "#/definitions/Keywords" + }, + "license": { + "$ref": "#/definitions/License" + }, + "license-file": { + "$ref": "#/definitions/LicenseFile" + }, + "publish": { + "$ref": "#/definitions/Publish" + }, + "readme": { + "$ref": "#/definitions/Readme" + }, + "repository": { + "$ref": "#/definitions/Repository" + }, + "rust-version": { + "$ref": "#/definitions/RustVersion" + }, + "version": { + "$ref": "#/definitions/SemVer" + } + }, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html#the-package-table" + } + }, + "x-tombi-table-keys-order": "schema" + }, + "resolver": { + "$ref": "#/definitions/Resolver" + } + }, + "title": "Workspace", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/workspaces.html" + } + }, + "x-tombi-table-keys-order": "schema" + }, + "WorkspaceInheritance": { + "type": "object", + "additionalProperties": false, + "properties": { + "workspace": { + "type": "boolean", + "description": "The `workspace` field allow keys to be inherited by defining them in the member package with `{key}.workspace = true`", + "enum": [ + true + ], + "title": "Workspace" + } + }, + "required": [ + "workspace" + ], + "x-tombi-table-keys-order": "schema" + } + }, + "description": "A schema for Cargo.toml.", + "properties": { + "badges": { + "type": "object", + "additionalProperties": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "x-tombi-table-keys-order": "version-sort" + }, + "description": "[crates.io](https://crates.io) can display various badges for build status, test coverage, etc. for\neach crate. All badges are optional.\n\n- The badges pertaining to build status that are currently available are\n Appveyor, CircleCI, Cirrus CI, GitLab, Azure DevOps, Travis CI and Bitbucket\n Pipelines.\n- Available badges pertaining to code test coverage are Codecov and Coveralls.\n- There are also maintenance-related badges based on isitmaintained.com\n which state the issue resolution time, percent of open issues, and future\n maintenance intentions.\n\nMost badge specifications require a `repository` key. It is expected to be in\n`user/repo` format.\n\n```toml\n[badges]\n\n# Appveyor: `repository` is required. `branch` is optional; default is `master`\n# `service` is optional; valid values are `github` (default), `bitbucket`, and\n# `gitlab`; `id` is optional; you can specify the appveyor project id if you\n# want to use that instead. `project_name` is optional; use when the repository\n# name differs from the appveyor project name.\nappveyor = { repository = \"...\", branch = \"master\", service = \"github\" }\n\n# Circle CI: `repository` is required. `branch` is optional; default is `master`\ncircle-ci = { repository = \"...\", branch = \"master\" }\n\n# Cirrus CI: `repository` is required. `branch` is optional; default is `master`\ncirrus-ci = { repository = \"...\", branch = \"master\" }\n\n# GitLab: `repository` is required. `branch` is optional; default is `master`\ngitlab = { repository = \"...\", branch = \"master\" }\n\n# Azure DevOps: `project` is required. `pipeline` is required. `build` is optional; default is `1`\n# Note: project = `organization/project`, pipeline = `name_of_pipeline`, build = `definitionId`\nazure-devops = { project = \"...\", pipeline = \"...\", build=\"2\" }\n\n# Travis CI: `repository` in format \"/\" is required.\n# `branch` is optional; default is `master`\ntravis-ci = { repository = \"...\", branch = \"master\" }\n\n# Bitbucket Pipelines: `repository` is required. `branch` is required\nbitbucket-pipelines = { repository = \"...\", branch = \"master\" }\n\n# Codecov: `repository` is required. `branch` is optional; default is `master`\n# `service` is optional; valid values are `github` (default), `bitbucket`, and\n# `gitlab`.\ncodecov = { repository = \"...\", branch = \"master\", service = \"github\" }\n\n# Coveralls: `repository` is required. `branch` is optional; default is `master`\n# `service` is optional; valid values are `github` (default) and `bitbucket`.\ncoveralls = { repository = \"...\", branch = \"master\", service = \"github\" }\n\n# Is it maintained resolution time: `repository` is required.\nis-it-maintained-issue-resolution = { repository = \"...\" }\n\n# Is it maintained percentage of open issues: `repository` is required.\nis-it-maintained-open-issues = { repository = \"...\" }\n\n# Maintenance: `status` is required. Available options are:\n# - `actively-developed`: New features are being added and bugs are being fixed.\n# - `passively-maintained`: There are no plans for new features, but the maintainer intends to\n# respond to issues that get filed.\n# - `as-is`: The crate is feature complete, the maintainer does not intend to continue working on\n# it or providing support, but it works for the purposes it was designed for.\n# - `experimental`: The author wants to share it with the community but is not intending to meet\n# anyone's particular use case.\n# - `looking-for-maintainer`: The current maintainer would like to transfer the crate to someone\n# else.\n# - `deprecated`: The maintainer does not recommend using this crate (the description of the crate\n# can describe why, there could be a better solution available or there could be problems with\n# the crate that the author does not want to fix).\n# - `none`: Displays no badge on crates.io, since the maintainer has not chosen to specify\n# their intentions, potential crate users will need to investigate on their own.\nmaintenance = { status = \"...\" }\n```", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/manifest.html#the-badges-section" + } + }, + "x-tombi-table-keys-order": "version-sort" + }, + "bench": { + "type": "array", + "description": "Benchmarks provide a way to test the performance of your code using the\n[`cargo bench`](https://doc.rust-lang.org/cargo/commands/cargo-bench.html) command. They follow the same structure as [tests](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#tests),\nwith each benchmark function annotated with the `#[bench]` attribute.\nSimilarly to tests:\n\n* Benchmarks are placed in the [`benches` directory](https://doc.rust-lang.org/cargo/guide/project-layout.html).\n* Benchmark functions defined in libraries and binaries have access to the\n *private* API within the target they are defined in. Benchmarks in the\n `benches` directory may use the *public* API.\n* [The `bench` field](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-bench-field) can be used to define which targets\n are benchmarked by default.\n* [The `harness` field](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-harness-field) can be used to disable the\n built-in harness.\n\n> **Note**: The [`#[bench]`\n> attribute](https://doc.rust-lang.org/unstable-book/library-features/test.html) is currently\n> unstable and only available on the [nightly channel](https://doc.rust-lang.org/book/appendix-07-nightly-rust.html). There are some\n> packages available on [crates.io](https://crates.io/keywords/benchmark) that\n> may help with running benchmarks on the stable channel, such as\n> [Criterion](https://crates.io/crates/criterion).", + "items": { + "$ref": "#/definitions/Target", + "description": "Benchmarks provide a way to test the performance of your code using the\n[`cargo bench`](https://doc.rust-lang.org/cargo/commands/cargo-bench.html) command. They follow the same structure as [tests](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#tests),\nwith each benchmark function annotated with the `#[bench]` attribute.\nSimilarly to tests:\n\n* Benchmarks are placed in the [`benches` directory](https://doc.rust-lang.org/cargo/guide/project-layout.html).\n* Benchmark functions defined in libraries and binaries have access to the\n *private* API within the target they are defined in. Benchmarks in the\n `benches` directory may use the *public* API.\n* [The `bench` field](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-bench-field) can be used to define which targets\n are benchmarked by default.\n* [The `harness` field](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-harness-field) can be used to disable the\n built-in harness.\n\n> **Note**: The [`#[bench]`\n> attribute](https://doc.rust-lang.org/unstable-book/library-features/test.html) is currently\n> unstable and only available on the [nightly channel](https://doc.rust-lang.org/book/appendix-07-nightly-rust.html). There are some\n> packages available on [crates.io](https://crates.io/keywords/benchmark) that\n> may help with running benchmarks on the stable channel, such as\n> [Criterion](https://crates.io/crates/criterion).", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#benchmarks" + } + } + }, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#benchmarks" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "bin": { + "type": "array", + "description": "Binary targets are executable programs that can be run after being compiled.\nThe default binary filename is `src/main.rs`, which defaults to the name of\nthe package. Additional binaries are stored in the [`src/bin/`\ndirectory](https://doc.rust-lang.org/cargo/guide/project-layout.html). The settings for each binary can be [customized](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#configuring-a-target) in the `[[bin]]` tables in `Cargo.toml`.\n\nBinaries can use the public API of the package's library. They are also linked\nwith the [`[dependencies]`](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html) defined in `Cargo.toml`.\n\nYou can run individual binaries with the [`cargo run`](https://doc.rust-lang.org/cargo/commands/cargo-run.html) command with the `--bin\n` option. [`cargo install`](https://doc.rust-lang.org/cargo/commands/cargo-install.html) can be used to copy the executable to a\ncommon location.\n\n```toml\n# Example of customizing binaries in Cargo.toml.\n[[bin]]\nname = \"cool-tool\"\ntest = false\nbench = false\n\n[[bin]]\nname = \"frobnicator\"\nrequired-features = [\"frobnicate\"]\n```", + "items": { + "$ref": "#/definitions/Target", + "description": "Binary targets are executable programs that can be run after being compiled.\nThe default binary filename is `src/main.rs`, which defaults to the name of\nthe package. Additional binaries are stored in the [`src/bin/`\ndirectory](https://doc.rust-lang.org/cargo/guide/project-layout.html). The settings for each binary can be [customized](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#configuring-a-target) in the `[[bin]]` tables in `Cargo.toml`.\n\nBinaries can use the public API of the package's library. They are also linked\nwith the [`[dependencies]`](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html) defined in `Cargo.toml`.\n\nYou can run individual binaries with the [`cargo run`](https://doc.rust-lang.org/cargo/commands/cargo-run.html) command with the `--bin\n` option. [`cargo install`](https://doc.rust-lang.org/cargo/commands/cargo-install.html) can be used to copy the executable to a\ncommon location.\n\n```toml\n# Example of customizing binaries in Cargo.toml.\n[[bin]]\nname = \"cool-tool\"\ntest = false\nbench = false\n\n[[bin]]\nname = \"frobnicator\"\nrequired-features = [\"frobnicate\"]\n```", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#binaries" + } + } + }, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#binaries" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "build-dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "description": "You can depend on other Cargo-based crates for use in your build scripts.\nDependencies are declared through the `build-dependencies` section of the\nmanifest:\n\n```toml\n[build-dependencies]\ncc = \"1.0.3\"\n```\n\nThe build script **does not** have access to the dependencies listed\nin the `dependencies` or `dev-dependencies` section. Build\ndependencies will likewise not be available to the package itself\nunless listed under the `dependencies` section as well. A package\nitself and its build script are built separately, so their\ndependencies need not coincide. Cargo is kept simpler and cleaner by\nusing independent dependencies for independent purposes.", + "x-taplo": { + "crates": { + "schemas": "dependencies" + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#build-dependencies" + }, + "plugins": [ + "crates" + ] + }, + "x-tombi-table-keys-order": "version-sort" + }, + "build_dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "deprecated": true, + "description": "[build_dependencies] is deprecated. Use [build-dependencies] instead.", + "x-taplo": { + "hidden": true + }, + "x-tombi-table-keys-order": "version-sort" + }, + "cargo-features": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true, + "x-tombi-array-values-order": "version-sort" + }, + "dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "description": "Cargo is configured to look for dependencies on [crates.io](https://crates.io) by default. Only\nthe name and a version string are required in this case. In [the cargo\nguide](https://doc.rust-lang.org/cargo/guide/index.html), we specified a dependency on the `time` crate:\n\n```toml\n[dependencies]\ntime = \"0.1.12\"\n```\n\nThe string `\"0.1.12\"` is a [semver](https://github.com/steveklabnik/semver#requirements) version requirement. Since this\nstring does not have any operators in it, it is interpreted the same way as\nif we had specified `\"^0.1.12\"`, which is called a caret requirement.\n\nA dependency can also be defined by a table with additional options:\n\n```toml\n[dependencies]\ntime = { path = \"../time\", version = \"0.1.12\" }\n```", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html" + } + }, + "x-tombi-table-keys-order": "version-sort" + }, + "dev-dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "description": "The format of `[dev-dependencies]` is equivalent to `[dependencies]`:\n\n```toml\n[dev-dependencies]\ntempdir = \"0.3\"\n```\n\nDev-dependencies are not used when compiling\na package for building, but are used for compiling tests, examples, and\nbenchmarks.\n\nThese dependencies are *not* propagated to other packages which depend on this\npackage.\n\nYou can also have target-specific development dependencies by using\n`dev-dependencies` in the target section header instead of `dependencies`. For\nexample:\n\n```toml\n[target.'cfg(unix)'.dev-dependencies]\nmio = \"0.0.1\"\n```\n\n> **Note**: When a package is published, only dev-dependencies that specify a\n> `version` will be included in the published crate. For most use cases,\n> dev-dependencies are not needed when published, though some users (like OS\n> packagers) may want to run tests within a crate, so providing a `version` if\n> possible can still be beneficial.\n", + "x-taplo": { + "crates": { + "schemas": "dependencies" + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#development-dependencies" + }, + "plugins": [ + "crates" + ] + }, + "x-tombi-table-keys-order": "version-sort" + }, + "dev_dependencies": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "deprecated": true, + "description": "[dev_dependencies] is deprecated. Use [dev-dependencies] instead.", + "x-taplo": { + "hidden": true + }, + "x-tombi-table-keys-order": "version-sort" + }, + "example": { + "type": "array", + "description": "Files located under the [examples directory](https://doc.rust-lang.org/cargo/guide/project-layout.html) are example uses of the functionality provided by the library. When compiled, they are placed in the[ target/debug/examples directory](https://doc.rust-lang.org/cargo/guide/build-cache.html).\n\nExamples can use the public API of the package's library. They are also linked with the [dependencies](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html) and [dev-dependencies](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#development-dependencies) defined in Cargo.toml.\n\nBy default, examples are executable binaries (with a `main()` function). You\ncan specify the [`crate-type` field](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-crate-type-field) to make an example\nbe compiled as a library:\n\n```toml\n[[example]]\nname = \"foo\"\ncrate-type = [\"staticlib\"]\n```\n\nYou can run individual executable examples with the [`cargo run`](https://doc.rust-lang.org/cargo/commands/cargo-run.html) command with\nthe `--example ` option. Library examples can be built with\n[`cargo build`](https://doc.rust-lang.org/cargo/commands/cargo-build.html) with the `--example ` option. [`cargo install`](https://doc.rust-lang.org/cargo/commands/cargo-install.html)\nwith the `--example ` option can be used to copy executable\nbinaries to a common location. Examples are compiled by [`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html) by\ndefault to protect them from bit-rotting. Set [the `test`\nfield](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-test-field) to `true` if you have `#[test]` functions in the\nexample that you want to run with [`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html).\n", + "items": { + "$ref": "#/definitions/Target", + "description": "Files located under the [examples directory](https://doc.rust-lang.org/cargo/guide/project-layout.html) are example uses of the functionality provided by the library. When compiled, they are placed in the[ target/debug/examples directory](https://doc.rust-lang.org/cargo/guide/build-cache.html).\n\nExamples can use the public API of the package's library. They are also linked with the [dependencies](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html) and [dev-dependencies](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#development-dependencies) defined in Cargo.toml.\n\nBy default, examples are executable binaries (with a `main()` function). You\ncan specify the [`crate-type` field](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-crate-type-field) to make an example\nbe compiled as a library:\n\n```toml\n[[example]]\nname = \"foo\"\ncrate-type = [\"staticlib\"]\n```\n\nYou can run individual executable examples with the [`cargo run`](https://doc.rust-lang.org/cargo/commands/cargo-run.html) command with\nthe `--example ` option. Library examples can be built with\n[`cargo build`](https://doc.rust-lang.org/cargo/commands/cargo-build.html) with the `--example ` option. [`cargo install`](https://doc.rust-lang.org/cargo/commands/cargo-install.html)\nwith the `--example ` option can be used to copy executable\nbinaries to a common location. Examples are compiled by [`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html) by\ndefault to protect them from bit-rotting. Set [the `test`\nfield](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#the-test-field) to `true` if you have `#[test]` functions in the\nexample that you want to run with [`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html).\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#examples" + } + } + }, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#examples" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "features": { + "type": "object", + "additionalProperties": { + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true, + "x-tombi-array-values-order": "version-sort" + }, + "description": "Cargo supports features to allow expression of:\n\n* conditional compilation options (usable through `cfg` attributes);\n* optional dependencies, which enhance a package, but are not required; and\n* clusters of optional dependencies, such as `postgres-all`, that would include the\n `postgres` package, the `postgres-macros` package, and possibly other packages\n (such as development-time mocking libraries, debugging tools, etc.).\n\nA feature of a package is either an optional dependency, or a set of other\nfeatures.\n", + "properties": { + "default": { + "type": "array", + "description": "The default features of the crate.", + "items": { + "type": "string" + }, + "title": "Default Feature", + "uniqueItems": true, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/features.html#the-default-feature" + } + }, + "x-tombi-array-values-order": "version-sort" + } + }, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/features.html" + } + }, + "x-tombi-table-keys-order": { + "additionalProperties": "version-sort", + "properties": "schema" + } + }, + "lib": { + "$ref": "#/definitions/Target", + "x-taplo": { + "docs": { + "main": "The library target defines a \"library\" that can be used and linked by other\nlibraries and executables. The filename defaults to `src/lib.rs`, and the name\nof the library defaults to the name of the package. A package can have only\none library. The settings for the library can be [customized](https://doc.rust-lang.org/cargo/reference/cargo-targets.html#configuring-a-target) in the `[lib]`\ntable in `Cargo.toml`.\n\n```toml\n# Example of customizing the library in Cargo.toml.\n[lib]\ncrate-type = [\"cdylib\"]\nbench = false\n```\n" + }, + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#library" + } + } + }, + "lints": { + "anyOf": [ + { + "$ref": "#/definitions/Lints" + }, + { + "type": "object", + "additionalProperties": false, + "properties": { + "workspace": { + "type": "boolean", + "description": "Inherit lints from the workspace manifest." + } + }, + "required": [ + "workspace" + ], + "x-tombi-table-keys-order": "version-sort" + } + ], + "description": "Override the default level of lints from different tools by assigning them to a new level in a table.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/stable/cargo/reference/manifest.html#the-lints-section" + } + } + }, + "package": { + "$ref": "#/definitions/Package" + }, + "patch": { + "type": "object", + "additionalProperties": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "x-tombi-table-keys-order": "version-sort" + }, + "description": "The `[patch]` section of `Cargo.toml` can be used to override dependencies\nwith other copies. The syntax is similar to the\n[`[dependencies]`](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html) section.\n\n", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/overriding-dependencies.html#the-patch-section" + } + }, + "x-tombi-table-keys-order": "version-sort" + }, + "profile": { + "$ref": "#/definitions/Profiles" + }, + "project": { + "$ref": "#/definitions/Package", + "deprecated": true, + "description": "[project] is deprecated. Use [package] instead.", + "x-taplo": { + "hidden": true + } + }, + "replace": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Dependency" + }, + "x-taplo": { + "hidden": true + }, + "x-tombi-table-keys-order": "version-sort" + }, + "target": { + "type": "object", + "additionalProperties": { + "$ref": "#/definitions/Platform" + }, + "x-tombi-table-keys-order": "version-sort" + }, + "test": { + "type": "array", + "description": "Files located under the [`tests` directory](https://doc.rust-lang.org/cargo/guide/project-layout.html) are integration\ntests. When you run [`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html), Cargo will compile each of these files as\na separate crate, and execute them.\n\nIntegration tests can use the public API of the package's library. They are\nalso linked with the [`[dependencies]`](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html) and\n[`[dev-dependencies]`](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#development-dependencies) defined in `Cargo.toml`.\n\nIf you want to share code among multiple integration tests, you can place it\nin a separate module such as `tests/common/mod.rs` and then put `mod common;`\nin each test to import it.\n\nEach integration test results in a separate executable binary, and [`cargo\ntest`](https://doc.rust-lang.org/cargo/commands/cargo-test.html) will run them serially. In some cases this can be inefficient, as it\ncan take longer to compile, and may not make full use of multiple CPUs when\nrunning the tests. If you have a lot of integration tests, you may want to\nconsider creating a single integration test, and split the tests into multiple\nmodules. The libtest harness will automatically find all of the `#[test]`\nannotated functions and run them in parallel. You can pass module names to\n[`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html) to only run the tests within that module.\n\nBinary targets are automatically built if there is an integration test. This\nallows an integration test to execute the binary to exercise and test its\nbehavior. The `CARGO_BIN_EXE_` [environment variable](https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-crates) is set when the\nintegration test is built so that it can use the [`env` macro](https://doc.rust-lang.org/std/macro.env.html) to locate the\nexecutable.", + "items": { + "$ref": "#/definitions/Target", + "description": "Files located under the [`tests` directory](https://doc.rust-lang.org/cargo/guide/project-layout.html) are integration\ntests. When you run [`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html), Cargo will compile each of these files as\na separate crate, and execute them.\n\nIntegration tests can use the public API of the package's library. They are\nalso linked with the [`[dependencies]`](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html) and\n[`[dev-dependencies]`](https://doc.rust-lang.org/cargo/reference/specifying-dependencies.html#development-dependencies) defined in `Cargo.toml`.\n\nIf you want to share code among multiple integration tests, you can place it\nin a separate module such as `tests/common/mod.rs` and then put `mod common;`\nin each test to import it.\n\nEach integration test results in a separate executable binary, and [`cargo\ntest`](https://doc.rust-lang.org/cargo/commands/cargo-test.html) will run them serially. In some cases this can be inefficient, as it\ncan take longer to compile, and may not make full use of multiple CPUs when\nrunning the tests. If you have a lot of integration tests, you may want to\nconsider creating a single integration test, and split the tests into multiple\nmodules. The libtest harness will automatically find all of the `#[test]`\nannotated functions and run them in parallel. You can pass module names to\n[`cargo test`](https://doc.rust-lang.org/cargo/commands/cargo-test.html) to only run the tests within that module.\n\nBinary targets are automatically built if there is an integration test. This\nallows an integration test to execute the binary to exercise and test its\nbehavior. The `CARGO_BIN_EXE_` [environment variable](https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-crates) is set when the\nintegration test is built so that it can use the [`env` macro](https://doc.rust-lang.org/std/macro.env.html) to locate the\nexecutable.", + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#integration-tests" + } + } + }, + "x-taplo": { + "links": { + "key": "https://doc.rust-lang.org/cargo/reference/cargo-targets.html#integration-tests" + } + }, + "x-tombi-array-values-order": "version-sort" + }, + "workspace": { + "$ref": "#/definitions/Workspace" + } + }, + "title": "Cargo.toml", + "x-taplo-info": { + "authors": [ + "tamasfe (https://github.com/tamasfe)" + ], + "patterns": [ + "^(.*(/|\\\\)Cargo\\.toml|Cargo\\.toml)$" + ] + }, + "x-tombi-table-keys-order": "schema", + "x-tombi-toml-version": "v1.0.0" +} diff --git a/tools/generate-bazel-rc/Cargo.lock b/tools/generate-bazel-rc/Cargo.lock index b5d7bd31e..6dfd5fc9c 100644 --- a/tools/generate-bazel-rc/Cargo.lock +++ b/tools/generate-bazel-rc/Cargo.lock @@ -2,12 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "equivalent" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" - [[package]] name = "generate-bazel-rc" version = "0.1.0" @@ -15,28 +9,6 @@ dependencies = [ "toml", ] -[[package]] -name = "hashbrown" -version = "0.15.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" - -[[package]] -name = "indexmap" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" -dependencies = [ - "equivalent", - "hashbrown", -] - -[[package]] -name = "memchr" -version = "2.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" - [[package]] name = "proc-macro2" version = "1.0.95" @@ -56,19 +28,19 @@ dependencies = [ ] [[package]] -name = "serde" -version = "1.0.219" +name = "serde_core" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -77,11 +49,11 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.8" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87607cb1398ed59d48732e575a4c28a7a8ebf2454b964fe3f224f2afc07909e1" +checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776" dependencies = [ - "serde", + "serde_core", ] [[package]] @@ -97,44 +69,41 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.22" +version = "1.0.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ae329d1f08c4d17a59bed7ff5b5a769d062e64a62d34a3261b219e62cd5aae" +checksum = "bbe30f93627849fa362d4a602212d41bb237dc2bd0f8ba0b2ce785012e124220" dependencies = [ - "serde", + "serde_core", "serde_spanned", "toml_datetime", - "toml_edit", + "toml_parser", + "toml_writer", + "winnow", ] [[package]] name = "toml_datetime" -version = "0.6.9" +version = "1.0.0+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3da5db5a963e24bc68be8b17b6fa82814bb22ee8660f192bb182771d498f09a3" +checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e" dependencies = [ - "serde", + "serde_core", ] [[package]] -name = "toml_edit" -version = "0.22.26" +name = "toml_parser" +version = "1.0.8+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "310068873db2c5b3e7659d2cc35d21855dbafa50d1ce336397c666e3cb08137e" +checksum = "0742ff5ff03ea7e67c8ae6c93cac239e0d9784833362da3f9a9c1da8dfefcbdc" dependencies = [ - "indexmap", - "serde", - "serde_spanned", - "toml_datetime", - "toml_write", "winnow", ] [[package]] -name = "toml_write" -version = "0.1.1" +name = "toml_writer" +version = "1.0.6+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfb942dfe1d8e29a7ee7fcbde5bd2b9a25fb89aa70caea2eba3bee836ff41076" +checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607" [[package]] name = "unicode-ident" @@ -144,9 +113,6 @@ checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "winnow" -version = "0.7.10" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06928c8748d81b05c9be96aad92e1b6ff01833332f281e8cfca3be4b35fc9ec" -dependencies = [ - "memchr", -] +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" diff --git a/tools/generate-bazel-rc/Cargo.toml b/tools/generate-bazel-rc/Cargo.toml index 0edab4184..f38bf0e82 100644 --- a/tools/generate-bazel-rc/Cargo.toml +++ b/tools/generate-bazel-rc/Cargo.toml @@ -1,9 +1,14 @@ +#:schema ../cargo-with-detailed-deps.json [package] edition = "2024" name = "generate-bazel-rc" version = "0.1.0" [dependencies] -toml = "0.8.22" +toml = { version = "1.0.0", default-features = false, features = [ + "display", + "parse", + "serde", +] } [workspace] diff --git a/tools/generate-bazel-rc/src/main.rs b/tools/generate-bazel-rc/src/main.rs index 83fee704e..a0d54021b 100644 --- a/tools/generate-bazel-rc/src/main.rs +++ b/tools/generate-bazel-rc/src/main.rs @@ -1,11 +1,14 @@ -use std::{collections::BTreeSet, env, fs}; -use toml::{Table, Value, map::Map}; +use std::collections::BTreeSet; +use std::{env, fs}; + +use toml::map::Map; +use toml::{Table, Value}; #[derive(PartialEq, PartialOrd, Clone)] enum LintLevel { Allow, Deny, - Warn + Warn, } impl LintLevel { @@ -64,8 +67,13 @@ fn get_lints_from_key(lints_table: &Map, key: &str) -> BTreeSet For more information, try '--help'. - EOF) + EOF + ) if [ "$nativelink_output" = "$print_error_output" ]; then echo "The output of nativelink matches the print_error output." diff --git a/tools/pre-commit-hooks.nix b/tools/pre-commit-hooks.nix index 7d81d5107..1cf94fd15 100644 --- a/tools/pre-commit-hooks.nix +++ b/tools/pre-commit-hooks.nix @@ -141,10 +141,21 @@ in { enable = true; packageOverrides.cargo = nightly-rust.cargo; packageOverrides.rustfmt = nightly-rust.rustfmt; + pass_filenames = true; + inherit excludes; }; + + # Taplo fmt taplo = { enable = true; - excludes = ["nativelink-proto"]; + types = ["toml"]; + }; + + # Taplo validate + taplo-validate = { + enable = true; + entry = "${pkgs.taplo}/bin/taplo validate"; + name = "taplo validate"; types = ["toml"]; }; diff --git a/tools/public/publish-ghcr.nix b/tools/public/publish-ghcr.nix index 4c9d39278..18d2b2341 100644 --- a/tools/public/publish-ghcr.nix +++ b/tools/public/publish-ghcr.nix @@ -27,25 +27,37 @@ writeShellScriptBin "publish-ghcr" '' nix run .#$1.copyTo docker://''${TAGGED_IMAGE} - echo $GHCR_PASSWORD | ${cosign}/bin/cosign \ - login \ - --username=$GHCR_USERNAME \ - --password-stdin \ - ghcr.io + # Skip signing if SKIP_SIGNING is set (useful for PR builds) + if [[ "''${SKIP_SIGNING:-false}" != "true" ]]; then + echo $GHCR_PASSWORD | ${cosign}/bin/cosign \ + login \ + --username=$GHCR_USERNAME \ + --password-stdin \ + ghcr.io + + ${cosign}/bin/cosign \ + sign \ + --yes \ + ''${GHCR_REGISTRY,,}/''${IMAGE_NAME}@$( \ + ${skopeo}/bin/skopeo \ + inspect \ + --format "{{ .Digest }}" \ + docker://''${TAGGED_IMAGE} \ + ) + else + echo "Skipping cosign signing (SKIP_SIGNING=true)" + fi + + # Skip trivy scan if SKIP_TRIVY is set + if [[ "''${SKIP_TRIVY:-false}" != "true" ]]; then + ${trivy}/bin/trivy \ + image \ + --format sarif \ + ''${TAGGED_IMAGE} \ + > trivy-results.sarif + else + echo "Skipping trivy scan (SKIP_TRIVY=true)" + fi - ${cosign}/bin/cosign \ - sign \ - --yes \ - ''${GHCR_REGISTRY,,}/''${IMAGE_NAME}@$( \ - ${skopeo}/bin/skopeo \ - inspect \ - --format "{{ .Digest }}" \ - docker://''${TAGGED_IMAGE} \ - ) - - ${trivy}/bin/trivy \ - image \ - --format sarif \ - ''${TAGGED_IMAGE} \ - > trivy-results.sarif + echo "Published: ''${TAGGED_IMAGE}" '' diff --git a/tools/toolchain-buck2/Dockerfile b/tools/toolchain-buck2/Dockerfile index 055bfef3f..f33216b77 100644 --- a/tools/toolchain-buck2/Dockerfile +++ b/tools/toolchain-buck2/Dockerfile @@ -13,13 +13,13 @@ # limitations under the License. # https://hub.docker.com/layers/library/ubuntu/noble-20250925/images/sha256-78281ac7684a7caf02348780a1b5de85844548a3cc0505df924de98380a0eeea -FROM ubuntu:noble-20250925 AS dependencies +FROM ubuntu:noble-20250925@sha256:728785b59223d755e3e5c5af178fab1be7031f3522c5ccd7a0b32b80d8248123 AS dependencies RUN apt-get update && DEBIAN_FRONTEND=noninteractive \ apt-get install -y --no-install-recommends \ git=1:2.43.0-1ubuntu7.3 \ ca-certificates=20240203 \ - curl=8.5.0-2ubuntu10.6 \ - python3=3.12.3-0ubuntu2 \ + curl=8.5.0-2ubuntu10.8 \ + python3=3.12.3-0ubuntu2.1 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* \ && update-ca-certificates diff --git a/tools/toolchain-drake/Dockerfile b/tools/toolchain-drake/Dockerfile index 40a36325f..dae04f59c 100644 --- a/tools/toolchain-drake/Dockerfile +++ b/tools/toolchain-drake/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM ubuntu:noble-20250925 AS dependencies +FROM ubuntu:noble-20250925@sha256:728785b59223d755e3e5c5af178fab1be7031f3522c5ccd7a0b32b80d8248123 AS dependencies RUN apt-get update && DEBIAN_FRONTEND=noninteractive \ apt-get install --no-install-recommends -y \ git=1:2.43.0-1ubuntu7.3 \ diff --git a/tools/toolchain-nativelink/Dockerfile b/tools/toolchain-nativelink/Dockerfile index ab006b2d4..e6beb857b 100644 --- a/tools/toolchain-nativelink/Dockerfile +++ b/tools/toolchain-nativelink/Dockerfile @@ -13,7 +13,7 @@ # limitations under the License. # https://hub.docker.com/layers/library/ubuntu/noble-20250925/images/sha256-78281ac7684a7caf02348780a1b5de85844548a3cc0505df924de98380a0eeea -FROM ubuntu:noble-20250925 +FROM ubuntu:noble-20250925@sha256:728785b59223d755e3e5c5af178fab1be7031f3522c5ccd7a0b32b80d8248123 # Set shell to bash and enable pipefail SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -22,10 +22,8 @@ SHELL ["/bin/bash", "-o", "pipefail", "-c"] RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ gcc=4:13.2.0-7ubuntu1 \ g++=4:13.2.0-7ubuntu1 \ - python3=3.12.3-0ubuntu2 \ - python3-minimal=3.12.3-0ubuntu2 \ - libpython3-stdlib=3.12.3-0ubuntu2 \ - curl=8.5.0-2ubuntu10.6 \ + python3=3.12.3-0ubuntu2.1 \ + curl=8.5.0-2ubuntu10.8 \ ca-certificates=20240203 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* diff --git a/tools/updaters/cache/.gitignore b/tools/updaters/cache/.gitignore new file mode 100644 index 000000000..b5624b74f --- /dev/null +++ b/tools/updaters/cache/.gitignore @@ -0,0 +1 @@ +*.tar.xz diff --git a/tools/updaters/rewrite-module.nix b/tools/updaters/rewrite-module.nix new file mode 100644 index 000000000..4ea58591b --- /dev/null +++ b/tools/updaters/rewrite-module.nix @@ -0,0 +1,9 @@ +{ + python-with-requests, + writeShellScriptBin, +}: +writeShellScriptBin "update-module-hashes" '' + set -uo pipefail + + ${python-with-requests}/bin/python tools/updaters/rewrite-module.py MODULE.bazel +'' diff --git a/tools/updaters/rewrite-module.py b/tools/updaters/rewrite-module.py new file mode 100644 index 000000000..bc4cf7770 --- /dev/null +++ b/tools/updaters/rewrite-module.py @@ -0,0 +1,36 @@ +import re +import subprocess +import requests +import sys +import pathlib + +module_bazel_path = sys.argv[1] +cache_dir = pathlib.Path(__file__).parent.joinpath("cache") +cache_dir.mkdir(exist_ok=True) + +original = open(module_bazel_path).read() +begin_shas = re.search("# BEGIN SHAS\n", original).end() # pyright: ignore[reportOptionalMemberAccess] +end_shas = re.search("\n # END SHAS", original).start() # pyright: ignore[reportOptionalMemberAccess] +print(begin_shas, end_shas) +sha_pattern = re.compile(r"\"(.+\.tar\.xz)\": \"([0-9a-f]+)\"") + +results = "" + +for entry in sha_pattern.finditer(original, begin_shas, end_shas): + short_url, hash = entry.groups() + cache_path = cache_dir.joinpath(short_url.replace("/", "_")) + if not cache_path.exists(): + full_url = f"https://static.rust-lang.org/dist/{short_url}" + print("getting", full_url, cache_path) + req = requests.get(full_url) + with cache_path.open("wb") as f: + f.write(req.content) + sha256_cmd = subprocess.check_output(["sha256sum", cache_path.as_posix()], encoding="utf-8") + sha256 = sha256_cmd.split(" ")[0] + if results != "": + results += "\n" + results += f" \"{short_url}\": \"{sha256}\"," + +revised = original[:begin_shas] + results + original[end_shas:] +with open(module_bazel_path, "w") as f: + f.write(revised) diff --git a/web/platform/bun.lock b/web/platform/bun.lock index 580ac5dae..f71d5453e 100644 --- a/web/platform/bun.lock +++ b/web/platform/bun.lock @@ -1,5 +1,6 @@ { "lockfileVersion": 1, + "configVersion": 0, "workspaces": { "": { "name": "nativelink-web-platform", @@ -24,7 +25,7 @@ "@react-three/fiber": "^9.1.2", "@tailwindcss/vite": "^4.1.5", "@types/bun": "^1.2.12", - "astro": "5.14.3", + "astro": "5.15.9", "clsx": "^2.1.1", "dotenv": "^17.0.0", "framer-motion": "^12.9.4", @@ -62,7 +63,7 @@ "@astrojs/check": ["@astrojs/check@0.9.4", "", { "dependencies": { "@astrojs/language-server": "^2.15.0", "chokidar": "^4.0.1", "kleur": "^4.1.5", "yargs": "^17.7.2" }, "peerDependencies": { "typescript": "^5.0.0" }, "bin": { "astro-check": "dist/bin.js" } }, "sha512-IOheHwCtpUfvogHHsvu0AbeRZEnjJg3MopdLddkJE70mULItS/Vh37BHcI00mcOJcH1vhD3odbpvWokpxam7xA=="], - "@astrojs/compiler": ["@astrojs/compiler@2.12.2", "", {}, "sha512-w2zfvhjNCkNMmMMOn5b0J8+OmUaBL1o40ipMvqcG6NRpdC+lKxmTi48DT8Xw0SzJ3AfmeFLB45zXZXtmbsjcgw=="], + "@astrojs/compiler": ["@astrojs/compiler@2.13.0", "", {}, "sha512-mqVORhUJViA28fwHYaWmsXSzLO9osbdZ5ImUfxBarqsYdMlPbqAqGJCxsNzvppp1BEzc1mJNjOVvQqeDN8Vspw=="], "@astrojs/internal-helpers": ["@astrojs/internal-helpers@0.7.2", "", {}, "sha512-KCkCqR3Goym79soqEtbtLzJfqhTWMyVaizUi35FLzgGSzBotSw8DB1qwsu7U96ihOJgYhDk2nVPz+3LnXPeX6g=="], @@ -104,13 +105,13 @@ "@babel/helper-string-parser": ["@babel/helper-string-parser@7.27.1", "", {}, "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA=="], - "@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + "@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.28.5", "", {}, "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q=="], "@babel/helper-validator-option": ["@babel/helper-validator-option@7.27.1", "", {}, "sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg=="], "@babel/helpers": ["@babel/helpers@7.27.1", "", { "dependencies": { "@babel/template": "^7.27.1", "@babel/types": "^7.27.1" } }, "sha512-FCvFTm0sWV8Fxhpp2McP5/W53GPllQ9QeQ7SiqGWjMf/LVG07lFa5+pgK05IRhVwtvafT22KF+ZSnM9I545CvQ=="], - "@babel/parser": ["@babel/parser@7.27.1", "", { "dependencies": { "@babel/types": "^7.27.1" }, "bin": "./bin/babel-parser.js" }, "sha512-I0dZ3ZpCrJ1c04OqlNsQcKiZlsrXf/kkE4FXzID9rIOYICsAbA8mMDzhW/luRNAHdCNt7os/u8wenklZDlUVUQ=="], + "@babel/parser": ["@babel/parser@7.28.5", "", { "dependencies": { "@babel/types": "^7.28.5" }, "bin": "./bin/babel-parser.js" }, "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ=="], "@babel/plugin-transform-react-jsx-self": ["@babel/plugin-transform-react-jsx-self@7.27.1", "", { "dependencies": { "@babel/helper-plugin-utils": "^7.27.1" }, "peerDependencies": { "@babel/core": "^7.0.0-0" } }, "sha512-6UzkCs+ejGdZ5mFFC/OCUrv028ab2fp1znZmCZjAOBKiBK2jXD1O+BPSfX8X2qjJ75fZBMSnQn3Rq2mrBJK2mw=="], @@ -122,7 +123,7 @@ "@babel/traverse": ["@babel/traverse@7.27.1", "", { "dependencies": { "@babel/code-frame": "^7.27.1", "@babel/generator": "^7.27.1", "@babel/parser": "^7.27.1", "@babel/template": "^7.27.1", "@babel/types": "^7.27.1", "debug": "^4.3.1", "globals": "^11.1.0" } }, "sha512-ZCYtZciz1IWJB4U61UPu4KEaqyfj+r5T1Q5mqPo+IBpcG9kHv30Z0aD8LXPgC1trYa6rK0orRyAhqUgk4MjmEg=="], - "@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + "@babel/types": ["@babel/types@7.28.5", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.28.5" } }, "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA=="], "@bazel/bazelisk": ["@bazel/bazelisk@1.26.0", "", { "bin": { "bazelisk": "bazelisk.js", "bazel": "bazelisk.js" } }, "sha512-bTNcHdGyEQ9r7SczEYUa0gkEQhJo1ld2BjXI8fWBvsUeoHi03QpUs2HZgDbjjrpQFQqG2ZbO7ihZvH8MjhUTHw=="], @@ -158,7 +159,7 @@ "@builder.io/qwik": ["@builder.io/qwik@1.13.0", "", { "dependencies": { "csstype": "^3.1" }, "peerDependencies": { "vite": "^5" }, "bin": { "qwik": "qwik-cli.cjs" } }, "sha512-dElfs3V91h+x12ftGWzAKO0pbO36kohfDd9ukr+YFSb/CP66WnTgjTTXJjlzkmFw18O9Bh9ObjqShpkEz02+Kg=="], - "@capsizecss/unpack": ["@capsizecss/unpack@3.0.0", "", { "dependencies": { "fontkit": "^2.0.2" } }, "sha512-+ntATQe1AlL7nTOYjwjj6w3299CgRot48wL761TUGYpYgAou3AaONZazp0PKZyCyWhudWsjhq1nvRHOvbMzhTA=="], + "@capsizecss/unpack": ["@capsizecss/unpack@3.0.1", "", { "dependencies": { "fontkit": "^2.0.2" } }, "sha512-8XqW8xGn++Eqqbz3e9wKuK7mxryeRjs4LOHLxbh2lwKeSbuNR4NFifDZT4KzvjU6HMOPbiNTsWpniK5EJfTWkg=="], "@chevrotain/cst-dts-gen": ["@chevrotain/cst-dts-gen@11.0.3", "", { "dependencies": { "@chevrotain/gast": "11.0.3", "@chevrotain/types": "11.0.3", "lodash-es": "4.17.21" } }, "sha512-BvIKpRLeS/8UbfxXxgC33xOumsacaeCKAjAeLyOn7Pcp95HiRbrpl14S+9vaZLolnbssPIUuiUd8IvgkRyt6NQ=="], @@ -700,7 +701,7 @@ "astring": ["astring@1.9.0", "", { "bin": { "astring": "bin/astring" } }, "sha512-LElXdjswlqjWrPpJFg1Fx4wpkOCxj1TDHlSV4PlaRxHGWko024xICaa97ZkMfs6DRKlCguiAI+rbXv5GWwXIkg=="], - "astro": ["astro@5.14.3", "", { "dependencies": { "@astrojs/compiler": "^2.12.2", "@astrojs/internal-helpers": "0.7.4", "@astrojs/markdown-remark": "6.3.8", "@astrojs/telemetry": "3.3.0", "@capsizecss/unpack": "^3.0.0", "@oslojs/encoding": "^1.1.0", "@rollup/pluginutils": "^5.2.0", "acorn": "^8.15.0", "aria-query": "^5.3.2", "axobject-query": "^4.1.0", "boxen": "8.0.1", "ci-info": "^4.3.0", "clsx": "^2.1.1", "common-ancestor-path": "^1.0.1", "cookie": "^1.0.2", "cssesc": "^3.0.0", "debug": "^4.4.1", "deterministic-object-hash": "^2.0.2", "devalue": "^5.3.2", "diff": "^5.2.0", "dlv": "^1.1.3", "dset": "^3.1.4", "es-module-lexer": "^1.7.0", "esbuild": "^0.25.0", "estree-walker": "^3.0.3", "flattie": "^1.1.1", "fontace": "~0.3.0", "github-slugger": "^2.0.0", "html-escaper": "3.0.3", "http-cache-semantics": "^4.2.0", "import-meta-resolve": "^4.2.0", "js-yaml": "^4.1.0", "kleur": "^4.1.5", "magic-string": "^0.30.18", "magicast": "^0.3.5", "mrmime": "^2.0.1", "neotraverse": "^0.6.18", "p-limit": "^6.2.0", "p-queue": "^8.1.0", "package-manager-detector": "^1.3.0", "picomatch": "^4.0.3", "prompts": "^2.4.2", "rehype": "^13.0.2", "semver": "^7.7.2", "shiki": "^3.12.0", "smol-toml": "^1.4.2", "tinyexec": "^1.0.1", "tinyglobby": "^0.2.14", "tsconfck": "^3.1.6", "ultrahtml": "^1.6.0", "unifont": "~0.6.0", "unist-util-visit": "^5.0.0", "unstorage": "^1.17.0", "vfile": "^6.0.3", "vite": "^6.3.6", "vitefu": "^1.1.1", "xxhash-wasm": "^1.1.0", "yargs-parser": "^21.1.1", "yocto-spinner": "^0.2.3", "zod": "^3.25.76", "zod-to-json-schema": "^3.24.6", "zod-to-ts": "^1.2.0" }, "optionalDependencies": { "sharp": "^0.34.0" }, "bin": { "astro": "astro.js" } }, "sha512-iRvl3eEYYdSYA195eNREjh43hqMMwKY1uoHYiKfLCB9G+bjFtaBtDe8R0ip7AbTD69wyOKgUCOtMad+lkOnT/w=="], + "astro": ["astro@5.15.9", "", { "dependencies": { "@astrojs/compiler": "^2.13.0", "@astrojs/internal-helpers": "0.7.5", "@astrojs/markdown-remark": "6.3.9", "@astrojs/telemetry": "3.3.0", "@capsizecss/unpack": "^3.0.1", "@oslojs/encoding": "^1.1.0", "@rollup/pluginutils": "^5.3.0", "acorn": "^8.15.0", "aria-query": "^5.3.2", "axobject-query": "^4.1.0", "boxen": "8.0.1", "ci-info": "^4.3.1", "clsx": "^2.1.1", "common-ancestor-path": "^1.0.1", "cookie": "^1.0.2", "cssesc": "^3.0.0", "debug": "^4.4.3", "deterministic-object-hash": "^2.0.2", "devalue": "^5.5.0", "diff": "^5.2.0", "dlv": "^1.1.3", "dset": "^3.1.4", "es-module-lexer": "^1.7.0", "esbuild": "^0.25.0", "estree-walker": "^3.0.3", "flattie": "^1.1.1", "fontace": "~0.3.1", "github-slugger": "^2.0.0", "html-escaper": "3.0.3", "http-cache-semantics": "^4.2.0", "import-meta-resolve": "^4.2.0", "js-yaml": "^4.1.1", "magic-string": "^0.30.21", "magicast": "^0.5.1", "mrmime": "^2.0.1", "neotraverse": "^0.6.18", "p-limit": "^6.2.0", "p-queue": "^8.1.1", "package-manager-detector": "^1.5.0", "picocolors": "^1.1.1", "picomatch": "^4.0.3", "prompts": "^2.4.2", "rehype": "^13.0.2", "semver": "^7.7.3", "shiki": "^3.15.0", "smol-toml": "^1.5.0", "tinyexec": "^1.0.2", "tinyglobby": "^0.2.15", "tsconfck": "^3.1.6", "ultrahtml": "^1.6.0", "unifont": "~0.6.0", "unist-util-visit": "^5.0.0", "unstorage": "^1.17.2", "vfile": "^6.0.3", "vite": "^6.4.1", "vitefu": "^1.1.1", "xxhash-wasm": "^1.1.0", "yargs-parser": "^21.1.1", "yocto-spinner": "^0.2.3", "zod": "^3.25.76", "zod-to-json-schema": "^3.24.6", "zod-to-ts": "^1.2.0" }, "optionalDependencies": { "sharp": "^0.34.0" }, "bin": { "astro": "astro.js" } }, "sha512-XLDXxu0282cC/oYHswWZm3johGlRvk9rLRS7pWVWSne+HsZe9JgrpHI+vewAJSSNHBGd1aCyaQOElT5RNGe7IQ=="], "astro-expressive-code": ["astro-expressive-code@0.41.2", "", { "dependencies": { "rehype-expressive-code": "^0.41.2" }, "peerDependencies": { "astro": "^4.0.0-beta || ^5.0.0-beta || ^3.3.0" } }, "sha512-HN0jWTnhr7mIV/2e6uu4PPRNNo/k4UEgTLZqbp3MrHU+caCARveG2yZxaZVBmxyiVdYqW5Pd3u3n2zjnshixbw=="], @@ -974,7 +975,7 @@ "deterministic-object-hash": ["deterministic-object-hash@2.0.2", "", { "dependencies": { "base-64": "^1.0.0" } }, "sha512-KxektNH63SrbfUyDiwXqRb1rLwKt33AmMv+5Nhsw1kqZ13SJBRTgZHtGbE+hH3a1mVW1cz+4pqSWVPAtLVXTzQ=="], - "devalue": ["devalue@5.3.2", "", {}, "sha512-UDsjUbpQn9kvm68slnrs+mfxwFkIflOhkanmyabZ8zOYk8SMEIbJ3TK+88g70hSIeytu4y18f0z/hYHMTrXIWw=="], + "devalue": ["devalue@5.5.0", "", {}, "sha512-69sM5yrHfFLJt0AZ9QqZXGCPfJ7fQjvpln3Rq5+PS03LD32Ost1Q9N+eEnaQwGRIriKkMImXD56ocjQmfjbV3w=="], "devlop": ["devlop@1.1.0", "", { "dependencies": { "dequal": "^2.0.0" } }, "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA=="], @@ -1110,7 +1111,7 @@ "flattie": ["flattie@1.1.1", "", {}, "sha512-9UbaD6XdAL97+k/n+N7JwX46K/M6Zc6KcFYskrYL8wbBV/Uyk0CTAMY0VT+qiK5PM7AIc9aTWYtq65U7T+aCNQ=="], - "fontace": ["fontace@0.3.0", "", { "dependencies": { "@types/fontkit": "^2.0.8", "fontkit": "^2.0.4" } }, "sha512-czoqATrcnxgWb/nAkfyIrRp6Q8biYj7nGnL6zfhTcX+JKKpWHFBnb8uNMw/kZr7u++3Y3wYSYoZgHkCcsuBpBg=="], + "fontace": ["fontace@0.3.1", "", { "dependencies": { "@types/fontkit": "^2.0.8", "fontkit": "^2.0.4" } }, "sha512-9f5g4feWT1jWT8+SbL85aLIRLIXUaDygaM2xPXRmzPYxrOMNok79Lr3FGJoKVNKibE0WCunNiEVG2mwuE+2qEg=="], "fontkit": ["fontkit@2.0.4", "", { "dependencies": { "@swc/helpers": "^0.5.12", "brotli": "^1.3.2", "clone": "^2.1.2", "dfa": "^1.2.0", "fast-deep-equal": "^3.1.3", "restructure": "^3.0.0", "tiny-inflate": "^1.0.3", "unicode-properties": "^1.4.0", "unicode-trie": "^2.0.0" } }, "sha512-syetQadaUEDNdxdugga9CpEYVaQIxOwk7GlwZWWZ19//qW4zE5bknOKeMBDYAASwnpaSHKJITRLMF9m1fp3s6g=="], @@ -1378,9 +1379,9 @@ "maath": ["maath@0.10.8", "", { "peerDependencies": { "@types/three": ">=0.134.0", "three": ">=0.134.0" } }, "sha512-tRvbDF0Pgqz+9XUa4jjfgAQ8/aPKmQdWXilFu2tMy4GWj4NOsx99HlULO4IeREfbO3a0sA145DZYyvXPkybm0g=="], - "magic-string": ["magic-string@0.30.19", "", { "dependencies": { "@jridgewell/sourcemap-codec": "^1.5.5" } }, "sha512-2N21sPY9Ws53PZvsEpVtNuSW+ScYbQdp4b9qUaL+9QkHUrGFKo56Lg9Emg5s9V/qrtNBmiR01sYhUOwu3H+VOw=="], + "magic-string": ["magic-string@0.30.21", "", { "dependencies": { "@jridgewell/sourcemap-codec": "^1.5.5" } }, "sha512-vd2F4YUyEXKGcLHoq+TEyCjxueSeHnFxyyjNp80yg0XV4vUhnDer/lvvlqM/arB5bXQN5K2/3oinyCRyx8T2CQ=="], - "magicast": ["magicast@0.3.5", "", { "dependencies": { "@babel/parser": "^7.25.4", "@babel/types": "^7.25.4", "source-map-js": "^1.2.0" } }, "sha512-L0WhttDl+2BOsybvEOLK7fW3UA0OQ0IQ2d6Zl2x/a6vVRs3bAY0ECOSHHeL5jD+SbOpOCUEi0y1DgHEn9Qn1AQ=="], + "magicast": ["magicast@0.5.1", "", { "dependencies": { "@babel/parser": "^7.28.5", "@babel/types": "^7.28.5", "source-map-js": "^1.2.1" } }, "sha512-xrHS24IxaLrvuo613F719wvOIv9xPHFWQHuvGUBmPnCA/3MQxKI3b+r7n1jAoDHmsbC5bRhTZYR77invLAxVnw=="], "make-dir": ["make-dir@3.1.0", "", { "dependencies": { "semver": "^6.0.0" } }, "sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw=="], @@ -1608,7 +1609,7 @@ "p-locate": ["p-locate@4.1.0", "", { "dependencies": { "p-limit": "^2.2.0" } }, "sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A=="], - "p-queue": ["p-queue@8.1.0", "", { "dependencies": { "eventemitter3": "^5.0.1", "p-timeout": "^6.1.2" } }, "sha512-mxLDbbGIBEXTJL0zEx8JIylaj3xQ7Z/7eEVjcF9fJX4DBiH9oqe+oahYnlKKxm0Ci9TlWTyhSHgygxMxjIB2jw=="], + "p-queue": ["p-queue@8.1.1", "", { "dependencies": { "eventemitter3": "^5.0.1", "p-timeout": "^6.1.2" } }, "sha512-aNZ+VfjobsWryoiPnEApGGmf5WmNsCo9xu8dfaYamG5qaLP7ClhLN6NgsFe6SwJ2UbLEBK5dv9x8Mn5+RVhMWQ=="], "p-timeout": ["p-timeout@6.1.4", "", {}, "sha512-MyIV3ZA/PmyBN/ud8vV9XzwTrNtR4jFrObymZYnZqMmW0zA8Z17vnT0rBgFE/TlohB+YCHqXMgZzb3Csp49vqg=="], @@ -1618,7 +1619,7 @@ "pac-resolver": ["pac-resolver@7.0.1", "", { "dependencies": { "degenerator": "^5.0.0", "netmask": "^2.0.2" } }, "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg=="], - "package-manager-detector": ["package-manager-detector@1.3.0", "", {}, "sha512-ZsEbbZORsyHuO00lY1kV3/t72yp6Ysay6Pd17ZAlNGuGwmWDLCJxFpRs0IzfXfj1o4icJOkUEioexFHzyPurSQ=="], + "package-manager-detector": ["package-manager-detector@1.5.0", "", {}, "sha512-uBj69dVlYe/+wxj8JOpr97XfsxH/eumMt6HqjNTmJDf/6NO9s+0uxeOneIz3AsPt2m6y9PqzDzd3ATcU17MNfw=="], "pagefind": ["pagefind@1.3.0", "", { "optionalDependencies": { "@pagefind/darwin-arm64": "1.3.0", "@pagefind/darwin-x64": "1.3.0", "@pagefind/linux-arm64": "1.3.0", "@pagefind/linux-x64": "1.3.0", "@pagefind/windows-x64": "1.3.0" }, "bin": { "pagefind": "lib/runner/bin.cjs" } }, "sha512-8KPLGT5g9s+olKMRTU9LFekLizkVIu9tes90O1/aigJ0T5LmyPqTzGJrETnSw3meSYg58YH7JTzhTTW/3z6VAw=="], @@ -1918,7 +1919,7 @@ "tiny-invariant": ["tiny-invariant@1.3.3", "", {}, "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg=="], - "tinyexec": ["tinyexec@1.0.1", "", {}, "sha512-5uC6DDlmeqiOwCPmK9jMSdOuZTh8bU39Ys6yidB+UTt5hfZUPGAypSgFRiEp+jbi9qH40BLDvy85jIU88wKSqw=="], + "tinyexec": ["tinyexec@1.0.2", "", {}, "sha512-W/KYk+NFhkmsYpuHq5JykngiOCnxeVL8v8dFnqxSD8qEEdRfXk1SDM6JzNqcERbcGYj9tMrDQBYV9cjgnunFIg=="], "tinyglobby": ["tinyglobby@0.2.15", "", { "dependencies": { "fdir": "^6.5.0", "picomatch": "^4.0.3" } }, "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ=="], @@ -2008,7 +2009,7 @@ "unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="], - "unstorage": ["unstorage@1.17.1", "", { "dependencies": { "anymatch": "^3.1.3", "chokidar": "^4.0.3", "destr": "^2.0.5", "h3": "^1.15.4", "lru-cache": "^10.4.3", "node-fetch-native": "^1.6.7", "ofetch": "^1.4.1", "ufo": "^1.6.1" }, "peerDependencies": { "@azure/app-configuration": "^1.8.0", "@azure/cosmos": "^4.2.0", "@azure/data-tables": "^13.3.0", "@azure/identity": "^4.6.0", "@azure/keyvault-secrets": "^4.9.0", "@azure/storage-blob": "^12.26.0", "@capacitor/preferences": "^6.0.3 || ^7.0.0", "@deno/kv": ">=0.9.0", "@netlify/blobs": "^6.5.0 || ^7.0.0 || ^8.1.0 || ^9.0.0 || ^10.0.0", "@planetscale/database": "^1.19.0", "@upstash/redis": "^1.34.3", "@vercel/blob": ">=0.27.1", "@vercel/functions": "^2.2.12 || ^3.0.0", "@vercel/kv": "^1.0.1", "aws4fetch": "^1.0.20", "db0": ">=0.2.1", "idb-keyval": "^6.2.1", "ioredis": "^5.4.2", "uploadthing": "^7.4.4" }, "optionalPeers": ["@azure/app-configuration", "@azure/cosmos", "@azure/data-tables", "@azure/identity", "@azure/keyvault-secrets", "@azure/storage-blob", "@capacitor/preferences", "@deno/kv", "@netlify/blobs", "@planetscale/database", "@upstash/redis", "@vercel/blob", "@vercel/functions", "@vercel/kv", "aws4fetch", "db0", "idb-keyval", "ioredis", "uploadthing"] }, "sha512-KKGwRTT0iVBCErKemkJCLs7JdxNVfqTPc/85ae1XES0+bsHbc/sFBfVi5kJp156cc51BHinIH2l3k0EZ24vOBQ=="], + "unstorage": ["unstorage@1.17.2", "", { "dependencies": { "anymatch": "^3.1.3", "chokidar": "^4.0.3", "destr": "^2.0.5", "h3": "^1.15.4", "lru-cache": "^10.4.3", "node-fetch-native": "^1.6.7", "ofetch": "^1.5.0", "ufo": "^1.6.1" }, "peerDependencies": { "@azure/app-configuration": "^1.8.0", "@azure/cosmos": "^4.2.0", "@azure/data-tables": "^13.3.0", "@azure/identity": "^4.6.0", "@azure/keyvault-secrets": "^4.9.0", "@azure/storage-blob": "^12.26.0", "@capacitor/preferences": "^6.0.3 || ^7.0.0", "@deno/kv": ">=0.9.0", "@netlify/blobs": "^6.5.0 || ^7.0.0 || ^8.1.0 || ^9.0.0 || ^10.0.0", "@planetscale/database": "^1.19.0", "@upstash/redis": "^1.34.3", "@vercel/blob": ">=0.27.1", "@vercel/functions": "^2.2.12 || ^3.0.0", "@vercel/kv": "^1.0.1", "aws4fetch": "^1.0.20", "db0": ">=0.2.1", "idb-keyval": "^6.2.1", "ioredis": "^5.4.2", "uploadthing": "^7.4.4" }, "optionalPeers": ["@azure/app-configuration", "@azure/cosmos", "@azure/data-tables", "@azure/identity", "@azure/keyvault-secrets", "@azure/storage-blob", "@capacitor/preferences", "@deno/kv", "@netlify/blobs", "@planetscale/database", "@upstash/redis", "@vercel/blob", "@vercel/functions", "@vercel/kv", "aws4fetch", "db0", "idb-keyval", "ioredis", "uploadthing"] }, "sha512-cKEsD6iBWJgOMJ6vW1ID/SYuqNf8oN4yqRk8OYqaVQ3nnkJXOT1PSpaMh2QfzLs78UN5kSNRD2c/mgjT8tX7+w=="], "update-browserslist-db": ["update-browserslist-db@1.1.3", "", { "dependencies": { "escalade": "^3.2.0", "picocolors": "^1.1.1" }, "peerDependencies": { "browserslist": ">= 4.21.0" }, "bin": { "update-browserslist-db": "cli.js" } }, "sha512-UxhIZQ+QInVdunkDAaiazvvT/+fXL5Osr0JZlJulepYu6Jd7qJtDZjlur0emRlT71EN3ScPoE7gvsuIKKNavKw=="], @@ -2150,12 +2151,36 @@ "@astrojs/telemetry/is-wsl": ["is-wsl@3.1.0", "", { "dependencies": { "is-inside-container": "^1.0.0" } }, "sha512-UcVfVfaK4Sc4m7X3dUSoHoozQGBEFeDC+zVo06t98xe8CzHSZZBekNXH+tu0NalHolcJ/QAGqS46Hef7QXBIMw=="], + "@babel/code-frame/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@babel/core/@babel/parser": ["@babel/parser@7.27.1", "", { "dependencies": { "@babel/types": "^7.27.1" }, "bin": "./bin/babel-parser.js" }, "sha512-I0dZ3ZpCrJ1c04OqlNsQcKiZlsrXf/kkE4FXzID9rIOYICsAbA8mMDzhW/luRNAHdCNt7os/u8wenklZDlUVUQ=="], + + "@babel/core/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + "@babel/core/semver": ["semver@6.3.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="], + "@babel/generator/@babel/parser": ["@babel/parser@7.27.1", "", { "dependencies": { "@babel/types": "^7.27.1" }, "bin": "./bin/babel-parser.js" }, "sha512-I0dZ3ZpCrJ1c04OqlNsQcKiZlsrXf/kkE4FXzID9rIOYICsAbA8mMDzhW/luRNAHdCNt7os/u8wenklZDlUVUQ=="], + + "@babel/generator/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + "@babel/helper-compilation-targets/lru-cache": ["lru-cache@5.1.1", "", { "dependencies": { "yallist": "^3.0.2" } }, "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w=="], "@babel/helper-compilation-targets/semver": ["semver@6.3.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="], + "@babel/helper-module-imports/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + + "@babel/helper-module-transforms/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@babel/helpers/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + + "@babel/template/@babel/parser": ["@babel/parser@7.27.1", "", { "dependencies": { "@babel/types": "^7.27.1" }, "bin": "./bin/babel-parser.js" }, "sha512-I0dZ3ZpCrJ1c04OqlNsQcKiZlsrXf/kkE4FXzID9rIOYICsAbA8mMDzhW/luRNAHdCNt7os/u8wenklZDlUVUQ=="], + + "@babel/template/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + + "@babel/traverse/@babel/parser": ["@babel/parser@7.27.1", "", { "dependencies": { "@babel/types": "^7.27.1" }, "bin": "./bin/babel-parser.js" }, "sha512-I0dZ3ZpCrJ1c04OqlNsQcKiZlsrXf/kkE4FXzID9rIOYICsAbA8mMDzhW/luRNAHdCNt7os/u8wenklZDlUVUQ=="], + + "@babel/traverse/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + "@iconify/utils/globals": ["globals@15.15.0", "", {}, "sha512-7ACyT3wmyp3I61S4fG682L0VA2RGD9otkqGJIwNUMF1SWUombIIk+af1unuDYgMm082aHYwD+mzJvv9Iu8dsgg=="], "@lhci/cli/yargs": ["yargs@15.4.1", "", { "dependencies": { "cliui": "^6.0.0", "decamelize": "^1.2.0", "find-up": "^4.1.0", "get-caller-file": "^2.0.1", "require-directory": "^2.1.1", "require-main-filename": "^2.0.0", "set-blocking": "^2.0.0", "string-width": "^4.2.0", "which-module": "^2.0.0", "y18n": "^4.0.0", "yargs-parser": "^18.1.2" } }, "sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A=="], @@ -2196,6 +2221,18 @@ "@tailwindcss/oxide-wasm32-wasi/tslib": ["tslib@2.8.1", "", { "bundled": true }, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="], + "@types/babel__core/@babel/parser": ["@babel/parser@7.27.1", "", { "dependencies": { "@babel/types": "^7.27.1" }, "bin": "./bin/babel-parser.js" }, "sha512-I0dZ3ZpCrJ1c04OqlNsQcKiZlsrXf/kkE4FXzID9rIOYICsAbA8mMDzhW/luRNAHdCNt7os/u8wenklZDlUVUQ=="], + + "@types/babel__core/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + + "@types/babel__generator/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + + "@types/babel__template/@babel/parser": ["@babel/parser@7.27.1", "", { "dependencies": { "@babel/types": "^7.27.1" }, "bin": "./bin/babel-parser.js" }, "sha512-I0dZ3ZpCrJ1c04OqlNsQcKiZlsrXf/kkE4FXzID9rIOYICsAbA8mMDzhW/luRNAHdCNt7os/u8wenklZDlUVUQ=="], + + "@types/babel__template/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + + "@types/babel__traverse/@babel/types": ["@babel/types@7.27.1", "", { "dependencies": { "@babel/helper-string-parser": "^7.27.1", "@babel/helper-validator-identifier": "^7.27.1" } }, "sha512-+EzkxvLNfiUeKMgy/3luqfsCWFRXLb7U6wNQTk60tovuckwB15B191tJWvpp4HjiQWdJkCxO3Wbvc6jlk3Xb2Q=="], + "@types/fontkit/@types/node": ["@types/node@22.15.12", "", { "dependencies": { "undici-types": "~6.21.0" } }, "sha512-K0fpC/ZVeb8G9rm7bH7vI0KAec4XHEhBam616nVJCV51bKzJ6oA3luG4WdKoaztxe70QaNjS/xBmcDLmr4PiGw=="], "@types/sax/@types/node": ["@types/node@22.15.12", "", { "dependencies": { "undici-types": "~6.21.0" } }, "sha512-K0fpC/ZVeb8G9rm7bH7vI0KAec4XHEhBam616nVJCV51bKzJ6oA3luG4WdKoaztxe70QaNjS/xBmcDLmr4PiGw=="], @@ -2208,9 +2245,9 @@ "anymatch/picomatch": ["picomatch@2.3.1", "", {}, "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA=="], - "astro/@astrojs/internal-helpers": ["@astrojs/internal-helpers@0.7.4", "", {}, "sha512-lDA9MqE8WGi7T/t2BMi+EAXhs4Vcvr94Gqx3q15cFEz8oFZMO4/SFBqYr/UcmNlvW+35alowkVj+w9VhLvs5Cw=="], + "astro/@astrojs/internal-helpers": ["@astrojs/internal-helpers@0.7.5", "", {}, "sha512-vreGnYSSKhAjFJCWAwe/CNhONvoc5lokxtRoZims+0wa3KbHBdPHSSthJsKxPd8d/aic6lWKpRTYGY/hsgK6EA=="], - "astro/@astrojs/markdown-remark": ["@astrojs/markdown-remark@6.3.8", "", { "dependencies": { "@astrojs/internal-helpers": "0.7.4", "@astrojs/prism": "3.3.0", "github-slugger": "^2.0.0", "hast-util-from-html": "^2.0.3", "hast-util-to-text": "^4.0.2", "import-meta-resolve": "^4.2.0", "js-yaml": "^4.1.0", "mdast-util-definitions": "^6.0.0", "rehype-raw": "^7.0.0", "rehype-stringify": "^10.0.1", "remark-gfm": "^4.0.1", "remark-parse": "^11.0.0", "remark-rehype": "^11.1.2", "remark-smartypants": "^3.0.2", "shiki": "^3.13.0", "smol-toml": "^1.4.2", "unified": "^11.0.5", "unist-util-remove-position": "^5.0.0", "unist-util-visit": "^5.0.0", "unist-util-visit-parents": "^6.0.1", "vfile": "^6.0.3" } }, "sha512-uFNyFWadnULWK2cOw4n0hLKeu+xaVWeuECdP10cQ3K2fkybtTlhb7J7TcScdjmS8Yps7oje9S/ehYMfZrhrgCg=="], + "astro/@astrojs/markdown-remark": ["@astrojs/markdown-remark@6.3.9", "", { "dependencies": { "@astrojs/internal-helpers": "0.7.5", "@astrojs/prism": "3.3.0", "github-slugger": "^2.0.0", "hast-util-from-html": "^2.0.3", "hast-util-to-text": "^4.0.2", "import-meta-resolve": "^4.2.0", "js-yaml": "^4.1.0", "mdast-util-definitions": "^6.0.0", "rehype-raw": "^7.0.0", "rehype-stringify": "^10.0.1", "remark-gfm": "^4.0.1", "remark-parse": "^11.0.0", "remark-rehype": "^11.1.2", "remark-smartypants": "^3.0.2", "shiki": "^3.13.0", "smol-toml": "^1.4.2", "unified": "^11.0.5", "unist-util-remove-position": "^5.0.0", "unist-util-visit": "^5.0.0", "unist-util-visit-parents": "^6.0.2", "vfile": "^6.0.3" } }, "sha512-hX2cLC/KW74Io1zIbn92kI482j9J7LleBLGCVU9EP3BeH5MVrnFawOnqD0t/q6D1Z+ZNeQG2gNKMslCcO36wng=="], "astro/debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], @@ -2218,11 +2255,13 @@ "astro/import-meta-resolve": ["import-meta-resolve@4.2.0", "", {}, "sha512-Iqv2fzaTQN28s/FwZAoFq0ZSs/7hMAHJVX+w8PZl3cY19Pxk6jFFalxQoIfW2826i/fDLXv8IiEZRIT0lDuWcg=="], - "astro/shiki": ["shiki@3.13.0", "", { "dependencies": { "@shikijs/core": "3.13.0", "@shikijs/engine-javascript": "3.13.0", "@shikijs/engine-oniguruma": "3.13.0", "@shikijs/langs": "3.13.0", "@shikijs/themes": "3.13.0", "@shikijs/types": "3.13.0", "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4" } }, "sha512-aZW4l8Og16CokuCLf8CF8kq+KK2yOygapU5m3+hoGw0Mdosc6fPitjM+ujYarppj5ZIKGyPDPP1vqmQhr+5/0g=="], + "astro/js-yaml": ["js-yaml@4.1.1", "", { "dependencies": { "argparse": "^2.0.1" }, "bin": { "js-yaml": "bin/js-yaml.js" } }, "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA=="], - "astro/smol-toml": ["smol-toml@1.4.2", "", {}, "sha512-rInDH6lCNiEyn3+hH8KVGFdbjc099j47+OSgbMrfDYX1CmXLfdKd7qi6IfcWj2wFxvSVkuI46M+wPGYfEOEj6g=="], + "astro/shiki": ["shiki@3.15.0", "", { "dependencies": { "@shikijs/core": "3.15.0", "@shikijs/engine-javascript": "3.15.0", "@shikijs/engine-oniguruma": "3.15.0", "@shikijs/langs": "3.15.0", "@shikijs/themes": "3.15.0", "@shikijs/types": "3.15.0", "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4" } }, "sha512-kLdkY6iV3dYbtPwS9KXU7mjfmDm25f5m0IPNFnaXO7TBPcvbUOY72PYXSuSqDzwp+vlH/d7MXpHlKO/x+QoLXw=="], - "astro/vite": ["vite@6.3.6", "", { "dependencies": { "esbuild": "^0.25.0", "fdir": "^6.4.4", "picomatch": "^4.0.2", "postcss": "^8.5.3", "rollup": "^4.34.9", "tinyglobby": "^0.2.13" }, "optionalDependencies": { "fsevents": "~2.3.3" }, "peerDependencies": { "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0", "jiti": ">=1.21.0", "less": "*", "lightningcss": "^1.21.0", "sass": "*", "sass-embedded": "*", "stylus": "*", "sugarss": "*", "terser": "^5.16.0", "tsx": "^4.8.1", "yaml": "^2.4.2" }, "optionalPeers": ["@types/node", "jiti", "less", "lightningcss", "sass", "sass-embedded", "stylus", "sugarss", "terser", "tsx", "yaml"], "bin": { "vite": "bin/vite.js" } }, "sha512-0msEVHJEScQbhkbVTb/4iHZdJ6SXp/AvxL2sjwYQFfBqleHtnCqv1J3sa9zbWz/6kW1m9Tfzn92vW+kZ1WV6QA=="], + "astro/smol-toml": ["smol-toml@1.5.2", "", {}, "sha512-QlaZEqcAH3/RtNyet1IPIYPsEWAaYyXXv1Krsi+1L/QHppjX4Ifm8MQsBISz9vE8cHicIq3clogsheili5vhaQ=="], + + "astro/vite": ["vite@6.4.1", "", { "dependencies": { "esbuild": "^0.25.0", "fdir": "^6.4.4", "picomatch": "^4.0.2", "postcss": "^8.5.3", "rollup": "^4.34.9", "tinyglobby": "^0.2.13" }, "optionalDependencies": { "fsevents": "~2.3.3" }, "peerDependencies": { "@types/node": "^18.0.0 || ^20.0.0 || >=22.0.0", "jiti": ">=1.21.0", "less": "*", "lightningcss": "^1.21.0", "sass": "*", "sass-embedded": "*", "stylus": "*", "sugarss": "*", "terser": "^5.16.0", "tsx": "^4.8.1", "yaml": "^2.4.2" }, "optionalPeers": ["@types/node", "jiti", "less", "lightningcss", "sass", "sass-embedded", "stylus", "sugarss", "terser", "tsx", "yaml"], "bin": { "vite": "bin/vite.js" } }, "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g=="], "astro/yargs-parser": ["yargs-parser@21.1.1", "", {}, "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw=="], @@ -2364,6 +2403,8 @@ "unstorage/lru-cache": ["lru-cache@10.4.3", "", {}, "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ=="], + "unstorage/ofetch": ["ofetch@1.5.1", "", { "dependencies": { "destr": "^2.0.5", "node-fetch-native": "^1.6.7", "ufo": "^1.6.1" } }, "sha512-2W4oUZlVaqAPAil6FUg/difl6YhqhUR7x2eZY4bQCko22UXg3hptq9KLQdqFClV+Wu85UX7hNtdGTngi/1BxcA=="], + "vite/esbuild": ["esbuild@0.25.4", "", { "optionalDependencies": { "@esbuild/aix-ppc64": "0.25.4", "@esbuild/android-arm": "0.25.4", "@esbuild/android-arm64": "0.25.4", "@esbuild/android-x64": "0.25.4", "@esbuild/darwin-arm64": "0.25.4", "@esbuild/darwin-x64": "0.25.4", "@esbuild/freebsd-arm64": "0.25.4", "@esbuild/freebsd-x64": "0.25.4", "@esbuild/linux-arm": "0.25.4", "@esbuild/linux-arm64": "0.25.4", "@esbuild/linux-ia32": "0.25.4", "@esbuild/linux-loong64": "0.25.4", "@esbuild/linux-mips64el": "0.25.4", "@esbuild/linux-ppc64": "0.25.4", "@esbuild/linux-riscv64": "0.25.4", "@esbuild/linux-s390x": "0.25.4", "@esbuild/linux-x64": "0.25.4", "@esbuild/netbsd-arm64": "0.25.4", "@esbuild/netbsd-x64": "0.25.4", "@esbuild/openbsd-arm64": "0.25.4", "@esbuild/openbsd-x64": "0.25.4", "@esbuild/sunos-x64": "0.25.4", "@esbuild/win32-arm64": "0.25.4", "@esbuild/win32-ia32": "0.25.4", "@esbuild/win32-x64": "0.25.4" }, "bin": { "esbuild": "bin/esbuild" } }, "sha512-8pgjLUcUjcgDg+2Q4NYXnPbo/vncAY4UmyaCm0jZevERqCHZIaWwdJHkf8XQtu4AxSKCdvrUbT0XUr1IdZzI8Q=="], "vite/fsevents": ["fsevents@2.3.3", "", { "os": "darwin" }, "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw=="], @@ -2400,6 +2441,18 @@ "@astrojs/starlight/@astrojs/markdown-remark/@astrojs/prism": ["@astrojs/prism@3.2.0", "", { "dependencies": { "prismjs": "^1.29.0" } }, "sha512-GilTHKGCW6HMq7y3BUv9Ac7GMe/MO9gi9GW62GzKtth0SwukCu/qp2wLiGpEujhY+VVhaG9v7kv/5vFzvf4NYw=="], + "@babel/core/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@babel/generator/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@babel/helper-module-imports/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@babel/helpers/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@babel/template/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@babel/traverse/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + "@lhci/cli/yargs/cliui": ["cliui@6.0.0", "", { "dependencies": { "string-width": "^4.2.0", "strip-ansi": "^6.0.0", "wrap-ansi": "^6.2.0" } }, "sha512-t6wbgtoCXvAzst7QgXxJYqPt0usEfbgQdftEPbLL/cvv6HPE5VgvqCuAIDR0NgU52ds6rFwqrgakNLrHEjCbrQ=="], "@lhci/cli/yargs/y18n": ["y18n@4.0.3", "", {}, "sha512-JKhqTOwSrqNA1NY5lSztJ1GrBiUodLMmIZuLiDaMRJ+itFd+ABVE8XBjOvIWL+rSqNDC74LCSFmlb/U4UZ4hJQ=="], @@ -2410,6 +2463,16 @@ "@sentry/node/https-proxy-agent/agent-base": ["agent-base@6.0.2", "", { "dependencies": { "debug": "4" } }, "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ=="], + "@types/babel__core/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@types/babel__generator/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@types/babel__template/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "@types/babel__traverse/@babel/types/@babel/helper-validator-identifier": ["@babel/helper-validator-identifier@7.27.1", "", {}, "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow=="], + + "astro/@astrojs/markdown-remark/unist-util-visit-parents": ["unist-util-visit-parents@6.0.2", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-is": "^6.0.0" } }, "sha512-goh1s1TBrqSqukSc8wrjwWhL0hiJxgA8m4kFxGlQ+8FYQ3C/m11FcTs4YYem7V664AhHVvgoQLk890Ssdsr2IQ=="], + "astro/esbuild/@esbuild/aix-ppc64": ["@esbuild/aix-ppc64@0.25.4", "", { "os": "aix", "cpu": "ppc64" }, "sha512-1VCICWypeQKhVbE9oW/sJaAmjLxhVqacdkvPLEjwlttjfwENRSClS8EjBz0KzRyFSCPDIkuXW34Je/vk7zdB7Q=="], "astro/esbuild/@esbuild/android-arm": ["@esbuild/android-arm@0.25.4", "", { "os": "android", "cpu": "arm" }, "sha512-QNdQEps7DfFwE3hXiU4BZeOV68HHzYwGd0Nthhd3uCkkEKK7/R6MTgM0P7H7FAs5pU/DIWsviMmEGxEoxIZ+ZQ=="], @@ -2456,17 +2519,19 @@ "astro/esbuild/@esbuild/win32-x64": ["@esbuild/win32-x64@0.25.4", "", { "os": "win32", "cpu": "x64" }, "sha512-nOT2vZNw6hJ+z43oP1SPea/G/6AbN6X+bGNhNuq8NtRHy4wsMhw765IKLNmnjek7GvjWBYQ8Q5VBoYTFg9y1UQ=="], - "astro/shiki/@shikijs/core": ["@shikijs/core@3.13.0", "", { "dependencies": { "@shikijs/types": "3.13.0", "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4", "hast-util-to-html": "^9.0.5" } }, "sha512-3P8rGsg2Eh2qIHekwuQjzWhKI4jV97PhvYjYUzGqjvJfqdQPz+nMlfWahU24GZAyW1FxFI1sYjyhfh5CoLmIUA=="], + "astro/shiki/@shikijs/core": ["@shikijs/core@3.15.0", "", { "dependencies": { "@shikijs/types": "3.15.0", "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4", "hast-util-to-html": "^9.0.5" } }, "sha512-8TOG6yG557q+fMsSVa8nkEDOZNTSxjbbR8l6lF2gyr6Np+jrPlslqDxQkN6rMXCECQ3isNPZAGszAfYoJOPGlg=="], + + "astro/shiki/@shikijs/engine-javascript": ["@shikijs/engine-javascript@3.15.0", "", { "dependencies": { "@shikijs/types": "3.15.0", "@shikijs/vscode-textmate": "^10.0.2", "oniguruma-to-es": "^4.3.3" } }, "sha512-ZedbOFpopibdLmvTz2sJPJgns8Xvyabe2QbmqMTz07kt1pTzfEvKZc5IqPVO/XFiEbbNyaOpjPBkkr1vlwS+qg=="], - "astro/shiki/@shikijs/engine-javascript": ["@shikijs/engine-javascript@3.13.0", "", { "dependencies": { "@shikijs/types": "3.13.0", "@shikijs/vscode-textmate": "^10.0.2", "oniguruma-to-es": "^4.3.3" } }, "sha512-Ty7xv32XCp8u0eQt8rItpMs6rU9Ki6LJ1dQOW3V/56PKDcpvfHPnYFbsx5FFUP2Yim34m/UkazidamMNVR4vKg=="], + "astro/shiki/@shikijs/engine-oniguruma": ["@shikijs/engine-oniguruma@3.15.0", "", { "dependencies": { "@shikijs/types": "3.15.0", "@shikijs/vscode-textmate": "^10.0.2" } }, "sha512-HnqFsV11skAHvOArMZdLBZZApRSYS4LSztk2K3016Y9VCyZISnlYUYsL2hzlS7tPqKHvNqmI5JSUJZprXloMvA=="], - "astro/shiki/@shikijs/engine-oniguruma": ["@shikijs/engine-oniguruma@3.13.0", "", { "dependencies": { "@shikijs/types": "3.13.0", "@shikijs/vscode-textmate": "^10.0.2" } }, "sha512-O42rBGr4UDSlhT2ZFMxqM7QzIU+IcpoTMzb3W7AlziI1ZF7R8eS2M0yt5Ry35nnnTX/LTLXFPUjRFCIW+Operg=="], + "astro/shiki/@shikijs/langs": ["@shikijs/langs@3.15.0", "", { "dependencies": { "@shikijs/types": "3.15.0" } }, "sha512-WpRvEFvkVvO65uKYW4Rzxs+IG0gToyM8SARQMtGGsH4GDMNZrr60qdggXrFOsdfOVssG/QQGEl3FnJ3EZ+8w8A=="], - "astro/shiki/@shikijs/langs": ["@shikijs/langs@3.13.0", "", { "dependencies": { "@shikijs/types": "3.13.0" } }, "sha512-672c3WAETDYHwrRP0yLy3W1QYB89Hbpj+pO4KhxK6FzIrDI2FoEXNiNCut6BQmEApYLfuYfpgOZaqbY+E9b8wQ=="], + "astro/shiki/@shikijs/themes": ["@shikijs/themes@3.15.0", "", { "dependencies": { "@shikijs/types": "3.15.0" } }, "sha512-8ow2zWb1IDvCKjYb0KiLNrK4offFdkfNVPXb1OZykpLCzRU6j+efkY+Y7VQjNlNFXonSw+4AOdGYtmqykDbRiQ=="], - "astro/shiki/@shikijs/themes": ["@shikijs/themes@3.13.0", "", { "dependencies": { "@shikijs/types": "3.13.0" } }, "sha512-Vxw1Nm1/Od8jyA7QuAenaV78BG2nSr3/gCGdBkLpfLscddCkzkL36Q5b67SrLLfvAJTOUzW39x4FHVCFriPVgg=="], + "astro/shiki/@shikijs/types": ["@shikijs/types@3.15.0", "", { "dependencies": { "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4" } }, "sha512-BnP+y/EQnhihgHy4oIAN+6FFtmfTekwOLsQbRw9hOKwqgNy8Bdsjq8B05oAt/ZgvIWWFrshV71ytOrlPfYjIJw=="], - "astro/shiki/@shikijs/types": ["@shikijs/types@3.13.0", "", { "dependencies": { "@shikijs/vscode-textmate": "^10.0.2", "@types/hast": "^3.0.4" } }, "sha512-oM9P+NCFri/mmQ8LoFGVfVyemm5Hi27330zuOBp0annwJdKH1kOLndw3zCtAVDehPLg9fKqoEx3Ht/wNZxolfw=="], + "astro/vite/fdir": ["fdir@6.5.0", "", { "peerDependencies": { "picomatch": "^3 || ^4" }, "optionalPeers": ["picomatch"] }, "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg=="], "astro/vite/fsevents": ["fsevents@2.3.3", "", { "os": "darwin" }, "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw=="], diff --git a/web/platform/package.json b/web/platform/package.json index c34868fa3..9b31d8e09 100644 --- a/web/platform/package.json +++ b/web/platform/package.json @@ -22,7 +22,7 @@ "@react-three/fiber": "^9.1.2", "@tailwindcss/vite": "^4.1.5", "@types/bun": "^1.2.12", - "astro": "5.14.3", + "astro": "5.15.9", "clsx": "^2.1.1", "dotenv": "^17.0.0", "framer-motion": "^12.9.4", diff --git a/web/platform/src/components/media/icons/contributors.tsx b/web/platform/src/components/media/icons/contributors.tsx index 6c44ed6e9..0f1d6d601 100644 --- a/web/platform/src/components/media/icons/contributors.tsx +++ b/web/platform/src/components/media/icons/contributors.tsx @@ -705,63 +705,3 @@ export const Lastmile = (props: PropsOf<"svg">, key: string) => { ); }; - -export const Browserbase = (props: PropsOf<"svg">, key: string) => { - return ( - - - - - - - - - - - - - - ); -}; diff --git a/web/platform/src/components/qwik/components/cards.tsx b/web/platform/src/components/qwik/components/cards.tsx index 7d7a9dc25..72724a092 100644 --- a/web/platform/src/components/qwik/components/cards.tsx +++ b/web/platform/src/components/qwik/components/cards.tsx @@ -58,12 +58,9 @@ export const VideoCard = component$( const pricing = [ { - title: "Starter", + title: "Open Source", items: [ - "Starting at $29/month", - "SOC2 and ISO27001", - "1 TB of cache transfer", - "100 cores of remote builds", + "Free!", "Community Support", ], cta: { @@ -75,9 +72,7 @@ const pricing = [ title: "Enterprise", items: [ "Custom pricing", - "SOC2 and ISO27001", - "Unlimited cache transfer", - "Unlimited cores of remote builds", + "On premise only", "Dedicated enterprise support", ], cta: { diff --git a/web/platform/src/components/qwik/components/codeTabs.tsx b/web/platform/src/components/qwik/components/codeTabs.tsx index c60ed9fdb..be6752804 100644 --- a/web/platform/src/components/qwik/components/codeTabs.tsx +++ b/web/platform/src/components/qwik/components/codeTabs.tsx @@ -45,14 +45,14 @@ export const CodeTabs = component$(
               
                 curl -O \{"\n"}
-                https://raw.githubusercontent.com/TraceMachina/nativelink/v0.6.0/nativelink-config/examples/basic_cas.json5
+                https://raw.githubusercontent.com/TraceMachina/nativelink/v0.7.5/nativelink-config/examples/basic_cas.json5
                 {"\n\n"}# See{"\n"}
                 https://github.com/TraceMachina/nativelink/pkgs/container/nativelink
                 {"\n\n"}
                 docker run \{"\n"}
                 -v $(pwd)/basic_cas.json:/config \{"\n"}
                 -p 50051:50051 \{"\n"}
-                ghcr.io/tracemachina/nativelink:v0.6.0 \{"\n"}
+                ghcr.io/tracemachina/nativelink:v0.7.5 \{"\n"}
                 config
               
             
@@ -61,12 +61,12 @@ export const CodeTabs = component$(
               
                 curl.exe -O \{"\n"}
-                https://raw.githubusercontent.com/TraceMachina/nativelink/v0.6.0/nativelink-config/examples/basic_cas.json5
+                https://raw.githubusercontent.com/TraceMachina/nativelink/v0.7.5/nativelink-config/examples/basic_cas.json5
                 {"\n\n"}
                 docker run \{"\n"}
                 -v $(pwd)/basic_cas.json:/config \{"\n"}
                 -p 50051:50051 \{"\n"}
-                ghcr.io/tracemachina/nativelink:v0.6.0 \{"\n"}
+                ghcr.io/tracemachina/nativelink:v0.7.5 \{"\n"}
                 config
               
             
diff --git a/web/platform/src/components/qwik/components/header.tsx b/web/platform/src/components/qwik/components/header.tsx index a6f48975b..307d612be 100644 --- a/web/platform/src/components/qwik/components/header.tsx +++ b/web/platform/src/components/qwik/components/header.tsx @@ -179,7 +179,7 @@ const Widgets = component$(() => { - Demo now - ); }); diff --git a/web/platform/src/components/qwik/sections/hero.tsx b/web/platform/src/components/qwik/sections/hero.tsx index f9370557f..30f8b9bb5 100644 --- a/web/platform/src/components/qwik/sections/hero.tsx +++ b/web/platform/src/components/qwik/sections/hero.tsx @@ -75,14 +75,6 @@ export const Hero = component$(() => {
- - Sign up today - -