From d2b040e5c763e0e69f5dd62f8bab3f0dbdeafca1 Mon Sep 17 00:00:00 2001 From: Ben Frederickson Date: Tue, 10 Mar 2026 16:52:56 -0700 Subject: [PATCH] Coderabbit integration Following on the work of https://github.com/rapidsai/rmm/pull/2244 and https://github.com/rapidsai/cuml/pull/7725, this enables general instructions and configuration to enable coderabbit codereviews on cuvs. --- .coderabbit.yaml | 77 ++++++++++ .github/CODEOWNERS | 1 + .github/workflows/pr.yaml | 14 ++ cpp/agents.md | 311 ++++++++++++++++++++++++++++++++++++++ python/agents.md | 184 ++++++++++++++++++++++ 5 files changed, 587 insertions(+) create mode 100644 .coderabbit.yaml create mode 100644 cpp/agents.md create mode 100644 python/agents.md diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 0000000000..89c7494129 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +reviews: + profile: chill + high_level_summary: true + high_level_summary_in_walkthrough: true + poem: false + auto_review: + enabled: true + drafts: false + base_branches: + - "^main$" + - "^release/.*" + - "^hotfix/.*" + ignore_usernames: ["rapids-bot", "GPUtester", "nv-automation-bot", "copy-pr-bot"] + tools: + markdownlint: + enabled: true + shellcheck: + enabled: true + gitleaks: + enabled: true + sequence_diagrams: false + collapse_walkthrough: true + + # Reduce noise from status messages + request_changes_workflow: false + review_status: false + + # Path-specific review instructions + # Note: Detailed C++/CUDA and Python review guidelines are in cpp/agents.md and python/agents.md + path_instructions: + - path: "docs/**/*" + instructions: | + For documentation changes, focus on: + - Accuracy: Verify code examples compile and run correctly + - Completeness: Check if API changes (parameters, return values, errors) are documented + - Clarity: Flag confusing explanations, missing prerequisites, or unclear examples + - Consistency: Version numbers, parameter types, and terminology match code + - Missing docs: If PR changes public APIs without updating docs, flag as HIGH priority + + - path: "c/include/cuvs/**/*" + instructions: | + For public C API headers, additionally check: + - Doxygen documentation for all public functions/classes + - API changes flagged for docs/ updates + - Breaking changes require deprecation warnings and migration guide updates + + - path: "cpp/include/cuvs/**/*" + instructions: | + For public C++ API headers, additionally check: + - Doxygen documentation for all public functions/classes + - API changes flagged for docs/ updates + - Breaking changes require deprecation warnings and migration guide updates + + - path: "notebooks/**/*" + instructions: | + For example notebooks: + - Verify code examples match current API (parameter names, return types) + - Flag outdated imports or deprecated API usage + + - path: "ci/**/*" + instructions: | + For CI/build scripts: + - Check for proper conda environment handling + - Verify GPU availability checks before tests + - Check for proper error handling and meaningful error messages + +knowledge_base: + opt_out: false + code_guidelines: + filePatterns: + - "cpp/agents.md" + - "python/agents.md" + - "docs/source/contributing.md" + - "docs/source/developer_guide.md" diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 72a854b3df..a9c9d7a26e 100755 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -31,6 +31,7 @@ cpp/scripts/run-cmake-format.sh @rapidsai/cuvs-cmake-codeowners /.github/ @rapidsai/ci-codeowners /ci/ @rapidsai/ci-codeowners /.shellcheckrc @rapidsai/ci-codeowners +/.coderabbit.yaml @rapidsai/ci-codeowners #packaging code owners /.pre-commit-config.yaml @rapidsai/packaging-codeowners diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 7aec55ed37..da6e5544f0 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -67,6 +67,8 @@ jobs: files_yaml: | build_docs: - '**' + - '!**/*/agents.md' + - '!.coderabbit.yaml' - '!.devcontainer/**' - '!.github/CODEOWNERS' - '!.github/ISSUE_TEMPLATE/**' @@ -89,6 +91,8 @@ jobs: - '!thirdparty/LICENSES/**' test_cpp: - '**' + - '!**/*/agents.md' + - '!.coderabbit.yaml' - '!.devcontainer/**' - '!.github/CODEOWNERS' - '!.github/ISSUE_TEMPLATE/**' @@ -123,6 +127,8 @@ jobs: - '!thirdparty/LICENSES/**' test_java: - '**' + - '!**/*/agents.md' + - '!.coderabbit.yaml' - '!.devcontainer/**' - '!.github/CODEOWNERS' - '!.github/ISSUE_TEMPLATE/**' @@ -156,6 +162,8 @@ jobs: - '!thirdparty/LICENSES/**' test_python_conda: - '**' + - '!**/*/agents.md' + - '!.coderabbit.yaml' - '!.devcontainer/**' - '!.github/CODEOWNERS' - '!.github/ISSUE_TEMPLATE/**' @@ -193,6 +201,8 @@ jobs: - '!thirdparty/LICENSES/**' test_python_wheels: - '**' + - '!**/*/agents.md' + - '!.coderabbit.yaml' - '!.devcontainer/**' - '!.github/CODEOWNERS' - '!.github/ISSUE_TEMPLATE/**' @@ -235,6 +245,8 @@ jobs: - '!thirdparty/LICENSES/**' test_rust: - '**' + - '!**/*/agents.md' + - '!.coderabbit.yaml' - '!.devcontainer/**' - '!.github/CODEOWNERS' - '!.github/ISSUE_TEMPLATE/**' @@ -272,6 +284,8 @@ jobs: - '!thirdparty/LICENSES/**' test_go: - '**' + - '!**/*/agents.md' + - '!.coderabbit.yaml' - '!.devcontainer/**' - '!.github/CODEOWNERS' - '!.github/ISSUE_TEMPLATE/**' diff --git a/cpp/agents.md b/cpp/agents.md new file mode 100644 index 0000000000..b7182cc563 --- /dev/null +++ b/cpp/agents.md @@ -0,0 +1,311 @@ +# AI Code Review Guidelines - cuVS C++/CUDA + +**Role**: Act as a principal engineer with 10+ years experience in GPU computing and high-performance numerical computing. Focus ONLY on CRITICAL and HIGH issues. + +**Target**: Sub-3% false positive rate. Be direct, concise, minimal. + +**Context**: cuVS C++ layer provides GPU-accelerated vector search implementations using CUDA, with dependencies on RAFT, RMM, libcudacxx, thrust, and CUB. + +## IGNORE These Issues + +- Style/formatting (clang-format handles this) +- Minor naming preferences (unless truly misleading) +- Personal taste on implementation (unless impacts maintainability) +- Nits that don't affect functionality +- Already-covered issues (one comment per root cause) + +## CRITICAL Issues (Always Comment) + +### GPU/CUDA Errors +- Unchecked CUDA errors (kernel launches, memory operations, synchronization) +- Race conditions in GPU kernels (shared memory, atomics, warps) +- Device memory leaks (cudaMalloc/cudaFree imbalance, leaked streams/events) +- Invalid memory access (out-of-bounds, use-after-free, host/device confusion) +- Missing CUDA synchronization causing non-deterministic failures +- Kernel launch with zero blocks/threads or invalid grid/block dimensions +- **Missing explicit stream creation for concurrent operations** (reusing default stream, missing stream isolation) +- **Incorrect stream lifecycle management** (using destroyed streams, not creating dedicated streams for concurrent ops) + +### Algorithm Correctness +- Logic errors in vector search kernels +- Numerical instability causing wrong results (overflow, underflow, precision loss) +- Incorrect gradient computations or convergence criteria +- **Data layout bugs** (incorrect row-major vs column-major assumptions) + +### Resource Management +- GPU memory leaks (device allocations, managed memory, pinned memory) +- CUDA stream/event leaks or improper cleanup +- Missing RAII or proper cleanup. Including in exception paths. +- Resource exhaustion (GPU memory) + +### API Breaking Changes +- C++ API changes without proper deprecation warnings +- Changes to data structures exposed in public headers (`cpp/include/cuvs/`) +- Breaking changes to algorithm behavior + +## HIGH Issues (Comment if Substantial) + +### Performance Issues +- Inefficient GPU kernel launches (low occupancy, poor memory access patterns) +- Unnecessary host-device synchronization blocking GPU pipeline +- Suboptimal memory access patterns (non-coalesced, strided, unaligned) +- Excessive memory allocations in hot paths +- Warp divergence in compute-heavy kernels +- Shared memory bank conflicts + +### Numerical Stability +- Floating-point operations prone to catastrophic cancellation +- Missing checks for division by zero or near-zero values +- Ill-conditioned matrix operations without preconditioning +- Accumulation errors in iterative algorithms +- Unsafe casting between numeric types (double→float with potential precision loss) +- Missing epsilon comparisons for floating-point equality checks +- **Numerical edge cases** (near-zero eigenvalues, degenerate matrices, extreme values) + +### Concurrency & Thread Safety +- Race conditions in multi-GPU operations +- Improper CUDA stream management causing false dependencies +- Deadlock potential in resource acquisition +- Thread-unsafe use of global/static variables +- **Concurrent operations sharing streams incorrectly** (multi-GPU without proper isolation) +- **Stream reuse across independent operations** (causing unwanted serialization or race conditions) + +### Design & Architecture +- Hard-coded GPU device IDs or resource limits +- Inappropriate use of exceptions in performance-critical paths +- Significant code duplication (3+ occurrences). Including in kernel logic. +- Reinventing functionality already available in RAFT, RMM, libcudacxx, thrust, or CUB + +### Test Quality +- Missing validation of numerical correctness +- **Using external datasets** (tests must not depend on external resources; use synthetic data or bundled datasets) + +## MEDIUM Issues (Comment Selectively) + +- Missing input validation (negative dimensions, null pointers) +- Deprecated CUDA API usage +- **Unclear data format in function parameters** (ambiguous row-major or column-major) + +## Review Protocol + +1. **CUDA correctness**: Errors checked? Memory safety? Race conditions? Synchronization? +2. **Algorithm correctness**: Does the kernel logic produce correct results? Numerical stability? +3. **Resource management**: GPU memory leaks? Stream/event cleanup? +4. **Performance**: GPU bottlenecks? Unnecessary sync? Memory access patterns? +5. **API stability**: Breaking changes to C++ APIs? +6. **Data layout**: Row/column major handled correctly? +7. **Stream lifecycle**: Are CUDA streams explicitly created/destroyed for concurrent operations? +8. **Ask, don't tell**: "Have you considered X?" not "You should do X" + +## Quality Threshold + +Before commenting, ask: +1. Is this actually wrong/risky, or just different? +2. Would this cause a real problem (crash, wrong results, leak)? +3. Does this comment add unique value? + +**If no to any: Skip the comment.** + +## Output Format + +- Use severity labels: CRITICAL, HIGH, MEDIUM +- Be concise: One-line issue summary + one-line impact +- Provide code suggestions when you have concrete fixes +- No preamble or sign-off + +## Examples to Follow + +**CRITICAL** (GPU memory leak): +``` +CRITICAL: GPU memory leak in fit() + +Issue: Device memory allocated but never freed on error path +Why: Causes GPU OOM on repeated calls + +Suggested fix: +if (cudaMalloc(&d_data, size) != cudaSuccess) { + cudaFree(d_centroids); + return ERROR_CODE; +} +``` + +**CRITICAL** (unchecked CUDA error): +``` +CRITICAL: Unchecked kernel launch + +Issue: Kernel launch error not checked +Why: Subsequent operations assume success, causing silent corruption + +Suggested fix: +myKernel<<>>(args); +RAFT_CUDA_TRY(cudaGetLastError()); +``` + +**HIGH** (numerical stability): +``` +HIGH: Potential division by near-zero + +Issue: No epsilon check before division in distance computation +Why: Can produce Inf/NaN values corrupting results +Consider: Add epsilon threshold check or use safe division helper +``` + +**HIGH** (performance issue): +``` +HIGH: Unnecessary synchronization in hot path + +Issue: cudaDeviceSynchronize() or raft::resource::sync_stream() inside iteration loop +Why: Blocks GPU pipeline, 10x slowdown on benchmarks +Consider: Move sync outside loop or use streams with events +``` + +**CRITICAL** (data layout mismatch): +``` +CRITICAL: Incorrect memory layout assumption in kernel + +Issue: Kernel assumes row-major data but input is column-major +Why: Memory access pattern produces wrong results +Impact: Silent data corruption + +Suggested fix: +// Check and handle data layout explicitly +if (input.is_column_major()) { + // Use column-major kernel variant +} +``` + +**HIGH** (missing stream isolation): +``` +HIGH: Multi-GPU operation missing dedicated streams + +Issue: Multi-GPU operation uses default stream without per-device streams +Why: Can cause serialization across devices, race conditions, or deadlocks + +Suggested fix: +cudaStream_t per_device_stream; +cudaStreamCreate(&per_device_stream); +// Use per_device_stream for this GPU's operations +// cudaStreamDestroy(per_device_stream) in cleanup +``` + +## Examples to Avoid + +**Boilerplate** (avoid): +- "CUDA Best Practices: Using streams improves concurrency..." +- "Memory Management: Proper cleanup of GPU resources is important..." + +**Subjective style** (ignore): +- "Consider using auto here instead of explicit type" +- "This function could be split into smaller functions" + +--- + +## C++/CUDA-Specific Considerations + +**Error Handling**: +- Use RAFT macros: `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`, `RAFT_CUSOLVER_TRY` +- Every CUDA call must have error checking (kernel launches, memory ops, sync) +- Use `RAFT_CUDA_TRY_NO_THROW` in destructors + +**Memory Management**: +- Use RMM for device memory allocations where possible +- Use `raft::resources` for stream and allocator management +- Prefer RAII patterns (`rmm::device_uvector`, `rmm::device_buffer`) + +**Stream Management**: +- Get streams from `raft::resource::get_cuda_stream()` +- For multi-stream operations, use `handle.get_internal_stream(idx)` +- Concurrent operations (multi-GPU, async ops) must have dedicated streams +- Clearly document stream lifecycle (who creates, who destroys) + +**Threading**: +- Only OpenMP is allowed for host threading +- Algorithms should be thread-safe with different `raft::handle_t` instances +- Use `raft::stream_syncer` for proper stream ordering + +**Public API** (`cpp/include/cuvs/`): +- Functions must be stateless (POD types, `raft::resources`, pointers to POD, mdspan) +- Doxygen documentation required for all public functions +- API changes require deprecation warnings + +--- + +## Common Bug Patterns + +### 1. Memory Layout Confusion +**Pattern**: Incorrect row-major vs column-major assumptions + +**Red flags**: +- Direct pointer access without verifying data layout +- Kernel assuming row-major when data might be column-major +- Missing layout parameter in function signatures + +### 2. CUDA Stream Lifecycle Issues +**Pattern**: Missing explicit stream creation for concurrent operations + +**Red flags**: +- Multi-GPU operations without dedicated stream per device +- Stream creation inside loop but destruction outside loop +- Using `nullptr` or default stream for operations that need isolation +- Missing `cudaStreamDestroy` for explicitly created streams + +### 3. GPU Memory Leaks +**Pattern**: Device memory allocated but not properly freed + +**Red flags**: +- cudaMalloc without corresponding cudaFree +- Temporary GPU buffers allocated per iteration without cleanup +- Exception paths skipping memory cleanup +- Missing RAII or smart pointers for GPU memory + +### 4. Numerical Instability in Kernels +**Pattern**: Incorrect floating-point handling in distance/kernel computations + +**Red flags**: +- Division without epsilon check +- Not handling zero-norm vectors +- Accumulation without compensation (Kahan summation) +- Unsafe type casting (double→float) + +--- + +## Code Review Checklists + +### When Reviewing CUDA Kernels +- [ ] Are CUDA errors checked after kernel launch (with peek)? +- [ ] Is shared memory usage within limits and avoiding bank conflicts? +- [ ] Is shared memory used when clearly possible? +- [ ] Is thread synchronization done correctly? Are any __syncthreads call unnecessary, misplaced or missing? +- [ ] Is memory access coalesced? +- [ ] Is memory aligned? +- [ ] Is there serial work inside of a thread? +- [ ] Are warp divergence issues minimized? +- [ ] Are grid/block dimensions validated? + +### When Reviewing Multi-GPU Operations +- [ ] Is stream lifecycle clearly documented? +- [ ] Are independent GPU operations using dedicated streams? +- [ ] Is `cudaSetDevice` called before device-specific operations? +- [ ] Are stream errors checked? + +### When Reviewing Memory Operations +- [ ] Is data layout (row-major vs column-major) explicitly handled? +- [ ] Are device allocations paired with deallocations? +- [ ] Is RAII used for GPU resources? +- [ ] Are exception paths cleaning up resources? + +### When Reviewing Numerical Computations +- [ ] Are edge cases handled (zero-norm, identical points)? +- [ ] Are divisions protected against near-zero denominators? +- [ ] Are epsilon tolerances used for floating-point comparisons? +- [ ] Is numerical stability maintained (avoiding overflow/underflow)? + +### When Reviewing Tests +- [ ] Are all datasets synthetic or bundled (no external resource dependencies)? +- [ ] Is numerical correctness validated? +- [ ] Are edge cases tested (empty, single element, extreme values)? + +--- + +**Remember**: Focus on correctness and safety. Catch real bugs (crashes, wrong results, leaks), +ignore style preferences. For cuVS C++: CUDA correctness and numerical stability are paramount. diff --git a/python/agents.md b/python/agents.md new file mode 100644 index 0000000000..901b1df5c1 --- /dev/null +++ b/python/agents.md @@ -0,0 +1,184 @@ +# AI Code Review Guidelines - cuVS Python + +**Role**: Act as a principal engineer with 10+ years experience in Python systems programming and GPU memory management. Focus ONLY on CRITICAL and HIGH issues. + +**Target**: Sub-3% false positive rate. Be direct, concise, minimal. + +**Context**: cuVS Python layer provides pythonic interfaces for vector search on the GPU. + +## IGNORE These Issues + +- Style/formatting (pre-commit hooks handle this) +- Minor naming preferences (unless truly misleading) +- Personal taste on implementation (unless impacts maintainability) +- Nits that don't affect functionality +- Already-covered issues (one comment per root cause) + +## CRITICAL Issues (Always Comment) + +### Memory Safety +- Memory leaks from improper resource management +- Use-after-free scenarios in device memory handling +- Incorrect lifetime management of memory resources +- **Cython memory management errors** (missing `del`, incorrect reference counting) + +### API Breaking Changes +- Python API changes breaking backward compatibility +- Changes to public interfaces +- Removing or renaming public methods/attributes without deprecation +- We usually require at least one release cycle for deprecations + +### Integration Errors +- Incorrect handling of CuPy/Numba/PyTorch array interfaces +- Silent data corruption from type coercion +- Missing validation causing crashes on invalid input +- **Incorrect CUDA stream handling in Python bindings** + +### Resource Management +- GPU memory leaks from Python objects +- Missing cleanup in `__del__` or context managers +- Circular references preventing garbage collection +- **Incorrect ownership semantics** between Python and C layer + +## HIGH Issues (Comment if Substantial) + +### Memory Resource Management +- Incorrect memory resource lifecycle +- Missing validation of memory resource parameters +- Improper upstream resource handling in Python + +### Input Validation +- Missing size/type checks +- Not handling edge cases + +### Test Quality +- Missing edge case coverage (zero-size, alignment) +- **Using external datasets** (tests must not depend on external resources) +- Missing tests for different array types (CuPy, Numba) +- **Using test classes instead of standalone functions** (cuVS prefers `test_foo_bar()` functions over `class TestFoo`) + +### Documentation +- Missing or incorrect docstrings for public methods +- Parameters not documented +- Missing usage examples for memory resources +- **New public API not added to docs** + +## MEDIUM Issues (Comment Selectively) + +- Edge cases not handled +- Missing input validation for edge cases +- Deprecated API usage +- Minor inefficiencies in non-critical code paths + +## Review Protocol + +1. **Memory safety**: Resource cleanup correct? Lifetime management? +2. **API stability**: Breaking changes to Python APIs? +3. **Integration**: CuPy/Numba compatibility maintained? +4. **Input validation**: Size/type checks present? +5. **Documentation**: Public API documented? +6. **Ask, don't tell**: "Have you considered X?" not "You should do X" + +## Quality Threshold + +Before commenting, ask: +1. Is this actually wrong/risky, or just different? +2. Would this cause a real problem (crash, leak, API break)? +3. Does this comment add unique value? + +**If no to any: Skip the comment.** + +## Output Format + +- Use severity labels: CRITICAL, HIGH, MEDIUM +- Be concise: One-line issue summary + one-line impact +- Provide code suggestions when you have concrete fixes +- No preamble or sign-off + +## Python-Specific Considerations + +**Memory Resource Lifecycle**: +- Memory resources should use context managers where appropriate +- Ensure proper cleanup in `__del__` methods +- Document ownership semantics clearly + +**Cython Bindings**: +- Use proper memory management (`__dealloc__`) +- Handle exceptions correctly across Python/C++ boundary +- Ensure GIL handling is correct for CUDA operations + +**Array Interfaces**: +- Support `__cuda_array_interface__` for interoperability with CuPy and pytorch +- Handle different array types (CuPy, Numba DeviceNDArray) +- Preserve array attributes where appropriate + +**Error Messages**: +- Error messages must be clear and actionable for users +- Include expected vs actual values where helpful + +**Testing**: +- Test different array types +- Use standalone `test_foo_bar()` functions, not test classes +- Use synthetic data, never external resources + +--- + +## Common Bug Patterns + +### 1. Resource Cleanup Issues +**Pattern**: GPU memory not properly released + +**Red flags**: +- Missing `__del__` or `__dealloc__` methods +- Cleanup not happening on exception paths +- Circular references preventing garbage collection + +### 2. Array Interface Errors +**Pattern**: Incorrect `__cuda_array_interface__` implementation + +**Red flags**: +- Wrong shape/strides in interface dict +- Missing required keys +- Incorrect data pointer + +### 3. Lifetime Management +**Pattern**: Python object outliving underlying C++ resource + +**Red flags**: +- Weak references to memory resources +- Callbacks holding stale references +- Missing ref-counting in Cython + +### 4. Stream Handling +**Pattern**: Incorrect stream semantics in Python bindings + +**Red flags**: +- Missing stream parameter propagation +- Incorrect stream synchronization +- Default stream used when explicit stream expected + +--- + +## Code Review Checklists + +### When Reviewing Memory Resource Classes +- [ ] Is cleanup implemented in `__del__` or context manager? +- [ ] Is ownership clearly documented? +- [ ] Are edge cases handled (zero-size)? +- [ ] Is the API consistent with existing cuVS patterns? + +### When Reviewing Cython Code +- [ ] Is `__dealloc__` implemented correctly? +- [ ] Are exceptions handled across the Python/C++ boundary? +- [ ] Is GIL handling correct? +- [ ] Is memory management correct (no leaks, no double-free)? +- [ ] Are errors from cuVS C functions being checked with `check_cuvs`? + +### When Reviewing Tests +- [ ] Are different array types tested? +- [ ] Are tests written as standalone functions? + +--- + +**Remember**: Focus on correctness and API compatibility. Catch real bugs (leaks, crashes, API +breaks), ignore style preferences. For cuVS Python: memory safety and proper resource lifecycle are paramount.